perf_counter: Rename perf_counter_hw_event => perf_counter_attr

[deliverable/linux.git] / arch / powerpc / kernel / perf_counter.c
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c

index 5008762e8bf4c430106e2fb7f1a677050b2a0491..ea54686cb7878dae52a72496d6eff89fdfe639cb 100644 (file)
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -17,16 +17,21 @@
  #include <asm/pmc.h>
  #include <asm/machdep.h>
  #include <asm/firmware.h>
+#include <asm/ptrace.h>
  
  struct cpu_hw_counters {
         int n_counters;
         int n_percpu;
         int disabled;
         int n_added;
+       int n_limited;
+       u8  pmcs_enabled;
         struct perf_counter *counter[MAX_HWCOUNTERS];
-       unsigned int events[MAX_HWCOUNTERS];
+       u64 events[MAX_HWCOUNTERS];
+       unsigned int flags[MAX_HWCOUNTERS];
         u64 mmcr[3];
-       u8 pmcs_enabled;
+       struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
+       u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
  };
  DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
  
@@ -41,6 +46,8 @@ struct power_pmu *ppmu;
   */
  static unsigned int freeze_counters_kernel = MMCR0_FCS;
  
+static void perf_counter_interrupt(struct pt_regs *regs);
+
  void perf_counter_print_debug(void)
  {
  }
@@ -125,10 +132,11 @@ static void write_pmc(int idx, unsigned long val)
   * and see if any combination of alternative codes is feasible.
   * The feasible set is returned in event[].
   */
-static int power_check_constraints(unsigned int event[], int n_ev)
+static int power_check_constraints(u64 event[], unsigned int cflags[],
+                                  int n_ev)
  {
         u64 mask, value, nv;
-       unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+       u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
         u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
         u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
         u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
@@ -142,11 +150,15 @@ static int power_check_constraints(unsigned int event[], int n_ev)
  
         /* First see if the events will go on as-is */
         for (i = 0; i < n_ev; ++i) {
-               alternatives[i][0] = event[i];
+               if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
+                   && !ppmu->limited_pmc_event(event[i])) {
+                       ppmu->get_alternatives(event[i], cflags[i],
+                                              alternatives[i]);
+                       event[i] = alternatives[i][0];
+               }
                 if (ppmu->get_constraint(event[i], &amasks[i][0],
                                          &avalues[i][0]))
                         return -1;
-               choice[i] = 0;
         }
         value = mask = 0;
         for (i = 0; i < n_ev; ++i) {
@@ -164,7 +176,9 @@ static int power_check_constraints(unsigned int event[], int n_ev)
         if (!ppmu->get_alternatives)
                 return -1;
         for (i = 0; i < n_ev; ++i) {
-               n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
+               choice[i] = 0;
+               n_alt[i] = ppmu->get_alternatives(event[i], cflags[i],
+                                                 alternatives[i]);
                 for (j = 1; j < n_alt[i]; ++j)
                         ppmu->get_constraint(alternatives[i][j],
                                              &amasks[i][j], &avalues[i][j]);
@@ -229,32 +243,45 @@ static int power_check_constraints(unsigned int event[], int n_ev)
   * exclude_{user,kernel,hv} with each other and any previously
   * added counters.
   */
-static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
+static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
+                         int n_prev, int n_new)
  {
-       int eu, ek, eh;
-       int i, n;
+       int eu = 0, ek = 0, eh = 0;
+       int i, n, first;
         struct perf_counter *counter;
  
         n = n_prev + n_new;
         if (n <= 1)
                 return 0;
  
-       eu = ctrs[0]->hw_event.exclude_user;
-       ek = ctrs[0]->hw_event.exclude_kernel;
-       eh = ctrs[0]->hw_event.exclude_hv;
-       if (n_prev == 0)
-               n_prev = 1;
-       for (i = n_prev; i < n; ++i) {
+       first = 1;
+       for (i = 0; i < n; ++i) {
+               if (cflags[i] & PPMU_LIMITED_PMC_OK) {
+                       cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
+                       continue;
+               }
                 counter = ctrs[i];
-               if (counter->hw_event.exclude_user != eu ||
-                   counter->hw_event.exclude_kernel != ek ||
-                   counter->hw_event.exclude_hv != eh)
+               if (first) {
+                       eu = counter->attr.exclude_user;
+                       ek = counter->attr.exclude_kernel;
+                       eh = counter->attr.exclude_hv;
+                       first = 0;
+               } else if (counter->attr.exclude_user != eu ||
+                          counter->attr.exclude_kernel != ek ||
+                          counter->attr.exclude_hv != eh) {
                         return -EAGAIN;
+               }
         }
+
+       if (eu || ek || eh)
+               for (i = 0; i < n; ++i)
+                       if (cflags[i] & PPMU_LIMITED_PMC_OK)
+                               cflags[i] |= PPMU_LIMITED_PMC_REQD;
+
         return 0;
  }
  
-static void power_perf_read(struct perf_counter *counter)
+static void power_pmu_read(struct perf_counter *counter)
  {
         long val, delta, prev;
  
@@ -277,11 +304,91 @@ static void power_perf_read(struct perf_counter *counter)
         atomic64_sub(delta, &counter->hw.period_left);
  }
  
+/*
+ * On some machines, PMC5 and PMC6 can't be written, don't respect
+ * the freeze conditions, and don't generate interrupts.  This tells
+ * us if `counter' is using such a PMC.
+ */
+static int is_limited_pmc(int pmcnum)
+{
+       return (ppmu->flags & PPMU_LIMITED_PMC5_6)
+               && (pmcnum == 5 || pmcnum == 6);
+}
+
+static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
+                                   unsigned long pmc5, unsigned long pmc6)
+{
+       struct perf_counter *counter;
+       u64 val, prev, delta;
+       int i;
+
+       for (i = 0; i < cpuhw->n_limited; ++i) {
+               counter = cpuhw->limited_counter[i];
+               if (!counter->hw.idx)
+                       continue;
+               val = (counter->hw.idx == 5) ? pmc5 : pmc6;
+               prev = atomic64_read(&counter->hw.prev_count);
+               counter->hw.idx = 0;
+               delta = (val - prev) & 0xfffffffful;
+               atomic64_add(delta, &counter->count);
+       }
+}
+
+static void thaw_limited_counters(struct cpu_hw_counters *cpuhw,
+                                 unsigned long pmc5, unsigned long pmc6)
+{
+       struct perf_counter *counter;
+       u64 val;
+       int i;
+
+       for (i = 0; i < cpuhw->n_limited; ++i) {
+               counter = cpuhw->limited_counter[i];
+               counter->hw.idx = cpuhw->limited_hwidx[i];
+               val = (counter->hw.idx == 5) ? pmc5 : pmc6;
+               atomic64_set(&counter->hw.prev_count, val);
+               perf_counter_update_userpage(counter);
+       }
+}
+
+/*
+ * Since limited counters don't respect the freeze conditions, we
+ * have to read them immediately after freezing or unfreezing the
+ * other counters.  We try to keep the values from the limited
+ * counters as consistent as possible by keeping the delay (in
+ * cycles and instructions) between freezing/unfreezing and reading
+ * the limited counters as small and consistent as possible.
+ * Therefore, if any limited counters are in use, we read them
+ * both, and always in the same order, to minimize variability,
+ * and do it inside the same asm that writes MMCR0.
+ */
+static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
+{
+       unsigned long pmc5, pmc6;
+
+       if (!cpuhw->n_limited) {
+               mtspr(SPRN_MMCR0, mmcr0);
+               return;
+       }
+
+       /*
+        * Write MMCR0, then read PMC5 and PMC6 immediately.
+        */
+       asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
+                    : "=&r" (pmc5), "=&r" (pmc6)
+                    : "r" (mmcr0), "i" (SPRN_MMCR0),
+                      "i" (SPRN_PMC5), "i" (SPRN_PMC6));
+
+       if (mmcr0 & MMCR0_FC)
+               freeze_limited_counters(cpuhw, pmc5, pmc6);
+       else
+               thaw_limited_counters(cpuhw, pmc5, pmc6);
+}
+
  /*
   * Disable all counters to prevent PMU interrupts and to allow
   * counters to be added or removed.
   */
-u64 hw_perf_save_disable(void)
+void hw_perf_disable(void)
  {
         struct cpu_hw_counters *cpuhw;
         unsigned long ret;
@@ -304,17 +411,25 @@ u64 hw_perf_save_disable(void)
                         cpuhw->pmcs_enabled = 1;
                 }
  
+               /*
+                * Disable instruction sampling if it was enabled
+                */
+               if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+                       mtspr(SPRN_MMCRA,
+                             cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+                       mb();
+               }
+
                 /*
                  * Set the 'freeze counters' bit.
                  * The barrier is to make sure the mtspr has been
                  * executed and the PMU has frozen the counters
                  * before we return.
                  */
-               mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
+               write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
                 mb();
         }
         local_irq_restore(flags);
-       return ret;
  }
  
  /*
@@ -322,7 +437,7 @@ u64 hw_perf_save_disable(void)
   * If we were previously disabled and counters were added, then
   * put the new config on the PMU.
   */
-void hw_perf_restore(u64 disable)
+void hw_perf_enable(void)
  {
         struct perf_counter *counter;
         struct cpu_hw_counters *cpuhw;
@@ -331,11 +446,15 @@ void hw_perf_restore(u64 disable)
         unsigned long val;
         s64 left;
         unsigned int hwc_index[MAX_HWCOUNTERS];
+       int n_lim;
+       int idx;
  
-       if (disable)
-               return;
         local_irq_save(flags);
         cpuhw = &__get_cpu_var(cpu_hw_counters);
+       if (!cpuhw->disabled) {
+               local_irq_restore(flags);
+               return;
+       }
         cpuhw->disabled = 0;
  
         /*
@@ -345,12 +464,11 @@ void hw_perf_restore(u64 disable)
          * (possibly updated for removal of counters).
          */
         if (!cpuhw->n_added) {
-               mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+               mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
                 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
-               mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
                 if (cpuhw->n_counters == 0)
                         get_lppaca()->pmcregs_in_use = 0;
-               goto out;
+               goto out_enable;
         }
  
         /*
@@ -365,16 +483,16 @@ void hw_perf_restore(u64 disable)
  
         /*
          * Add in MMCR0 freeze bits corresponding to the
-        * hw_event.exclude_* bits for the first counter.
+        * attr.exclude_* bits for the first counter.
          * We have already checked that all counters have the
          * same values for these bits as the first counter.
          */
         counter = cpuhw->counter[0];
-       if (counter->hw_event.exclude_user)
+       if (counter->attr.exclude_user)
                 cpuhw->mmcr[0] |= MMCR0_FCP;
-       if (counter->hw_event.exclude_kernel)
+       if (counter->attr.exclude_kernel)
                 cpuhw->mmcr[0] |= freeze_counters_kernel;
-       if (counter->hw_event.exclude_hv)
+       if (counter->attr.exclude_hv)
                 cpuhw->mmcr[0] |= MMCR0_FCHV;
  
         /*
@@ -383,7 +501,7 @@ void hw_perf_restore(u64 disable)
          * Then unfreeze the counters.
          */
         get_lppaca()->pmcregs_in_use = 1;
-       mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+       mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
         mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
         mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
                                 | MMCR0_FC);
@@ -395,7 +513,7 @@ void hw_perf_restore(u64 disable)
         for (i = 0; i < cpuhw->n_counters; ++i) {
                 counter = cpuhw->counter[i];
                 if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
-                       power_perf_read(counter);
+                       power_pmu_read(counter);
                         write_pmc(counter->hw.idx, 0);
                         counter->hw.idx = 0;
                 }
@@ -404,30 +522,51 @@ void hw_perf_restore(u64 disable)
         /*
          * Initialize the PMCs for all the new and moved counters.
          */
+       cpuhw->n_limited = n_lim = 0;
         for (i = 0; i < cpuhw->n_counters; ++i) {
                 counter = cpuhw->counter[i];
                 if (counter->hw.idx)
                         continue;
+               idx = hwc_index[i] + 1;
+               if (is_limited_pmc(idx)) {
+                       cpuhw->limited_counter[n_lim] = counter;
+                       cpuhw->limited_hwidx[n_lim] = idx;
+                       ++n_lim;
+                       continue;
+               }
                 val = 0;
-               if (counter->hw_event.irq_period) {
+               if (counter->hw.sample_period) {
                         left = atomic64_read(&counter->hw.period_left);
                         if (left < 0x80000000L)
                                 val = 0x80000000L - left;
                 }
                 atomic64_set(&counter->hw.prev_count, val);
-               counter->hw.idx = hwc_index[i] + 1;
-               write_pmc(counter->hw.idx, val);
+               counter->hw.idx = idx;
+               write_pmc(idx, val);
+               perf_counter_update_userpage(counter);
         }
-       mb();
+       cpuhw->n_limited = n_lim;
         cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
-       mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+
+ out_enable:
+       mb();
+       write_mmcr0(cpuhw, cpuhw->mmcr[0]);
+
+       /*
+        * Enable instruction sampling if necessary
+        */
+       if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+               mb();
+               mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+       }
  
   out:
         local_irq_restore(flags);
  }
  
  static int collect_events(struct perf_counter *group, int max_count,
-                         struct perf_counter *ctrs[], unsigned int *events)
+                         struct perf_counter *ctrs[], u64 *events,
+                         unsigned int *flags)
  {
         int n = 0;
         struct perf_counter *counter;
@@ -436,6 +575,7 @@ static int collect_events(struct perf_counter *group, int max_count,
                 if (n >= max_count)
                         return -1;
                 ctrs[n] = group;
+               flags[n] = group->hw.counter_base;
                 events[n++] = group->hw.config;
         }
         list_for_each_entry(counter, &group->sibling_list, list_entry) {
@@ -444,6 +584,7 @@ static int collect_events(struct perf_counter *group, int max_count,
                         if (n >= max_count)
                                 return -1;
                         ctrs[n] = counter;
+                       flags[n] = counter->hw.counter_base;
                         events[n++] = counter->hw.config;
                 }
         }
@@ -454,8 +595,9 @@ static void counter_sched_in(struct perf_counter *counter, int cpu)
  {
         counter->state = PERF_COUNTER_STATE_ACTIVE;
         counter->oncpu = cpu;
+       counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
         if (is_software_counter(counter))
-               counter->hw_ops->enable(counter);
+               counter->pmu->enable(counter);
  }
  
  /*
@@ -475,12 +617,14 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
         cpuhw = &__get_cpu_var(cpu_hw_counters);
         n0 = cpuhw->n_counters;
         n = collect_events(group_leader, ppmu->n_counter - n0,
-                          &cpuhw->counter[n0], &cpuhw->events[n0]);
+                          &cpuhw->counter[n0], &cpuhw->events[n0],
+                          &cpuhw->flags[n0]);
         if (n < 0)
                 return -EAGAIN;
-       if (check_excludes(cpuhw->counter, n0, n))
+       if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n))
                 return -EAGAIN;
-       if (power_check_constraints(cpuhw->events, n + n0))
+       i = power_check_constraints(cpuhw->events, cpuhw->flags, n + n0);
+       if (i < 0)
                 return -EAGAIN;
         cpuhw->n_counters = n0 + n;
         cpuhw->n_added += n;
@@ -508,19 +652,18 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
  /*
   * Add a counter to the PMU.
   * If all counters are not already frozen, then we disable and
- * re-enable the PMU in order to get hw_perf_restore to do the
+ * re-enable the PMU in order to get hw_perf_enable to do the
   * actual work of reconfiguring the PMU.
   */
-static int power_perf_enable(struct perf_counter *counter)
+static int power_pmu_enable(struct perf_counter *counter)
  {
         struct cpu_hw_counters *cpuhw;
         unsigned long flags;
-       u64 pmudis;
         int n0;
         int ret = -EAGAIN;
  
         local_irq_save(flags);
-       pmudis = hw_perf_save_disable();
+       perf_disable();
  
         /*
          * Add the counter to the list (if there is room)
@@ -532,9 +675,10 @@ static int power_perf_enable(struct perf_counter *counter)
                 goto out;
         cpuhw->counter[n0] = counter;
         cpuhw->events[n0] = counter->hw.config;
-       if (check_excludes(cpuhw->counter, n0, 1))
+       cpuhw->flags[n0] = counter->hw.counter_base;
+       if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1))
                 goto out;
-       if (power_check_constraints(cpuhw->events, n0 + 1))
+       if (power_check_constraints(cpuhw->events, cpuhw->flags, n0 + 1))
                 goto out;
  
         counter->hw.config = cpuhw->events[n0];
@@ -543,7 +687,7 @@ static int power_perf_enable(struct perf_counter *counter)
  
         ret = 0;
   out:
-       hw_perf_restore(pmudis);
+       perf_enable();
         local_irq_restore(flags);
         return ret;
  }
@@ -551,17 +695,16 @@ static int power_perf_enable(struct perf_counter *counter)
  /*
   * Remove a counter from the PMU.
   */
-static void power_perf_disable(struct perf_counter *counter)
+static void power_pmu_disable(struct perf_counter *counter)
  {
         struct cpu_hw_counters *cpuhw;
         long i;
-       u64 pmudis;
         unsigned long flags;
  
         local_irq_save(flags);
-       pmudis = hw_perf_save_disable();
+       perf_disable();
  
-       power_perf_read(counter);
+       power_pmu_read(counter);
  
         cpuhw = &__get_cpu_var(cpu_hw_counters);
         for (i = 0; i < cpuhw->n_counters; ++i) {
@@ -570,44 +713,155 @@ static void power_perf_disable(struct perf_counter *counter)
                                 cpuhw->counter[i-1] = cpuhw->counter[i];
                         --cpuhw->n_counters;
                         ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
-                       write_pmc(counter->hw.idx, 0);
-                       counter->hw.idx = 0;
+                       if (counter->hw.idx) {
+                               write_pmc(counter->hw.idx, 0);
+                               counter->hw.idx = 0;
+                       }
+                       perf_counter_update_userpage(counter);
+                       break;
+               }
+       }
+       for (i = 0; i < cpuhw->n_limited; ++i)
+               if (counter == cpuhw->limited_counter[i])
                         break;
+       if (i < cpuhw->n_limited) {
+               while (++i < cpuhw->n_limited) {
+                       cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
+                       cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
                 }
+               --cpuhw->n_limited;
         }
         if (cpuhw->n_counters == 0) {
                 /* disable exceptions if no counters are running */
                 cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
         }
  
-       hw_perf_restore(pmudis);
+       perf_enable();
+       local_irq_restore(flags);
+}
+
+/*
+ * Re-enable interrupts on a counter after they were throttled
+ * because they were coming too fast.
+ */
+static void power_pmu_unthrottle(struct perf_counter *counter)
+{
+       s64 val, left;
+       unsigned long flags;
+
+       if (!counter->hw.idx || !counter->hw.sample_period)
+               return;
+       local_irq_save(flags);
+       perf_disable();
+       power_pmu_read(counter);
+       left = counter->hw.sample_period;
+       val = 0;
+       if (left < 0x80000000L)
+               val = 0x80000000L - left;
+       write_pmc(counter->hw.idx, val);
+       atomic64_set(&counter->hw.prev_count, val);
+       atomic64_set(&counter->hw.period_left, left);
+       perf_counter_update_userpage(counter);
+       perf_enable();
         local_irq_restore(flags);
  }
  
-struct hw_perf_counter_ops power_perf_ops = {
-       .enable = power_perf_enable,
-       .disable = power_perf_disable,
-       .read = power_perf_read
+struct pmu power_pmu = {
+       .enable         = power_pmu_enable,
+       .disable        = power_pmu_disable,
+       .read           = power_pmu_read,
+       .unthrottle     = power_pmu_unthrottle,
  };
  
-const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+/*
+ * Return 1 if we might be able to put counter on a limited PMC,
+ * or 0 if not.
+ * A counter can only go on a limited PMC if it counts something
+ * that a limited PMC can count, doesn't require interrupts, and
+ * doesn't exclude any processor mode.
+ */
+static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
+                                unsigned int flags)
+{
+       int n;
+       u64 alt[MAX_EVENT_ALTERNATIVES];
+
+       if (counter->attr.exclude_user
+           || counter->attr.exclude_kernel
+           || counter->attr.exclude_hv
+           || counter->attr.sample_period)
+               return 0;
+
+       if (ppmu->limited_pmc_event(ev))
+               return 1;
+
+       /*
+        * The requested event isn't on a limited PMC already;
+        * see if any alternative code goes on a limited PMC.
+        */
+       if (!ppmu->get_alternatives)
+               return 0;
+
+       flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
+       n = ppmu->get_alternatives(ev, flags, alt);
+
+       return n > 0;
+}
+
+/*
+ * Find an alternative event that goes on a normal PMC, if possible,
+ * and return the event code, or 0 if there is no such alternative.
+ * (Note: event code 0 is "don't count" on all machines.)
+ */
+static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
  {
-       unsigned long ev;
+       u64 alt[MAX_EVENT_ALTERNATIVES];
+       int n;
+
+       flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
+       n = ppmu->get_alternatives(ev, flags, alt);
+       if (!n)
+               return 0;
+       return alt[0];
+}
+
+/* Number of perf_counters counting hardware events */
+static atomic_t num_counters;
+/* Used to avoid races in calling reserve/release_pmc_hardware */
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+/*
+ * Release the PMU if this is the last perf_counter.
+ */
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+       if (!atomic_add_unless(&num_counters, -1, 1)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_dec_return(&num_counters) == 0)
+                       release_pmc_hardware();
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+}
+
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
+{
+       u64 ev;
+       unsigned long flags;
         struct perf_counter *ctrs[MAX_HWCOUNTERS];
-       unsigned int events[MAX_HWCOUNTERS];
+       u64 events[MAX_HWCOUNTERS];
+       unsigned int cflags[MAX_HWCOUNTERS];
         int n;
+       int err;
  
         if (!ppmu)
-               return NULL;
-       if ((s64)counter->hw_event.irq_period < 0)
-               return NULL;
-       ev = counter->hw_event.type;
-       if (!counter->hw_event.raw) {
-               if (ev >= ppmu->n_generic ||
-                   ppmu->generic_events[ev] == 0)
-                       return NULL;
+               return ERR_PTR(-ENXIO);
+       if (!perf_event_raw(&counter->attr)) {
+               ev = perf_event_id(&counter->attr);
+               if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
+                       return ERR_PTR(-EOPNOTSUPP);
                 ev = ppmu->generic_events[ev];
+       } else {
+               ev = perf_event_config(&counter->attr);
         }
         counter->hw.config_base = ev;
         counter->hw.idx = 0;
@@ -618,8 +872,37 @@ hw_perf_counter_init(struct perf_counter *counter)
          * the user set it to.
          */
         if (!firmware_has_feature(FW_FEATURE_LPAR))
-               counter->hw_event.exclude_hv = 0;
-       
+               counter->attr.exclude_hv = 0;
+
+       /*
+        * If this is a per-task counter, then we can use
+        * PM_RUN_* events interchangeably with their non RUN_*
+        * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
+        * XXX we should check if the task is an idle task.
+        */
+       flags = 0;
+       if (counter->ctx->task)
+               flags |= PPMU_ONLY_COUNT_RUN;
+
+       /*
+        * If this machine has limited counters, check whether this
+        * event could go on a limited counter.
+        */
+       if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
+               if (can_go_on_limited_pmc(counter, ev, flags)) {
+                       flags |= PPMU_LIMITED_PMC_OK;
+               } else if (ppmu->limited_pmc_event(ev)) {
+                       /*
+                        * The requested event is on a limited PMC,
+                        * but we can't use a limited PMC; see if any
+                        * alternative goes on a normal PMC.
+                        */
+                       ev = normal_pmc_alternative(ev, flags);
+                       if (!ev)
+                               return ERR_PTR(-EINVAL);
+               }
+       }
+
         /*
          * If this is in a group, check if it can go on with all the
          * other hardware counters in the group.  We assume the counter
@@ -628,73 +911,43 @@ hw_perf_counter_init(struct perf_counter *counter)
         n = 0;
         if (counter->group_leader != counter) {
                 n = collect_events(counter->group_leader, ppmu->n_counter - 1,
-                                  ctrs, events);
+                                  ctrs, events, cflags);
                 if (n < 0)
-                       return NULL;
+                       return ERR_PTR(-EINVAL);
         }
         events[n] = ev;
         ctrs[n] = counter;
-       if (check_excludes(ctrs, n, 1))
-               return NULL;
-       if (power_check_constraints(events, n + 1))
-               return NULL;
+       cflags[n] = flags;
+       if (check_excludes(ctrs, cflags, n, 1))
+               return ERR_PTR(-EINVAL);
+       if (power_check_constraints(events, cflags, n + 1))
+               return ERR_PTR(-EINVAL);
  
         counter->hw.config = events[n];
-       atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
-       return &power_perf_ops;
-}
+       counter->hw.counter_base = cflags[n];
+       atomic64_set(&counter->hw.period_left, counter->hw.sample_period);
  
-/*
- * Handle wakeups.
- */
-void perf_counter_do_pending(void)
-{
-       int i;
-       struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
-       struct perf_counter *counter;
-
-       for (i = 0; i < cpuhw->n_counters; ++i) {
-               counter = cpuhw->counter[i];
-               if (counter && counter->wakeup_pending) {
-                       counter->wakeup_pending = 0;
-                       wake_up(&counter->waitq);
-               }
+       /*
+        * See if we need to reserve the PMU.
+        * If no counters are currently in use, then we have to take a
+        * mutex to ensure that we don't race with another task doing
+        * reserve_pmc_hardware or release_pmc_hardware.
+        */
+       err = 0;
+       if (!atomic_inc_not_zero(&num_counters)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_read(&num_counters) == 0 &&
+                   reserve_pmc_hardware(perf_counter_interrupt))
+                       err = -EBUSY;
+               else
+                       atomic_inc(&num_counters);
+               mutex_unlock(&pmc_reserve_mutex);
         }
-}
-
-/*
- * Record data for an irq counter.
- * This function was lifted from the x86 code; maybe it should
- * go in the core?
- */
-static void perf_store_irq_data(struct perf_counter *counter, u64 data)
-{
-       struct perf_data *irqdata = counter->irqdata;
+       counter->destroy = hw_perf_counter_destroy;
  
-       if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-               irqdata->overrun++;
-       } else {
-               u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-               *p = data;
-               irqdata->len += sizeof(u64);
-       }
-}
-
-/*
- * Record all the values of the counters in a group
- */
-static void perf_handle_group(struct perf_counter *counter)
-{
-       struct perf_counter *leader, *sub;
-
-       leader = counter->group_leader;
-       list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-               if (sub != counter)
-                       sub->hw_ops->read(sub);
-               perf_store_irq_data(counter, sub->hw_event.type);
-               perf_store_irq_data(counter, atomic64_read(&sub->count));
-       }
+       if (err)
+               return ERR_PTR(err);
+       return &power_pmu;
  }
  
  /*
@@ -703,10 +956,12 @@ static void perf_handle_group(struct perf_counter *counter)
   * here so there is no possibility of being interrupted.
   */
  static void record_and_restart(struct perf_counter *counter, long val,
-                              struct pt_regs *regs)
+                              struct pt_regs *regs, int nmi)
  {
+       u64 period = counter->hw.sample_period;
         s64 prev, delta, left;
         int record = 0;
+       u64 addr, mmcra, sdsync;
  
         /* we don't have to worry about interrupts here */
         prev = atomic64_read(&counter->hw.prev_count);
@@ -719,37 +974,105 @@ static void record_and_restart(struct perf_counter *counter, long val,
          */
         val = 0;
         left = atomic64_read(&counter->hw.period_left) - delta;
-       if (counter->hw_event.irq_period) {
+       if (period) {
                 if (left <= 0) {
-                       left += counter->hw_event.irq_period;
+                       left += period;
                         if (left <= 0)
-                               left = counter->hw_event.irq_period;
+                               left = period;
                         record = 1;
                 }
                 if (left < 0x80000000L)
                         val = 0x80000000L - left;
         }
-       write_pmc(counter->hw.idx, val);
-       atomic64_set(&counter->hw.prev_count, val);
-       atomic64_set(&counter->hw.period_left, left);
  
         /*
          * Finally record data if requested.
          */
         if (record) {
-               switch (counter->hw_event.record_type) {
-               case PERF_RECORD_SIMPLE:
-                       break;
-               case PERF_RECORD_IRQ:
-                       perf_store_irq_data(counter, instruction_pointer(regs));
-                       counter->wakeup_pending = 1;
-                       break;
-               case PERF_RECORD_GROUP:
-                       perf_handle_group(counter);
-                       counter->wakeup_pending = 1;
-                       break;
+               addr = 0;
+               if (counter->attr.record_type & PERF_RECORD_ADDR) {
+                       /*
+                        * The user wants a data address recorded.
+                        * If we're not doing instruction sampling,
+                        * give them the SDAR (sampled data address).
+                        * If we are doing instruction sampling, then only
+                        * give them the SDAR if it corresponds to the
+                        * instruction pointed to by SIAR; this is indicated
+                        * by the [POWER6_]MMCRA_SDSYNC bit in MMCRA.
+                        */
+                       mmcra = regs->dsisr;
+                       sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
+                               POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
+                       if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
+                               addr = mfspr(SPRN_SDAR);
+               }
+               if (perf_counter_overflow(counter, nmi, regs, addr)) {
+                       /*
+                        * Interrupts are coming too fast - throttle them
+                        * by setting the counter to 0, so it will be
+                        * at least 2^30 cycles until the next interrupt
+                        * (assuming each counter counts at most 2 counts
+                        * per cycle).
+                        */
+                       val = 0;
+                       left = ~0ULL >> 1;
                 }
         }
+
+       write_pmc(counter->hw.idx, val);
+       atomic64_set(&counter->hw.prev_count, val);
+       atomic64_set(&counter->hw.period_left, left);
+       perf_counter_update_userpage(counter);
+}
+
+/*
+ * Called from generic code to get the misc flags (i.e. processor mode)
+ * for an event.
+ */
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+       unsigned long mmcra;
+
+       if (TRAP(regs) != 0xf00) {
+               /* not a PMU interrupt */
+               return user_mode(regs) ? PERF_EVENT_MISC_USER :
+                       PERF_EVENT_MISC_KERNEL;
+       }
+
+       mmcra = regs->dsisr;
+       if (ppmu->flags & PPMU_ALT_SIPR) {
+               if (mmcra & POWER6_MMCRA_SIHV)
+                       return PERF_EVENT_MISC_HYPERVISOR;
+               return (mmcra & POWER6_MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
+                       PERF_EVENT_MISC_KERNEL;
+       }
+       if (mmcra & MMCRA_SIHV)
+               return PERF_EVENT_MISC_HYPERVISOR;
+       return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
+                       PERF_EVENT_MISC_KERNEL;
+}
+
+/*
+ * Called from generic code to get the instruction pointer
+ * for an event.
+ */
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+       unsigned long mmcra;
+       unsigned long ip;
+       unsigned long slot;
+
+       if (TRAP(regs) != 0xf00)
+               return regs->nip;       /* not a PMU interrupt */
+
+       ip = mfspr(SPRN_SIAR);
+       mmcra = regs->dsisr;
+       if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
+               slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
+               if (slot > 1)
+                       ip += 4 * (slot - 1);
+       }
+       return ip;
  }
  
  /*
@@ -761,17 +1084,37 @@ static void perf_counter_interrupt(struct pt_regs *regs)
         struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
         struct perf_counter *counter;
         long val;
-       int need_wakeup = 0, found = 0;
+       int found = 0;
+       int nmi;
+
+       if (cpuhw->n_limited)
+               freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
+                                       mfspr(SPRN_PMC6));
+
+       /*
+        * Overload regs->dsisr to store MMCRA so we only need to read it once.
+        */
+       regs->dsisr = mfspr(SPRN_MMCRA);
+
+       /*
+        * If interrupts were soft-disabled when this PMU interrupt
+        * occurred, treat it as an NMI.
+        */
+       nmi = !regs->softe;
+       if (nmi)
+               nmi_enter();
+       else
+               irq_enter();
  
         for (i = 0; i < cpuhw->n_counters; ++i) {
                 counter = cpuhw->counter[i];
+               if (is_limited_pmc(counter->hw.idx))
+                       continue;
                 val = read_pmc(counter->hw.idx);
                 if ((int)val < 0) {
                         /* counter has overflowed */
                         found = 1;
-                       record_and_restart(counter, val, regs);
-                       if (counter->wakeup_pending)
-                               need_wakeup = 1;
+                       record_and_restart(counter, val, regs, nmi);
                 }
         }
  
@@ -783,6 +1126,8 @@ static void perf_counter_interrupt(struct pt_regs *regs)
          */
         if (!found) {
                 for (i = 0; i < ppmu->n_counter; ++i) {
+                       if (is_limited_pmc(i + 1))
+                               continue;
                         val = read_pmc(i + 1);
                         if ((int)val < 0)
                                 write_pmc(i + 1, 0);
@@ -796,23 +1141,12 @@ static void perf_counter_interrupt(struct pt_regs *regs)
          * XXX might want to use MSR.PM to keep the counters frozen until
          * we get back out of this interrupt.
          */
-       mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+       write_mmcr0(cpuhw, cpuhw->mmcr[0]);
  
-       /*
-        * If we need a wakeup, check whether interrupts were soft-enabled
-        * when we took the interrupt.  If they were, we can wake stuff up
-        * immediately; otherwise we'll have to set a flag and do the
-        * wakeup when interrupts get soft-enabled.
-        */
-       if (need_wakeup) {
-               if (regs->softe) {
-                       irq_enter();
-                       perf_counter_do_pending();
-                       irq_exit();
-               } else {
-                       set_perf_counter_pending();
-               }
-       }
+       if (nmi)
+               nmi_exit();
+       else
+               irq_exit();
  }
  
  void hw_perf_counter_setup(int cpu)
@@ -833,11 +1167,6 @@ static int init_perf_counters(void)
  {
         unsigned long pvr;
  
-       if (reserve_pmc_hardware(perf_counter_interrupt)) {
-               printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
-               return -EBUSY;
-       }
-
         /* XXX should get this from cputable */
         pvr = mfspr(SPRN_PVR);
         switch (PVR_VER(pvr)) {