perf_counter: Rename perf_counter_hw_event => perf_counter_attr
[deliverable/linux.git] / arch / powerpc / kernel / perf_counter.c
index 26f69dc7130e9e9088c601f39e8bfa292a2df398..ea54686cb7878dae52a72496d6eff89fdfe639cb 100644 (file)
 #include <asm/pmc.h>
 #include <asm/machdep.h>
 #include <asm/firmware.h>
+#include <asm/ptrace.h>
 
 struct cpu_hw_counters {
        int n_counters;
        int n_percpu;
        int disabled;
        int n_added;
+       int n_limited;
+       u8  pmcs_enabled;
        struct perf_counter *counter[MAX_HWCOUNTERS];
-       unsigned int events[MAX_HWCOUNTERS];
+       u64 events[MAX_HWCOUNTERS];
+       unsigned int flags[MAX_HWCOUNTERS];
        u64 mmcr[3];
-       u8 pmcs_enabled;
+       struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
+       u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
 };
 DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
@@ -41,6 +46,8 @@ struct power_pmu *ppmu;
  */
 static unsigned int freeze_counters_kernel = MMCR0_FCS;
 
+static void perf_counter_interrupt(struct pt_regs *regs);
+
 void perf_counter_print_debug(void)
 {
 }
@@ -125,10 +132,11 @@ static void write_pmc(int idx, unsigned long val)
  * and see if any combination of alternative codes is feasible.
  * The feasible set is returned in event[].
  */
-static int power_check_constraints(unsigned int event[], int n_ev)
+static int power_check_constraints(u64 event[], unsigned int cflags[],
+                                  int n_ev)
 {
        u64 mask, value, nv;
-       unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+       u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
        u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
        u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
        u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
@@ -142,11 +150,15 @@ static int power_check_constraints(unsigned int event[], int n_ev)
 
        /* First see if the events will go on as-is */
        for (i = 0; i < n_ev; ++i) {
-               alternatives[i][0] = event[i];
+               if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
+                   && !ppmu->limited_pmc_event(event[i])) {
+                       ppmu->get_alternatives(event[i], cflags[i],
+                                              alternatives[i]);
+                       event[i] = alternatives[i][0];
+               }
                if (ppmu->get_constraint(event[i], &amasks[i][0],
                                         &avalues[i][0]))
                        return -1;
-               choice[i] = 0;
        }
        value = mask = 0;
        for (i = 0; i < n_ev; ++i) {
@@ -164,7 +176,9 @@ static int power_check_constraints(unsigned int event[], int n_ev)
        if (!ppmu->get_alternatives)
                return -1;
        for (i = 0; i < n_ev; ++i) {
-               n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
+               choice[i] = 0;
+               n_alt[i] = ppmu->get_alternatives(event[i], cflags[i],
+                                                 alternatives[i]);
                for (j = 1; j < n_alt[i]; ++j)
                        ppmu->get_constraint(alternatives[i][j],
                                             &amasks[i][j], &avalues[i][j]);
@@ -229,32 +243,45 @@ static int power_check_constraints(unsigned int event[], int n_ev)
  * exclude_{user,kernel,hv} with each other and any previously
  * added counters.
  */
-static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
+static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
+                         int n_prev, int n_new)
 {
-       int eu, ek, eh;
-       int i, n;
+       int eu = 0, ek = 0, eh = 0;
+       int i, n, first;
        struct perf_counter *counter;
 
        n = n_prev + n_new;
        if (n <= 1)
                return 0;
 
-       eu = ctrs[0]->hw_event.exclude_user;
-       ek = ctrs[0]->hw_event.exclude_kernel;
-       eh = ctrs[0]->hw_event.exclude_hv;
-       if (n_prev == 0)
-               n_prev = 1;
-       for (i = n_prev; i < n; ++i) {
+       first = 1;
+       for (i = 0; i < n; ++i) {
+               if (cflags[i] & PPMU_LIMITED_PMC_OK) {
+                       cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
+                       continue;
+               }
                counter = ctrs[i];
-               if (counter->hw_event.exclude_user != eu ||
-                   counter->hw_event.exclude_kernel != ek ||
-                   counter->hw_event.exclude_hv != eh)
+               if (first) {
+                       eu = counter->attr.exclude_user;
+                       ek = counter->attr.exclude_kernel;
+                       eh = counter->attr.exclude_hv;
+                       first = 0;
+               } else if (counter->attr.exclude_user != eu ||
+                          counter->attr.exclude_kernel != ek ||
+                          counter->attr.exclude_hv != eh) {
                        return -EAGAIN;
+               }
        }
+
+       if (eu || ek || eh)
+               for (i = 0; i < n; ++i)
+                       if (cflags[i] & PPMU_LIMITED_PMC_OK)
+                               cflags[i] |= PPMU_LIMITED_PMC_REQD;
+
        return 0;
 }
 
-static void power_perf_read(struct perf_counter *counter)
+static void power_pmu_read(struct perf_counter *counter)
 {
        long val, delta, prev;
 
@@ -277,11 +304,91 @@ static void power_perf_read(struct perf_counter *counter)
        atomic64_sub(delta, &counter->hw.period_left);
 }
 
+/*
+ * On some machines, PMC5 and PMC6 can't be written, don't respect
+ * the freeze conditions, and don't generate interrupts.  This tells
+ * us if `counter' is using such a PMC.
+ */
+static int is_limited_pmc(int pmcnum)
+{
+       return (ppmu->flags & PPMU_LIMITED_PMC5_6)
+               && (pmcnum == 5 || pmcnum == 6);
+}
+
+static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
+                                   unsigned long pmc5, unsigned long pmc6)
+{
+       struct perf_counter *counter;
+       u64 val, prev, delta;
+       int i;
+
+       for (i = 0; i < cpuhw->n_limited; ++i) {
+               counter = cpuhw->limited_counter[i];
+               if (!counter->hw.idx)
+                       continue;
+               val = (counter->hw.idx == 5) ? pmc5 : pmc6;
+               prev = atomic64_read(&counter->hw.prev_count);
+               counter->hw.idx = 0;
+               delta = (val - prev) & 0xfffffffful;
+               atomic64_add(delta, &counter->count);
+       }
+}
+
+static void thaw_limited_counters(struct cpu_hw_counters *cpuhw,
+                                 unsigned long pmc5, unsigned long pmc6)
+{
+       struct perf_counter *counter;
+       u64 val;
+       int i;
+
+       for (i = 0; i < cpuhw->n_limited; ++i) {
+               counter = cpuhw->limited_counter[i];
+               counter->hw.idx = cpuhw->limited_hwidx[i];
+               val = (counter->hw.idx == 5) ? pmc5 : pmc6;
+               atomic64_set(&counter->hw.prev_count, val);
+               perf_counter_update_userpage(counter);
+       }
+}
+
+/*
+ * Since limited counters don't respect the freeze conditions, we
+ * have to read them immediately after freezing or unfreezing the
+ * other counters.  We try to keep the values from the limited
+ * counters as consistent as possible by keeping the delay (in
+ * cycles and instructions) between freezing/unfreezing and reading
+ * the limited counters as small and consistent as possible.
+ * Therefore, if any limited counters are in use, we read them
+ * both, and always in the same order, to minimize variability,
+ * and do it inside the same asm that writes MMCR0.
+ */
+static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
+{
+       unsigned long pmc5, pmc6;
+
+       if (!cpuhw->n_limited) {
+               mtspr(SPRN_MMCR0, mmcr0);
+               return;
+       }
+
+       /*
+        * Write MMCR0, then read PMC5 and PMC6 immediately.
+        */
+       asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
+                    : "=&r" (pmc5), "=&r" (pmc6)
+                    : "r" (mmcr0), "i" (SPRN_MMCR0),
+                      "i" (SPRN_PMC5), "i" (SPRN_PMC6));
+
+       if (mmcr0 & MMCR0_FC)
+               freeze_limited_counters(cpuhw, pmc5, pmc6);
+       else
+               thaw_limited_counters(cpuhw, pmc5, pmc6);
+}
+
 /*
  * Disable all counters to prevent PMU interrupts and to allow
  * counters to be added or removed.
  */
-u64 hw_perf_save_disable(void)
+void hw_perf_disable(void)
 {
        struct cpu_hw_counters *cpuhw;
        unsigned long ret;
@@ -304,17 +411,25 @@ u64 hw_perf_save_disable(void)
                        cpuhw->pmcs_enabled = 1;
                }
 
+               /*
+                * Disable instruction sampling if it was enabled
+                */
+               if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+                       mtspr(SPRN_MMCRA,
+                             cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+                       mb();
+               }
+
                /*
                 * Set the 'freeze counters' bit.
                 * The barrier is to make sure the mtspr has been
                 * executed and the PMU has frozen the counters
                 * before we return.
                 */
-               mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
+               write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
                mb();
        }
        local_irq_restore(flags);
-       return ret;
 }
 
 /*
@@ -322,7 +437,7 @@ u64 hw_perf_save_disable(void)
  * If we were previously disabled and counters were added, then
  * put the new config on the PMU.
  */
-void hw_perf_restore(u64 disable)
+void hw_perf_enable(void)
 {
        struct perf_counter *counter;
        struct cpu_hw_counters *cpuhw;
@@ -331,11 +446,15 @@ void hw_perf_restore(u64 disable)
        unsigned long val;
        s64 left;
        unsigned int hwc_index[MAX_HWCOUNTERS];
+       int n_lim;
+       int idx;
 
-       if (disable)
-               return;
        local_irq_save(flags);
        cpuhw = &__get_cpu_var(cpu_hw_counters);
+       if (!cpuhw->disabled) {
+               local_irq_restore(flags);
+               return;
+       }
        cpuhw->disabled = 0;
 
        /*
@@ -345,12 +464,11 @@ void hw_perf_restore(u64 disable)
         * (possibly updated for removal of counters).
         */
        if (!cpuhw->n_added) {
-               mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+               mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
                mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
-               mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
                if (cpuhw->n_counters == 0)
                        get_lppaca()->pmcregs_in_use = 0;
-               goto out;
+               goto out_enable;
        }
 
        /*
@@ -365,16 +483,16 @@ void hw_perf_restore(u64 disable)
 
        /*
         * Add in MMCR0 freeze bits corresponding to the
-        * hw_event.exclude_* bits for the first counter.
+        * attr.exclude_* bits for the first counter.
         * We have already checked that all counters have the
         * same values for these bits as the first counter.
         */
        counter = cpuhw->counter[0];
-       if (counter->hw_event.exclude_user)
+       if (counter->attr.exclude_user)
                cpuhw->mmcr[0] |= MMCR0_FCP;
-       if (counter->hw_event.exclude_kernel)
+       if (counter->attr.exclude_kernel)
                cpuhw->mmcr[0] |= freeze_counters_kernel;
-       if (counter->hw_event.exclude_hv)
+       if (counter->attr.exclude_hv)
                cpuhw->mmcr[0] |= MMCR0_FCHV;
 
        /*
@@ -383,7 +501,7 @@ void hw_perf_restore(u64 disable)
         * Then unfreeze the counters.
         */
        get_lppaca()->pmcregs_in_use = 1;
-       mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+       mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
        mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
        mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
                                | MMCR0_FC);
@@ -395,7 +513,7 @@ void hw_perf_restore(u64 disable)
        for (i = 0; i < cpuhw->n_counters; ++i) {
                counter = cpuhw->counter[i];
                if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
-                       power_perf_read(counter);
+                       power_pmu_read(counter);
                        write_pmc(counter->hw.idx, 0);
                        counter->hw.idx = 0;
                }
@@ -404,30 +522,51 @@ void hw_perf_restore(u64 disable)
        /*
         * Initialize the PMCs for all the new and moved counters.
         */
+       cpuhw->n_limited = n_lim = 0;
        for (i = 0; i < cpuhw->n_counters; ++i) {
                counter = cpuhw->counter[i];
                if (counter->hw.idx)
                        continue;
+               idx = hwc_index[i] + 1;
+               if (is_limited_pmc(idx)) {
+                       cpuhw->limited_counter[n_lim] = counter;
+                       cpuhw->limited_hwidx[n_lim] = idx;
+                       ++n_lim;
+                       continue;
+               }
                val = 0;
-               if (counter->hw_event.irq_period) {
+               if (counter->hw.sample_period) {
                        left = atomic64_read(&counter->hw.period_left);
                        if (left < 0x80000000L)
                                val = 0x80000000L - left;
                }
                atomic64_set(&counter->hw.prev_count, val);
-               counter->hw.idx = hwc_index[i] + 1;
-               write_pmc(counter->hw.idx, val);
+               counter->hw.idx = idx;
+               write_pmc(idx, val);
+               perf_counter_update_userpage(counter);
        }
-       mb();
+       cpuhw->n_limited = n_lim;
        cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
-       mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+
+ out_enable:
+       mb();
+       write_mmcr0(cpuhw, cpuhw->mmcr[0]);
+
+       /*
+        * Enable instruction sampling if necessary
+        */
+       if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+               mb();
+               mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+       }
 
  out:
        local_irq_restore(flags);
 }
 
 static int collect_events(struct perf_counter *group, int max_count,
-                         struct perf_counter *ctrs[], unsigned int *events)
+                         struct perf_counter *ctrs[], u64 *events,
+                         unsigned int *flags)
 {
        int n = 0;
        struct perf_counter *counter;
@@ -436,6 +575,7 @@ static int collect_events(struct perf_counter *group, int max_count,
                if (n >= max_count)
                        return -1;
                ctrs[n] = group;
+               flags[n] = group->hw.counter_base;
                events[n++] = group->hw.config;
        }
        list_for_each_entry(counter, &group->sibling_list, list_entry) {
@@ -444,6 +584,7 @@ static int collect_events(struct perf_counter *group, int max_count,
                        if (n >= max_count)
                                return -1;
                        ctrs[n] = counter;
+                       flags[n] = counter->hw.counter_base;
                        events[n++] = counter->hw.config;
                }
        }
@@ -454,8 +595,9 @@ static void counter_sched_in(struct perf_counter *counter, int cpu)
 {
        counter->state = PERF_COUNTER_STATE_ACTIVE;
        counter->oncpu = cpu;
+       counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
        if (is_software_counter(counter))
-               counter->hw_ops->enable(counter);
+               counter->pmu->enable(counter);
 }
 
 /*
@@ -475,12 +617,14 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
        cpuhw = &__get_cpu_var(cpu_hw_counters);
        n0 = cpuhw->n_counters;
        n = collect_events(group_leader, ppmu->n_counter - n0,
-                          &cpuhw->counter[n0], &cpuhw->events[n0]);
+                          &cpuhw->counter[n0], &cpuhw->events[n0],
+                          &cpuhw->flags[n0]);
        if (n < 0)
                return -EAGAIN;
-       if (check_excludes(cpuhw->counter, n0, n))
+       if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n))
                return -EAGAIN;
-       if (power_check_constraints(cpuhw->events, n + n0))
+       i = power_check_constraints(cpuhw->events, cpuhw->flags, n + n0);
+       if (i < 0)
                return -EAGAIN;
        cpuhw->n_counters = n0 + n;
        cpuhw->n_added += n;
@@ -508,19 +652,18 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 /*
  * Add a counter to the PMU.
  * If all counters are not already frozen, then we disable and
- * re-enable the PMU in order to get hw_perf_restore to do the
+ * re-enable the PMU in order to get hw_perf_enable to do the
  * actual work of reconfiguring the PMU.
  */
-static int power_perf_enable(struct perf_counter *counter)
+static int power_pmu_enable(struct perf_counter *counter)
 {
        struct cpu_hw_counters *cpuhw;
        unsigned long flags;
-       u64 pmudis;
        int n0;
        int ret = -EAGAIN;
 
        local_irq_save(flags);
-       pmudis = hw_perf_save_disable();
+       perf_disable();
 
        /*
         * Add the counter to the list (if there is room)
@@ -532,9 +675,10 @@ static int power_perf_enable(struct perf_counter *counter)
                goto out;
        cpuhw->counter[n0] = counter;
        cpuhw->events[n0] = counter->hw.config;
-       if (check_excludes(cpuhw->counter, n0, 1))
+       cpuhw->flags[n0] = counter->hw.counter_base;
+       if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1))
                goto out;
-       if (power_check_constraints(cpuhw->events, n0 + 1))
+       if (power_check_constraints(cpuhw->events, cpuhw->flags, n0 + 1))
                goto out;
 
        counter->hw.config = cpuhw->events[n0];
@@ -543,7 +687,7 @@ static int power_perf_enable(struct perf_counter *counter)
 
        ret = 0;
  out:
-       hw_perf_restore(pmudis);
+       perf_enable();
        local_irq_restore(flags);
        return ret;
 }
@@ -551,17 +695,16 @@ static int power_perf_enable(struct perf_counter *counter)
 /*
  * Remove a counter from the PMU.
  */
-static void power_perf_disable(struct perf_counter *counter)
+static void power_pmu_disable(struct perf_counter *counter)
 {
        struct cpu_hw_counters *cpuhw;
        long i;
-       u64 pmudis;
        unsigned long flags;
 
        local_irq_save(flags);
-       pmudis = hw_perf_save_disable();
+       perf_disable();
 
-       power_perf_read(counter);
+       power_pmu_read(counter);
 
        cpuhw = &__get_cpu_var(cpu_hw_counters);
        for (i = 0; i < cpuhw->n_counters; ++i) {
@@ -570,44 +713,155 @@ static void power_perf_disable(struct perf_counter *counter)
                                cpuhw->counter[i-1] = cpuhw->counter[i];
                        --cpuhw->n_counters;
                        ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
-                       write_pmc(counter->hw.idx, 0);
-                       counter->hw.idx = 0;
+                       if (counter->hw.idx) {
+                               write_pmc(counter->hw.idx, 0);
+                               counter->hw.idx = 0;
+                       }
+                       perf_counter_update_userpage(counter);
+                       break;
+               }
+       }
+       for (i = 0; i < cpuhw->n_limited; ++i)
+               if (counter == cpuhw->limited_counter[i])
                        break;
+       if (i < cpuhw->n_limited) {
+               while (++i < cpuhw->n_limited) {
+                       cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
+                       cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
                }
+               --cpuhw->n_limited;
        }
        if (cpuhw->n_counters == 0) {
                /* disable exceptions if no counters are running */
                cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
        }
 
-       hw_perf_restore(pmudis);
+       perf_enable();
+       local_irq_restore(flags);
+}
+
+/*
+ * Re-enable interrupts on a counter after they were throttled
+ * because they were coming too fast.
+ */
+static void power_pmu_unthrottle(struct perf_counter *counter)
+{
+       s64 val, left;
+       unsigned long flags;
+
+       if (!counter->hw.idx || !counter->hw.sample_period)
+               return;
+       local_irq_save(flags);
+       perf_disable();
+       power_pmu_read(counter);
+       left = counter->hw.sample_period;
+       val = 0;
+       if (left < 0x80000000L)
+               val = 0x80000000L - left;
+       write_pmc(counter->hw.idx, val);
+       atomic64_set(&counter->hw.prev_count, val);
+       atomic64_set(&counter->hw.period_left, left);
+       perf_counter_update_userpage(counter);
+       perf_enable();
        local_irq_restore(flags);
 }
 
-struct hw_perf_counter_ops power_perf_ops = {
-       .enable = power_perf_enable,
-       .disable = power_perf_disable,
-       .read = power_perf_read
+struct pmu power_pmu = {
+       .enable         = power_pmu_enable,
+       .disable        = power_pmu_disable,
+       .read           = power_pmu_read,
+       .unthrottle     = power_pmu_unthrottle,
 };
 
-const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+/*
+ * Return 1 if we might be able to put counter on a limited PMC,
+ * or 0 if not.
+ * A counter can only go on a limited PMC if it counts something
+ * that a limited PMC can count, doesn't require interrupts, and
+ * doesn't exclude any processor mode.
+ */
+static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
+                                unsigned int flags)
+{
+       int n;
+       u64 alt[MAX_EVENT_ALTERNATIVES];
+
+       if (counter->attr.exclude_user
+           || counter->attr.exclude_kernel
+           || counter->attr.exclude_hv
+           || counter->attr.sample_period)
+               return 0;
+
+       if (ppmu->limited_pmc_event(ev))
+               return 1;
+
+       /*
+        * The requested event isn't on a limited PMC already;
+        * see if any alternative code goes on a limited PMC.
+        */
+       if (!ppmu->get_alternatives)
+               return 0;
+
+       flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
+       n = ppmu->get_alternatives(ev, flags, alt);
+
+       return n > 0;
+}
+
+/*
+ * Find an alternative event that goes on a normal PMC, if possible,
+ * and return the event code, or 0 if there is no such alternative.
+ * (Note: event code 0 is "don't count" on all machines.)
+ */
+static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
 {
-       unsigned long ev;
+       u64 alt[MAX_EVENT_ALTERNATIVES];
+       int n;
+
+       flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
+       n = ppmu->get_alternatives(ev, flags, alt);
+       if (!n)
+               return 0;
+       return alt[0];
+}
+
+/* Number of perf_counters counting hardware events */
+static atomic_t num_counters;
+/* Used to avoid races in calling reserve/release_pmc_hardware */
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+/*
+ * Release the PMU if this is the last perf_counter.
+ */
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+       if (!atomic_add_unless(&num_counters, -1, 1)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_dec_return(&num_counters) == 0)
+                       release_pmc_hardware();
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+}
+
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
+{
+       u64 ev;
+       unsigned long flags;
        struct perf_counter *ctrs[MAX_HWCOUNTERS];
-       unsigned int events[MAX_HWCOUNTERS];
+       u64 events[MAX_HWCOUNTERS];
+       unsigned int cflags[MAX_HWCOUNTERS];
        int n;
+       int err;
 
        if (!ppmu)
-               return NULL;
-       if ((s64)counter->hw_event.irq_period < 0)
-               return NULL;
-       ev = counter->hw_event.event_id;
-       if (!counter->hw_event.raw) {
-               if (ev >= ppmu->n_generic ||
-                   ppmu->generic_events[ev] == 0)
-                       return NULL;
+               return ERR_PTR(-ENXIO);
+       if (!perf_event_raw(&counter->attr)) {
+               ev = perf_event_id(&counter->attr);
+               if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
+                       return ERR_PTR(-EOPNOTSUPP);
                ev = ppmu->generic_events[ev];
+       } else {
+               ev = perf_event_config(&counter->attr);
        }
        counter->hw.config_base = ev;
        counter->hw.idx = 0;
@@ -618,8 +872,37 @@ hw_perf_counter_init(struct perf_counter *counter)
         * the user set it to.
         */
        if (!firmware_has_feature(FW_FEATURE_LPAR))
-               counter->hw_event.exclude_hv = 0;
-       
+               counter->attr.exclude_hv = 0;
+
+       /*
+        * If this is a per-task counter, then we can use
+        * PM_RUN_* events interchangeably with their non RUN_*
+        * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
+        * XXX we should check if the task is an idle task.
+        */
+       flags = 0;
+       if (counter->ctx->task)
+               flags |= PPMU_ONLY_COUNT_RUN;
+
+       /*
+        * If this machine has limited counters, check whether this
+        * event could go on a limited counter.
+        */
+       if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
+               if (can_go_on_limited_pmc(counter, ev, flags)) {
+                       flags |= PPMU_LIMITED_PMC_OK;
+               } else if (ppmu->limited_pmc_event(ev)) {
+                       /*
+                        * The requested event is on a limited PMC,
+                        * but we can't use a limited PMC; see if any
+                        * alternative goes on a normal PMC.
+                        */
+                       ev = normal_pmc_alternative(ev, flags);
+                       if (!ev)
+                               return ERR_PTR(-EINVAL);
+               }
+       }
+
        /*
         * If this is in a group, check if it can go on with all the
         * other hardware counters in the group.  We assume the counter
@@ -628,73 +911,43 @@ hw_perf_counter_init(struct perf_counter *counter)
        n = 0;
        if (counter->group_leader != counter) {
                n = collect_events(counter->group_leader, ppmu->n_counter - 1,
-                                  ctrs, events);
+                                  ctrs, events, cflags);
                if (n < 0)
-                       return NULL;
+                       return ERR_PTR(-EINVAL);
        }
        events[n] = ev;
        ctrs[n] = counter;
-       if (check_excludes(ctrs, n, 1))
-               return NULL;
-       if (power_check_constraints(events, n + 1))
-               return NULL;
+       cflags[n] = flags;
+       if (check_excludes(ctrs, cflags, n, 1))
+               return ERR_PTR(-EINVAL);
+       if (power_check_constraints(events, cflags, n + 1))
+               return ERR_PTR(-EINVAL);
 
        counter->hw.config = events[n];
-       atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
-       return &power_perf_ops;
-}
+       counter->hw.counter_base = cflags[n];
+       atomic64_set(&counter->hw.period_left, counter->hw.sample_period);
 
-/*
* Handle wakeups.
- */
-void perf_counter_do_pending(void)
-{
-       int i;
-       struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
-       struct perf_counter *counter;
-
-       for (i = 0; i < cpuhw->n_counters; ++i) {
-               counter = cpuhw->counter[i];
-               if (counter && counter->wakeup_pending) {
-                       counter->wakeup_pending = 0;
-                       wake_up(&counter->waitq);
-               }
+       /*
       * See if we need to reserve the PMU.
+        * If no counters are currently in use, then we have to take a
+        * mutex to ensure that we don't race with another task doing
+        * reserve_pmc_hardware or release_pmc_hardware.
+        */
+       err = 0;
+       if (!atomic_inc_not_zero(&num_counters)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_read(&num_counters) == 0 &&
+                   reserve_pmc_hardware(perf_counter_interrupt))
+                       err = -EBUSY;
+               else
+                       atomic_inc(&num_counters);
+               mutex_unlock(&pmc_reserve_mutex);
        }
-}
-
-/*
- * Record data for an irq counter.
- * This function was lifted from the x86 code; maybe it should
- * go in the core?
- */
-static void perf_store_irq_data(struct perf_counter *counter, u64 data)
-{
-       struct perf_data *irqdata = counter->irqdata;
+       counter->destroy = hw_perf_counter_destroy;
 
-       if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-               irqdata->overrun++;
-       } else {
-               u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-               *p = data;
-               irqdata->len += sizeof(u64);
-       }
-}
-
-/*
- * Record all the values of the counters in a group
- */
-static void perf_handle_group(struct perf_counter *counter)
-{
-       struct perf_counter *leader, *sub;
-
-       leader = counter->group_leader;
-       list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-               if (sub != counter)
-                       sub->hw_ops->read(sub);
-               perf_store_irq_data(counter, sub->hw_event.event_config);
-               perf_store_irq_data(counter, atomic64_read(&sub->count));
-       }
+       if (err)
+               return ERR_PTR(err);
+       return &power_pmu;
 }
 
 /*
@@ -703,10 +956,12 @@ static void perf_handle_group(struct perf_counter *counter)
  * here so there is no possibility of being interrupted.
  */
 static void record_and_restart(struct perf_counter *counter, long val,
-                              struct pt_regs *regs)
+                              struct pt_regs *regs, int nmi)
 {
+       u64 period = counter->hw.sample_period;
        s64 prev, delta, left;
        int record = 0;
+       u64 addr, mmcra, sdsync;
 
        /* we don't have to worry about interrupts here */
        prev = atomic64_read(&counter->hw.prev_count);
@@ -719,37 +974,105 @@ static void record_and_restart(struct perf_counter *counter, long val,
         */
        val = 0;
        left = atomic64_read(&counter->hw.period_left) - delta;
-       if (counter->hw_event.irq_period) {
+       if (period) {
                if (left <= 0) {
-                       left += counter->hw_event.irq_period;
+                       left += period;
                        if (left <= 0)
-                               left = counter->hw_event.irq_period;
+                               left = period;
                        record = 1;
                }
                if (left < 0x80000000L)
                        val = 0x80000000L - left;
        }
-       write_pmc(counter->hw.idx, val);
-       atomic64_set(&counter->hw.prev_count, val);
-       atomic64_set(&counter->hw.period_left, left);
 
        /*
         * Finally record data if requested.
         */
        if (record) {
-               switch (counter->hw_event.record_type) {
-               case PERF_RECORD_SIMPLE:
-                       break;
-               case PERF_RECORD_IRQ:
-                       perf_store_irq_data(counter, instruction_pointer(regs));
-                       counter->wakeup_pending = 1;
-                       break;
-               case PERF_RECORD_GROUP:
-                       perf_handle_group(counter);
-                       counter->wakeup_pending = 1;
-                       break;
+               addr = 0;
+               if (counter->attr.record_type & PERF_RECORD_ADDR) {
+                       /*
+                        * The user wants a data address recorded.
+                        * If we're not doing instruction sampling,
+                        * give them the SDAR (sampled data address).
+                        * If we are doing instruction sampling, then only
+                        * give them the SDAR if it corresponds to the
+                        * instruction pointed to by SIAR; this is indicated
+                        * by the [POWER6_]MMCRA_SDSYNC bit in MMCRA.
+                        */
+                       mmcra = regs->dsisr;
+                       sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
+                               POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
+                       if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
+                               addr = mfspr(SPRN_SDAR);
+               }
+               if (perf_counter_overflow(counter, nmi, regs, addr)) {
+                       /*
+                        * Interrupts are coming too fast - throttle them
+                        * by setting the counter to 0, so it will be
+                        * at least 2^30 cycles until the next interrupt
+                        * (assuming each counter counts at most 2 counts
+                        * per cycle).
+                        */
+                       val = 0;
+                       left = ~0ULL >> 1;
                }
        }
+
+       write_pmc(counter->hw.idx, val);
+       atomic64_set(&counter->hw.prev_count, val);
+       atomic64_set(&counter->hw.period_left, left);
+       perf_counter_update_userpage(counter);
+}
+
+/*
+ * Called from generic code to get the misc flags (i.e. processor mode)
+ * for an event.
+ */
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+       unsigned long mmcra;
+
+       if (TRAP(regs) != 0xf00) {
+               /* not a PMU interrupt */
+               return user_mode(regs) ? PERF_EVENT_MISC_USER :
+                       PERF_EVENT_MISC_KERNEL;
+       }
+
+       mmcra = regs->dsisr;
+       if (ppmu->flags & PPMU_ALT_SIPR) {
+               if (mmcra & POWER6_MMCRA_SIHV)
+                       return PERF_EVENT_MISC_HYPERVISOR;
+               return (mmcra & POWER6_MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
+                       PERF_EVENT_MISC_KERNEL;
+       }
+       if (mmcra & MMCRA_SIHV)
+               return PERF_EVENT_MISC_HYPERVISOR;
+       return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
+                       PERF_EVENT_MISC_KERNEL;
+}
+
+/*
+ * Called from generic code to get the instruction pointer
+ * for an event.
+ */
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+       unsigned long mmcra;
+       unsigned long ip;
+       unsigned long slot;
+
+       if (TRAP(regs) != 0xf00)
+               return regs->nip;       /* not a PMU interrupt */
+
+       ip = mfspr(SPRN_SIAR);
+       mmcra = regs->dsisr;
+       if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
+               slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
+               if (slot > 1)
+                       ip += 4 * (slot - 1);
+       }
+       return ip;
 }
 
 /*
@@ -761,17 +1084,37 @@ static void perf_counter_interrupt(struct pt_regs *regs)
        struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
        struct perf_counter *counter;
        long val;
-       int need_wakeup = 0, found = 0;
+       int found = 0;
+       int nmi;
+
+       if (cpuhw->n_limited)
+               freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
+                                       mfspr(SPRN_PMC6));
+
+       /*
+        * Overload regs->dsisr to store MMCRA so we only need to read it once.
+        */
+       regs->dsisr = mfspr(SPRN_MMCRA);
+
+       /*
+        * If interrupts were soft-disabled when this PMU interrupt
+        * occurred, treat it as an NMI.
+        */
+       nmi = !regs->softe;
+       if (nmi)
+               nmi_enter();
+       else
+               irq_enter();
 
        for (i = 0; i < cpuhw->n_counters; ++i) {
                counter = cpuhw->counter[i];
+               if (is_limited_pmc(counter->hw.idx))
+                       continue;
                val = read_pmc(counter->hw.idx);
                if ((int)val < 0) {
                        /* counter has overflowed */
                        found = 1;
-                       record_and_restart(counter, val, regs);
-                       if (counter->wakeup_pending)
-                               need_wakeup = 1;
+                       record_and_restart(counter, val, regs, nmi);
                }
        }
 
@@ -783,6 +1126,8 @@ static void perf_counter_interrupt(struct pt_regs *regs)
         */
        if (!found) {
                for (i = 0; i < ppmu->n_counter; ++i) {
+                       if (is_limited_pmc(i + 1))
+                               continue;
                        val = read_pmc(i + 1);
                        if ((int)val < 0)
                                write_pmc(i + 1, 0);
@@ -796,23 +1141,12 @@ static void perf_counter_interrupt(struct pt_regs *regs)
         * XXX might want to use MSR.PM to keep the counters frozen until
         * we get back out of this interrupt.
         */
-       mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+       write_mmcr0(cpuhw, cpuhw->mmcr[0]);
 
-       /*
-        * If we need a wakeup, check whether interrupts were soft-enabled
-        * when we took the interrupt.  If they were, we can wake stuff up
-        * immediately; otherwise we'll have to set a flag and do the
-        * wakeup when interrupts get soft-enabled.
-        */
-       if (need_wakeup) {
-               if (regs->softe) {
-                       irq_enter();
-                       perf_counter_do_pending();
-                       irq_exit();
-               } else {
-                       set_perf_counter_pending();
-               }
-       }
+       if (nmi)
+               nmi_exit();
+       else
+               irq_exit();
 }
 
 void hw_perf_counter_setup(int cpu)
@@ -833,11 +1167,6 @@ static int init_perf_counters(void)
 {
        unsigned long pvr;
 
-       if (reserve_pmc_hardware(perf_counter_interrupt)) {
-               printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
-               return -EBUSY;
-       }
-
        /* XXX should get this from cputable */
        pvr = mfspr(SPRN_PVR);
        switch (PVR_VER(pvr)) {
This page took 0.039837 seconds and 5 git commands to generate.