perf_counter: powerpc: only reserve PMU hardware when we need it

[deliverable/linux.git] / arch / powerpc / kernel / perf_counter.c
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c

index 5b0211348c73a19be76fd80cb6e1edb2c5025d37..560dd1e7b524472c35309c54cfcac5d57ccaf2bd 100644 (file)
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -16,6 +16,7 @@
  #include <asm/reg.h>
  #include <asm/pmc.h>
  #include <asm/machdep.h>
+#include <asm/firmware.h>
  
  struct cpu_hw_counters {
         int n_counters;
@@ -31,6 +32,17 @@ DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
  
  struct power_pmu *ppmu;
  
+/*
+ * Normally, to ignore kernel events we set the FCS (freeze counters
+ * in supervisor mode) bit in MMCR0, but if the kernel runs with the
+ * hypervisor bit set in the MSR, or if we are running on a processor
+ * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
+ * then we need to use the FCHV bit to ignore kernel events.
+ */
+static unsigned int freeze_counters_kernel = MMCR0_FCS;
+
+static void perf_counter_interrupt(struct pt_regs *regs);
+
  void perf_counter_print_debug(void)
  {
  }
@@ -214,6 +226,36 @@ static int power_check_constraints(unsigned int event[], int n_ev)
         return 0;
  }
  
+/*
+ * Check if newly-added counters have consistent settings for
+ * exclude_{user,kernel,hv} with each other and any previously
+ * added counters.
+ */
+static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
+{
+       int eu, ek, eh;
+       int i, n;
+       struct perf_counter *counter;
+
+       n = n_prev + n_new;
+       if (n <= 1)
+               return 0;
+
+       eu = ctrs[0]->hw_event.exclude_user;
+       ek = ctrs[0]->hw_event.exclude_kernel;
+       eh = ctrs[0]->hw_event.exclude_hv;
+       if (n_prev == 0)
+               n_prev = 1;
+       for (i = n_prev; i < n; ++i) {
+               counter = ctrs[i];
+               if (counter->hw_event.exclude_user != eu ||
+                   counter->hw_event.exclude_kernel != ek ||
+                   counter->hw_event.exclude_hv != eh)
+                       return -EAGAIN;
+       }
+       return 0;
+}
+
  static void power_perf_read(struct perf_counter *counter)
  {
         long val, delta, prev;
@@ -323,6 +365,20 @@ void hw_perf_restore(u64 disable)
                 goto out;
         }
  
+       /*
+        * Add in MMCR0 freeze bits corresponding to the
+        * hw_event.exclude_* bits for the first counter.
+        * We have already checked that all counters have the
+        * same values for these bits as the first counter.
+        */
+       counter = cpuhw->counter[0];
+       if (counter->hw_event.exclude_user)
+               cpuhw->mmcr[0] |= MMCR0_FCP;
+       if (counter->hw_event.exclude_kernel)
+               cpuhw->mmcr[0] |= freeze_counters_kernel;
+       if (counter->hw_event.exclude_hv)
+               cpuhw->mmcr[0] |= MMCR0_FCHV;
+
         /*
          * Write the new configuration to MMCR* with the freeze
          * bit set and set the hardware counters to their initial values.
@@ -363,6 +419,7 @@ void hw_perf_restore(u64 disable)
                 atomic64_set(&counter->hw.prev_count, val);
                 counter->hw.idx = hwc_index[i] + 1;
                 write_pmc(counter->hw.idx, val);
+               perf_counter_update_userpage(counter);
         }
         mb();
         cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
@@ -400,6 +457,8 @@ static void counter_sched_in(struct perf_counter *counter, int cpu)
  {
         counter->state = PERF_COUNTER_STATE_ACTIVE;
         counter->oncpu = cpu;
+       counter->tstamp_running += counter->ctx->time_now -
+               counter->tstamp_stopped;
         if (is_software_counter(counter))
                 counter->hw_ops->enable(counter);
  }
@@ -424,6 +483,8 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
                            &cpuhw->counter[n0], &cpuhw->events[n0]);
         if (n < 0)
                 return -EAGAIN;
+       if (check_excludes(cpuhw->counter, n0, n))
+               return -EAGAIN;
         if (power_check_constraints(cpuhw->events, n + n0))
                 return -EAGAIN;
         cpuhw->n_counters = n0 + n;
@@ -476,6 +537,8 @@ static int power_perf_enable(struct perf_counter *counter)
                 goto out;
         cpuhw->counter[n0] = counter;
         cpuhw->events[n0] = counter->hw.config;
+       if (check_excludes(cpuhw->counter, n0, 1))
+               goto out;
         if (power_check_constraints(cpuhw->events, n0 + 1))
                 goto out;
  
@@ -514,6 +577,7 @@ static void power_perf_disable(struct perf_counter *counter)
                         ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
                         write_pmc(counter->hw.idx, 0);
                         counter->hw.idx = 0;
+                       perf_counter_update_userpage(counter);
                         break;
                 }
         }
@@ -532,6 +596,24 @@ struct hw_perf_counter_ops power_perf_ops = {
         .read = power_perf_read
  };
  
+/* Number of perf_counters counting hardware events */
+static atomic_t num_counters;
+/* Used to avoid races in calling reserve/release_pmc_hardware */
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+/*
+ * Release the PMU if this is the last perf_counter.
+ */
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+       if (!atomic_add_unless(&num_counters, -1, 1)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_dec_return(&num_counters) == 0)
+                       release_pmc_hardware();
+               mutex_unlock(&pmc_reserve_mutex);
+       }
+}
+
  const struct hw_perf_counter_ops *
  hw_perf_counter_init(struct perf_counter *counter)
  {
@@ -539,21 +621,31 @@ hw_perf_counter_init(struct perf_counter *counter)
         struct perf_counter *ctrs[MAX_HWCOUNTERS];
         unsigned int events[MAX_HWCOUNTERS];
         int n;
+       int err;
  
         if (!ppmu)
                 return NULL;
         if ((s64)counter->hw_event.irq_period < 0)
                 return NULL;
-       ev = counter->hw_event.type;
-       if (!counter->hw_event.raw) {
-               if (ev >= ppmu->n_generic ||
-                   ppmu->generic_events[ev] == 0)
+       if (!perf_event_raw(&counter->hw_event)) {
+               ev = perf_event_id(&counter->hw_event);
+               if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
                         return NULL;
                 ev = ppmu->generic_events[ev];
+       } else {
+               ev = perf_event_config(&counter->hw_event);
         }
         counter->hw.config_base = ev;
         counter->hw.idx = 0;
  
+       /*
+        * If we are not running on a hypervisor, force the
+        * exclude_hv bit to 0 so that we don't care what
+        * the user set it to.
+        */
+       if (!firmware_has_feature(FW_FEATURE_LPAR))
+               counter->hw_event.exclude_hv = 0;
+       
         /*
          * If this is in a group, check if it can go on with all the
          * other hardware counters in the group.  We assume the counter
@@ -566,67 +658,37 @@ hw_perf_counter_init(struct perf_counter *counter)
                 if (n < 0)
                         return NULL;
         }
-       events[n++] = ev;
-       if (power_check_constraints(events, n))
+       events[n] = ev;
+       ctrs[n] = counter;
+       if (check_excludes(ctrs, n, 1))
+               return NULL;
+       if (power_check_constraints(events, n + 1))
                 return NULL;
  
-       counter->hw.config = events[n - 1];
+       counter->hw.config = events[n];
         atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
-       return &power_perf_ops;
-}
-
-/*
- * Handle wakeups.
- */
-void perf_counter_do_pending(void)
-{
-       int i;
-       struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
-       struct perf_counter *counter;
  
-       set_perf_counter_pending(0);
-       for (i = 0; i < cpuhw->n_counters; ++i) {
-               counter = cpuhw->counter[i];
-               if (counter && counter->wakeup_pending) {
-                       counter->wakeup_pending = 0;
-                       wake_up(&counter->waitq);
-               }
-       }
-}
-
-/*
- * Record data for an irq counter.
- * This function was lifted from the x86 code; maybe it should
- * go in the core?
- */
-static void perf_store_irq_data(struct perf_counter *counter, u64 data)
-{
-       struct perf_data *irqdata = counter->irqdata;
-
-       if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-               irqdata->overrun++;
-       } else {
-               u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-               *p = data;
-               irqdata->len += sizeof(u64);
+       /*
+        * See if we need to reserve the PMU.
+        * If no counters are currently in use, then we have to take a
+        * mutex to ensure that we don't race with another task doing
+        * reserve_pmc_hardware or release_pmc_hardware.
+        */
+       err = 0;
+       if (!atomic_inc_not_zero(&num_counters)) {
+               mutex_lock(&pmc_reserve_mutex);
+               if (atomic_read(&num_counters) == 0 &&
+                   reserve_pmc_hardware(perf_counter_interrupt))
+                       err = -EBUSY;
+               else
+                       atomic_inc(&num_counters);
+               mutex_unlock(&pmc_reserve_mutex);
         }
-}
+       counter->destroy = hw_perf_counter_destroy;
  
-/*
- * Record all the values of the counters in a group
- */
-static void perf_handle_group(struct perf_counter *counter)
-{
-       struct perf_counter *leader, *sub;
-
-       leader = counter->group_leader;
-       list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-               if (sub != counter)
-                       sub->hw_ops->read(sub);
-               perf_store_irq_data(counter, sub->hw_event.type);
-               perf_store_irq_data(counter, atomic64_read(&sub->count));
-       }
+       if (err)
+               return NULL;
+       return &power_perf_ops;
  }
  
  /*
@@ -664,24 +726,13 @@ static void record_and_restart(struct perf_counter *counter, long val,
         write_pmc(counter->hw.idx, val);
         atomic64_set(&counter->hw.prev_count, val);
         atomic64_set(&counter->hw.period_left, left);
+       perf_counter_update_userpage(counter);
  
         /*
          * Finally record data if requested.
          */
-       if (record) {
-               switch (counter->hw_event.record_type) {
-               case PERF_RECORD_SIMPLE:
-                       break;
-               case PERF_RECORD_IRQ:
-                       perf_store_irq_data(counter, instruction_pointer(regs));
-                       counter->wakeup_pending = 1;
-                       break;
-               case PERF_RECORD_GROUP:
-                       perf_handle_group(counter);
-                       counter->wakeup_pending = 1;
-                       break;
-               }
-       }
+       if (record)
+               perf_counter_output(counter, 1, regs);
  }
  
  /*
@@ -693,7 +744,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
         struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
         struct perf_counter *counter;
         long val;
-       int need_wakeup = 0, found = 0;
+       int found = 0;
  
         for (i = 0; i < cpuhw->n_counters; ++i) {
                 counter = cpuhw->counter[i];
@@ -702,8 +753,6 @@ static void perf_counter_interrupt(struct pt_regs *regs)
                         /* counter has overflowed */
                         found = 1;
                         record_and_restart(counter, val, regs);
-                       if (counter->wakeup_pending)
-                               need_wakeup = 1;
                 }
         }
  
@@ -733,17 +782,14 @@ static void perf_counter_interrupt(struct pt_regs *regs)
         /*
          * If we need a wakeup, check whether interrupts were soft-enabled
          * when we took the interrupt.  If they were, we can wake stuff up
-        * immediately; otherwise we'll have to set a flag and do the
-        * wakeup when interrupts get soft-enabled.
+        * immediately; otherwise we'll have do the wakeup when interrupts
+        * get soft-enabled.
          */
-       if (need_wakeup) {
-               if (regs->softe) {
-                       irq_enter();
-                       perf_counter_do_pending();
-                       irq_exit();
-               } else {
-                       set_perf_counter_pending(1);
-               }
+       if (test_perf_counter_pending() && regs->softe) {
+               irq_enter();
+               clear_perf_counter_pending();
+               perf_counter_do_pending();
+               irq_exit();
         }
  }
  
@@ -755,30 +801,45 @@ void hw_perf_counter_setup(int cpu)
         cpuhw->mmcr[0] = MMCR0_FC;
  }
  
+extern struct power_pmu power4_pmu;
  extern struct power_pmu ppc970_pmu;
+extern struct power_pmu power5_pmu;
+extern struct power_pmu power5p_pmu;
  extern struct power_pmu power6_pmu;
  
  static int init_perf_counters(void)
  {
         unsigned long pvr;
  
-       if (reserve_pmc_hardware(perf_counter_interrupt)) {
-               printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
-               return -EBUSY;
-       }
-
         /* XXX should get this from cputable */
         pvr = mfspr(SPRN_PVR);
         switch (PVR_VER(pvr)) {
+       case PV_POWER4:
+       case PV_POWER4p:
+               ppmu = &power4_pmu;
+               break;
         case PV_970:
         case PV_970FX:
         case PV_970MP:
                 ppmu = &ppc970_pmu;
                 break;
+       case PV_POWER5:
+               ppmu = &power5_pmu;
+               break;
+       case PV_POWER5p:
+               ppmu = &power5p_pmu;
+               break;
         case 0x3e:
                 ppmu = &power6_pmu;
                 break;
         }
+
+       /*
+        * Use FCHV to ignore kernel events if MSR.HV is set.
+        */
+       if (mfmsr() & MSR_HV)
+               freeze_counters_kernel = MMCR0_FCHV;
+
         return 0;
  }