sched: Use resched IPI to kick off the nohz idle balance

[deliverable/linux.git] / kernel / sched_fair.c
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index 76411950ff3bd367777e54c06637c25e8a20d8ab..6c5fa1099229191e98e4daec7e01c8f6b8dac70f 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -706,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
+/* we need this in update_cfs_load and load-balance functions below */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
  # ifdef CONFIG_SMP
  static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
                                             int global_update)
@@ -728,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
         u64 now, delta;
         unsigned long load = cfs_rq->load.weight;
  
-       if (cfs_rq->tg == &root_task_group)
+       if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
                 return;
  
         now = rq_of(cfs_rq)->clock_task;
@@ -837,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
  
         tg = cfs_rq->tg;
         se = tg->se[cpu_of(rq_of(cfs_rq))];
-       if (!se)
+       if (!se || throttled_hierarchy(cfs_rq))
                 return;
  #ifndef CONFIG_SMP
         if (likely(se->load.weight == tg->shares))
@@ -968,6 +970,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
         se->vruntime = vruntime;
  }
  
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+
  static void
  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
@@ -997,8 +1001,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
                 __enqueue_entity(cfs_rq, se);
         se->on_rq = 1;
  
-       if (cfs_rq->nr_running == 1)
+       if (cfs_rq->nr_running == 1) {
                 list_add_leaf_cfs_rq(cfs_rq);
+               check_enqueue_throttle(cfs_rq);
+       }
  }
  
  static void __clear_buddies_last(struct sched_entity *se)
@@ -1046,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 __clear_buddies_skip(se);
  }
  
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
  static void
  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
@@ -1084,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         if (!(flags & DEQUEUE_SLEEP))
                 se->vruntime -= cfs_rq->min_vruntime;
  
+       /* return excess runtime on last dequeue */
+       return_cfs_rq_runtime(cfs_rq);
+
         update_min_vruntime(cfs_rq);
         update_cfs_shares(cfs_rq);
  }
@@ -1095,6 +1106,8 @@ static void
  check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
  {
         unsigned long ideal_runtime, delta_exec;
+       struct sched_entity *se;
+       s64 delta;
  
         ideal_runtime = sched_slice(cfs_rq, curr);
         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -1116,16 +1129,14 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
         if (delta_exec < sysctl_sched_min_granularity)
                 return;
  
-       if (cfs_rq->nr_running > 1) {
-               struct sched_entity *se = __pick_first_entity(cfs_rq);
-               s64 delta = curr->vruntime - se->vruntime;
+       se = __pick_first_entity(cfs_rq);
+       delta = curr->vruntime - se->vruntime;
  
-               if (delta < 0)
-                       return;
+       if (delta < 0)
+               return;
  
-               if (delta > ideal_runtime)
-                       resched_task(rq_of(cfs_rq)->curr);
-       }
+       if (delta > ideal_runtime)
+               resched_task(rq_of(cfs_rq)->curr);
  }
  
  static void
@@ -1200,6 +1211,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
         return se;
  }
  
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
  static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
  {
         /*
@@ -1209,6 +1222,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
         if (prev->on_rq)
                 update_curr(cfs_rq);
  
+       /* throttle cfs_rqs exceeding runtime */
+       check_cfs_rq_runtime(cfs_rq);
+
         check_spread(cfs_rq, prev);
         if (prev->on_rq) {
                 update_stats_wait_start(cfs_rq, prev);
@@ -1403,7 +1419,66 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
         return cfs_rq->throttled;
  }
  
-static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+/* check whether cfs_rq, or any parent, is throttled */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+       return cfs_rq->throttle_count;
+}
+
+/*
+ * Ensure that neither of the group entities corresponding to src_cpu or
+ * dest_cpu are members of a throttled hierarchy when performing group
+ * load-balance operations.
+ */
+static inline int throttled_lb_pair(struct task_group *tg,
+                                   int src_cpu, int dest_cpu)
+{
+       struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+
+       src_cfs_rq = tg->cfs_rq[src_cpu];
+       dest_cfs_rq = tg->cfs_rq[dest_cpu];
+
+       return throttled_hierarchy(src_cfs_rq) ||
+              throttled_hierarchy(dest_cfs_rq);
+}
+
+/* updated child weight may affect parent so we have to do this bottom up */
+static int tg_unthrottle_up(struct task_group *tg, void *data)
+{
+       struct rq *rq = data;
+       struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+       cfs_rq->throttle_count--;
+#ifdef CONFIG_SMP
+       if (!cfs_rq->throttle_count) {
+               u64 delta = rq->clock_task - cfs_rq->load_stamp;
+
+               /* leaving throttled state, advance shares averaging windows */
+               cfs_rq->load_stamp += delta;
+               cfs_rq->load_last += delta;
+
+               /* update entity weight now that we are on_rq again */
+               update_cfs_shares(cfs_rq);
+       }
+#endif
+
+       return 0;
+}
+
+static int tg_throttle_down(struct task_group *tg, void *data)
+{
+       struct rq *rq = data;
+       struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+       /* group is entering throttled state, record last load */
+       if (!cfs_rq->throttle_count)
+               update_cfs_load(cfs_rq, 0);
+       cfs_rq->throttle_count++;
+
+       return 0;
+}
+
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
  {
         struct rq *rq = rq_of(cfs_rq);
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
@@ -1413,7 +1488,9 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
  
         /* account load preceding throttle */
-       update_cfs_load(cfs_rq, 0);
+       rcu_read_lock();
+       walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
+       rcu_read_unlock();
  
         task_delta = cfs_rq->h_nr_running;
         for_each_sched_entity(se) {
@@ -1434,6 +1511,7 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
                 rq->nr_running -= task_delta;
  
         cfs_rq->throttled = 1;
+       cfs_rq->throttled_timestamp = rq->clock;
         raw_spin_lock(&cfs_b->lock);
         list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
         raw_spin_unlock(&cfs_b->lock);
@@ -1451,8 +1529,14 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
  
         cfs_rq->throttled = 0;
         raw_spin_lock(&cfs_b->lock);
+       cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
         list_del_rcu(&cfs_rq->throttled_list);
         raw_spin_unlock(&cfs_b->lock);
+       cfs_rq->throttled_timestamp = 0;
+
+       update_rq_clock(rq);
+       /* update hierarchical throttle state */
+       walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
  
         if (!cfs_rq->load.weight)
                 return;
@@ -1536,6 +1620,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
         throttled = !list_empty(&cfs_b->throttled_cfs_rq);
         /* idle depends on !throttled (for the case of a large deficit) */
         idle = cfs_b->idle && !throttled;
+       cfs_b->nr_periods += overrun;
  
         /* if we're going inactive then everything else can be deferred */
         if (idle)
@@ -1549,6 +1634,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
                 goto out_unlock;
         }
  
+       /* account preceding periods in which throttling occurred */
+       cfs_b->nr_throttled += overrun;
+
         /*
          * There are throttled entities so we must first use the new bandwidth
          * to unthrottle them before making it generally available.  This
@@ -1590,14 +1678,167 @@ out_unlock:
  
         return idle;
  }
+
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+
+/* are we near the end of the current quota period? */
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+       struct hrtimer *refresh_timer = &cfs_b->period_timer;
+       u64 remaining;
+
+       /* if the call-back is running a quota refresh is already occurring */
+       if (hrtimer_callback_running(refresh_timer))
+               return 1;
+
+       /* is a quota refresh about to occur? */
+       remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+       if (remaining < min_expire)
+               return 1;
+
+       return 0;
+}
+
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+       u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+
+       /* if there's a quota refresh soon don't bother with slack */
+       if (runtime_refresh_within(cfs_b, min_left))
+               return;
+
+       start_bandwidth_timer(&cfs_b->slack_timer,
+                               ns_to_ktime(cfs_bandwidth_slack_period));
+}
+
+/* we know any runtime found here is valid as update_curr() precedes return */
+static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+       s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
+
+       if (slack_runtime <= 0)
+               return;
+
+       raw_spin_lock(&cfs_b->lock);
+       if (cfs_b->quota != RUNTIME_INF &&
+           cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+               cfs_b->runtime += slack_runtime;
+
+               /* we are under rq->lock, defer unthrottling using a timer */
+               if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+                   !list_empty(&cfs_b->throttled_cfs_rq))
+                       start_cfs_slack_bandwidth(cfs_b);
+       }
+       raw_spin_unlock(&cfs_b->lock);
+
+       /* even if it's not valid for return we don't want to try again */
+       cfs_rq->runtime_remaining -= slack_runtime;
+}
+
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+       if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+               return;
+
+       __return_cfs_rq_runtime(cfs_rq);
+}
+
+/*
+ * This is done with a timer (instead of inline with bandwidth return) since
+ * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
+ */
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+       u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+       u64 expires;
+
+       /* confirm we're still not at a refresh boundary */
+       if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+               return;
+
+       raw_spin_lock(&cfs_b->lock);
+       if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+               runtime = cfs_b->runtime;
+               cfs_b->runtime = 0;
+       }
+       expires = cfs_b->runtime_expires;
+       raw_spin_unlock(&cfs_b->lock);
+
+       if (!runtime)
+               return;
+
+       runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+
+       raw_spin_lock(&cfs_b->lock);
+       if (expires == cfs_b->runtime_expires)
+               cfs_b->runtime = runtime;
+       raw_spin_unlock(&cfs_b->lock);
+}
+
+/*
+ * When a group wakes up we want to make sure that its quota is not already
+ * expired/exceeded, otherwise it may be allowed to steal additional ticks of
+ * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ */
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
+{
+       /* an active group must be handled by the update_curr()->put() path */
+       if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+               return;
+
+       /* ensure the group is not already throttled */
+       if (cfs_rq_throttled(cfs_rq))
+               return;
+
+       /* update runtime allocation */
+       account_cfs_rq_runtime(cfs_rq, 0);
+       if (cfs_rq->runtime_remaining <= 0)
+               throttle_cfs_rq(cfs_rq);
+}
+
+/* conditionally throttle active cfs_rq's from put_prev_entity() */
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+       if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+               return;
+
+       /*
+        * it's possible for a throttled entity to be forced into a running
+        * state (e.g. set_curr_task), in this case we're finished.
+        */
+       if (cfs_rq_throttled(cfs_rq))
+               return;
+
+       throttle_cfs_rq(cfs_rq);
+}
  #else
  static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
                                      unsigned long delta_exec) {}
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
  
  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
  {
         return 0;
  }
+
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+       return 0;
+}
+
+static inline int throttled_lb_pair(struct task_group *tg,
+                                   int src_cpu, int dest_cpu)
+{
+       return 0;
+}
  #endif
  
  /**************************************************
@@ -2270,6 +2511,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
         if (unlikely(se == pse))
                 return;
  
+       /*
+        * This is possible from callers such as pull_task(), in which we
+        * unconditionally check_prempt_curr() after an enqueue (which may have
+        * lead to a throttle).  This both saves work and prevents false
+        * next-buddy nomination below.
+        */
+       if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+               return;
+
         if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
                 set_next_buddy(pse);
                 next_buddy_marked = 1;
@@ -2278,6 +2528,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
         /*
          * We can come here with TIF_NEED_RESCHED already set from new task
          * wake up path.
+        *
+        * Note: this also catches the edge-case of curr being in a throttled
+        * group (e.g. via set_curr_task), since update_curr() (in the
+        * enqueue of curr) will have resulted in resched being set.  This
+        * prevents us from potentially nominating it as a false LAST_BUDDY
+        * below.
          */
         if (test_tsk_need_resched(curr))
                 return;
@@ -2396,7 +2652,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  {
         struct sched_entity *se = &p->se;
  
-       if (!se->on_rq)
+       /* throttled hierarchies are not runnable */
+       if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
                 return false;
  
         /* Tell the scheduler that we'd really like pse to run next. */
@@ -2493,6 +2750,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
  
         for_each_leaf_cfs_rq(busiest, cfs_rq) {
                 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+                       if (throttled_lb_pair(task_group(p),
+                                             busiest->cpu, this_cpu))
+                               break;
  
                         if (!can_migrate_task(p, busiest, this_cpu,
                                                 sd, idle, &pinned))
@@ -2608,8 +2868,13 @@ static void update_shares(int cpu)
          * Iterates the task_group tree in a bottom up fashion, see
          * list_add_leaf_cfs_rq() for details.
          */
-       for_each_leaf_cfs_rq(rq, cfs_rq)
+       for_each_leaf_cfs_rq(rq, cfs_rq) {
+               /* throttled entities do not contribute to load */
+               if (throttled_hierarchy(cfs_rq))
+                       continue;
+
                 update_shares_cpu(cfs_rq->tg, cpu);
+       }
         rcu_read_unlock();
  }
  
@@ -2659,9 +2924,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                 u64 rem_load, moved_load;
  
                 /*
-                * empty group
+                * empty group or part of a throttled hierarchy
                  */
-               if (!busiest_cfs_rq->task_weight)
+               if (!busiest_cfs_rq->task_weight ||
+                   throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
                         continue;
  
                 rem_load = (u64)rem_load_move * busiest_weight;
@@ -4003,22 +4269,6 @@ out_unlock:
  }
  
  #ifdef CONFIG_NO_HZ
-
-static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
-
-static void trigger_sched_softirq(void *data)
-{
-       raise_softirq_irqoff(SCHED_SOFTIRQ);
-}
-
-static inline void init_sched_softirq_csd(struct call_single_data *csd)
-{
-       csd->func = trigger_sched_softirq;
-       csd->info = NULL;
-       csd->flags = 0;
-       csd->priv = 0;
-}
-
  /*
   * idle load balancing details
   * - One of the idle CPUs nominates itself as idle load_balancer, while
@@ -4184,11 +4434,16 @@ static void nohz_balancer_kick(int cpu)
         }
  
         if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
-               struct call_single_data *cp;
-
                 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
-               cp = &per_cpu(remote_sched_softirq_cb, cpu);
-               __smp_call_function_single(ilb_cpu, cp, 0);
+
+               smp_mb();
+               /*
+                * Use smp_send_reschedule() instead of resched_cpu().
+                * This way we generate a sched IPI on the target cpu which
+                * is idle. And the softirq performing nohz idle load balance
+                * will be run before returning from the IPI.
+                */
+               smp_send_reschedule(ilb_cpu);
         }
         return;
  }