sched: Improve balance_cpu() to consider other cpus in its group as target of (pinned...

[deliverable/linux.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index f9f9aa0edf3c47f8731d4a7f43ca3ef1872fdfce..22321db64952f9461b9e03a3e90bd99217478f8f 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3054,6 +3054,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
  
  #define LBF_ALL_PINNED 0x01
  #define LBF_NEED_BREAK 0x02
+#define LBF_SOME_PINNED 0x04
  
  struct lb_env {
         struct sched_domain     *sd;
@@ -3064,6 +3065,8 @@ struct lb_env {
         int                     dst_cpu;
         struct rq               *dst_rq;
  
+       struct cpumask          *dst_grpmask;
+       int                     new_dst_cpu;
         enum cpu_idle_type      idle;
         long                    imbalance;
         unsigned int            flags;
@@ -3131,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
          * 3) are cache-hot on their current CPU.
          */
         if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+               int new_dst_cpu;
+
                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+
+               /*
+                * Remember if this task can be migrated to any other cpu in
+                * our sched_group. We may want to revisit it if we couldn't
+                * meet load balance goals by pulling other tasks on src_cpu.
+                *
+                * Also avoid computing new_dst_cpu if we have already computed
+                * one in current iteration.
+                */
+               if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+                       return 0;
+
+               new_dst_cpu = cpumask_first_and(env->dst_grpmask,
+                                               tsk_cpus_allowed(p));
+               if (new_dst_cpu < nr_cpu_ids) {
+                       env->flags |= LBF_SOME_PINNED;
+                       env->new_dst_cpu = new_dst_cpu;
+               }
                 return 0;
         }
+
+       /* Record that we found atleast one task that could run on dst_cpu */
         env->flags &= ~LBF_ALL_PINNED;
  
         if (task_running(env->src_rq, p)) {
@@ -4213,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                         struct sched_domain *sd, enum cpu_idle_type idle,
                         int *balance)
  {
-       int ld_moved, active_balance = 0;
+       int ld_moved, cur_ld_moved, active_balance = 0;
+       int lb_iterations, max_lb_iterations;
         struct sched_group *group;
         struct rq *busiest;
         unsigned long flags;
@@ -4223,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                 .sd             = sd,
                 .dst_cpu        = this_cpu,
                 .dst_rq         = this_rq,
+               .dst_grpmask    = sched_group_cpus(sd->groups),
                 .idle           = idle,
                 .loop_break     = sched_nr_migrate_break,
         };
  
         cpumask_copy(cpus, cpu_active_mask);
+       max_lb_iterations = cpumask_weight(env.dst_grpmask);
  
         schedstat_inc(sd, lb_count[idle]);
  
@@ -4253,6 +4281,7 @@ redo:
         schedstat_add(sd, lb_imbalance[idle], env.imbalance);
  
         ld_moved = 0;
+       lb_iterations = 1;
         if (busiest->nr_running > 1) {
                 /*
                  * Attempt to move tasks. If find_busiest_group has found
@@ -4270,7 +4299,13 @@ more_balance:
                 double_rq_lock(this_rq, busiest);
                 if (!env.loop)
                         update_h_load(env.src_cpu);
-               ld_moved += move_tasks(&env);
+
+               /*
+                * cur_ld_moved - load moved in current iteration
+                * ld_moved     - cumulative load moved across iterations
+                */
+               cur_ld_moved = move_tasks(&env);
+               ld_moved += cur_ld_moved;
                 double_rq_unlock(this_rq, busiest);
                 local_irq_restore(flags);
  
@@ -4282,8 +4317,43 @@ more_balance:
                 /*
                  * some other cpu did the load balance for us.
                  */
-               if (ld_moved && this_cpu != smp_processor_id())
-                       resched_cpu(this_cpu);
+               if (cur_ld_moved && env.dst_cpu != smp_processor_id())
+                       resched_cpu(env.dst_cpu);
+
+               /*
+                * Revisit (affine) tasks on src_cpu that couldn't be moved to
+                * us and move them to an alternate dst_cpu in our sched_group
+                * where they can run. The upper limit on how many times we
+                * iterate on same src_cpu is dependent on number of cpus in our
+                * sched_group.
+                *
+                * This changes load balance semantics a bit on who can move
+                * load to a given_cpu. In addition to the given_cpu itself
+                * (or a ilb_cpu acting on its behalf where given_cpu is
+                * nohz-idle), we now have balance_cpu in a position to move
+                * load to given_cpu. In rare situations, this may cause
+                * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
+                * _independently_ and at _same_ time to move some load to
+                * given_cpu) causing exceess load to be moved to given_cpu.
+                * This however should not happen so much in practice and
+                * moreover subsequent load balance cycles should correct the
+                * excess load moved.
+                */
+               if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
+                               lb_iterations++ < max_lb_iterations) {
+
+                       this_rq          = cpu_rq(env.new_dst_cpu);
+                       env.dst_rq       = this_rq;
+                       env.dst_cpu      = env.new_dst_cpu;
+                       env.flags       &= ~LBF_SOME_PINNED;
+                       env.loop         = 0;
+                       env.loop_break   = sched_nr_migrate_break;
+                       /*
+                        * Go back to "more_balance" rather than "redo" since we
+                        * need to continue with same src_cpu.
+                        */
+                       goto more_balance;
+               }
  
                 /* All tasks on this runqueue were pinned by CPU affinity */
                 if (unlikely(env.flags & LBF_ALL_PINNED)) {