sched: Improve balance_cpu() to consider other cpus in its group as target of (pinned...
[deliverable/linux.git] / kernel / sched / fair.c
index f9f9aa0edf3c47f8731d4a7f43ca3ef1872fdfce..22321db64952f9461b9e03a3e90bd99217478f8f 100644 (file)
@@ -3054,6 +3054,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
 #define LBF_ALL_PINNED 0x01
 #define LBF_NEED_BREAK 0x02
+#define LBF_SOME_PINNED 0x04
 
 struct lb_env {
        struct sched_domain     *sd;
@@ -3064,6 +3065,8 @@ struct lb_env {
        int                     dst_cpu;
        struct rq               *dst_rq;
 
+       struct cpumask          *dst_grpmask;
+       int                     new_dst_cpu;
        enum cpu_idle_type      idle;
        long                    imbalance;
        unsigned int            flags;
@@ -3131,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         * 3) are cache-hot on their current CPU.
         */
        if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+               int new_dst_cpu;
+
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+
+               /*
+                * Remember if this task can be migrated to any other cpu in
+                * our sched_group. We may want to revisit it if we couldn't
+                * meet load balance goals by pulling other tasks on src_cpu.
+                *
+                * Also avoid computing new_dst_cpu if we have already computed
+                * one in current iteration.
+                */
+               if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+                       return 0;
+
+               new_dst_cpu = cpumask_first_and(env->dst_grpmask,
+                                               tsk_cpus_allowed(p));
+               if (new_dst_cpu < nr_cpu_ids) {
+                       env->flags |= LBF_SOME_PINNED;
+                       env->new_dst_cpu = new_dst_cpu;
+               }
                return 0;
        }
+
+       /* Record that we found atleast one task that could run on dst_cpu */
        env->flags &= ~LBF_ALL_PINNED;
 
        if (task_running(env->src_rq, p)) {
@@ -4213,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *balance)
 {
-       int ld_moved, active_balance = 0;
+       int ld_moved, cur_ld_moved, active_balance = 0;
+       int lb_iterations, max_lb_iterations;
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
@@ -4223,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .sd             = sd,
                .dst_cpu        = this_cpu,
                .dst_rq         = this_rq,
+               .dst_grpmask    = sched_group_cpus(sd->groups),
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
        };
 
        cpumask_copy(cpus, cpu_active_mask);
+       max_lb_iterations = cpumask_weight(env.dst_grpmask);
 
        schedstat_inc(sd, lb_count[idle]);
 
@@ -4253,6 +4281,7 @@ redo:
        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
 
        ld_moved = 0;
+       lb_iterations = 1;
        if (busiest->nr_running > 1) {
                /*
                 * Attempt to move tasks. If find_busiest_group has found
@@ -4270,7 +4299,13 @@ more_balance:
                double_rq_lock(this_rq, busiest);
                if (!env.loop)
                        update_h_load(env.src_cpu);
-               ld_moved += move_tasks(&env);
+
+               /*
+                * cur_ld_moved - load moved in current iteration
+                * ld_moved     - cumulative load moved across iterations
+                */
+               cur_ld_moved = move_tasks(&env);
+               ld_moved += cur_ld_moved;
                double_rq_unlock(this_rq, busiest);
                local_irq_restore(flags);
 
@@ -4282,8 +4317,43 @@ more_balance:
                /*
                 * some other cpu did the load balance for us.
                 */
-               if (ld_moved && this_cpu != smp_processor_id())
-                       resched_cpu(this_cpu);
+               if (cur_ld_moved && env.dst_cpu != smp_processor_id())
+                       resched_cpu(env.dst_cpu);
+
+               /*
+                * Revisit (affine) tasks on src_cpu that couldn't be moved to
+                * us and move them to an alternate dst_cpu in our sched_group
+                * where they can run. The upper limit on how many times we
+                * iterate on same src_cpu is dependent on number of cpus in our
+                * sched_group.
+                *
+                * This changes load balance semantics a bit on who can move
+                * load to a given_cpu. In addition to the given_cpu itself
+                * (or a ilb_cpu acting on its behalf where given_cpu is
+                * nohz-idle), we now have balance_cpu in a position to move
+                * load to given_cpu. In rare situations, this may cause
+                * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
+                * _independently_ and at _same_ time to move some load to
+                * given_cpu) causing exceess load to be moved to given_cpu.
+                * This however should not happen so much in practice and
+                * moreover subsequent load balance cycles should correct the
+                * excess load moved.
+                */
+               if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
+                               lb_iterations++ < max_lb_iterations) {
+
+                       this_rq          = cpu_rq(env.new_dst_cpu);
+                       env.dst_rq       = this_rq;
+                       env.dst_cpu      = env.new_dst_cpu;
+                       env.flags       &= ~LBF_SOME_PINNED;
+                       env.loop         = 0;
+                       env.loop_break   = sched_nr_migrate_break;
+                       /*
+                        * Go back to "more_balance" rather than "redo" since we
+                        * need to continue with same src_cpu.
+                        */
+                       goto more_balance;
+               }
 
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(env.flags & LBF_ALL_PINNED)) {
This page took 0.056495 seconds and 5 git commands to generate.