sched: Check for pushing rt tasks after all scheduling

[deliverable/linux.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 98972d366fdc5bd8c2bf6232ee212e90d20be5cb..a030d4514cdc00f0143d2f7eb94dabd29604da9f 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -693,6 +693,7 @@ static inline int cpu_of(struct rq *rq)
  #define this_rq()              (&__get_cpu_var(runqueues))
  #define task_rq(p)             cpu_rq(task_cpu(p))
  #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
+#define raw_rq()               (&__raw_get_cpu_var(runqueues))
  
  inline void update_rq_clock(struct rq *rq)
  {
@@ -1522,13 +1523,18 @@ static void
  update_group_shares_cpu(struct task_group *tg, int cpu,
                         unsigned long sd_shares, unsigned long sd_rq_weight)
  {
-       unsigned long shares;
         unsigned long rq_weight;
+       unsigned long shares;
+       int boost = 0;
  
         if (!tg->se[cpu])
                 return;
  
         rq_weight = tg->cfs_rq[cpu]->rq_weight;
+       if (!rq_weight) {
+               boost = 1;
+               rq_weight = NICE_0_LOAD;
+       }
  
         /*
          *           \Sum shares * rq_weight
@@ -1545,8 +1551,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
                 unsigned long flags;
  
                 spin_lock_irqsave(&rq->lock, flags);
-               tg->cfs_rq[cpu]->shares = shares;
-
+               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
                 __set_se_shares(tg->se[cpu], shares);
                 spin_unlock_irqrestore(&rq->lock, flags);
         }
@@ -1559,7 +1564,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
   */
  static int tg_shares_up(struct task_group *tg, void *data)
  {
-       unsigned long weight, rq_weight = 0;
+       unsigned long weight, rq_weight = 0, eff_weight = 0;
         unsigned long shares = 0;
         struct sched_domain *sd = data;
         int i;
@@ -1571,11 +1576,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
                  * run here it will not get delayed by group starvation.
                  */
                 weight = tg->cfs_rq[i]->load.weight;
+               tg->cfs_rq[i]->rq_weight = weight;
+               rq_weight += weight;
+
                 if (!weight)
                         weight = NICE_0_LOAD;
  
-               tg->cfs_rq[i]->rq_weight = weight;
-               rq_weight += weight;
+               eff_weight += weight;
                 shares += tg->cfs_rq[i]->shares;
         }
  
@@ -1585,8 +1592,14 @@ static int tg_shares_up(struct task_group *tg, void *data)
         if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                 shares = tg->shares;
  
-       for_each_cpu(i, sched_domain_span(sd))
-               update_group_shares_cpu(tg, i, shares, rq_weight);
+       for_each_cpu(i, sched_domain_span(sd)) {
+               unsigned long sd_rq_weight = rq_weight;
+
+               if (!tg->cfs_rq[i]->rq_weight)
+                       sd_rq_weight = eff_weight;
+
+               update_group_shares_cpu(tg, i, shares, sd_rq_weight);
+       }
  
         return 0;
  }
@@ -1616,8 +1629,14 @@ static int tg_load_down(struct task_group *tg, void *data)
  
  static void update_shares(struct sched_domain *sd)
  {
-       u64 now = cpu_clock(raw_smp_processor_id());
-       s64 elapsed = now - sd->last_update;
+       s64 elapsed;
+       u64 now;
+
+       if (root_task_group_empty())
+               return;
+
+       now = cpu_clock(raw_smp_processor_id());
+       elapsed = now - sd->last_update;
  
         if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                 sd->last_update = now;
@@ -1627,6 +1646,9 @@ static void update_shares(struct sched_domain *sd)
  
  static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
  {
+       if (root_task_group_empty())
+               return;
+
         spin_unlock(&rq->lock);
         update_shares(sd);
         spin_lock(&rq->lock);
@@ -1634,6 +1656,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
  
  static void update_h_load(long cpu)
  {
+       if (root_task_group_empty())
+               return;
+
         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
  }
  
@@ -2637,9 +2662,32 @@ void sched_fork(struct task_struct *p, int clone_flags)
         set_task_cpu(p, cpu);
  
         /*
-        * Make sure we do not leak PI boosting priority to the child:
+        * Make sure we do not leak PI boosting priority to the child.
          */
         p->prio = current->normal_prio;
+
+       /*
+        * Revert to default priority/policy on fork if requested.
+        */
+       if (unlikely(p->sched_reset_on_fork)) {
+               if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
+                       p->policy = SCHED_NORMAL;
+
+               if (p->normal_prio < DEFAULT_PRIO)
+                       p->prio = DEFAULT_PRIO;
+
+               if (PRIO_TO_NICE(p->static_prio) < 0) {
+                       p->static_prio = NICE_TO_PRIO(0);
+                       set_load_weight(p);
+               }
+
+               /*
+                * We don't need the reset flag anymore after the fork. It has
+                * fulfilled its duty:
+                */
+               p->sched_reset_on_fork = 0;
+       }
+
         if (!rt_prio(p->prio))
                 p->sched_class = &fair_sched_class;
  
@@ -2791,14 +2839,14 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
   * with the lock held can cause deadlocks; see schedule() for
   * details.)
   */
-static void finish_task_switch(struct rq *rq, struct task_struct *prev)
+static int finish_task_switch(struct rq *rq, struct task_struct *prev)
         __releases(rq->lock)
  {
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
-#ifdef CONFIG_SMP
         int post_schedule = 0;
  
+#ifdef CONFIG_SMP
         if (current->sched_class->needs_post_schedule)
                 post_schedule = current->sched_class->needs_post_schedule(rq);
  #endif
@@ -2820,10 +2868,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         finish_arch_switch(prev);
         perf_counter_task_sched_in(current, cpu_of(rq));
         finish_lock_switch(rq, prev);
-#ifdef CONFIG_SMP
-       if (post_schedule)
-               current->sched_class->post_schedule(rq);
-#endif
  
         fire_sched_in_preempt_notifiers(current);
         if (mm)
@@ -2836,6 +2880,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
                 kprobe_flush_task(prev);
                 put_task_struct(prev);
         }
+
+       return post_schedule;
  }
  
  /**
@@ -2846,8 +2892,15 @@ asmlinkage void schedule_tail(struct task_struct *prev)
         __releases(rq->lock)
  {
         struct rq *rq = this_rq();
+       int post_schedule;
+
+       post_schedule = finish_task_switch(rq, prev);
+
+#ifdef CONFIG_SMP
+       if (post_schedule)
+               current->sched_class->post_schedule(rq);
+#endif
  
-       finish_task_switch(rq, prev);
  #ifdef __ARCH_WANT_UNLOCKED_CTXSW
         /* In this case, finish_task_switch does not reenable preemption */
         preempt_enable();
@@ -2860,7 +2913,7 @@ asmlinkage void schedule_tail(struct task_struct *prev)
   * context_switch - switch to the new MM and the new
   * thread's register state.
   */
-static inline void
+static inline int
  context_switch(struct rq *rq, struct task_struct *prev,
                struct task_struct *next)
  {
@@ -2907,7 +2960,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
          * CPUs since it called schedule(), thus the 'rq' on its stack
          * frame will be invalid.
          */
-       finish_task_switch(this_rq(), prev);
+       return finish_task_switch(this_rq(), prev);
  }
  
  /*
@@ -5318,6 +5371,7 @@ asmlinkage void __sched schedule(void)
  {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
+       int post_schedule = 0;
         struct rq *rq;
         int cpu;
  
@@ -5368,15 +5422,25 @@ need_resched_nonpreemptible:
                 rq->curr = next;
                 ++*switch_count;
  
-               context_switch(rq, prev, next); /* unlocks the rq */
+               post_schedule = context_switch(rq, prev, next); /* unlocks the rq */
                 /*
                  * the context switch might have flipped the stack from under
                  * us, hence refresh the local variables.
                  */
                 cpu = smp_processor_id();
                 rq = cpu_rq(cpu);
-       } else
+       } else {
+#ifdef CONFIG_SMP
+               if (current->sched_class->needs_post_schedule)
+                       post_schedule = current->sched_class->needs_post_schedule(rq);
+#endif
                 spin_unlock_irq(&rq->lock);
+       }
+
+#ifdef CONFIG_SMP
+       if (post_schedule)
+               current->sched_class->post_schedule(rq);
+#endif
  
         if (unlikely(reacquire_kernel_lock(current) < 0))
                 goto need_resched_nonpreemptible;
@@ -6123,17 +6187,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
         unsigned long flags;
         const struct sched_class *prev_class = p->sched_class;
         struct rq *rq;
+       int reset_on_fork;
  
         /* may grab non-irq protected spin_locks */
         BUG_ON(in_interrupt());
  recheck:
         /* double check policy once rq lock held */
-       if (policy < 0)
+       if (policy < 0) {
+               reset_on_fork = p->sched_reset_on_fork;
                 policy = oldpolicy = p->policy;
-       else if (policy != SCHED_FIFO && policy != SCHED_RR &&
-                       policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                       policy != SCHED_IDLE)
-               return -EINVAL;
+       } else {
+               reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+               policy &= ~SCHED_RESET_ON_FORK;
+
+               if (policy != SCHED_FIFO && policy != SCHED_RR &&
+                               policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+                               policy != SCHED_IDLE)
+                       return -EINVAL;
+       }
+
         /*
          * Valid priorities for SCHED_FIFO and SCHED_RR are
          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6177,6 +6249,10 @@ recheck:
                 /* can't change other user's priorities */
                 if (!check_same_owner(p))
                         return -EPERM;
+
+               /* Normal users shall not reset the sched_reset_on_fork flag */
+               if (p->sched_reset_on_fork && !reset_on_fork)
+                       return -EPERM;
         }
  
         if (user) {
@@ -6220,6 +6296,8 @@ recheck:
         if (running)
                 p->sched_class->put_prev_task(rq, p);
  
+       p->sched_reset_on_fork = reset_on_fork;
+
         oldprio = p->prio;
         __setscheduler(rq, p, policy, param->sched_priority);
  
@@ -6336,14 +6414,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
         if (p) {
                 retval = security_task_getscheduler(p);
                 if (!retval)
-                       retval = p->policy;
+                       retval = p->policy
+                               | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
         }
         read_unlock(&tasklist_lock);
         return retval;
  }
  
  /**
- * sys_sched_getscheduler - get the RT priority of a thread
+ * sys_sched_getparam - get the RT priority of a thread
   * @pid: the pid in question.
   * @param: structure containing the RT priority.
   */
@@ -6571,19 +6650,9 @@ static inline int should_resched(void)
  
  static void __cond_resched(void)
  {
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-       __might_sleep(__FILE__, __LINE__);
-#endif
-       /*
-        * The BKS might be reacquired before we have dropped
-        * PREEMPT_ACTIVE, which could trigger a second
-        * cond_resched() call.
-        */
-       do {
-               add_preempt_count(PREEMPT_ACTIVE);
-               schedule();
-               sub_preempt_count(PREEMPT_ACTIVE);
-       } while (need_resched());
+       add_preempt_count(PREEMPT_ACTIVE);
+       schedule();
+       sub_preempt_count(PREEMPT_ACTIVE);
  }
  
  int __sched _cond_resched(void)
@@ -6597,14 +6666,14 @@ int __sched _cond_resched(void)
  EXPORT_SYMBOL(_cond_resched);
  
  /*
- * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
   * call schedule, and on return reacquire the lock.
   *
   * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
   * operations here to prevent schedule() from being called twice (once via
   * spin_unlock(), once by hand).
   */
-int cond_resched_lock(spinlock_t *lock)
+int __cond_resched_lock(spinlock_t *lock)
  {
         int resched = should_resched();
         int ret = 0;
@@ -6620,9 +6689,9 @@ int cond_resched_lock(spinlock_t *lock)
         }
         return ret;
  }
-EXPORT_SYMBOL(cond_resched_lock);
+EXPORT_SYMBOL(__cond_resched_lock);
  
-int __sched cond_resched_softirq(void)
+int __sched __cond_resched_softirq(void)
  {
         BUG_ON(!in_softirq());
  
@@ -6634,7 +6703,7 @@ int __sched cond_resched_softirq(void)
         }
         return 0;
  }
-EXPORT_SYMBOL(cond_resched_softirq);
+EXPORT_SYMBOL(__cond_resched_softirq);
  
  /**
   * yield - yield the current processor to other threads.
@@ -6658,7 +6727,7 @@ EXPORT_SYMBOL(yield);
   */
  void __sched io_schedule(void)
  {
-       struct rq *rq = &__raw_get_cpu_var(runqueues);
+       struct rq *rq = raw_rq();
  
         delayacct_blkio_start();
         atomic_inc(&rq->nr_iowait);
@@ -6670,7 +6739,7 @@ EXPORT_SYMBOL(io_schedule);
  
  long __sched io_schedule_timeout(long timeout)
  {
-       struct rq *rq = &__raw_get_cpu_var(runqueues);
+       struct rq *rq = raw_rq();
         long ret;
  
         delayacct_blkio_start();
@@ -7289,6 +7358,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
  static void calc_global_load_remove(struct rq *rq)
  {
         atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+       rq->calc_load_active = 0;
  }
  #endif /* CONFIG_HOTPLUG_CPU */
  
@@ -7515,6 +7585,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 task_rq_unlock(rq, &flags);
                 get_task_struct(p);
                 cpu_rq(cpu)->migration_thread = p;
+               rq->calc_load_update = calc_load_update;
                 break;
  
         case CPU_ONLINE:
@@ -7525,8 +7596,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 /* Update our root-domain */
                 rq = cpu_rq(cpu);
                 spin_lock_irqsave(&rq->lock, flags);
-               rq->calc_load_update = calc_load_update;
-               rq->calc_load_active = 0;
                 if (rq->rd) {
                         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
  
@@ -7625,7 +7694,7 @@ static int __init migration_init(void)
         migration_call(&migration_notifier, CPU_ONLINE, cpu);
         register_cpu_notifier(&migration_notifier);
  
-       return err;
+       return 0;
  }
  early_initcall(migration_init);
  #endif
@@ -9398,13 +9467,20 @@ void __init sched_init(void)
  }
  
  #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-void __might_sleep(char *file, int line)
+static inline int preempt_count_equals(int preempt_offset)
+{
+       int nested = preempt_count() & ~PREEMPT_ACTIVE;
+
+       return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+}
+
+void __might_sleep(char *file, int line, int preempt_offset)
  {
  #ifdef in_atomic
         static unsigned long prev_jiffy;        /* ratelimiting */
  
-       if ((!in_atomic() && !irqs_disabled()) ||
-                   system_state != SYSTEM_RUNNING || oops_in_progress)
+       if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+           system_state != SYSTEM_RUNNING || oops_in_progress)
                 return;
         if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                 return;