nohz: Re-evaluate the tick for the new task after a context switch

[deliverable/linux.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 3a673a3b0c6bb7ffbfcb2897220a2806b320b6d5..dd09def88567bf9f418825c0f684b105b0667be7 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -549,7 +549,7 @@ void resched_cpu(int cpu)
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
  /*
   * In the semi idle case, use the nearest busy cpu for migrating timers
   * from an idle cpu.  This is good for power-savings.
@@ -587,7 +587,7 @@ unlock:
   * account when the CPU goes back to idle and evaluates the timer
   * wheel for the next timer event.
   */
-void wake_up_idle_cpu(int cpu)
+static void wake_up_idle_cpu(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
  
@@ -617,20 +617,56 @@ void wake_up_idle_cpu(int cpu)
                 smp_send_reschedule(cpu);
  }
  
+static bool wake_up_full_nohz_cpu(int cpu)
+{
+       if (tick_nohz_full_cpu(cpu)) {
+               if (cpu != smp_processor_id() ||
+                   tick_nohz_tick_stopped())
+                       smp_send_reschedule(cpu);
+               return true;
+       }
+
+       return false;
+}
+
+void wake_up_nohz_cpu(int cpu)
+{
+       if (!wake_up_full_nohz_cpu(cpu))
+               wake_up_idle_cpu(cpu);
+}
+
  static inline bool got_nohz_idle_kick(void)
  {
         int cpu = smp_processor_id();
         return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
  }
  
-#else /* CONFIG_NO_HZ */
+#else /* CONFIG_NO_HZ_COMMON */
  
  static inline bool got_nohz_idle_kick(void)
  {
         return false;
  }
  
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
+
+#ifdef CONFIG_NO_HZ_FULL
+bool sched_can_stop_tick(void)
+{
+       struct rq *rq;
+
+       rq = this_rq();
+
+       /* Make sure rq->nr_running update is visible after the IPI */
+       smp_rmb();
+
+       /* More than one running task need preemption */
+       if (rq->nr_running > 1)
+               return false;
+
+       return true;
+}
+#endif /* CONFIG_NO_HZ_FULL */
  
  void sched_avg_update(struct rq *rq)
  {
@@ -1132,18 +1168,28 @@ EXPORT_SYMBOL_GPL(kick_process);
   */
  static int select_fallback_rq(int cpu, struct task_struct *p)
  {
-       const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+       int nid = cpu_to_node(cpu);
+       const struct cpumask *nodemask = NULL;
         enum { cpuset, possible, fail } state = cpuset;
         int dest_cpu;
  
-       /* Look for allowed, online CPU in same node. */
-       for_each_cpu(dest_cpu, nodemask) {
-               if (!cpu_online(dest_cpu))
-                       continue;
-               if (!cpu_active(dest_cpu))
-                       continue;
-               if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-                       return dest_cpu;
+       /*
+        * If the node that the cpu is on has been offlined, cpu_to_node()
+        * will return -1. There is no cpu on the node, and we should
+        * select the cpu on the other node.
+        */
+       if (nid != -1) {
+               nodemask = cpumask_of_node(nid);
+
+               /* Look for allowed, online CPU in same node. */
+               for_each_cpu(dest_cpu, nodemask) {
+                       if (!cpu_online(dest_cpu))
+                               continue;
+                       if (!cpu_active(dest_cpu))
+                               continue;
+                       if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+                               return dest_cpu;
+               }
         }
  
         for (;;) {
@@ -1278,8 +1324,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
  static void
  ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
  {
-       trace_sched_wakeup(p, true);
         check_preempt_curr(rq, p, wake_flags);
+       trace_sched_wakeup(p, true);
  
         p->state = TASK_RUNNING;
  #ifdef CONFIG_SMP
@@ -1352,7 +1398,8 @@ static void sched_ttwu_pending(void)
  
  void scheduler_ipi(void)
  {
-       if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+       if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
+           && !tick_nohz_full_cpu(smp_processor_id()))
                 return;
  
         /*
@@ -1369,6 +1416,7 @@ void scheduler_ipi(void)
          * somewhat pessimize the simple resched case.
          */
         irq_enter();
+       tick_nohz_full_check();
         sched_ttwu_pending();
  
         /*
@@ -1742,9 +1790,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
  static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
  {
         struct preempt_notifier *notifier;
-       struct hlist_node *node;
  
-       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+       hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
                 notifier->ops->sched_in(notifier, raw_smp_processor_id());
  }
  
@@ -1753,9 +1800,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
                                  struct task_struct *next)
  {
         struct preempt_notifier *notifier;
-       struct hlist_node *node;
  
-       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+       hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
                 notifier->ops->sched_out(notifier, next);
  }
  
@@ -1850,6 +1896,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
                 kprobe_flush_task(prev);
                 put_task_struct(prev);
         }
+
+       tick_nohz_task_switch(current);
  }
  
  #ifdef CONFIG_SMP
@@ -1969,11 +2017,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
  }
  
  /*
- * nr_running, nr_uninterruptible and nr_context_switches:
+ * nr_running and nr_context_switches:
   *
   * externally visible scheduler statistics: current number of runnable
- * threads, current number of uninterruptible-sleeping threads, total
- * number of context switches performed since bootup.
+ * threads, total number of context switches performed since bootup.
   */
  unsigned long nr_running(void)
  {
@@ -1985,23 +2032,6 @@ unsigned long nr_running(void)
         return sum;
  }
  
-unsigned long nr_uninterruptible(void)
-{
-       unsigned long i, sum = 0;
-
-       for_each_possible_cpu(i)
-               sum += cpu_rq(i)->nr_uninterruptible;
-
-       /*
-        * Since we read the counters lockless, it might be slightly
-        * inaccurate. Do not allow it to go below zero though:
-        */
-       if (unlikely((long)sum < 0))
-               sum = 0;
-
-       return sum;
-}
-
  unsigned long long nr_context_switches(void)
  {
         int i;
@@ -2131,7 +2161,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
         return load >> FSHIFT;
  }
  
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
  /*
   * Handle NO_HZ for the global load-average.
   *
@@ -2357,12 +2387,12 @@ static void calc_global_nohz(void)
         smp_wmb();
         calc_load_idx++;
  }
-#else /* !CONFIG_NO_HZ */
+#else /* !CONFIG_NO_HZ_COMMON */
  
  static inline long calc_load_fold_idle(void) { return 0; }
  static inline void calc_global_nohz(void) { }
  
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
  
  /*
   * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2522,7 +2552,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
         sched_avg_update(this_rq);
  }
  
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
  /*
   * There is no sane way to deal with nohz on smp when using jiffies because the
   * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -2582,7 +2612,7 @@ void update_cpu_load_nohz(void)
         }
         raw_spin_unlock(&this_rq->lock);
  }
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
  
  /*
   * Called from scheduler_tick()
@@ -2786,7 +2816,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
         if (irqs_disabled())
                 print_irqtrace_events(prev);
         dump_stack();
-       add_taint(TAINT_WARN);
+       add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
  }
  
  /*
@@ -3092,11 +3122,13 @@ EXPORT_SYMBOL(preempt_schedule);
  asmlinkage void __sched preempt_schedule_irq(void)
  {
         struct thread_info *ti = current_thread_info();
+       enum ctx_state prev_state;
  
         /* Catch callers which need to be fixed */
         BUG_ON(ti->preempt_count || !irqs_disabled());
  
-       user_exit();
+       prev_state = exception_enter();
+
         do {
                 add_preempt_count(PREEMPT_ACTIVE);
                 local_irq_enable();
@@ -3110,6 +3142,8 @@ asmlinkage void __sched preempt_schedule_irq(void)
                  */
                 barrier();
         } while (need_resched());
+
+       exception_exit(prev_state);
  }
  
  #endif /* CONFIG_PREEMPT */
@@ -3268,7 +3302,8 @@ void complete_all(struct completion *x)
  EXPORT_SYMBOL(complete_all);
  
  static inline long __sched
-do_wait_for_common(struct completion *x, long timeout, int state)
+do_wait_for_common(struct completion *x,
+                  long (*action)(long), long timeout, int state)
  {
         if (!x->done) {
                 DECLARE_WAITQUEUE(wait, current);
@@ -3281,7 +3316,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
                         }
                         __set_current_state(state);
                         spin_unlock_irq(&x->wait.lock);
-                       timeout = schedule_timeout(timeout);
+                       timeout = action(timeout);
                         spin_lock_irq(&x->wait.lock);
                 } while (!x->done && timeout);
                 __remove_wait_queue(&x->wait, &wait);
@@ -3292,17 +3327,30 @@ do_wait_for_common(struct completion *x, long timeout, int state)
         return timeout ?: 1;
  }
  
-static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
+static inline long __sched
+__wait_for_common(struct completion *x,
+                 long (*action)(long), long timeout, int state)
  {
         might_sleep();
  
         spin_lock_irq(&x->wait.lock);
-       timeout = do_wait_for_common(x, timeout, state);
+       timeout = do_wait_for_common(x, action, timeout, state);
         spin_unlock_irq(&x->wait.lock);
         return timeout;
  }
  
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+       return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+       return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+
  /**
   * wait_for_completion: - waits for completion of a task
   * @x:  holds the state of this particular completion
@@ -3338,6 +3386,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
  }
  EXPORT_SYMBOL(wait_for_completion_timeout);
  
+/**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO.
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+       wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+       return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+
  /**
   * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
   * @x:  holds the state of this particular completion
@@ -4364,7 +4445,10 @@ EXPORT_SYMBOL(yield);
   * It's the caller's job to ensure that the target task struct
   * can't go away on us before we can do any checks.
   *
- * Returns true if we indeed boosted the target task.
+ * Returns:
+ *     true (>0) if we indeed boosted the target task.
+ *     false (0) if we failed to boost the target.
+ *     -ESRCH if there's no task to yield to.
   */
  bool __sched yield_to(struct task_struct *p, bool preempt)
  {
@@ -4378,6 +4462,15 @@ bool __sched yield_to(struct task_struct *p, bool preempt)
  
  again:
         p_rq = task_rq(p);
+       /*
+        * If we're the only runnable task on the rq and target rq also
+        * has only one task, there's absolutely no point in yielding.
+        */
+       if (rq->nr_running == 1 && p_rq->nr_running == 1) {
+               yielded = -ESRCH;
+               goto out_irq;
+       }
+
         double_rq_lock(rq, p_rq);
         while (task_rq(p) != p_rq) {
                 double_rq_unlock(rq, p_rq);
@@ -4385,13 +4478,13 @@ again:
         }
  
         if (!curr->sched_class->yield_to_task)
-               goto out;
+               goto out_unlock;
  
         if (curr->sched_class != p->sched_class)
-               goto out;
+               goto out_unlock;
  
         if (task_running(p_rq, p) || p->state)
-               goto out;
+               goto out_unlock;
  
         yielded = curr->sched_class->yield_to_task(rq, p, preempt);
         if (yielded) {
@@ -4404,11 +4497,12 @@ again:
                         resched_task(p_rq->curr);
         }
  
-out:
+out_unlock:
         double_rq_unlock(rq, p_rq);
+out_irq:
         local_irq_restore(flags);
  
-       if (yielded)
+       if (yielded > 0)
                 schedule();
  
         return yielded;
@@ -6811,6 +6905,10 @@ int in_sched_functions(unsigned long addr)
  }
  
  #ifdef CONFIG_CGROUP_SCHED
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
  struct task_group root_task_group;
  LIST_HEAD(task_groups);
  #endif
@@ -6947,7 +7045,7 @@ void __init sched_init(void)
                 INIT_LIST_HEAD(&rq->cfs_tasks);
  
                 rq_attach_root(rq, &def_root_domain);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
                 rq->nohz_flags = 0;
  #endif
  #endif
@@ -7405,7 +7503,7 @@ unlock:
         return err;
  }
  
-int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
  {
         u64 rt_runtime, rt_period;
  
@@ -7417,7 +7515,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
  }
  
-long sched_group_rt_runtime(struct task_group *tg)
+static long sched_group_rt_runtime(struct task_group *tg)
  {
         u64 rt_runtime_us;
  
@@ -7429,7 +7527,7 @@ long sched_group_rt_runtime(struct task_group *tg)
         return rt_runtime_us;
  }
  
-int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
  {
         u64 rt_runtime, rt_period;
  
@@ -7442,7 +7540,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
  }
  
-long sched_group_rt_period(struct task_group *tg)
+static long sched_group_rt_period(struct task_group *tg)
  {
         u64 rt_period_us;
  
@@ -7477,7 +7575,7 @@ static int sched_rt_global_constraints(void)
         return ret;
  }
  
-int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
  {
         /* Don't accept realtime tasks when there is no way for them to run */
         if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)