Merge branch 'for-linus' of git://git.kernel.dk/linux-block

[deliverable/linux.git] / include / linux / sched.h
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 52c4847b05e2882a72d04c3c75fc4d55c2b4a6b9..6e42ada26345d507ffdec856107382a9fb67656a 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -40,7 +40,6 @@ struct sched_param {
  #include <linux/pid.h>
  #include <linux/percpu.h>
  #include <linux/topology.h>
-#include <linux/proportions.h>
  #include <linux/seccomp.h>
  #include <linux/rcupdate.h>
  #include <linux/rculist.h>
@@ -178,9 +177,11 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
  extern void calc_global_load(unsigned long ticks);
  
  #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void update_cpu_load_nohz(int active);
+extern void cpu_load_update_nohz_start(void);
+extern void cpu_load_update_nohz_stop(void);
  #else
-static inline void update_cpu_load_nohz(int active) { }
+static inline void cpu_load_update_nohz_start(void) { }
+static inline void cpu_load_update_nohz_stop(void) { }
  #endif
  
  extern void dump_cpu_task(int cpu);
@@ -372,6 +373,15 @@ extern void cpu_init (void);
  extern void trap_init(void);
  extern void update_process_times(int user);
  extern void scheduler_tick(void);
+extern int sched_cpu_starting(unsigned int cpu);
+extern int sched_cpu_activate(unsigned int cpu);
+extern int sched_cpu_deactivate(unsigned int cpu);
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern int sched_cpu_dying(unsigned int cpu);
+#else
+# define sched_cpu_dying       NULL
+#endif
  
  extern void sched_show_task(struct task_struct *p);
  
@@ -511,6 +521,7 @@ static inline int get_dumpable(struct mm_struct *mm)
  
  #define MMF_HAS_UPROBES                19      /* has uprobes */
  #define MMF_RECALC_UPROBES     20      /* MMF_HAS_UPROBES can be wrong */
+#define MMF_OOM_REAPED         21      /* mm has been already reaped */
  
  #define MMF_INIT_MASK          (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
  
@@ -658,6 +669,7 @@ struct signal_struct {
         atomic_t                sigcnt;
         atomic_t                live;
         int                     nr_threads;
+       atomic_t oom_victims; /* # of TIF_MEDIE threads in this thread group */
         struct list_head        thread_head;
  
         wait_queue_head_t       wait_chldexit;  /* for wait4() */
@@ -782,7 +794,11 @@ struct signal_struct {
         struct tty_audit_buf *tty_audit_buf;
  #endif
  
-       oom_flags_t oom_flags;
+       /*
+        * Thread is the potential origin of an oom condition; kill first on
+        * oom
+        */
+       bool oom_flag_origin;
         short oom_score_adj;            /* OOM kill score adjustment */
         short oom_score_adj_min;        /* OOM kill score adjustment min value.
                                          * Only settable by CAP_SYS_RESOURCE. */
@@ -934,10 +950,20 @@ enum cpu_idle_type {
         CPU_MAX_IDLE_TYPES
  };
  
+/*
+ * Integer metrics need fixed point arithmetic, e.g., sched/fair
+ * has a few: load, load_avg, util_avg, freq, and capacity.
+ *
+ * We define a basic fixed point arithmetic range, and then formalize
+ * all these metrics based on that basic range.
+ */
+# define SCHED_FIXEDPOINT_SHIFT        10
+# define SCHED_FIXEDPOINT_SCALE        (1L << SCHED_FIXEDPOINT_SHIFT)
+
  /*
   * Increase resolution of cpu_capacity calculations
   */
-#define SCHED_CAPACITY_SHIFT   10
+#define SCHED_CAPACITY_SHIFT   SCHED_FIXEDPOINT_SHIFT
  #define SCHED_CAPACITY_SCALE   (1L << SCHED_CAPACITY_SHIFT)
  
  /*
@@ -1199,18 +1225,56 @@ struct load_weight {
  };
  
  /*
- * The load_avg/util_avg accumulates an infinite geometric series.
- * 1) load_avg factors frequency scaling into the amount of time that a
- * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
- * aggregated such weights of all runnable and blocked sched_entities.
- * 2) util_avg factors frequency and cpu scaling into the amount of time
- * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
- * For cfs_rq, it is the aggregated such times of all runnable and
+ * The load_avg/util_avg accumulates an infinite geometric series
+ * (see __update_load_avg() in kernel/sched/fair.c).
+ *
+ * [load_avg definition]
+ *
+ *   load_avg = runnable% * scale_load_down(load)
+ *
+ * where runnable% is the time ratio that a sched_entity is runnable.
+ * For cfs_rq, it is the aggregated load_avg of all runnable and
   * blocked sched_entities.
- * The 64 bit load_sum can:
- * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with
- * the highest weight (=88761) always runnable, we should not overflow
- * 2) for entity, support any load.weight always runnable
+ *
+ * load_avg may also take frequency scaling into account:
+ *
+ *   load_avg = runnable% * scale_load_down(load) * freq%
+ *
+ * where freq% is the CPU frequency normalized to the highest frequency.
+ *
+ * [util_avg definition]
+ *
+ *   util_avg = running% * SCHED_CAPACITY_SCALE
+ *
+ * where running% is the time ratio that a sched_entity is running on
+ * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable
+ * and blocked sched_entities.
+ *
+ * util_avg may also factor frequency scaling and CPU capacity scaling:
+ *
+ *   util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity%
+ *
+ * where freq% is the same as above, and capacity% is the CPU capacity
+ * normalized to the greatest capacity (due to uarch differences, etc).
+ *
+ * N.B., the above ratios (runnable%, running%, freq%, and capacity%)
+ * themselves are in the range of [0, 1]. To do fixed point arithmetics,
+ * we therefore scale them to as large a range as necessary. This is for
+ * example reflected by util_avg's SCHED_CAPACITY_SCALE.
+ *
+ * [Overflow issue]
+ *
+ * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
+ * with the highest load (=88761), always runnable on a single cfs_rq,
+ * and should not overflow as the number already hits PID_MAX_LIMIT.
+ *
+ * For all other cases (including 32-bit kernels), struct load_weight's
+ * weight will overflow first before we do, because:
+ *
+ *    Max(load_avg) <= Max(load.weight)
+ *
+ * Then it is the load_weight's responsibility to consider overflow
+ * issues.
   */
  struct sched_avg {
         u64 last_update_time, load_sum;
@@ -1475,6 +1539,7 @@ struct task_struct {
         unsigned sched_reset_on_fork:1;
         unsigned sched_contributes_to_load:1;
         unsigned sched_migrated:1;
+       unsigned sched_remote_wakeup:1;
         unsigned :0; /* force alignment to the next boundary */
  
         /* unserialized, strictly 'current' */
@@ -1596,6 +1661,7 @@ struct task_struct {
  
         unsigned long sas_ss_sp;
         size_t sas_ss_size;
+       unsigned sas_ss_flags;
  
         struct callback_head *task_works;
  
@@ -1871,6 +1937,11 @@ extern int arch_task_struct_size __read_mostly;
  /* Future-safe accessor for struct task_struct's cpus_allowed. */
  #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
  
+static inline int tsk_nr_cpus_allowed(struct task_struct *p)
+{
+       return p->nr_cpus_allowed;
+}
+
  #define TNF_MIGRATED   0x01
  #define TNF_NO_GROUP   0x02
  #define TNF_SHARED     0x04
@@ -2184,6 +2255,7 @@ static inline void memalloc_noio_restore(unsigned int flags)
  #define PFA_NO_NEW_PRIVS 0     /* May not gain new privileges. */
  #define PFA_SPREAD_PAGE  1      /* Spread page cache over cpuset */
  #define PFA_SPREAD_SLAB  2      /* Spread some slab caches over cpuset */
+#define PFA_LMK_WAITING  3      /* Lowmemorykiller is waiting */
  
  
  #define TASK_PFA_TEST(name, func)                                      \
@@ -2207,6 +2279,9 @@ TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
  TASK_PFA_SET(SPREAD_SLAB, spread_slab)
  TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
  
+TASK_PFA_TEST(LMK_WAITING, lmk_waiting)
+TASK_PFA_SET(LMK_WAITING, lmk_waiting)
+
  /*
   * task->jobctl flags
   */
@@ -2303,8 +2378,6 @@ extern unsigned long long notrace sched_clock(void);
  /*
   * See the comment in kernel/sched/clock.c
   */
-extern u64 cpu_clock(int cpu);
-extern u64 local_clock(void);
  extern u64 running_clock(void);
  extern u64 sched_clock_cpu(int cpu);
  
@@ -2323,6 +2396,16 @@ static inline void sched_clock_idle_sleep_event(void)
  static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
  {
  }
+
+static inline u64 cpu_clock(int cpu)
+{
+       return sched_clock();
+}
+
+static inline u64 local_clock(void)
+{
+       return sched_clock();
+}
  #else
  /*
   * Architectures can set this to 1 if they have specified
@@ -2337,6 +2420,26 @@ extern void clear_sched_clock_stable(void);
  extern void sched_clock_tick(void);
  extern void sched_clock_idle_sleep_event(void);
  extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+
+/*
+ * As outlined in clock.c, provides a fast, high resolution, nanosecond
+ * time source that is monotonic per cpu argument and has bounded drift
+ * between cpus.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !!                                                  #
+ * ####################################################################
+ */
+static inline u64 cpu_clock(int cpu)
+{
+       return sched_clock_cpu(cpu);
+}
+
+static inline u64 local_clock(void)
+{
+       return sched_clock_cpu(raw_smp_processor_id());
+}
  #endif
  
  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -2575,6 +2678,18 @@ static inline int kill_cad_pid(int sig, int priv)
   */
  static inline int on_sig_stack(unsigned long sp)
  {
+       /*
+        * If the signal stack is SS_AUTODISARM then, by construction, we
+        * can't be on the signal stack unless user code deliberately set
+        * SS_AUTODISARM when we were already on it.
+        *
+        * This improves reliability: if user state gets corrupted such that
+        * the stack pointer points very close to the end of the signal stack,
+        * then this check will enable the signal to be handled anyway.
+        */
+       if (current->sas_ss_flags & SS_AUTODISARM)
+               return 0;
+
  #ifdef CONFIG_STACK_GROWSUP
         return sp >= current->sas_ss_sp &&
                 sp - current->sas_ss_sp < current->sas_ss_size;
@@ -2592,6 +2707,13 @@ static inline int sas_ss_flags(unsigned long sp)
         return on_sig_stack(sp) ? SS_ONSTACK : 0;
  }
  
+static inline void sas_ss_reset(struct task_struct *p)
+{
+       p->sas_ss_sp = 0;
+       p->sas_ss_size = 0;
+       p->sas_ss_flags = SS_DISABLE;
+}
+
  static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
  {
         if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
@@ -2610,14 +2732,26 @@ extern struct mm_struct * mm_alloc(void);
  
  /* mmdrop drops the mm and the page tables */
  extern void __mmdrop(struct mm_struct *);
-static inline void mmdrop(struct mm_struct * mm)
+static inline void mmdrop(struct mm_struct *mm)
  {
         if (unlikely(atomic_dec_and_test(&mm->mm_count)))
                 __mmdrop(mm);
  }
  
+static inline bool mmget_not_zero(struct mm_struct *mm)
+{
+       return atomic_inc_not_zero(&mm->mm_users);
+}
+
  /* mmput gets rid of the mappings and all user-space */
  extern void mmput(struct mm_struct *);
+#ifdef CONFIG_MMU
+/* same as above but performs the slow path from the async context. Can
+ * be called from the atomic context as well
+ */
+extern void mmput_async(struct mm_struct *);
+#endif
+
  /* Grab a reference to a task's mm, if it is not already going away */
  extern struct mm_struct *get_task_mm(struct task_struct *task);
  /*
@@ -2646,7 +2780,14 @@ static inline int copy_thread_tls(
  }
  #endif
  extern void flush_thread(void);
-extern void exit_thread(void);
+
+#ifdef CONFIG_HAVE_EXIT_THREAD
+extern void exit_thread(struct task_struct *tsk);
+#else
+static inline void exit_thread(struct task_struct *tsk)
+{
+}
+#endif
  
  extern void exit_files(struct task_struct *);
  extern void __cleanup_sighand(struct sighand_struct *);
@@ -3240,7 +3381,10 @@ struct update_util_data {
                      u64 time, unsigned long util, unsigned long max);
  };
  
-void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+                       void (*func)(struct update_util_data *data, u64 time,
+                                    unsigned long util, unsigned long max));
+void cpufreq_remove_update_util_hook(int cpu);
  #endif /* CONFIG_CPU_FREQ */
  
  #endif