Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 3 Jun 2014 21:00:15 +0000 (14:00 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 3 Jun 2014 21:00:15 +0000 (14:00 -0700)
Pull scheduler updates from Ingo Molnar:
 "The main scheduling related changes in this cycle were:

   - various sched/numa updates, for better performance

   - tree wide cleanup of open coded nice levels

   - nohz fix related to rq->nr_running use

   - cpuidle changes and continued consolidation to improve the
     kernel/sched/idle.c high level idle scheduling logic.  As part of
     this effort I pulled cpuidle driver changes from Rafael as well.

   - standardized idle polling amongst architectures

   - continued work on preparing better power/energy aware scheduling

   - sched/rt updates

   - misc fixlets and cleanups"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (49 commits)
  sched/numa: Decay ->wakee_flips instead of zeroing
  sched/numa: Update migrate_improves/degrades_locality()
  sched/numa: Allow task switch if load imbalance improves
  sched/rt: Fix 'struct sched_dl_entity' and dl_task_time() comments, to match the current upstream code
  sched: Consolidate open coded implementations of nice level frobbing into nice_to_rlimit() and rlimit_to_nice()
  sched: Initialize rq->age_stamp on processor start
  sched, nohz: Change rq->nr_running to always use wrappers
  sched: Fix the rq->next_balance logic in rebalance_domains() and idle_balance()
  sched: Use clamp() and clamp_val() to make sys_nice() more readable
  sched: Do not zero sg->cpumask and sg->sgp->power in build_sched_groups()
  sched/numa: Fix initialization of sched_domain_topology for NUMA
  sched: Call select_idle_sibling() when not affine_sd
  sched: Simplify return logic in sched_read_attr()
  sched: Simplify return logic in sched_copy_attr()
  sched: Fix exec_start/task_hot on migrated tasks
  arm64: Remove TIF_POLLING_NRFLAG
  metag: Remove TIF_POLLING_NRFLAG
  sched/idle: Make cpuidle_idle_call() void
  sched/idle: Reflow cpuidle_idle_call()
  sched/idle: Delay clearing the polling bit
  ...

48 files changed:
arch/alpha/include/asm/thread_info.h
arch/arm/kernel/topology.c
arch/arm64/include/asm/thread_info.h
arch/ia64/include/asm/thread_info.h
arch/ia64/include/asm/topology.h
arch/metag/include/asm/thread_info.h
arch/powerpc/kernel/smp.c
arch/s390/include/asm/topology.h
arch/s390/kernel/topology.c
arch/tile/include/asm/thread_info.h
arch/tile/include/asm/topology.h
arch/x86/include/asm/thread_info.h
arch/x86/kernel/apm_32.c
drivers/block/loop.c
drivers/block/nbd.c
drivers/block/pktcdvd.c
drivers/char/ipmi/ipmi_si_intf.c
drivers/cpuidle/cpuidle.c
drivers/cpuidle/governors/menu.c
drivers/s390/crypto/ap_bus.c
drivers/scsi/bnx2fc/bnx2fc_fcoe.c
drivers/scsi/bnx2i/bnx2i_hwi.c
drivers/scsi/fcoe/fcoe.c
drivers/scsi/ibmvscsi/ibmvfc.c
drivers/scsi/ibmvscsi/ibmvscsi.c
drivers/scsi/lpfc/lpfc_hbadisc.c
drivers/scsi/qla2xxx/qla_os.c
drivers/staging/android/binder.c
drivers/staging/lustre/lustre/llite/lloop.c
fs/ocfs2/cluster/heartbeat.c
include/linux/cpuidle.h
include/linux/sched.h
include/linux/sched/prio.h
include/linux/thread_info.h
include/linux/topology.h
kernel/locking/locktorture.c
kernel/power/suspend.c
kernel/sched/core.c
kernel/sched/deadline.c
kernel/sched/fair.c
kernel/sched/idle.c
kernel/sched/rt.c
kernel/sched/sched.h
kernel/sched/stop_task.c
kernel/sys.c
kernel/workqueue.c
mm/huge_memory.c
mm/memory.c

index 3d6ce6d56fc9ee9ad6fc4276f97f3d5633df6886..48bbea6898b30c23c4846d486347e1a17c16d5c2 100644 (file)
@@ -73,12 +73,14 @@ register struct thread_info *__current_thread_info __asm__("$8");
 #define TIF_SYSCALL_AUDIT      4       /* syscall audit active */
 #define TIF_DIE_IF_KERNEL      9       /* dik recursion lock */
 #define TIF_MEMDIE             13      /* is terminating due to OOM killer */
+#define TIF_POLLING_NRFLAG     14      /* idle is polling for TIF_NEED_RESCHED */
 
 #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING                (1<<TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED      (1<<TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME     (1<<TIF_NOTIFY_RESUME)
 #define _TIF_SYSCALL_AUDIT     (1<<TIF_SYSCALL_AUDIT)
+#define _TIF_POLLING_NRFLAG    (1<<TIF_POLLING_NRFLAG)
 
 /* Work to do on interrupt/exception return.  */
 #define _TIF_WORK_MASK         (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
@@ -92,8 +94,6 @@ register struct thread_info *__current_thread_info __asm__("$8");
 #define TS_UAC_NOFIX           0x0002  /* ! flags as they match          */
 #define TS_UAC_SIGBUS          0x0004  /* ! userspace part of 'osf_sysinfo' */
 #define TS_RESTORE_SIGMASK     0x0008  /* restore signal mask in do_signal() */
-#define TS_POLLING             0x0010  /* idle task polling need_resched,
-                                          skip sending interrupt */
 
 #ifndef __ASSEMBLY__
 #define HAVE_SET_RESTORE_SIGMASK       1
index 0bc94b1fd1ae9e73bdc95987797012396d9ef6bb..71e1fec6d31a5b76d6a82249092e47b88c7ad4da 100644 (file)
@@ -185,6 +185,15 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
        return &cpu_topology[cpu].core_sibling;
 }
 
+/*
+ * The current assumption is that we can power gate each core independently.
+ * This will be superseded by DT binding once available.
+ */
+const struct cpumask *cpu_corepower_mask(int cpu)
+{
+       return &cpu_topology[cpu].thread_sibling;
+}
+
 static void update_siblings_masks(unsigned int cpuid)
 {
        struct cputopo_arm *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
@@ -266,6 +275,20 @@ void store_cpu_topology(unsigned int cpuid)
                cpu_topology[cpuid].socket_id, mpidr);
 }
 
+static inline const int cpu_corepower_flags(void)
+{
+       return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN;
+}
+
+static struct sched_domain_topology_level arm_topology[] = {
+#ifdef CONFIG_SCHED_MC
+       { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
+       { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+       { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+       { NULL, },
+};
+
 /*
  * init_cpu_topology is called at boot when only one cpu is running
  * which prevent simultaneous write access to cpu_topology array
@@ -289,4 +312,7 @@ void __init init_cpu_topology(void)
        smp_wmb();
 
        parse_dt_topology();
+
+       /* Set scheduler topology descriptor */
+       set_sched_topology(arm_topology);
 }
index 720e70b66ffdcf6eff60efc964c32e9a0584325b..7b8e3a2a00fb3e54214c44d663e6cb37720c7d67 100644 (file)
@@ -95,13 +95,11 @@ static inline struct thread_info *current_thread_info(void)
  *  TIF_NEED_RESCHED   - rescheduling necessary
  *  TIF_NOTIFY_RESUME  - callback before returning to user
  *  TIF_USEDFPU                - FPU was used by this task this quantum (SMP)
- *  TIF_POLLING_NRFLAG - true if poll_idle() is polling TIF_NEED_RESCHED
  */
 #define TIF_SIGPENDING         0
 #define TIF_NEED_RESCHED       1
 #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
 #define TIF_SYSCALL_TRACE      8
-#define TIF_POLLING_NRFLAG     16
 #define TIF_MEMDIE             18      /* is terminating due to OOM killer */
 #define TIF_FREEZE             19
 #define TIF_RESTORE_SIGMASK    20
index 5957cf61f8980641c54271ab4030b1d765570001..5b17418b42239be81694cdc7b3c1f60ae68c5347 100644 (file)
@@ -107,6 +107,7 @@ struct thread_info {
 #define TIF_MCA_INIT           18      /* this task is processing MCA or INIT */
 #define TIF_DB_DISABLED                19      /* debug trap disabled for fsyscall */
 #define TIF_RESTORE_RSE                21      /* user RBS is newer than kernel RBS */
+#define TIF_POLLING_NRFLAG     22      /* idle is polling for TIF_NEED_RESCHED */
 
 #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
@@ -118,6 +119,7 @@ struct thread_info {
 #define _TIF_MCA_INIT          (1 << TIF_MCA_INIT)
 #define _TIF_DB_DISABLED       (1 << TIF_DB_DISABLED)
 #define _TIF_RESTORE_RSE       (1 << TIF_RESTORE_RSE)
+#define _TIF_POLLING_NRFLAG    (1 << TIF_POLLING_NRFLAG)
 
 /* "work to do on user-return" bits */
 #define TIF_ALLWORK_MASK       (_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\
@@ -125,7 +127,6 @@ struct thread_info {
 /* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */
 #define TIF_WORK_MASK          (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT))
 
-#define TS_POLLING             1       /* true if in idle loop and not sleeping */
 #define TS_RESTORE_SIGMASK     2       /* restore signal mask in do_signal() */
 
 #ifndef __ASSEMBLY__
index 5cb55a1e606b0b5cbed352f1e7187c41d31371de..3202aa74e0d62838e104b0497e8a8bb6080aa92f 100644 (file)
 
 void build_cpu_to_node_map(void);
 
-#define SD_CPU_INIT (struct sched_domain) {            \
-       .parent                 = NULL,                 \
-       .child                  = NULL,                 \
-       .groups                 = NULL,                 \
-       .min_interval           = 1,                    \
-       .max_interval           = 4,                    \
-       .busy_factor            = 64,                   \
-       .imbalance_pct          = 125,                  \
-       .cache_nice_tries       = 2,                    \
-       .busy_idx               = 2,                    \
-       .idle_idx               = 1,                    \
-       .newidle_idx            = 0,                    \
-       .wake_idx               = 0,                    \
-       .forkexec_idx           = 0,                    \
-       .flags                  = SD_LOAD_BALANCE       \
-                               | SD_BALANCE_NEWIDLE    \
-                               | SD_BALANCE_EXEC       \
-                               | SD_BALANCE_FORK       \
-                               | SD_WAKE_AFFINE,       \
-       .last_balance           = jiffies,              \
-       .balance_interval       = 1,                    \
-       .nr_balance_failed      = 0,                    \
-}
-
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_SMP
index b19e9c588a16733e81f5495593c881d2d9c7c170..47711336119e0f7132754957896ef4b19dc03e01 100644 (file)
@@ -117,10 +117,8 @@ static inline int kstack_end(void *addr)
 #define TIF_SECCOMP            5       /* secure computing */
 #define TIF_RESTORE_SIGMASK    6       /* restore signal mask in do_signal() */
 #define TIF_NOTIFY_RESUME      7       /* callback before returning to user */
-#define TIF_POLLING_NRFLAG      8      /* true if poll_idle() is polling
-                                          TIF_NEED_RESCHED */
-#define TIF_MEMDIE             9       /* is terminating due to OOM killer */
-#define TIF_SYSCALL_TRACEPOINT  10     /* syscall tracepoint instrumentation */
+#define TIF_MEMDIE             8       /* is terminating due to OOM killer */
+#define TIF_SYSCALL_TRACEPOINT 9       /* syscall tracepoint instrumentation */
 
 
 #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
index e2a4232c5871a19056b0b5b666adb540a17b1403..10ffffef041413dc6da76a7c9564e4b79804c72a 100644 (file)
@@ -766,6 +766,28 @@ int setup_profiling_timer(unsigned int multiplier)
        return 0;
 }
 
+#ifdef CONFIG_SCHED_SMT
+/* cpumask of CPUs with asymetric SMT dependancy */
+static const int powerpc_smt_flags(void)
+{
+       int flags = SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
+
+       if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
+               printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
+               flags |= SD_ASYM_PACKING;
+       }
+       return flags;
+}
+#endif
+
+static struct sched_domain_topology_level powerpc_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+       { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+       { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+       { NULL, },
+};
+
 void __init smp_cpus_done(unsigned int max_cpus)
 {
        cpumask_var_t old_mask;
@@ -790,15 +812,8 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
        dump_numa_cpu_topology();
 
-}
+       set_sched_topology(powerpc_topology);
 
-int arch_sd_sibling_asym_packing(void)
-{
-       if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
-               printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
-               return SD_ASYM_PACKING;
-       }
-       return 0;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
index 05425b18c0aab7af89205a716f2ba05cec89cd0a..56af53093d24d3660ed1267e229eccb6925b56ef 100644 (file)
@@ -26,21 +26,12 @@ extern struct cpu_topology_s390 cpu_topology[NR_CPUS];
 
 #define mc_capable() 1
 
-static inline const struct cpumask *cpu_coregroup_mask(int cpu)
-{
-       return &cpu_topology[cpu].core_mask;
-}
-
-static inline const struct cpumask *cpu_book_mask(int cpu)
-{
-       return &cpu_topology[cpu].book_mask;
-}
-
 int topology_cpu_init(struct cpu *);
 int topology_set_cpu_management(int fc);
 void topology_schedule_update(void);
 void store_topology(struct sysinfo_15_1_x *info);
 void topology_expect_change(void);
+const struct cpumask *cpu_coregroup_mask(int cpu);
 
 #else /* CONFIG_SCHED_BOOK */
 
@@ -64,8 +55,6 @@ static inline void s390_init_cpu_topology(void)
 };
 #endif
 
-#define SD_BOOK_INIT   SD_CPU_INIT
-
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_S390_TOPOLOGY_H */
index fa3b8cdaadacf4cf14cf24976074f0319a0ebcf9..355a16c557026a21f50fbb39b51a5f1320b0773a 100644 (file)
@@ -445,6 +445,23 @@ int topology_cpu_init(struct cpu *cpu)
        return sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group);
 }
 
+const struct cpumask *cpu_coregroup_mask(int cpu)
+{
+       return &cpu_topology[cpu].core_mask;
+}
+
+static const struct cpumask *cpu_book_mask(int cpu)
+{
+       return &cpu_topology[cpu].book_mask;
+}
+
+static struct sched_domain_topology_level s390_topology[] = {
+       { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+       { cpu_book_mask, SD_INIT_NAME(BOOK) },
+       { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+       { NULL, },
+};
+
 static int __init topology_init(void)
 {
        if (!MACHINE_HAS_TOPOLOGY) {
@@ -453,6 +470,9 @@ static int __init topology_init(void)
        }
        set_topology_timer();
 out:
+
+       set_sched_topology(s390_topology);
+
        return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching);
 }
 device_initcall(topology_init);
index 729aa107f64e6391cb435ff1f5c038065eafe3b2..d767ff9f59b9c1532aa14dd8c8b342cd9a82155d 100644 (file)
@@ -129,6 +129,7 @@ extern void _cpu_idle(void);
 #define TIF_MEMDIE             7       /* OOM killer at work */
 #define TIF_NOTIFY_RESUME      8       /* callback before returning to user */
 #define TIF_SYSCALL_TRACEPOINT 9       /* syscall tracepoint instrumentation */
+#define TIF_POLLING_NRFLAG     10      /* idle is polling for TIF_NEED_RESCHED */
 
 #define _TIF_SIGPENDING                (1<<TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED      (1<<TIF_NEED_RESCHED)
@@ -140,6 +141,7 @@ extern void _cpu_idle(void);
 #define _TIF_MEMDIE            (1<<TIF_MEMDIE)
 #define _TIF_NOTIFY_RESUME     (1<<TIF_NOTIFY_RESUME)
 #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
+#define _TIF_POLLING_NRFLAG    (1<<TIF_POLLING_NRFLAG)
 
 /* Work to do on any return to user space. */
 #define _TIF_ALLWORK_MASK \
@@ -162,7 +164,6 @@ extern void _cpu_idle(void);
 #ifdef __tilegx__
 #define TS_COMPAT              0x0001  /* 32-bit compatibility mode */
 #endif
-#define TS_POLLING             0x0004  /* in idle loop but not sleeping */
 #define TS_RESTORE_SIGMASK     0x0008  /* restore signal mask in do_signal */
 
 #ifndef __ASSEMBLY__
index d15c0d8d550f6061368324a05b4e51d8d52a2186..938311844233b8c7e2753b5982bbdd0a302d5304 100644 (file)
@@ -44,39 +44,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
 /* For now, use numa node -1 for global allocation. */
 #define pcibus_to_node(bus)            ((void)(bus), -1)
 
-/*
- * TILE architecture has many cores integrated in one processor, so we need
- * setup bigger balance_interval for both CPU/NODE scheduling domains to
- * reduce process scheduling costs.
- */
-
-/* sched_domains SD_CPU_INIT for TILE architecture */
-#define SD_CPU_INIT (struct sched_domain) {                            \
-       .min_interval           = 4,                                    \
-       .max_interval           = 128,                                  \
-       .busy_factor            = 64,                                   \
-       .imbalance_pct          = 125,                                  \
-       .cache_nice_tries       = 1,                                    \
-       .busy_idx               = 2,                                    \
-       .idle_idx               = 1,                                    \
-       .newidle_idx            = 0,                                    \
-       .wake_idx               = 0,                                    \
-       .forkexec_idx           = 0,                                    \
-                                                                       \
-       .flags                  = 1*SD_LOAD_BALANCE                     \
-                               | 1*SD_BALANCE_NEWIDLE                  \
-                               | 1*SD_BALANCE_EXEC                     \
-                               | 1*SD_BALANCE_FORK                     \
-                               | 0*SD_BALANCE_WAKE                     \
-                               | 0*SD_WAKE_AFFINE                      \
-                               | 0*SD_SHARE_CPUPOWER                   \
-                               | 0*SD_SHARE_PKG_RESOURCES              \
-                               | 0*SD_SERIALIZE                        \
-                               ,                                       \
-       .last_balance           = jiffies,                              \
-       .balance_interval       = 32,                                   \
-}
-
 /* By definition, we create nodes based on online memory. */
 #define node_has_online_mem(nid) 1
 
index 47e5de25ba799f787d7f9344c28daa797fde47b7..854053889d4d2d6f74cdb3143da4f6c63e213996 100644 (file)
@@ -83,6 +83,7 @@ struct thread_info {
 #define TIF_FORK               18      /* ret_from_fork */
 #define TIF_NOHZ               19      /* in adaptive nohz mode */
 #define TIF_MEMDIE             20      /* is terminating due to OOM killer */
+#define TIF_POLLING_NRFLAG     21      /* idle is polling for TIF_NEED_RESCHED */
 #define TIF_IO_BITMAP          22      /* uses I/O bitmap */
 #define TIF_FORCED_TF          24      /* true if TF in eflags artificially */
 #define TIF_BLOCKSTEP          25      /* set when we want DEBUGCTLMSR_BTF */
@@ -106,6 +107,7 @@ struct thread_info {
 #define _TIF_IA32              (1 << TIF_IA32)
 #define _TIF_FORK              (1 << TIF_FORK)
 #define _TIF_NOHZ              (1 << TIF_NOHZ)
+#define _TIF_POLLING_NRFLAG    (1 << TIF_POLLING_NRFLAG)
 #define _TIF_IO_BITMAP         (1 << TIF_IO_BITMAP)
 #define _TIF_FORCED_TF         (1 << TIF_FORCED_TF)
 #define _TIF_BLOCKSTEP         (1 << TIF_BLOCKSTEP)
@@ -191,8 +193,6 @@ static inline struct thread_info *current_thread_info(void)
  * have to worry about atomic accesses.
  */
 #define TS_COMPAT              0x0002  /* 32bit syscall active (64BIT)*/
-#define TS_POLLING             0x0004  /* idle task polling need_resched,
-                                          skip sending interrupt */
 #define TS_RESTORE_SIGMASK     0x0008  /* restore signal mask in do_signal() */
 
 #ifndef __ASSEMBLY__
index 3ab03430211d6d4714e94234b432bac9e813b1e7..f3a1f04ed4cb80794f539cb5e2fb1c3e7e803dd6 100644 (file)
@@ -844,21 +844,10 @@ static int apm_do_idle(void)
        int polling;
        int err = 0;
 
-       polling = !!(current_thread_info()->status & TS_POLLING);
-       if (polling) {
-               current_thread_info()->status &= ~TS_POLLING;
-               /*
-                * TS_POLLING-cleared state must be visible before we
-                * test NEED_RESCHED:
-                */
-               smp_mb();
-       }
        if (!need_resched()) {
                idled = 1;
                ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
        }
-       if (polling)
-               current_thread_info()->status |= TS_POLLING;
 
        if (!idled)
                return 0;
index f70a230a2945225f89ae188909c7bc9db90bc32f..6cb1beb47c25d1d2a7db113ca9f173a9ef8b68d3 100644 (file)
@@ -548,7 +548,7 @@ static int loop_thread(void *data)
        struct loop_device *lo = data;
        struct bio *bio;
 
-       set_user_nice(current, -20);
+       set_user_nice(current, MIN_NICE);
 
        while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
 
index 3a70ea2f7cd69b2641302e6c44560f32245a078c..56a027d6115e0f5fa83c48070ff26ca6d0ba061e 100644 (file)
@@ -533,7 +533,7 @@ static int nbd_thread(void *data)
        struct nbd_device *nbd = data;
        struct request *req;
 
-       set_user_nice(current, -20);
+       set_user_nice(current, MIN_NICE);
        while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
                /* wait for something to do */
                wait_event_interruptible(nbd->waiting_wq,
index a2af73db187b694c3112bf1c6d08e4070596cd10..ef166ad2dbadc37bdd58a4e055409898291eea82 100644 (file)
@@ -1463,7 +1463,7 @@ static int kcdrwd(void *foobar)
        struct packet_data *pkt;
        long min_sleep_time, residue;
 
-       set_user_nice(current, -20);
+       set_user_nice(current, MIN_NICE);
        set_freezable();
 
        for (;;) {
index 1c4bb4f6ce932f95043385d1387ed14b789a9996..5d665680ae33fea5bbc74084d1d79d37f62c4f15 100644 (file)
@@ -1007,7 +1007,7 @@ static int ipmi_thread(void *data)
        struct timespec busy_until;
 
        ipmi_si_set_not_busy(&busy_until);
-       set_user_nice(current, 19);
+       set_user_nice(current, MAX_NICE);
        while (!kthread_should_stop()) {
                int busy_wait;
 
index 8236746e46bb9d5afa313aff7d17f72fd1ec2326..cb7019977c50febbcd21b9ad49f4a9bfe1ca6271 100644 (file)
@@ -32,6 +32,7 @@ LIST_HEAD(cpuidle_detected_devices);
 static int enabled_devices;
 static int off __read_mostly;
 static int initialized __read_mostly;
+static bool use_deepest_state __read_mostly;
 
 int cpuidle_disabled(void)
 {
@@ -65,23 +66,42 @@ int cpuidle_play_dead(void)
 }
 
 /**
- * cpuidle_enabled - check if the cpuidle framework is ready
- * @dev: cpuidle device for this cpu
- * @drv: cpuidle driver for this cpu
+ * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode.
+ * @enable: Whether enable or disable the feature.
+ *
+ * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and
+ * always use the state with the greatest exit latency (out of the states that
+ * are not disabled).
  *
- * Return 0 on success, otherwise:
- * -NODEV : the cpuidle framework is not available
- * -EBUSY : the cpuidle framework is not initialized
+ * This function can only be called after cpuidle_pause() to avoid races.
  */
-int cpuidle_enabled(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+void cpuidle_use_deepest_state(bool enable)
 {
-       if (off || !initialized)
-               return -ENODEV;
+       use_deepest_state = enable;
+}
 
-       if (!drv || !dev || !dev->enabled)
-               return -EBUSY;
+/**
+ * cpuidle_find_deepest_state - Find the state of the greatest exit latency.
+ * @drv: cpuidle driver for a given CPU.
+ * @dev: cpuidle device for a given CPU.
+ */
+static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
+                                     struct cpuidle_device *dev)
+{
+       unsigned int latency_req = 0;
+       int i, ret = CPUIDLE_DRIVER_STATE_START - 1;
 
-       return 0;
+       for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
+               struct cpuidle_state *s = &drv->states[i];
+               struct cpuidle_state_usage *su = &dev->states_usage[i];
+
+               if (s->disabled || su->disable || s->exit_latency <= latency_req)
+                       continue;
+
+               latency_req = s->exit_latency;
+               ret = i;
+       }
+       return ret;
 }
 
 /**
@@ -138,6 +158,15 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
  */
 int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 {
+       if (off || !initialized)
+               return -ENODEV;
+
+       if (!drv || !dev || !dev->enabled)
+               return -EBUSY;
+
+       if (unlikely(use_deepest_state))
+               return cpuidle_find_deepest_state(drv, dev);
+
        return cpuidle_curr_governor->select(drv, dev);
 }
 
@@ -169,7 +198,7 @@ int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev,
  */
 void cpuidle_reflect(struct cpuidle_device *dev, int index)
 {
-       if (cpuidle_curr_governor->reflect)
+       if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state))
                cpuidle_curr_governor->reflect(dev, index);
 }
 
index 71b52329335472b7a888e16b4075c6ea7a4f58c5..c4f80c15a48dde7be9e16898b09f008e5d0c28fd 100644 (file)
@@ -296,7 +296,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
                data->needs_update = 0;
        }
 
-       data->last_state_idx = 0;
+       data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
 
        /* Special case when user has set very strict latency requirement */
        if (unlikely(latency_req == 0))
@@ -310,13 +310,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 
        data->bucket = which_bucket(data->next_timer_us);
 
-       /*
-        * if the correction factor is 0 (eg first time init or cpu hotplug
-        * etc), we actually want to start out with a unity factor.
-        */
-       if (data->correction_factor[data->bucket] == 0)
-               data->correction_factor[data->bucket] = RESOLUTION * DECAY;
-
        /*
         * Force the result of multiplication to be 64 bits even if both
         * operands are 32 bits.
@@ -466,9 +459,17 @@ static int menu_enable_device(struct cpuidle_driver *drv,
                                struct cpuidle_device *dev)
 {
        struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
+       int i;
 
        memset(data, 0, sizeof(struct menu_device));
 
+       /*
+        * if the correction factor is 0 (eg first time init or cpu hotplug
+        * etc), we actually want to start out with a unity factor.
+        */
+       for(i = 0; i < BUCKETS; i++)
+               data->correction_factor[i] = RESOLUTION * DECAY;
+
        return 0;
 }
 
index ab3baa7f95082e0cabf03906354c69ae9ed8880b..8eec1653c9cc44ec5c338f61f49b74dedbedceef 100644 (file)
@@ -1803,7 +1803,7 @@ static int ap_poll_thread(void *data)
        int requests;
        struct ap_device *ap_dev;
 
-       set_user_nice(current, 19);
+       set_user_nice(current, MAX_NICE);
        while (1) {
                if (ap_suspend_flag)
                        return 0;
index 1d41f4b9114f8253e780d279799dad0ac0d27e04..f548430234663691b80f3e50ab94e2088bdf25ad 100644 (file)
@@ -464,7 +464,7 @@ static int bnx2fc_l2_rcv_thread(void *arg)
        struct fcoe_percpu_s *bg = arg;
        struct sk_buff *skb;
 
-       set_user_nice(current, -20);
+       set_user_nice(current, MIN_NICE);
        set_current_state(TASK_INTERRUPTIBLE);
        while (!kthread_should_stop()) {
                schedule();
@@ -602,7 +602,7 @@ int bnx2fc_percpu_io_thread(void *arg)
        struct bnx2fc_work *work, *tmp;
        LIST_HEAD(work_list);
 
-       set_user_nice(current, -20);
+       set_user_nice(current, MIN_NICE);
        set_current_state(TASK_INTERRUPTIBLE);
        while (!kthread_should_stop()) {
                schedule();
index b5ffd280a1aefeab817953a506484c033066ae95..d6d491c2f00463c4f3d9f32a900e7b96cc9803b3 100644 (file)
@@ -1870,7 +1870,7 @@ int bnx2i_percpu_io_thread(void *arg)
        struct bnx2i_work *work, *tmp;
        LIST_HEAD(work_list);
 
-       set_user_nice(current, -20);
+       set_user_nice(current, MIN_NICE);
 
        while (!kthread_should_stop()) {
                spin_lock_bh(&p->p_work_lock);
index d5e105b173f0cf121894fcb5105a5afedadc16d5..00ee0ed642aac717fd8c0b1e2976c860ccb664ff 100644 (file)
@@ -1872,7 +1872,7 @@ static int fcoe_percpu_receive_thread(void *arg)
 
        skb_queue_head_init(&tmp);
 
-       set_user_nice(current, -20);
+       set_user_nice(current, MIN_NICE);
 
 retry:
        while (!kthread_should_stop()) {
index 23f5ba5e6472581d4aa69bef39d1ca25992502d4..8dd47689d58430a1147a389c1eeed1eea1afae1d 100644 (file)
@@ -4515,7 +4515,7 @@ static int ibmvfc_work(void *data)
        struct ibmvfc_host *vhost = data;
        int rc;
 
-       set_user_nice(current, -20);
+       set_user_nice(current, MIN_NICE);
 
        while (1) {
                rc = wait_event_interruptible(vhost->work_wait_q,
index fa764406df6872c2daa9fa3d2e75a9e5f2164c8b..2ebfb2bb0f425f78975ac2c678c9130834d500b0 100644 (file)
@@ -2213,7 +2213,7 @@ static int ibmvscsi_work(void *data)
        struct ibmvscsi_host_data *hostdata = data;
        int rc;
 
-       set_user_nice(current, -20);
+       set_user_nice(current, MIN_NICE);
 
        while (1) {
                rc = wait_event_interruptible(hostdata->work_wait_q,
index 59b51c529ba0f9666a7200928ea7b330626daac7..294c072e90835efbb72012f2789dc412bf8c9686 100644 (file)
@@ -731,7 +731,7 @@ lpfc_do_work(void *p)
        struct lpfc_hba *phba = p;
        int rc;
 
-       set_user_nice(current, -20);
+       set_user_nice(current, MIN_NICE);
        current->flags |= PF_NOFREEZE;
        phba->data_flags = 0;
 
index 19e99cc33724c526f30de23d0e98d0757cf2f99d..afc84814e9bb3b5db7682c4e5b7751edd8b051e4 100644 (file)
@@ -4828,7 +4828,7 @@ qla2x00_do_dpc(void *data)
        ha = (struct qla_hw_data *)data;
        base_vha = pci_get_drvdata(ha->pdev);
 
-       set_user_nice(current, -20);
+       set_user_nice(current, MIN_NICE);
 
        set_current_state(TASK_INTERRUPTIBLE);
        while (!kthread_should_stop()) {
index 989f809f323f447f586569e16b3a04b9fec290be..a741da77828aec517d858b1644be4102d2298749 100644 (file)
@@ -439,12 +439,12 @@ static void binder_set_nice(long nice)
                set_user_nice(current, nice);
                return;
        }
-       min_nice = 20 - current->signal->rlim[RLIMIT_NICE].rlim_cur;
+       min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur);
        binder_debug(BINDER_DEBUG_PRIORITY_CAP,
                     "%d: nice value %ld not allowed use %ld instead\n",
                      current->pid, nice, min_nice);
        set_user_nice(current, min_nice);
-       if (min_nice < 20)
+       if (min_nice <= MAX_NICE)
                return;
        binder_user_error("%d RLIMIT_NICE not set\n", current->pid);
 }
index b9694b8cb5dd6e2a200d7f94df765e68e0f788d5..0ff8c3362a8d461e2c90593ed21deeaec7150139 100644 (file)
@@ -404,7 +404,7 @@ static int loop_thread(void *data)
        int refcheck;
        int ret = 0;
 
-       set_user_nice(current, -20);
+       set_user_nice(current, MIN_NICE);
 
        lo->lo_state = LLOOP_BOUND;
 
index bf482dfed14fecf17406a6aa2d517929d6834800..73039295d0d1f35205e060220521934afd33ff27 100644 (file)
@@ -1107,7 +1107,7 @@ static int o2hb_thread(void *data)
 
        mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
 
-       set_user_nice(current, -20);
+       set_user_nice(current, MIN_NICE);
 
        /* Pin node */
        o2nm_depend_this_node();
index b0238cba440b4d80606b40db4effb5ebcea110a5..c51a436135c485ff354be3eec4d9a612b0822c8d 100644 (file)
@@ -120,8 +120,6 @@ struct cpuidle_driver {
 #ifdef CONFIG_CPU_IDLE
 extern void disable_cpuidle(void);
 
-extern int cpuidle_enabled(struct cpuidle_driver *drv,
-                         struct cpuidle_device *dev);
 extern int cpuidle_select(struct cpuidle_driver *drv,
                          struct cpuidle_device *dev);
 extern int cpuidle_enter(struct cpuidle_driver *drv,
@@ -145,13 +143,11 @@ extern void cpuidle_resume(void);
 extern int cpuidle_enable_device(struct cpuidle_device *dev);
 extern void cpuidle_disable_device(struct cpuidle_device *dev);
 extern int cpuidle_play_dead(void);
+extern void cpuidle_use_deepest_state(bool enable);
 
 extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
 #else
 static inline void disable_cpuidle(void) { }
-static inline int cpuidle_enabled(struct cpuidle_driver *drv,
-                                 struct cpuidle_device *dev)
-{return -ENODEV; }
 static inline int cpuidle_select(struct cpuidle_driver *drv,
                                 struct cpuidle_device *dev)
 {return -ENODEV; }
@@ -180,6 +176,7 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev)
 {return -ENODEV; }
 static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
 static inline int cpuidle_play_dead(void) {return -ENODEV; }
+static inline void cpuidle_use_deepest_state(bool enable) {}
 static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
        struct cpuidle_device *dev) {return NULL; }
 #endif
index 4dce5d844b7413a41748dd678f72edb7bb11b325..70f67e4e6156f32a2feefe5bcafcd5e4364a5828 100644 (file)
@@ -870,6 +870,7 @@ enum cpu_idle_type {
 #define SD_BALANCE_WAKE                0x0010  /* Balance on wakeup */
 #define SD_WAKE_AFFINE         0x0020  /* Wake task to waking CPU */
 #define SD_SHARE_CPUPOWER      0x0080  /* Domain members share cpu power */
+#define SD_SHARE_POWERDOMAIN   0x0100  /* Domain members share power domain */
 #define SD_SHARE_PKG_RESOURCES 0x0200  /* Domain members share cpu pkg resources */
 #define SD_SERIALIZE           0x0400  /* Only a single load balancing instance */
 #define SD_ASYM_PACKING                0x0800  /* Place busy groups earlier in the domain */
@@ -877,7 +878,26 @@ enum cpu_idle_type {
 #define SD_OVERLAP             0x2000  /* sched_domains of this level overlap */
 #define SD_NUMA                        0x4000  /* cross-node balancing */
 
-extern int __weak arch_sd_sibiling_asym_packing(void);
+#ifdef CONFIG_SCHED_SMT
+static inline const int cpu_smt_flags(void)
+{
+       return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+static inline const int cpu_core_flags(void)
+{
+       return SD_SHARE_PKG_RESOURCES;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+static inline const int cpu_numa_flags(void)
+{
+       return SD_NUMA;
+}
+#endif
 
 struct sched_domain_attr {
        int relax_domain_level;
@@ -985,6 +1005,38 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 
 bool cpus_share_cache(int this_cpu, int that_cpu);
 
+typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+typedef const int (*sched_domain_flags_f)(void);
+
+#define SDTL_OVERLAP   0x01
+
+struct sd_data {
+       struct sched_domain **__percpu sd;
+       struct sched_group **__percpu sg;
+       struct sched_group_power **__percpu sgp;
+};
+
+struct sched_domain_topology_level {
+       sched_domain_mask_f mask;
+       sched_domain_flags_f sd_flags;
+       int                 flags;
+       int                 numa_level;
+       struct sd_data      data;
+#ifdef CONFIG_SCHED_DEBUG
+       char                *name;
+#endif
+};
+
+extern struct sched_domain_topology_level *sched_domain_topology;
+
+extern void set_sched_topology(struct sched_domain_topology_level *tl);
+
+#ifdef CONFIG_SCHED_DEBUG
+# define SD_INIT_NAME(type)            .name = #type
+#else
+# define SD_INIT_NAME(type)
+#endif
+
 #else /* CONFIG_SMP */
 
 struct sched_domain_attr;
@@ -1123,8 +1175,8 @@ struct sched_dl_entity {
 
        /*
         * Original scheduling parameters. Copied here from sched_attr
-        * during sched_setscheduler2(), they will remain the same until
-        * the next sched_setscheduler2().
+        * during sched_setattr(), they will remain the same until
+        * the next sched_setattr().
         */
        u64 dl_runtime;         /* maximum runtime for each instance    */
        u64 dl_deadline;        /* relative deadline of each instance   */
@@ -2723,51 +2775,9 @@ static inline int spin_needbreak(spinlock_t *lock)
 
 /*
  * Idle thread specific functions to determine the need_resched
- * polling state. We have two versions, one based on TS_POLLING in
- * thread_info.status and one based on TIF_POLLING_NRFLAG in
- * thread_info.flags
+ * polling state.
  */
-#ifdef TS_POLLING
-static inline int tsk_is_polling(struct task_struct *p)
-{
-       return task_thread_info(p)->status & TS_POLLING;
-}
-static inline void __current_set_polling(void)
-{
-       current_thread_info()->status |= TS_POLLING;
-}
-
-static inline bool __must_check current_set_polling_and_test(void)
-{
-       __current_set_polling();
-
-       /*
-        * Polling state must be visible before we test NEED_RESCHED,
-        * paired by resched_task()
-        */
-       smp_mb();
-
-       return unlikely(tif_need_resched());
-}
-
-static inline void __current_clr_polling(void)
-{
-       current_thread_info()->status &= ~TS_POLLING;
-}
-
-static inline bool __must_check current_clr_polling_and_test(void)
-{
-       __current_clr_polling();
-
-       /*
-        * Polling state must be visible before we test NEED_RESCHED,
-        * paired by resched_task()
-        */
-       smp_mb();
-
-       return unlikely(tif_need_resched());
-}
-#elif defined(TIF_POLLING_NRFLAG)
+#ifdef TIF_POLLING_NRFLAG
 static inline int tsk_is_polling(struct task_struct *p)
 {
        return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
index ac322583c82028233a2504fd5481c40a6cca36d4..d9cf5a5762d9d3c5fb12177543e282ba0de06e69 100644 (file)
 #define TASK_USER_PRIO(p)      USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO          (USER_PRIO(MAX_PRIO))
 
+/*
+ * Convert nice value [19,-20] to rlimit style value [1,40].
+ */
+static inline long nice_to_rlimit(long nice)
+{
+       return (MAX_NICE - nice + 1);
+}
+
+/*
+ * Convert rlimit style value [1,40] to nice value [-20, 19].
+ */
+static inline long rlimit_to_nice(long prio)
+{
+       return (MAX_NICE - prio + 1);
+}
+
 #endif /* _SCHED_PRIO_H */
index fddbe2023a5d568717b3b90eabc692bb4612f9bd..cb0cec94fda3330c63bf233b380bbe9ee81fe2b6 100644 (file)
@@ -104,20 +104,6 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
 #define test_thread_flag(flag) \
        test_ti_thread_flag(current_thread_info(), flag)
 
-static inline __deprecated void set_need_resched(void)
-{
-       /*
-        * Use of this function in deprecated.
-        *
-        * As of this writing there are only a few users in the DRM tree left
-        * all of which are wrong and can be removed without causing too much
-        * grief.
-        *
-        * The DRM people are aware and are working on removing the last few
-        * instances.
-        */
-}
-
 #define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
 
 #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
index 7062330a13296188ae96448f4abec8a28617fe1d..973671ff9e7d9d433b8924762b8c3090d1cebc18 100644 (file)
@@ -66,121 +66,6 @@ int arch_update_cpu_topology(void);
 #define PENALTY_FOR_NODE_WITH_CPUS     (1)
 #endif
 
-/*
- * Below are the 3 major initializers used in building sched_domains:
- * SD_SIBLING_INIT, for SMT domains
- * SD_CPU_INIT, for SMP domains
- *
- * Any architecture that cares to do any tuning to these values should do so
- * by defining their own arch-specific initializer in include/asm/topology.h.
- * A definition there will automagically override these default initializers
- * and allow arch-specific performance tuning of sched_domains.
- * (Only non-zero and non-null fields need be specified.)
- */
-
-#ifdef CONFIG_SCHED_SMT
-/* MCD - Do we really need this?  It is always on if CONFIG_SCHED_SMT is,
- * so can't we drop this in favor of CONFIG_SCHED_SMT?
- */
-#define ARCH_HAS_SCHED_WAKE_IDLE
-/* Common values for SMT siblings */
-#ifndef SD_SIBLING_INIT
-#define SD_SIBLING_INIT (struct sched_domain) {                                \
-       .min_interval           = 1,                                    \
-       .max_interval           = 2,                                    \
-       .busy_factor            = 64,                                   \
-       .imbalance_pct          = 110,                                  \
-                                                                       \
-       .flags                  = 1*SD_LOAD_BALANCE                     \
-                               | 1*SD_BALANCE_NEWIDLE                  \
-                               | 1*SD_BALANCE_EXEC                     \
-                               | 1*SD_BALANCE_FORK                     \
-                               | 0*SD_BALANCE_WAKE                     \
-                               | 1*SD_WAKE_AFFINE                      \
-                               | 1*SD_SHARE_CPUPOWER                   \
-                               | 1*SD_SHARE_PKG_RESOURCES              \
-                               | 0*SD_SERIALIZE                        \
-                               | 0*SD_PREFER_SIBLING                   \
-                               | arch_sd_sibling_asym_packing()        \
-                               ,                                       \
-       .last_balance           = jiffies,                              \
-       .balance_interval       = 1,                                    \
-       .smt_gain               = 1178, /* 15% */                       \
-       .max_newidle_lb_cost    = 0,                                    \
-       .next_decay_max_lb_cost = jiffies,                              \
-}
-#endif
-#endif /* CONFIG_SCHED_SMT */
-
-#ifdef CONFIG_SCHED_MC
-/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
-#ifndef SD_MC_INIT
-#define SD_MC_INIT (struct sched_domain) {                             \
-       .min_interval           = 1,                                    \
-       .max_interval           = 4,                                    \
-       .busy_factor            = 64,                                   \
-       .imbalance_pct          = 125,                                  \
-       .cache_nice_tries       = 1,                                    \
-       .busy_idx               = 2,                                    \
-       .wake_idx               = 0,                                    \
-       .forkexec_idx           = 0,                                    \
-                                                                       \
-       .flags                  = 1*SD_LOAD_BALANCE                     \
-                               | 1*SD_BALANCE_NEWIDLE                  \
-                               | 1*SD_BALANCE_EXEC                     \
-                               | 1*SD_BALANCE_FORK                     \
-                               | 0*SD_BALANCE_WAKE                     \
-                               | 1*SD_WAKE_AFFINE                      \
-                               | 0*SD_SHARE_CPUPOWER                   \
-                               | 1*SD_SHARE_PKG_RESOURCES              \
-                               | 0*SD_SERIALIZE                        \
-                               ,                                       \
-       .last_balance           = jiffies,                              \
-       .balance_interval       = 1,                                    \
-       .max_newidle_lb_cost    = 0,                                    \
-       .next_decay_max_lb_cost = jiffies,                              \
-}
-#endif
-#endif /* CONFIG_SCHED_MC */
-
-/* Common values for CPUs */
-#ifndef SD_CPU_INIT
-#define SD_CPU_INIT (struct sched_domain) {                            \
-       .min_interval           = 1,                                    \
-       .max_interval           = 4,                                    \
-       .busy_factor            = 64,                                   \
-       .imbalance_pct          = 125,                                  \
-       .cache_nice_tries       = 1,                                    \
-       .busy_idx               = 2,                                    \
-       .idle_idx               = 1,                                    \
-       .newidle_idx            = 0,                                    \
-       .wake_idx               = 0,                                    \
-       .forkexec_idx           = 0,                                    \
-                                                                       \
-       .flags                  = 1*SD_LOAD_BALANCE                     \
-                               | 1*SD_BALANCE_NEWIDLE                  \
-                               | 1*SD_BALANCE_EXEC                     \
-                               | 1*SD_BALANCE_FORK                     \
-                               | 0*SD_BALANCE_WAKE                     \
-                               | 1*SD_WAKE_AFFINE                      \
-                               | 0*SD_SHARE_CPUPOWER                   \
-                               | 0*SD_SHARE_PKG_RESOURCES              \
-                               | 0*SD_SERIALIZE                        \
-                               | 1*SD_PREFER_SIBLING                   \
-                               ,                                       \
-       .last_balance           = jiffies,                              \
-       .balance_interval       = 1,                                    \
-       .max_newidle_lb_cost    = 0,                                    \
-       .next_decay_max_lb_cost = jiffies,                              \
-}
-#endif
-
-#ifdef CONFIG_SCHED_BOOK
-#ifndef SD_BOOK_INIT
-#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
-#endif
-#endif /* CONFIG_SCHED_BOOK */
-
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DECLARE_PER_CPU(int, numa_node);
 
@@ -295,4 +180,17 @@ static inline int cpu_to_mem(int cpu)
 #define topology_core_cpumask(cpu)             cpumask_of(cpu)
 #endif
 
+#ifdef CONFIG_SCHED_SMT
+static inline const struct cpumask *cpu_smt_mask(int cpu)
+{
+       return topology_thread_cpumask(cpu);
+}
+#endif
+
+static inline const struct cpumask *cpu_cpu_mask(int cpu)
+{
+       return cpumask_of_node(cpu_to_node(cpu));
+}
+
+
 #endif /* _LINUX_TOPOLOGY_H */
index dbafeac18e4d58b2da77988ff939e8db2e860d0e..0955b885d0dc8eb5fe54aeeebe08c0eee6bd9cb7 100644 (file)
@@ -216,7 +216,7 @@ static int lock_torture_writer(void *arg)
        static DEFINE_TORTURE_RANDOM(rand);
 
        VERBOSE_TOROUT_STRING("lock_torture_writer task started");
-       set_user_nice(current, 19);
+       set_user_nice(current, MAX_NICE);
 
        do {
                if ((torture_random(&rand) & 0xfffff) == 0)
index 8233cd4047d776c311ef71800479f1e2b637e5da..155721f7f9090dc9bdc97065a0a1daa1c10fafa8 100644 (file)
@@ -54,9 +54,11 @@ static void freeze_begin(void)
 
 static void freeze_enter(void)
 {
+       cpuidle_use_deepest_state(true);
        cpuidle_resume();
        wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
        cpuidle_pause();
+       cpuidle_use_deepest_state(false);
 }
 
 void freeze_wake(void)
index a62a7dec3986205a07307948de2e7f94c479a4ff..913c6d6cc2c15644ebe64c6cd33d2eb0d5b18a7e 100644 (file)
@@ -521,6 +521,39 @@ static inline void init_hrtick(void)
 }
 #endif /* CONFIG_SCHED_HRTICK */
 
+/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, val)                                             \
+({     typeof(*(ptr)) __old, __val = *(ptr);                           \
+       for (;;) {                                                      \
+               __old = cmpxchg((ptr), __val, __val | (val));           \
+               if (__old == __val)                                     \
+                       break;                                          \
+               __val = __old;                                          \
+       }                                                               \
+       __old;                                                          \
+})
+
+#ifdef TIF_POLLING_NRFLAG
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+       struct thread_info *ti = task_thread_info(p);
+       return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+       set_tsk_need_resched(p);
+       return true;
+}
+#endif
+
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
@@ -537,17 +570,15 @@ void resched_task(struct task_struct *p)
        if (test_tsk_need_resched(p))
                return;
 
-       set_tsk_need_resched(p);
-
        cpu = task_cpu(p);
+
        if (cpu == smp_processor_id()) {
+               set_tsk_need_resched(p);
                set_preempt_need_resched();
                return;
        }
 
-       /* NEED_RESCHED must be visible before we test polling */
-       smp_mb();
-       if (!tsk_is_polling(p))
+       if (set_nr_and_not_polling(p))
                smp_send_reschedule(cpu);
 }
 
@@ -3018,7 +3049,7 @@ EXPORT_SYMBOL(set_user_nice);
 int can_nice(const struct task_struct *p, const int nice)
 {
        /* convert nice value [19,-20] to rlimit style value [1,40] */
-       int nice_rlim = 20 - nice;
+       int nice_rlim = nice_to_rlimit(nice);
 
        return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
                capable(CAP_SYS_NICE));
@@ -3042,17 +3073,10 @@ SYSCALL_DEFINE1(nice, int, increment)
         * We don't have to worry. Conceptually one call occurs first
         * and we have a single winner.
         */
-       if (increment < -40)
-               increment = -40;
-       if (increment > 40)
-               increment = 40;
-
+       increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
        nice = task_nice(current) + increment;
-       if (nice < MIN_NICE)
-               nice = MIN_NICE;
-       if (nice > MAX_NICE)
-               nice = MAX_NICE;
 
+       nice = clamp_val(nice, MIN_NICE, MAX_NICE);
        if (increment < 0 && !can_nice(current, nice))
                return -EPERM;
 
@@ -3642,13 +3666,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
         */
        attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
 
-out:
-       return ret;
+       return 0;
 
 err_size:
        put_user(sizeof(*attr), &uattr->size);
-       ret = -E2BIG;
-       goto out;
+       return -E2BIG;
 }
 
 /**
@@ -3808,7 +3830,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
 
                for (; addr < end; addr++) {
                        if (*addr)
-                               goto err_size;
+                               return -EFBIG;
                }
 
                attr->size = usize;
@@ -3818,12 +3840,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
        if (ret)
                return -EFAULT;
 
-out:
-       return ret;
-
-err_size:
-       ret = -E2BIG;
-       goto out;
+       return 0;
 }
 
 /**
@@ -5093,10 +5110,20 @@ static struct notifier_block migration_notifier = {
        .priority = CPU_PRI_MIGRATION,
 };
 
+static void __cpuinit set_cpu_rq_start_time(void)
+{
+       int cpu = smp_processor_id();
+       struct rq *rq = cpu_rq(cpu);
+       rq->age_stamp = sched_clock_cpu(cpu);
+}
+
 static int sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
        switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_STARTING:
+               set_cpu_rq_start_time();
+               return NOTIFY_OK;
        case CPU_DOWN_FAILED:
                set_cpu_active((long)hcpu, true);
                return NOTIFY_OK;
@@ -5305,7 +5332,8 @@ static int sd_degenerate(struct sched_domain *sd)
                         SD_BALANCE_FORK |
                         SD_BALANCE_EXEC |
                         SD_SHARE_CPUPOWER |
-                        SD_SHARE_PKG_RESOURCES)) {
+                        SD_SHARE_PKG_RESOURCES |
+                        SD_SHARE_POWERDOMAIN)) {
                if (sd->groups != sd->groups->next)
                        return 0;
        }
@@ -5336,7 +5364,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                SD_BALANCE_EXEC |
                                SD_SHARE_CPUPOWER |
                                SD_SHARE_PKG_RESOURCES |
-                               SD_PREFER_SIBLING);
+                               SD_PREFER_SIBLING |
+                               SD_SHARE_POWERDOMAIN);
                if (nr_node_ids == 1)
                        pflags &= ~SD_SERIALIZE;
        }
@@ -5610,17 +5639,6 @@ static int __init isolated_cpu_setup(char *str)
 
 __setup("isolcpus=", isolated_cpu_setup);
 
-static const struct cpumask *cpu_cpu_mask(int cpu)
-{
-       return cpumask_of_node(cpu_to_node(cpu));
-}
-
-struct sd_data {
-       struct sched_domain **__percpu sd;
-       struct sched_group **__percpu sg;
-       struct sched_group_power **__percpu sgp;
-};
-
 struct s_data {
        struct sched_domain ** __percpu sd;
        struct root_domain      *rd;
@@ -5633,21 +5651,6 @@ enum s_alloc {
        sa_none,
 };
 
-struct sched_domain_topology_level;
-
-typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-
-#define SDTL_OVERLAP   0x01
-
-struct sched_domain_topology_level {
-       sched_domain_init_f init;
-       sched_domain_mask_f mask;
-       int                 flags;
-       int                 numa_level;
-       struct sd_data      data;
-};
-
 /*
  * Build an iteration mask that can exclude certain CPUs from the upwards
  * domain traversal.
@@ -5815,8 +5818,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
                        continue;
 
                group = get_group(i, sdd, &sg);
-               cpumask_clear(sched_group_cpus(sg));
-               sg->sgp->power = 0;
                cpumask_setall(sched_group_mask(sg));
 
                for_each_cpu(j, span) {
@@ -5866,44 +5867,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
 }
 
-int __weak arch_sd_sibling_asym_packing(void)
-{
-       return 0*SD_ASYM_PACKING;
-}
-
 /*
  * Initializers for schedule domains
  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
  */
 
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(sd, type)                sd->name = #type
-#else
-# define SD_INIT_NAME(sd, type)                do { } while (0)
-#endif
-
-#define SD_INIT_FUNC(type)                                             \
-static noinline struct sched_domain *                                  \
-sd_init_##type(struct sched_domain_topology_level *tl, int cpu)        \
-{                                                                      \
-       struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
-       *sd = SD_##type##_INIT;                                         \
-       SD_INIT_NAME(sd, type);                                         \
-       sd->private = &tl->data;                                        \
-       return sd;                                                      \
-}
-
-SD_INIT_FUNC(CPU)
-#ifdef CONFIG_SCHED_SMT
- SD_INIT_FUNC(SIBLING)
-#endif
-#ifdef CONFIG_SCHED_MC
- SD_INIT_FUNC(MC)
-#endif
-#ifdef CONFIG_SCHED_BOOK
- SD_INIT_FUNC(BOOK)
-#endif
-
 static int default_relax_domain_level = -1;
 int sched_domain_level_max;
 
@@ -5991,99 +5959,154 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
                *per_cpu_ptr(sdd->sgp, cpu) = NULL;
 }
 
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *cpu_smt_mask(int cpu)
-{
-       return topology_thread_cpumask(cpu);
-}
-#endif
-
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-       { sd_init_SIBLING, cpu_smt_mask, },
-#endif
-#ifdef CONFIG_SCHED_MC
-       { sd_init_MC, cpu_coregroup_mask, },
-#endif
-#ifdef CONFIG_SCHED_BOOK
-       { sd_init_BOOK, cpu_book_mask, },
-#endif
-       { sd_init_CPU, cpu_cpu_mask, },
-       { NULL, },
-};
-
-static struct sched_domain_topology_level *sched_domain_topology = default_topology;
-
-#define for_each_sd_topology(tl)                       \
-       for (tl = sched_domain_topology; tl->init; tl++)
-
 #ifdef CONFIG_NUMA
-
 static int sched_domains_numa_levels;
 static int *sched_domains_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
+#endif
 
-static inline int sd_local_flags(int level)
-{
-       if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
-               return 0;
-
-       return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
-}
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * SD_SHARE_CPUPOWER      - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA                - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING        - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS              \
+       (SD_SHARE_CPUPOWER |            \
+        SD_SHARE_PKG_RESOURCES |       \
+        SD_NUMA |                      \
+        SD_ASYM_PACKING |              \
+        SD_SHARE_POWERDOMAIN)
 
 static struct sched_domain *
-sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl, int cpu)
 {
        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
-       int level = tl->numa_level;
-       int sd_weight = cpumask_weight(
-                       sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+       int sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+       /*
+        * Ugly hack to pass state to sd_numa_mask()...
+        */
+       sched_domains_curr_level = tl->numa_level;
+#endif
+
+       sd_weight = cpumask_weight(tl->mask(cpu));
+
+       if (tl->sd_flags)
+               sd_flags = (*tl->sd_flags)();
+       if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+                       "wrong sd_flags in topology description\n"))
+               sd_flags &= ~TOPOLOGY_SD_FLAGS;
 
        *sd = (struct sched_domain){
                .min_interval           = sd_weight,
                .max_interval           = 2*sd_weight,
                .busy_factor            = 32,
                .imbalance_pct          = 125,
-               .cache_nice_tries       = 2,
-               .busy_idx               = 3,
-               .idle_idx               = 2,
+
+               .cache_nice_tries       = 0,
+               .busy_idx               = 0,
+               .idle_idx               = 0,
                .newidle_idx            = 0,
                .wake_idx               = 0,
                .forkexec_idx           = 0,
 
                .flags                  = 1*SD_LOAD_BALANCE
                                        | 1*SD_BALANCE_NEWIDLE
-                                       | 0*SD_BALANCE_EXEC
-                                       | 0*SD_BALANCE_FORK
+                                       | 1*SD_BALANCE_EXEC
+                                       | 1*SD_BALANCE_FORK
                                        | 0*SD_BALANCE_WAKE
-                                       | 0*SD_WAKE_AFFINE
+                                       | 1*SD_WAKE_AFFINE
                                        | 0*SD_SHARE_CPUPOWER
                                        | 0*SD_SHARE_PKG_RESOURCES
-                                       | 1*SD_SERIALIZE
+                                       | 0*SD_SERIALIZE
                                        | 0*SD_PREFER_SIBLING
-                                       | 1*SD_NUMA
-                                       | sd_local_flags(level)
+                                       | 0*SD_NUMA
+                                       | sd_flags
                                        ,
+
                .last_balance           = jiffies,
                .balance_interval       = sd_weight,
+               .smt_gain               = 0,
                .max_newidle_lb_cost    = 0,
                .next_decay_max_lb_cost = jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+               .name                   = tl->name,
+#endif
        };
-       SD_INIT_NAME(sd, NUMA);
-       sd->private = &tl->data;
 
        /*
-        * Ugly hack to pass state to sd_numa_mask()...
+        * Convert topological properties into behaviour.
         */
-       sched_domains_curr_level = tl->numa_level;
+
+       if (sd->flags & SD_SHARE_CPUPOWER) {
+               sd->imbalance_pct = 110;
+               sd->smt_gain = 1178; /* ~15% */
+
+       } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+               sd->imbalance_pct = 117;
+               sd->cache_nice_tries = 1;
+               sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+       } else if (sd->flags & SD_NUMA) {
+               sd->cache_nice_tries = 2;
+               sd->busy_idx = 3;
+               sd->idle_idx = 2;
+
+               sd->flags |= SD_SERIALIZE;
+               if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+                       sd->flags &= ~(SD_BALANCE_EXEC |
+                                      SD_BALANCE_FORK |
+                                      SD_WAKE_AFFINE);
+               }
+
+#endif
+       } else {
+               sd->flags |= SD_PREFER_SIBLING;
+               sd->cache_nice_tries = 1;
+               sd->busy_idx = 2;
+               sd->idle_idx = 1;
+       }
+
+       sd->private = &tl->data;
 
        return sd;
 }
 
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+       { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+       { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+       { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+       { NULL, },
+};
+
+struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+#define for_each_sd_topology(tl)                       \
+       for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+       sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
 static const struct cpumask *sd_numa_mask(int cpu)
 {
        return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6227,7 +6250,10 @@ static void sched_init_numa(void)
                }
        }
 
-       tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+       /* Compute default topology size */
+       for (i = 0; sched_domain_topology[i].mask; i++);
+
+       tl = kzalloc((i + level + 1) *
                        sizeof(struct sched_domain_topology_level), GFP_KERNEL);
        if (!tl)
                return;
@@ -6235,18 +6261,19 @@ static void sched_init_numa(void)
        /*
         * Copy the default topology bits..
         */
-       for (i = 0; default_topology[i].init; i++)
-               tl[i] = default_topology[i];
+       for (i = 0; sched_domain_topology[i].mask; i++)
+               tl[i] = sched_domain_topology[i];
 
        /*
         * .. and append 'j' levels of NUMA goodness.
         */
        for (j = 0; j < level; i++, j++) {
                tl[i] = (struct sched_domain_topology_level){
-                       .init = sd_numa_init,
                        .mask = sd_numa_mask,
+                       .sd_flags = cpu_numa_flags,
                        .flags = SDTL_OVERLAP,
                        .numa_level = j,
+                       SD_INIT_NAME(NUMA)
                };
        }
 
@@ -6404,7 +6431,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                const struct cpumask *cpu_map, struct sched_domain_attr *attr,
                struct sched_domain *child, int cpu)
 {
-       struct sched_domain *sd = tl->init(tl, cpu);
+       struct sched_domain *sd = sd_init(tl, cpu);
        if (!sd)
                return child;
 
@@ -6974,6 +7001,7 @@ void __init sched_init(void)
        if (cpu_isolated_map == NULL)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
        idle_thread_set_boot_cpu();
+       set_cpu_rq_start_time();
 #endif
        init_sched_fair_class();
 
index 800e99b99075141421d82f0bdc07e42f09baea9d..f9ca7d19781a5691eb915c8764873ba82b31fcf0 100644 (file)
@@ -520,7 +520,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
         * We need to take care of a possible races here. In fact, the
         * task might have changed its scheduling policy to something
         * different from SCHED_DEADLINE or changed its reservation
-        * parameters (through sched_setscheduler()).
+        * parameters (through sched_setattr()).
         */
        if (!dl_task(p) || dl_se->dl_new)
                goto unlock;
@@ -741,7 +741,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 
        WARN_ON(!dl_prio(prio));
        dl_rq->dl_nr_running++;
-       inc_nr_running(rq_of_dl_rq(dl_rq));
+       add_nr_running(rq_of_dl_rq(dl_rq), 1);
 
        inc_dl_deadline(dl_rq, deadline);
        inc_dl_migration(dl_se, dl_rq);
@@ -755,7 +755,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
        WARN_ON(!dl_prio(prio));
        WARN_ON(!dl_rq->dl_nr_running);
        dl_rq->dl_nr_running--;
-       dec_nr_running(rq_of_dl_rq(dl_rq));
+       sub_nr_running(rq_of_dl_rq(dl_rq), 1);
 
        dec_dl_deadline(dl_rq, dl_se->deadline);
        dec_dl_migration(dl_se, dl_rq);
index 0fdb96de81a5b8a92c302961769cdefdb5cad915..c9617b73bcc00d6ac8b275f47f130afd15749dcd 100644 (file)
@@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env,
        env->best_cpu = env->dst_cpu;
 }
 
+static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
+                               long src_load, long dst_load,
+                               struct task_numa_env *env)
+{
+       long imb, old_imb;
+
+       /* We care about the slope of the imbalance, not the direction. */
+       if (dst_load < src_load)
+               swap(dst_load, src_load);
+
+       /* Is the difference below the threshold? */
+       imb = dst_load * 100 - src_load * env->imbalance_pct;
+       if (imb <= 0)
+               return false;
+
+       /*
+        * The imbalance is above the allowed threshold.
+        * Compare it with the old imbalance.
+        */
+       if (orig_dst_load < orig_src_load)
+               swap(orig_dst_load, orig_src_load);
+
+       old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+
+       /* Would this change make things worse? */
+       return (old_imb > imb);
+}
+
 /*
  * This checks if the overall compute and NUMA accesses of the system would
  * be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env,
        struct rq *src_rq = cpu_rq(env->src_cpu);
        struct rq *dst_rq = cpu_rq(env->dst_cpu);
        struct task_struct *cur;
-       long dst_load, src_load;
+       long orig_src_load, src_load;
+       long orig_dst_load, dst_load;
        long load;
        long imp = (groupimp > 0) ? groupimp : taskimp;
 
@@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env,
         * In the overloaded case, try and keep the load balanced.
         */
 balance:
-       dst_load = env->dst_stats.load;
-       src_load = env->src_stats.load;
+       orig_dst_load = env->dst_stats.load;
+       orig_src_load = env->src_stats.load;
 
        /* XXX missing power terms */
        load = task_h_load(env->p);
-       dst_load += load;
-       src_load -= load;
+       dst_load = orig_dst_load + load;
+       src_load = orig_src_load - load;
 
        if (cur) {
                load = task_h_load(cur);
@@ -1195,11 +1224,8 @@ balance:
                src_load += load;
        }
 
-       /* make src_load the smaller */
-       if (dst_load < src_load)
-               swap(dst_load, src_load);
-
-       if (src_load * env->imbalance_pct < dst_load * 100)
+       if (load_too_imbalanced(orig_src_load, orig_dst_load,
+                               src_load, dst_load, env))
                goto unlock;
 
 assign:
@@ -1301,7 +1327,16 @@ static int task_numa_migrate(struct task_struct *p)
        if (env.best_cpu == -1)
                return -EAGAIN;
 
-       sched_setnuma(p, env.dst_nid);
+       /*
+        * If the task is part of a workload that spans multiple NUMA nodes,
+        * and is migrating into one of the workload's active nodes, remember
+        * this node as the task's preferred numa node, so the workload can
+        * settle down.
+        * A task that migrated to a second choice node will be better off
+        * trying for a better one later. Do not set the preferred node here.
+        */
+       if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
+               sched_setnuma(p, env.dst_nid);
 
        /*
         * Reset the scan period if the task is being rescheduled on an
@@ -1326,12 +1361,15 @@ static int task_numa_migrate(struct task_struct *p)
 /* Attempt to migrate a task to a CPU on the preferred node. */
 static void numa_migrate_preferred(struct task_struct *p)
 {
+       unsigned long interval = HZ;
+
        /* This task has no NUMA fault statistics yet */
        if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
                return;
 
        /* Periodically retry migrating the task to the preferred node */
-       p->numa_migrate_retry = jiffies + HZ;
+       interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+       p->numa_migrate_retry = jiffies + interval;
 
        /* Success if task is already running on preferred CPU */
        if (task_node(p) == p->numa_preferred_nid)
@@ -1738,6 +1776,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
        struct task_struct *p = current;
        bool migrated = flags & TNF_MIGRATED;
        int cpu_node = task_node(current);
+       int local = !!(flags & TNF_FAULT_LOCAL);
        int priv;
 
        if (!numabalancing_enabled)
@@ -1786,6 +1825,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
                        task_numa_group(p, last_cpupid, flags, &priv);
        }
 
+       /*
+        * If a workload spans multiple NUMA nodes, a shared fault that
+        * occurs wholly within the set of nodes that the workload is
+        * actively using should be counted as local. This allows the
+        * scan rate to slow down when a workload has settled down.
+        */
+       if (!priv && !local && p->numa_group &&
+                       node_isset(cpu_node, p->numa_group->active_nodes) &&
+                       node_isset(mem_node, p->numa_group->active_nodes))
+               local = 1;
+
        task_numa_placement(p);
 
        /*
@@ -1800,7 +1850,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 
        p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
        p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
-       p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
+       p->numa_faults_locality[local] += pages;
 }
 
 static void reset_ptenuma_scan(struct task_struct *p)
@@ -3301,7 +3351,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        }
 
        if (!se)
-               rq->nr_running -= task_delta;
+               sub_nr_running(rq, task_delta);
 
        cfs_rq->throttled = 1;
        cfs_rq->throttled_clock = rq_clock(rq);
@@ -3352,7 +3402,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
        }
 
        if (!se)
-               rq->nr_running += task_delta;
+               add_nr_running(rq, task_delta);
 
        /* determine whether we need to wake up potentially idle cpu */
        if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -3884,7 +3934,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
        if (!se) {
                update_rq_runnable_avg(rq, rq->nr_running);
-               inc_nr_running(rq);
+               add_nr_running(rq, 1);
        }
        hrtick_update(rq);
 }
@@ -3944,7 +3994,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        }
 
        if (!se) {
-               dec_nr_running(rq);
+               sub_nr_running(rq, 1);
                update_rq_runnable_avg(rq, 1);
        }
        hrtick_update(rq);
@@ -4015,7 +4065,7 @@ static void record_wakee(struct task_struct *p)
         * about the loss.
         */
        if (jiffies > current->wakee_flip_decay_ts + HZ) {
-               current->wakee_flips = 0;
+               current->wakee_flips >>= 1;
                current->wakee_flip_decay_ts = jiffies;
        }
 
@@ -4449,10 +4499,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                        sd = tmp;
        }
 
-       if (affine_sd) {
-               if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
-                       prev_cpu = cpu;
+       if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+               prev_cpu = cpu;
 
+       if (sd_flag & SD_BALANCE_WAKE) {
                new_cpu = select_idle_sibling(p, prev_cpu);
                goto unlock;
        }
@@ -4520,6 +4570,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
                atomic_long_add(se->avg.load_avg_contrib,
                                                &cfs_rq->removed_load);
        }
+
+       /* We have migrated, no longer consider this task hot */
+       se->exec_start = 0;
 }
 #endif /* CONFIG_SMP */
 
@@ -5070,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now)
 /* Returns true if the destination node has incurred more faults */
 static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
 {
+       struct numa_group *numa_group = rcu_dereference(p->numa_group);
        int src_nid, dst_nid;
 
        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
@@ -5083,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
        if (src_nid == dst_nid)
                return false;
 
-       /* Always encourage migration to the preferred node. */
-       if (dst_nid == p->numa_preferred_nid)
-               return true;
+       if (numa_group) {
+               /* Task is already in the group's interleave set. */
+               if (node_isset(src_nid, numa_group->active_nodes))
+                       return false;
+
+               /* Task is moving into the group's interleave set. */
+               if (node_isset(dst_nid, numa_group->active_nodes))
+                       return true;
 
-       /* If both task and group weight improve, this move is a winner. */
-       if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
-           group_weight(p, dst_nid) > group_weight(p, src_nid))
+               return group_faults(p, dst_nid) > group_faults(p, src_nid);
+       }
+
+       /* Encourage migration to the preferred node. */
+       if (dst_nid == p->numa_preferred_nid)
                return true;
 
-       return false;
+       return task_faults(p, dst_nid) > task_faults(p, src_nid);
 }
 
 
 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
+       struct numa_group *numa_group = rcu_dereference(p->numa_group);
        int src_nid, dst_nid;
 
        if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5112,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
        if (src_nid == dst_nid)
                return false;
 
+       if (numa_group) {
+               /* Task is moving within/into the group's interleave set. */
+               if (node_isset(dst_nid, numa_group->active_nodes))
+                       return false;
+
+               /* Task is moving out of the group's interleave set. */
+               if (node_isset(src_nid, numa_group->active_nodes))
+                       return true;
+
+               return group_faults(p, dst_nid) < group_faults(p, src_nid);
+       }
+
        /* Migrating away from the preferred node is always bad. */
        if (src_nid == p->numa_preferred_nid)
                return true;
 
-       /* If either task or group weight get worse, don't do it. */
-       if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
-           group_weight(p, dst_nid) < group_weight(p, src_nid))
-               return true;
-
-       return false;
+       return task_faults(p, dst_nid) < task_faults(p, src_nid);
 }
 
 #else
@@ -5564,6 +5633,7 @@ static unsigned long scale_rt_power(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        u64 total, available, age_stamp, avg;
+       s64 delta;
 
        /*
         * Since we're reading these variables without serialization make sure
@@ -5572,7 +5642,11 @@ static unsigned long scale_rt_power(int cpu)
        age_stamp = ACCESS_ONCE(rq->age_stamp);
        avg = ACCESS_ONCE(rq->rt_avg);
 
-       total = sched_avg_period() + (rq_clock(rq) - age_stamp);
+       delta = rq_clock(rq) - age_stamp;
+       if (unlikely(delta < 0))
+               delta = 0;
+
+       total = sched_avg_period() + delta;
 
        if (unlikely(total < avg)) {
                /* Ensures that power won't end up being negative */
@@ -6640,17 +6714,44 @@ out:
        return ld_moved;
 }
 
+static inline unsigned long
+get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
+{
+       unsigned long interval = sd->balance_interval;
+
+       if (cpu_busy)
+               interval *= sd->busy_factor;
+
+       /* scale ms to jiffies */
+       interval = msecs_to_jiffies(interval);
+       interval = clamp(interval, 1UL, max_load_balance_interval);
+
+       return interval;
+}
+
+static inline void
+update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+{
+       unsigned long interval, next;
+
+       interval = get_sd_balance_interval(sd, cpu_busy);
+       next = sd->last_balance + interval;
+
+       if (time_after(*next_balance, next))
+               *next_balance = next;
+}
+
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
 static int idle_balance(struct rq *this_rq)
 {
+       unsigned long next_balance = jiffies + HZ;
+       int this_cpu = this_rq->cpu;
        struct sched_domain *sd;
        int pulled_task = 0;
-       unsigned long next_balance = jiffies + HZ;
        u64 curr_cost = 0;
-       int this_cpu = this_rq->cpu;
 
        idle_enter_fair(this_rq);
 
@@ -6660,8 +6761,15 @@ static int idle_balance(struct rq *this_rq)
         */
        this_rq->idle_stamp = rq_clock(this_rq);
 
-       if (this_rq->avg_idle < sysctl_sched_migration_cost)
+       if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+               rcu_read_lock();
+               sd = rcu_dereference_check_sched_domain(this_rq->sd);
+               if (sd)
+                       update_next_balance(sd, 0, &next_balance);
+               rcu_read_unlock();
+
                goto out;
+       }
 
        /*
         * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6671,20 +6779,20 @@ static int idle_balance(struct rq *this_rq)
        update_blocked_averages(this_cpu);
        rcu_read_lock();
        for_each_domain(this_cpu, sd) {
-               unsigned long interval;
                int continue_balancing = 1;
                u64 t0, domain_cost;
 
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
 
-               if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+               if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+                       update_next_balance(sd, 0, &next_balance);
                        break;
+               }
 
                if (sd->flags & SD_BALANCE_NEWIDLE) {
                        t0 = sched_clock_cpu(this_cpu);
 
-                       /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance(this_cpu, this_rq,
                                                   sd, CPU_NEWLY_IDLE,
                                                   &continue_balancing);
@@ -6696,10 +6804,13 @@ static int idle_balance(struct rq *this_rq)
                        curr_cost += domain_cost;
                }
 
-               interval = msecs_to_jiffies(sd->balance_interval);
-               if (time_after(next_balance, sd->last_balance + interval))
-                       next_balance = sd->last_balance + interval;
-               if (pulled_task)
+               update_next_balance(sd, 0, &next_balance);
+
+               /*
+                * Stop searching for tasks to pull if there are
+                * now runnable tasks on this rq.
+                */
+               if (pulled_task || this_rq->nr_running > 0)
                        break;
        }
        rcu_read_unlock();
@@ -6717,20 +6828,13 @@ static int idle_balance(struct rq *this_rq)
        if (this_rq->cfs.h_nr_running && !pulled_task)
                pulled_task = 1;
 
-       if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
-               /*
-                * We are going idle. next_balance may be set based on
-                * a busy processor. So reset next_balance.
-                */
+out:
+       /* Move the next balance forward */
+       if (time_after(this_rq->next_balance, next_balance))
                this_rq->next_balance = next_balance;
-       }
 
-out:
        /* Is there a task of a high priority class? */
-       if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
-           ((this_rq->stop && this_rq->stop->on_rq) ||
-            this_rq->dl.dl_nr_running ||
-            (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
+       if (this_rq->nr_running != this_rq->cfs.h_nr_running)
                pulled_task = -1;
 
        if (pulled_task) {
@@ -7011,16 +7115,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                        break;
                }
 
-               interval = sd->balance_interval;
-               if (idle != CPU_IDLE)
-                       interval *= sd->busy_factor;
-
-               /* scale ms to jiffies */
-               interval = msecs_to_jiffies(interval);
-               interval = clamp(interval, 1UL, max_load_balance_interval);
+               interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
 
                need_serialize = sd->flags & SD_SERIALIZE;
-
                if (need_serialize) {
                        if (!spin_trylock(&balancing))
                                goto out;
@@ -7036,6 +7133,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                                idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
                        }
                        sd->last_balance = jiffies;
+                       interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
                }
                if (need_serialize)
                        spin_unlock(&balancing);
index 8f4390a079c77d18cb9253c5266b3c7af2f24273..25b9423abce9fa54052abb49946ad6561f1dfed7 100644 (file)
@@ -67,24 +67,21 @@ void __weak arch_cpu_idle(void)
  * cpuidle_idle_call - the main idle function
  *
  * NOTE: no locks or semaphores should be used here
- * return non-zero on failure
  */
-static int cpuidle_idle_call(void)
+static void cpuidle_idle_call(void)
 {
        struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
        struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
-       int next_state, entered_state, ret;
+       int next_state, entered_state;
        bool broadcast;
 
        /*
         * Check if the idle task must be rescheduled. If it is the
-        * case, exit the function after re-enabling the local irq and
-        * set again the polling flag
+        * case, exit the function after re-enabling the local irq.
         */
-       if (current_clr_polling_and_test()) {
+       if (need_resched()) {
                local_irq_enable();
-               __current_set_polling();
-               return 0;
+               return;
        }
 
        /*
@@ -101,96 +98,79 @@ static int cpuidle_idle_call(void)
        rcu_idle_enter();
 
        /*
-        * Check if the cpuidle framework is ready, otherwise fallback
-        * to the default arch specific idle method
+        * Ask the cpuidle framework to choose a convenient idle state.
+        * Fall back to the default arch idle method on errors.
         */
-       ret = cpuidle_enabled(drv, dev);
-
-       if (!ret) {
+       next_state = cpuidle_select(drv, dev);
+       if (next_state < 0) {
+use_default:
                /*
-                * Ask the governor to choose an idle state it thinks
-                * it is convenient to go to. There is *always* a
-                * convenient idle state
+                * We can't use the cpuidle framework, let's use the default
+                * idle routine.
                 */
-               next_state = cpuidle_select(drv, dev);
-
-               /*
-                * The idle task must be scheduled, it is pointless to
-                * go to idle, just update no idle residency and get
-                * out of this function
-                */
-               if (current_clr_polling_and_test()) {
-                       dev->last_residency = 0;
-                       entered_state = next_state;
+               if (current_clr_polling_and_test())
                        local_irq_enable();
-               } else {
-                       broadcast = !!(drv->states[next_state].flags &
-                                      CPUIDLE_FLAG_TIMER_STOP);
-
-                       if (broadcast)
-                               /*
-                                * Tell the time framework to switch
-                                * to a broadcast timer because our
-                                * local timer will be shutdown. If a
-                                * local timer is used from another
-                                * cpu as a broadcast timer, this call
-                                * may fail if it is not available
-                                */
-                               ret = clockevents_notify(
-                                       CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
-                                       &dev->cpu);
-
-                       if (!ret) {
-                               trace_cpu_idle_rcuidle(next_state, dev->cpu);
-
-                               /*
-                                * Enter the idle state previously
-                                * returned by the governor
-                                * decision. This function will block
-                                * until an interrupt occurs and will
-                                * take care of re-enabling the local
-                                * interrupts
-                                */
-                               entered_state = cpuidle_enter(drv, dev,
-                                                             next_state);
-
-                               trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
-                                                      dev->cpu);
-
-                               if (broadcast)
-                                       clockevents_notify(
-                                               CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
-                                               &dev->cpu);
-
-                               /*
-                                * Give the governor an opportunity to reflect on the
-                                * outcome
-                                */
-                               cpuidle_reflect(dev, entered_state);
-                       }
-               }
+               else
+                       arch_cpu_idle();
+
+               goto exit_idle;
        }
 
+
        /*
-        * We can't use the cpuidle framework, let's use the default
-        * idle routine
+        * The idle task must be scheduled, it is pointless to
+        * go to idle, just update no idle residency and get
+        * out of this function
         */
-       if (ret)
-               arch_cpu_idle();
+       if (current_clr_polling_and_test()) {
+               dev->last_residency = 0;
+               entered_state = next_state;
+               local_irq_enable();
+               goto exit_idle;
+       }
+
+       broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
 
+       /*
+        * Tell the time framework to switch to a broadcast timer
+        * because our local timer will be shutdown. If a local timer
+        * is used from another cpu as a broadcast timer, this call may
+        * fail if it is not available
+        */
+       if (broadcast &&
+           clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
+               goto use_default;
+
+       trace_cpu_idle_rcuidle(next_state, dev->cpu);
+
+       /*
+        * Enter the idle state previously returned by the governor decision.
+        * This function will block until an interrupt occurs and will take
+        * care of re-enabling the local interrupts
+        */
+       entered_state = cpuidle_enter(drv, dev, next_state);
+
+       trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
+
+       if (broadcast)
+               clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+
+       /*
+        * Give the governor an opportunity to reflect on the outcome
+        */
+       cpuidle_reflect(dev, entered_state);
+
+exit_idle:
        __current_set_polling();
 
        /*
-        * It is up to the idle functions to enable back the local
-        * interrupt
+        * It is up to the idle functions to reenable local interrupts
         */
        if (WARN_ON_ONCE(irqs_disabled()))
                local_irq_enable();
 
        rcu_idle_exit();
        start_critical_timings();
-
-       return 0;
 }
 
 /*
index bd2267ad404fa78de092b8bc8f228cfea2dca87b..0ebfd7a29472bdfd55b74de00cec66014372d863 100644 (file)
@@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        rt_rq->overloaded = 0;
        plist_head_init(&rt_rq->pushable_tasks);
 #endif
+       /* We start is dequeued state, because no RT tasks are queued */
+       rt_rq->rt_queued = 0;
 
        rt_rq->rt_time = 0;
        rt_rq->rt_throttled = 0;
@@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
        return rt_se->rt_rq;
 }
 
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *rt_rq = rt_se->rt_rq;
+
+       return rt_rq->rq;
+}
+
 void free_rt_sched_group(struct task_group *tg)
 {
        int i;
@@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
        return container_of(rt_rq, struct rq, rt);
 }
 
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 {
        struct task_struct *p = rt_task_of(rt_se);
-       struct rq *rq = task_rq(p);
+
+       return task_rq(p);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+       struct rq *rq = rq_of_rt_se(rt_se);
 
        return &rq->rt;
 }
@@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq)
 }
 #endif /* CONFIG_SMP */
 
+static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
+
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
        return !list_empty(&rt_se->run_list);
@@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
        rt_se = rt_rq->tg->rt_se[cpu];
 
        if (rt_rq->rt_nr_running) {
-               if (rt_se && !on_rt_rq(rt_se))
+               if (!rt_se)
+                       enqueue_top_rt_rq(rt_rq);
+               else if (!on_rt_rq(rt_se))
                        enqueue_rt_entity(rt_se, false);
+
                if (rt_rq->highest_prio.curr < curr->prio)
                        resched_task(curr);
        }
@@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 
        rt_se = rt_rq->tg->rt_se[cpu];
 
-       if (rt_se && on_rt_rq(rt_se))
+       if (!rt_se)
+               dequeue_top_rt_rq(rt_rq);
+       else if (on_rt_rq(rt_se))
                dequeue_rt_entity(rt_se);
 }
 
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+       return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
 static int rt_se_boosted(struct sched_rt_entity *rt_se)
 {
        struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 
 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
-       if (rt_rq->rt_nr_running)
-               resched_task(rq_of_rt_rq(rt_rq)->curr);
+       struct rq *rq = rq_of_rt_rq(rt_rq);
+
+       if (!rt_rq->rt_nr_running)
+               return;
+
+       enqueue_top_rt_rq(rt_rq);
+       resched_task(rq->curr);
 }
 
 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
+       dequeue_top_rt_rq(rt_rq);
+}
+
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+       return rt_rq->rt_throttled;
 }
 
 static inline const struct cpumask *sched_rt_period_mask(void)
@@ -922,6 +961,38 @@ static void update_curr_rt(struct rq *rq)
        }
 }
 
+static void
+dequeue_top_rt_rq(struct rt_rq *rt_rq)
+{
+       struct rq *rq = rq_of_rt_rq(rt_rq);
+
+       BUG_ON(&rq->rt != rt_rq);
+
+       if (!rt_rq->rt_queued)
+               return;
+
+       BUG_ON(!rq->nr_running);
+
+       sub_nr_running(rq, rt_rq->rt_nr_running);
+       rt_rq->rt_queued = 0;
+}
+
+static void
+enqueue_top_rt_rq(struct rt_rq *rt_rq)
+{
+       struct rq *rq = rq_of_rt_rq(rt_rq);
+
+       BUG_ON(&rq->rt != rt_rq);
+
+       if (rt_rq->rt_queued)
+               return;
+       if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
+               return;
+
+       add_nr_running(rq, rt_rq->rt_nr_running);
+       rt_rq->rt_queued = 1;
+}
+
 #if defined CONFIG_SMP
 
 static void
@@ -1044,13 +1115,24 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+static inline
+unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *group_rq = group_rt_rq(rt_se);
+
+       if (group_rq)
+               return group_rq->rt_nr_running;
+       else
+               return 1;
+}
+
 static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
        int prio = rt_se_prio(rt_se);
 
        WARN_ON(!rt_prio(prio));
-       rt_rq->rt_nr_running++;
+       rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
 
        inc_rt_prio(rt_rq, prio);
        inc_rt_migration(rt_se, rt_rq);
@@ -1062,7 +1144,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        WARN_ON(!rt_rq->rt_nr_running);
-       rt_rq->rt_nr_running--;
+       rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
 
        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
        dec_rt_migration(rt_se, rt_rq);
@@ -1119,6 +1201,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
                back = rt_se;
        }
 
+       dequeue_top_rt_rq(rt_rq_of_se(back));
+
        for (rt_se = back; rt_se; rt_se = rt_se->back) {
                if (on_rt_rq(rt_se))
                        __dequeue_rt_entity(rt_se);
@@ -1127,13 +1211,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 {
+       struct rq *rq = rq_of_rt_se(rt_se);
+
        dequeue_rt_stack(rt_se);
        for_each_sched_rt_entity(rt_se)
                __enqueue_rt_entity(rt_se, head);
+       enqueue_top_rt_rq(&rq->rt);
 }
 
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 {
+       struct rq *rq = rq_of_rt_se(rt_se);
+
        dequeue_rt_stack(rt_se);
 
        for_each_sched_rt_entity(rt_se) {
@@ -1142,6 +1231,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
                if (rt_rq && rt_rq->rt_nr_running)
                        __enqueue_rt_entity(rt_se, false);
        }
+       enqueue_top_rt_rq(&rq->rt);
 }
 
 /*
@@ -1159,8 +1249,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 
        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
-
-       inc_nr_running(rq);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1171,8 +1259,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        dequeue_rt_entity(rt_se);
 
        dequeue_pushable_task(rq, p);
-
-       dec_nr_running(rq);
 }
 
 /*
@@ -1377,10 +1463,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
        if (prev->sched_class == &rt_sched_class)
                update_curr_rt(rq);
 
-       if (!rt_rq->rt_nr_running)
-               return NULL;
-
-       if (rt_rq_throttled(rt_rq))
+       if (!rt_rq->rt_queued)
                return NULL;
 
        put_prev_task(rq, prev);
@@ -1892,9 +1975,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
         */
        if (p->on_rq && rq->curr != p) {
 #ifdef CONFIG_SMP
-               if (rq->rt.overloaded && push_rt_task(rq) &&
+               if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
                    /* Don't resched if we changed runqueues */
-                   rq != task_rq(p))
+                   push_rt_task(rq) && rq != task_rq(p))
                        check_resched = 0;
 #endif /* CONFIG_SMP */
                if (check_resched && p->prio < rq->curr->prio)
index 456e492a3dca37c13d7cb7b57a51965bfa18d6b3..600e2291a75c4092dd6ff400f869c91edcc7903f 100644 (file)
@@ -409,6 +409,8 @@ struct rt_rq {
        int overloaded;
        struct plist_head pushable_tasks;
 #endif
+       int rt_queued;
+
        int rt_throttled;
        u64 rt_time;
        u64 rt_runtime;
@@ -423,18 +425,6 @@ struct rt_rq {
 #endif
 };
 
-#ifdef CONFIG_RT_GROUP_SCHED
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-       return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
-}
-#else
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-       return rt_rq->rt_throttled;
-}
-#endif
-
 /* Deadline class' related fields in a runqueue */
 struct dl_rq {
        /* runqueue is an rbtree, ordered by deadline */
@@ -1216,12 +1206,14 @@ extern void update_idle_cpu_load(struct rq *this_rq);
 
 extern void init_task_runnable_average(struct task_struct *p);
 
-static inline void inc_nr_running(struct rq *rq)
+static inline void add_nr_running(struct rq *rq, unsigned count)
 {
-       rq->nr_running++;
+       unsigned prev_nr = rq->nr_running;
+
+       rq->nr_running = prev_nr + count;
 
 #ifdef CONFIG_NO_HZ_FULL
-       if (rq->nr_running == 2) {
+       if (prev_nr < 2 && rq->nr_running >= 2) {
                if (tick_nohz_full_cpu(rq->cpu)) {
                        /* Order rq->nr_running write against the IPI */
                        smp_wmb();
@@ -1231,9 +1223,9 @@ static inline void inc_nr_running(struct rq *rq)
 #endif
 }
 
-static inline void dec_nr_running(struct rq *rq)
+static inline void sub_nr_running(struct rq *rq, unsigned count)
 {
-       rq->nr_running--;
+       rq->nr_running -= count;
 }
 
 static inline void rq_last_tick_reset(struct rq *rq)
index d6ce65dde5412d4b4b9d8473caf92318ba7fcb24..bfe0edadbfbbe70b55d4cb022a42c29c1d33dda1 100644 (file)
@@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 static void
 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
-       inc_nr_running(rq);
+       add_nr_running(rq, 1);
 }
 
 static void
 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
-       dec_nr_running(rq);
+       sub_nr_running(rq, 1);
 }
 
 static void yield_task_stop(struct rq *rq)
index fba0f29401eafba43b29602b0296ee16022b4633..66a751ebf9d9c77be801d263d0844d40a8b52ad0 100644 (file)
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
                        else
                                p = current;
                        if (p) {
-                               niceval = 20 - task_nice(p);
+                               niceval = nice_to_rlimit(task_nice(p));
                                if (niceval > retval)
                                        retval = niceval;
                        }
@@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
                        else
                                pgrp = task_pgrp(current);
                        do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
-                               niceval = 20 - task_nice(p);
+                               niceval = nice_to_rlimit(task_nice(p));
                                if (niceval > retval)
                                        retval = niceval;
                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
 
                        do_each_thread(g, p) {
                                if (uid_eq(task_uid(p), uid)) {
-                                       niceval = 20 - task_nice(p);
+                                       niceval = nice_to_rlimit(task_nice(p));
                                        if (niceval > retval)
                                                retval = niceval;
                                }
index 8edc87185427cb17fa02ed93498fcf6f8301cb7e..a4bab46cd38e1ee177fe30bcf51d9d5861cac3ed 100644 (file)
@@ -100,10 +100,10 @@ enum {
 
        /*
         * Rescue workers are used only on emergencies and shared by
-        * all cpus.  Give -20.
+        * all cpus.  Give MIN_NICE.
         */
-       RESCUER_NICE_LEVEL      = -20,
-       HIGHPRI_NICE_LEVEL      = -20,
+       RESCUER_NICE_LEVEL      = MIN_NICE,
+       HIGHPRI_NICE_LEVEL      = MIN_NICE,
 
        WQ_NAME_LEN             = 24,
 };
index b4b1feba64724234dee1b66a482c79a0cd3c0f95..d199d2d919467eeddbc82127f0ffbf780e947138 100644 (file)
@@ -2740,7 +2740,7 @@ static int khugepaged(void *none)
        struct mm_slot *mm_slot;
 
        set_freezable();
-       set_user_nice(current, 19);
+       set_user_nice(current, MAX_NICE);
 
        while (!kthread_should_stop()) {
                khugepaged_do_scan();
index 037b812a953141f3dc77b1f7402b29bb54cd9e44..e302ae1dcce05bb50be6c40fa836990bdd9ac522 100644 (file)
@@ -3920,9 +3920,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                }
        }
 
-       /* THP should already have been handled */
-       BUG_ON(pmd_numa(*pmd));
-
        /*
         * Use __pte_alloc instead of pte_alloc_map, because we can't
         * run pte_offset_map on the pmd, if an huge pmd could
This page took 0.225621 seconds and 5 git commands to generate.