From 29f59db3a74b0bdf78a1f5b53ef773caa82692dc Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Mon, 15 Oct 2007 17:00:07 +0200 Subject: [PATCH] sched: group-scheduler core Add interface to control cpu bandwidth allocation to task-groups. (not yet configurable, due to missing CONFIG_CONTAINERS) Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- init/Kconfig | 9 ++ kernel/sched.c | 346 ++++++++++++++++++++++++++++++++++++++-- kernel/sched_fair.c | 3 +- kernel/sched_idletask.c | 5 + kernel/sched_rt.c | 5 + 5 files changed, 350 insertions(+), 18 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index d54d0cadcc06..11c6762a6529 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -281,6 +281,15 @@ config CPUSETS Say N if unsure. +config FAIR_GROUP_SCHED + bool "Fair group scheduler" + depends on EXPERIMENTAL && CONTAINERS + help + This option enables you to group tasks and control CPU resource + allocation to such groups. + + Say N if unsure. + config SYSFS_DEPRECATED bool "Create deprecated sysfs files" default y diff --git a/kernel/sched.c b/kernel/sched.c index 4ad789d268fe..b2688ce54b11 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -171,6 +171,58 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; +#ifdef CONFIG_FAIR_GROUP_SCHED + +#include + +struct cfs_rq; + +/* task group related information */ +struct task_grp { + struct container_subsys_state css; + /* schedulable entities of this group on each cpu */ + struct sched_entity **se; + /* runqueue "owned" by this group on each cpu */ + struct cfs_rq **cfs_rq; + unsigned long shares; +}; + +/* Default task group's sched entity on each cpu */ +static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); +/* Default task group's cfs_rq on each cpu */ +static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; + +static struct sched_entity *init_sched_entity_p[CONFIG_NR_CPUS]; +static struct cfs_rq *init_cfs_rq_p[CONFIG_NR_CPUS]; + +/* Default task group. + * Every task in system belong to this group at bootup. + */ +static struct task_grp init_task_grp = { + .se = init_sched_entity_p, + .cfs_rq = init_cfs_rq_p, + }; + +/* return group to which a task belongs */ +static inline struct task_grp *task_grp(struct task_struct *p) +{ + return container_of(task_subsys_state(p, cpu_subsys_id), + struct task_grp, css); +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_cfs_rq(struct task_struct *p) +{ + p->se.cfs_rq = task_grp(p)->cfs_rq[task_cpu(p)]; + p->se.parent = task_grp(p)->se[task_cpu(p)]; +} + +#else + +static inline void set_task_cfs_rq(struct task_struct *p) { } + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; @@ -197,6 +249,7 @@ struct cfs_rq { * list is used during load balance. */ struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ + struct task_grp *tg; /* group that "owns" this runqueue */ #endif }; @@ -419,18 +472,6 @@ unsigned long long cpu_clock(int cpu) return now; } -#ifdef CONFIG_FAIR_GROUP_SCHED -/* Change a task's ->cfs_rq if it moves across CPUs */ -static inline void set_task_cfs_rq(struct task_struct *p) -{ - p->se.cfs_rq = &task_rq(p)->cfs; -} -#else -static inline void set_task_cfs_rq(struct task_struct *p) -{ -} -#endif - #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif @@ -970,8 +1011,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { #ifdef CONFIG_SMP task_thread_info(p)->cpu = cpu; - set_task_cfs_rq(p); #endif + set_task_cfs_rq(p); } #ifdef CONFIG_SMP @@ -3885,8 +3926,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) oldprio = p->prio; on_rq = p->se.on_rq; - if (on_rq) + if (on_rq) { dequeue_task(rq, p, 0); + if (task_running(rq, p)) + p->sched_class->put_prev_task(rq, p); + } if (rt_prio(prio)) p->sched_class = &rt_sched_class; @@ -3905,6 +3949,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); + p->sched_class->set_curr_task(rq); } else { check_preempt_curr(rq, p); } @@ -4190,8 +4235,11 @@ recheck: } update_rq_clock(rq); on_rq = p->se.on_rq; - if (on_rq) + if (on_rq) { deactivate_task(rq, p, 0); + if (task_running(rq, p)) + p->sched_class->put_prev_task(rq, p); + } oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); if (on_rq) { @@ -4204,6 +4252,7 @@ recheck: if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); + p->sched_class->set_curr_task(rq); } else { check_preempt_curr(rq, p); } @@ -6444,7 +6493,25 @@ void __init sched_init(void) init_cfs_rq(&rq->cfs, rq); #ifdef CONFIG_FAIR_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); - list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + { + struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); + struct sched_entity *se = + &per_cpu(init_sched_entity, i); + + init_cfs_rq_p[i] = cfs_rq; + init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = &init_task_grp; + list_add(&cfs_rq->leaf_cfs_rq_list, + &rq->leaf_cfs_rq_list); + + init_sched_entity_p[i] = se; + se->cfs_rq = &rq->cfs; + se->my_q = cfs_rq; + se->load.weight = NICE_0_LOAD; + se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); + se->parent = NULL; + } + init_task_grp.shares = NICE_0_LOAD; #endif for (j = 0; j < CPU_LOAD_IDX_MAX; j++) @@ -6632,3 +6699,250 @@ void set_curr_task(int cpu, struct task_struct *p) } #endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* return corresponding task_grp object of a container */ +static inline struct task_grp *container_tg(struct container *cont) +{ + return container_of(container_subsys_state(cont, cpu_subsys_id), + struct task_grp, css); +} + +/* allocate runqueue etc for a new task group */ +static struct container_subsys_state * +sched_create_group(struct container_subsys *ss, struct container *cont) +{ + struct task_grp *tg; + struct cfs_rq *cfs_rq; + struct sched_entity *se; + int i; + + if (!cont->parent) { + /* This is early initialization for the top container */ + init_task_grp.css.container = cont; + return &init_task_grp.css; + } + + /* we support only 1-level deep hierarchical scheduler atm */ + if (cont->parent->parent) + return ERR_PTR(-EINVAL); + + tg = kzalloc(sizeof(*tg), GFP_KERNEL); + if (!tg) + return ERR_PTR(-ENOMEM); + + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * num_possible_cpus(), GFP_KERNEL); + if (!tg->cfs_rq) + goto err; + tg->se = kzalloc(sizeof(se) * num_possible_cpus(), GFP_KERNEL); + if (!tg->se) + goto err; + + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + + cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, + cpu_to_node(i)); + if (!cfs_rq) + goto err; + + se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, + cpu_to_node(i)); + if (!se) + goto err; + + memset(cfs_rq, 0, sizeof(struct cfs_rq)); + memset(se, 0, sizeof(struct sched_entity)); + + tg->cfs_rq[i] = cfs_rq; + init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + + tg->se[i] = se; + se->cfs_rq = &rq->cfs; + se->my_q = cfs_rq; + se->load.weight = NICE_0_LOAD; + se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); + se->parent = NULL; + } + + tg->shares = NICE_0_LOAD; + + /* Bind the container to task_grp object we just created */ + tg->css.container = cont; + + return &tg->css; + +err: + for_each_possible_cpu(i) { + if (tg->cfs_rq && tg->cfs_rq[i]) + kfree(tg->cfs_rq[i]); + if (tg->se && tg->se[i]) + kfree(tg->se[i]); + } + if (tg->cfs_rq) + kfree(tg->cfs_rq); + if (tg->se) + kfree(tg->se); + if (tg) + kfree(tg); + + return ERR_PTR(-ENOMEM); +} + + +/* destroy runqueue etc associated with a task group */ +static void sched_destroy_group(struct container_subsys *ss, + struct container *cont) +{ + struct task_grp *tg = container_tg(cont); + struct cfs_rq *cfs_rq; + struct sched_entity *se; + int i; + + for_each_possible_cpu(i) { + cfs_rq = tg->cfs_rq[i]; + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + } + + /* wait for possible concurrent references to cfs_rqs complete */ + synchronize_sched(); + + /* now it should be safe to free those cfs_rqs */ + for_each_possible_cpu(i) { + cfs_rq = tg->cfs_rq[i]; + kfree(cfs_rq); + + se = tg->se[i]; + kfree(se); + } + + kfree(tg->cfs_rq); + kfree(tg->se); + kfree(tg); +} + +static int sched_can_attach(struct container_subsys *ss, + struct container *cont, struct task_struct *tsk) +{ + /* We don't support RT-tasks being in separate groups */ + if (tsk->sched_class != &fair_sched_class) + return -EINVAL; + + return 0; +} + +/* change task's runqueue when it moves between groups */ +static void sched_move_task(struct container_subsys *ss, struct container *cont, + struct container *old_cont, struct task_struct *tsk) +{ + int on_rq, running; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(tsk, &flags); + + if (tsk->sched_class != &fair_sched_class) + goto done; + + update_rq_clock(rq); + + running = task_running(rq, tsk); + on_rq = tsk->se.on_rq; + + if (on_rq) { + dequeue_task(rq, tsk, 0); + if (unlikely(running)) + tsk->sched_class->put_prev_task(rq, tsk); + } + + set_task_cfs_rq(tsk); + + if (on_rq) { + enqueue_task(rq, tsk, 0); + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); + } + +done: + task_rq_unlock(rq, &flags); +} + +static void set_se_shares(struct sched_entity *se, unsigned long shares) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + struct rq *rq = cfs_rq->rq; + int on_rq; + + spin_lock_irq(&rq->lock); + + on_rq = se->on_rq; + if (on_rq) + dequeue_entity(cfs_rq, se, 0); + + se->load.weight = shares; + se->load.inv_weight = div64_64((1ULL<<32), shares); + + if (on_rq) + enqueue_entity(cfs_rq, se, 0); + + spin_unlock_irq(&rq->lock); +} + +static ssize_t cpu_shares_write(struct container *cont, struct cftype *cftype, + struct file *file, const char __user *userbuf, + size_t nbytes, loff_t *ppos) +{ + int i; + unsigned long shareval; + struct task_grp *tg = container_tg(cont); + char buffer[2*sizeof(unsigned long) + 1]; + + if (nbytes > 2*sizeof(unsigned long)) /* safety check */ + return -E2BIG; + + if (copy_from_user(buffer, userbuf, nbytes)) + return -EFAULT; + + buffer[nbytes] = 0; /* nul-terminate */ + shareval = simple_strtoul(buffer, NULL, 10); + + tg->shares = shareval; + for_each_possible_cpu(i) + set_se_shares(tg->se[i], shareval); + + return nbytes; +} + +static u64 cpu_shares_read_uint(struct container *cont, struct cftype *cft) +{ + struct task_grp *tg = container_tg(cont); + + return (u64) tg->shares; +} + +struct cftype cpuctl_share = { + .name = "shares", + .read_uint = cpu_shares_read_uint, + .write = cpu_shares_write, +}; + +static int sched_populate(struct container_subsys *ss, struct container *cont) +{ + return container_add_file(cont, ss, &cpuctl_share); +} + +struct container_subsys cpu_subsys = { + .name = "cpu", + .create = sched_create_group, + .destroy = sched_destroy_group, + .can_attach = sched_can_attach, + .attach = sched_move_task, + .populate = sched_populate, + .subsys_id = cpu_subsys_id, + .early_init = 1, +}; + +#endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ec445cadbb01..12ab9338d563 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -610,8 +610,7 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) */ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) { - /* A later patch will take group into account */ - return &cpu_rq(this_cpu)->cfs; + return cfs_rq->tg->cfs_rq[this_cpu]; } /* Iterate thr' all leaf cfs_rq's on a runqueue */ diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3503fb2d9f96..5ebf829cdd73 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -50,6 +50,10 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr) { } +static void set_curr_task_idle(struct rq *rq) +{ +} + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -66,6 +70,7 @@ static struct sched_class idle_sched_class __read_mostly = { .load_balance = load_balance_idle, + .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, /* no .task_new for idle tasks */ }; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 4b87476a02d0..45b339f56aea 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -218,6 +218,10 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) } } +static void set_curr_task_rt(struct rq *rq) +{ +} + static struct sched_class rt_sched_class __read_mostly = { .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, @@ -230,5 +234,6 @@ static struct sched_class rt_sched_class __read_mostly = { .load_balance = load_balance_rt, + .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, }; -- 2.34.1