blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node);
blkg->blkcg = blkcg;
- blkg->refcnt = 1;
+ atomic_set(&blkg->refcnt, 1);
/* root blkg uses @q->root_rl, init rl only for !root blkgs */
if (blkcg != &blkcg_root) {
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
- if (blkg->parent) {
- spin_lock_irq(blkg->q->queue_lock);
+ if (blkg->parent)
blkg_put(blkg->parent);
- spin_unlock_irq(blkg->q->queue_lock);
- }
blkg_free(blkg);
}
{
lockdep_assert_held(q->queue_lock);
+ /*
+ * @q could be exiting and already have destroyed all blkgs as
+ * indicated by NULL root_blkg. If so, don't confuse policies.
+ */
+ if (!q->root_blkg)
+ return;
+
blk_throtl_drain(q);
}
.css_offline = blkcg_css_offline,
.css_free = blkcg_css_free,
.can_attach = blkcg_can_attach,
- .base_cftypes = blkcg_files,
+ .legacy_cftypes = blkcg_files,
+ #ifdef CONFIG_MEMCG
+ /*
+ * This ensures that, if available, memcg is automatically enabled
+ * together on the default hierarchy so that the owner cgroup can
+ * be retrieved from writeback pages.
+ */
+ .depends_on = 1 << memory_cgrp_id,
+ #endif
};
EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
* Register @pol with blkcg core. Might sleep and @pol may be modified on
* successful registration. Returns 0 on success and -errno on failure.
*/
-int __init blkcg_policy_register(struct blkcg_policy *pol)
+int blkcg_policy_register(struct blkcg_policy *pol)
{
int i, ret;
/* everything is in place, add intf files for the new policy */
if (pol->cftypes)
- WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes));
+ WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
+ pol->cftypes));
ret = 0;
out_unlock:
mutex_unlock(&blkcg_pol_mutex);
*/
static bool cgrp_dfl_root_visible;
+ /*
+ * Set by the boot param of the same name and makes subsystems with NULL
+ * ->dfl_files to use ->legacy_files on the default hierarchy.
+ */
+ static bool cgroup_legacy_files_on_dfl;
+
/* some controllers are not supported in the default hierarchy */
- static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
- #ifdef CONFIG_CGROUP_DEBUG
- | (1 << debug_cgrp_id)
- #endif
- ;
+ static unsigned int cgrp_dfl_root_inhibit_ss_mask;
/* The list of hierarchy roots */
*/
static int need_forkexit_callback __read_mostly;
- static struct cftype cgroup_base_files[];
+ static struct cftype cgroup_dfl_base_files[];
+ static struct cftype cgroup_legacy_base_files[];
static void cgroup_put(struct cgroup *cgrp);
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned int ss_mask);
static int cgroup_destroy_locked(struct cgroup *cgrp);
- static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+ bool visible);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
css_put(&cgrp->self);
}
+ /**
+ * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * @cgrp: the target cgroup
+ *
+ * On the default hierarchy, a subsystem may request other subsystems to be
+ * enabled together through its ->depends_on mask. In such cases, more
+ * subsystems than specified in "cgroup.subtree_control" may be enabled.
+ *
+ * This function determines which subsystems need to be enabled given the
+ * current @cgrp->subtree_control and records it in
+ * @cgrp->child_subsys_mask. The resulting mask is always a superset of
+ * @cgrp->subtree_control and follows the usual hierarchy rules.
+ */
+ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+ {
+ struct cgroup *parent = cgroup_parent(cgrp);
+ unsigned int cur_ss_mask = cgrp->subtree_control;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!cgroup_on_dfl(cgrp)) {
+ cgrp->child_subsys_mask = cur_ss_mask;
+ return;
+ }
+
+ while (true) {
+ unsigned int new_ss_mask = cur_ss_mask;
+
+ for_each_subsys(ss, ssid)
+ if (cur_ss_mask & (1 << ssid))
+ new_ss_mask |= ss->depends_on;
+
+ /*
+ * Mask out subsystems which aren't available. This can
+ * happen only if some depended-upon subsystems were bound
+ * to non-default hierarchies.
+ */
+ if (parent)
+ new_ss_mask &= parent->child_subsys_mask;
+ else
+ new_ss_mask &= cgrp->root->subsys_mask;
+
+ if (new_ss_mask == cur_ss_mask)
+ break;
+ cur_ss_mask = new_ss_mask;
+ }
+
+ cgrp->child_subsys_mask = cur_ss_mask;
+ }
+
/**
* cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
up_write(&css_set_rwsem);
src_root->subsys_mask &= ~(1 << ssid);
- src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+ src_root->cgrp.subtree_control &= ~(1 << ssid);
+ cgroup_refresh_child_subsys_mask(&src_root->cgrp);
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
- if (dst_root != &cgrp_dfl_root)
- dst_root->cgrp.child_subsys_mask |= 1 << ssid;
+ if (dst_root != &cgrp_dfl_root) {
+ dst_root->cgrp.subtree_control |= 1 << ssid;
+ cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+ }
if (ss->bind)
ss->bind(css);
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
seq_printf(seq, ",%s", ss->name);
- if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
- seq_puts(seq, ",sane_behavior");
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
bool all_ss = false, one_ss = false;
unsigned int mask = -1U;
struct cgroup_subsys *ss;
+ int nr_opts = 0;
int i;
#ifdef CONFIG_CPUSETS
memset(opts, 0, sizeof(*opts));
while ((token = strsep(&o, ",")) != NULL) {
+ nr_opts++;
+
if (!*token)
return -EINVAL;
if (!strcmp(token, "none")) {
return -ENOENT;
}
- /* Consistency checks */
-
if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
-
- if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
- opts->cpuset_clone_children || opts->release_agent ||
- opts->name) {
- pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
+ if (nr_opts != 1) {
+ pr_err("sane_behavior: no other mount options allowed\n");
return -EINVAL;
}
- } else {
- /*
- * If the 'all' option was specified select all the
- * subsystems, otherwise if 'none', 'name=' and a subsystem
- * name options were not specified, let's default to 'all'
- */
- if (all_ss || (!one_ss && !opts->none && !opts->name))
- for_each_subsys(ss, i)
- if (!ss->disabled)
- opts->subsys_mask |= (1 << i);
-
- /*
- * We either have to specify by name or by subsystems. (So
- * all empty hierarchies must have a name).
- */
- if (!opts->subsys_mask && !opts->name)
- return -EINVAL;
+ return 0;
}
+ /*
+ * If the 'all' option was specified select all the subsystems,
+ * otherwise if 'none', 'name=' and a subsystem name options were
+ * not specified, let's default to 'all'
+ */
+ if (all_ss || (!one_ss && !opts->none && !opts->name))
+ for_each_subsys(ss, i)
+ if (!ss->disabled)
+ opts->subsys_mask |= (1 << i);
+
+ /*
+ * We either have to specify by name or by subsystems. (So all
+ * empty hierarchies must have a name).
+ */
+ if (!opts->subsys_mask && !opts->name)
+ return -EINVAL;
+
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
return -EINVAL;
-
/* Can't specify "none" and some subsystems */
if (opts->subsys_mask && opts->none)
return -EINVAL;
struct cgroup_sb_opts opts;
unsigned int added_mask, removed_mask;
- if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_err("sane_behavior: remount is not allowed\n");
+ if (root == &cgrp_dfl_root) {
+ pr_err("remount is not allowed\n");
return -EINVAL;
}
removed_mask = root->subsys_mask & ~opts.subsys_mask;
/* Don't allow flags or name to change at remount */
- if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
+ if ((opts.flags ^ root->flags) ||
(opts.name && strcmp(opts.name, root->name))) {
pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
- opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
- root->flags & CGRP_ROOT_OPTION_MASK, root->name);
+ opts.flags, opts.name ?: "", root->flags, root->name);
ret = -EINVAL;
goto out_unlock;
}
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
+ struct cftype *base_files;
struct css_set *cset;
int i, ret;
}
root_cgrp->kn = root->kf_root->kn;
- ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+ if (root == &cgrp_dfl_root)
+ base_files = cgroup_dfl_base_files;
+ else
+ base_files = cgroup_legacy_base_files;
+
+ ret = cgroup_addrm_files(root_cgrp, base_files, true);
if (ret)
goto destroy_root;
exit_root_id:
cgroup_exit_root_id(root);
cancel_ref:
- percpu_ref_cancel_init(&root_cgrp->self.refcnt);
+ percpu_ref_exit(&root_cgrp->self.refcnt);
out:
free_cgrp_cset_links(&tmp_links);
return ret;
int flags, const char *unused_dev_name,
void *data)
{
+ struct super_block *pinned_sb = NULL;
+ struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
+ int i;
bool new_sb;
/*
goto out_unlock;
/* look for a matching existing root */
- if (!opts.subsys_mask && !opts.none && !opts.name) {
+ if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
cgrp_dfl_root_visible = true;
root = &cgrp_dfl_root;
cgroup_get(&root->cgrp);
goto out_unlock;
}
+ /*
+ * Destruction of cgroup root is asynchronous, so subsystems may
+ * still be dying after the previous unmount. Let's drain the
+ * dying subsystems. We just need to ensure that the ones
+ * unmounted previously finish dying and don't care about new ones
+ * starting. Testing ref liveliness is good enough.
+ */
+ for_each_subsys(ss, i) {
+ if (!(opts.subsys_mask & (1 << i)) ||
+ ss->root == &cgrp_dfl_root)
+ continue;
+
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+ cgroup_put(&ss->root->cgrp);
+ }
+
for_each_root(root) {
bool name_match = false;
goto out_unlock;
}
- if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
- if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_err("sane_behavior: new mount options should match the existing superblock\n");
- ret = -EINVAL;
- goto out_unlock;
- } else {
- pr_warn("new mount options do not match the existing superblock, will be ignored\n");
- }
- }
+ if (root->flags ^ opts.flags)
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
/*
- * A root's lifetime is governed by its root cgroup.
- * tryget_live failure indicate that the root is being
- * destroyed. Wait for destruction to complete so that the
- * subsystems are free. We can use wait_queue for the wait
- * but this path is super cold. Let's just sleep for a bit
- * and retry.
+ * We want to reuse @root whose lifetime is governed by its
+ * ->cgrp. Let's check whether @root is alive and keep it
+ * that way. As cgroup_kill_sb() can happen anytime, we
+ * want to block it by pinning the sb so that @root doesn't
+ * get killed before mount is complete.
+ *
+ * With the sb pinned, tryget_live can reliably indicate
+ * whether @root can be reused. If it's being killed,
+ * drain it. We can use wait_queue for the wait but this
+ * path is super cold. Let's just sleep a bit and retry.
*/
- if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+ pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+ if (IS_ERR(pinned_sb) ||
+ !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
+ if (!IS_ERR_OR_NULL(pinned_sb))
+ deactivate_super(pinned_sb);
msleep(10);
ret = restart_syscall();
goto out_free;
CGROUP_SUPER_MAGIC, &new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
+
+ /*
+ * If @pinned_sb, we're reusing an existing root and holding an
+ * extra ref on its sb. Mount is complete. Put the extra ref.
+ */
+ if (pinned_sb) {
+ WARN_ON(new_sb);
+ deactivate_super(pinned_sb);
+ }
+
return dentry;
}
static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
-
- seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+ seq_puts(seq, "0\n");
return 0;
}
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
+ cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
return 0;
}
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+ cgroup_print_ss_mask(seq, cgrp->subtree_control);
return 0;
}
loff_t off)
{
unsigned int enable = 0, disable = 0;
+ unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
for_each_subsys(ss, ssid) {
if (enable & (1 << ssid)) {
- if (cgrp->child_subsys_mask & (1 << ssid)) {
+ if (cgrp->subtree_control & (1 << ssid)) {
enable &= ~(1 << ssid);
continue;
}
+ /* unavailable or not enabled on the parent? */
+ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+ (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ /*
+ * @ss is already enabled through dependency and
+ * we'll just make it visible. Skip draining.
+ */
+ if (cgrp->child_subsys_mask & (1 << ssid))
+ continue;
+
/*
* Because css offlining is asynchronous, userland
* might try to re-enable the same controller while
return restart_syscall();
}
-
- /* unavailable or not enabled on the parent? */
- if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
- (cgroup_parent(cgrp) &&
- !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
- ret = -ENOENT;
- goto out_unlock;
- }
} else if (disable & (1 << ssid)) {
- if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+ if (!(cgrp->subtree_control & (1 << ssid))) {
disable &= ~(1 << ssid);
continue;
}
/* a child has it enabled? */
cgroup_for_each_live_child(child, cgrp) {
- if (child->child_subsys_mask & (1 << ssid)) {
+ if (child->subtree_control & (1 << ssid)) {
ret = -EBUSY;
goto out_unlock;
}
}
/*
- * Except for the root, child_subsys_mask must be zero for a cgroup
+ * Except for the root, subtree_control must be zero for a cgroup
* with tasks so that child cgroups don't compete against tasks.
*/
if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
}
/*
- * Create csses for enables and update child_subsys_mask. This
- * changes cgroup_e_css() results which in turn makes the
- * subsequent cgroup_update_dfl_csses() associate all tasks in the
- * subtree to the updated csses.
+ * Update subsys masks and calculate what needs to be done. More
+ * subsystems than specified may need to be enabled or disabled
+ * depending on subsystem dependencies.
+ */
+ cgrp->subtree_control |= enable;
+ cgrp->subtree_control &= ~disable;
+
+ old_ctrl = cgrp->child_subsys_mask;
+ cgroup_refresh_child_subsys_mask(cgrp);
+ new_ctrl = cgrp->child_subsys_mask;
+
+ css_enable = ~old_ctrl & new_ctrl;
+ css_disable = old_ctrl & ~new_ctrl;
+ enable |= css_enable;
+ disable |= css_disable;
+
+ /*
+ * Create new csses or make the existing ones visible. A css is
+ * created invisible if it's being implicitly enabled through
+ * dependency. An invisible css is made visible when the userland
+ * explicitly enables it.
*/
for_each_subsys(ss, ssid) {
if (!(enable & (1 << ssid)))
continue;
cgroup_for_each_live_child(child, cgrp) {
- ret = create_css(child, ss);
+ if (css_enable & (1 << ssid))
+ ret = create_css(child, ss,
+ cgrp->subtree_control & (1 << ssid));
+ else
+ ret = cgroup_populate_dir(child, 1 << ssid);
if (ret)
goto err_undo_css;
}
}
- cgrp->child_subsys_mask |= enable;
- cgrp->child_subsys_mask &= ~disable;
-
+ /*
+ * At this point, cgroup_e_css() results reflect the new csses
+ * making the following cgroup_update_dfl_csses() properly update
+ * css associations of all tasks in the subtree.
+ */
ret = cgroup_update_dfl_csses(cgrp);
if (ret)
goto err_undo_css;
- /* all tasks are now migrated away from the old csses, kill them */
+ /*
+ * All tasks are migrated out of disabled csses. Kill or hide
+ * them. A css is hidden when the userland requests it to be
+ * disabled while other subsystems are still depending on it. The
+ * css must not actively control resources and be in the vanilla
+ * state if it's made visible again later. Controllers which may
+ * be depended upon should provide ->css_reset() for this purpose.
+ */
for_each_subsys(ss, ssid) {
if (!(disable & (1 << ssid)))
continue;
- cgroup_for_each_live_child(child, cgrp)
- kill_css(cgroup_css(child, ss));
+ cgroup_for_each_live_child(child, cgrp) {
+ struct cgroup_subsys_state *css = cgroup_css(child, ss);
+
+ if (css_disable & (1 << ssid)) {
+ kill_css(css);
+ } else {
+ cgroup_clear_dir(child, 1 << ssid);
+ if (ss->css_reset)
+ ss->css_reset(css);
+ }
+ }
}
kernfs_activate(cgrp->kn);
return ret ?: nbytes;
err_undo_css:
- cgrp->child_subsys_mask &= ~enable;
- cgrp->child_subsys_mask |= disable;
+ cgrp->subtree_control &= ~enable;
+ cgrp->subtree_control |= disable;
+ cgroup_refresh_child_subsys_mask(cgrp);
for_each_subsys(ss, ssid) {
if (!(enable & (1 << ssid)))
cgroup_for_each_live_child(child, cgrp) {
struct cgroup_subsys_state *css = cgroup_css(child, ss);
- if (css)
+
+ if (!css)
+ continue;
+
+ if (css_enable & (1 << ssid))
kill_css(css);
+ else
+ cgroup_clear_dir(child, 1 << ssid);
}
}
goto out_unlock;
/*
* This isn't a proper migration and its usefulness is very
- * limited. Disallow if sane_behavior.
+ * limited. Disallow on the default hierarchy.
*/
- if (cgroup_sane_behavior(cgrp))
+ if (cgroup_on_dfl(cgrp))
return -EPERM;
/*
for (cft = cfts; cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
- if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
- if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+ if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
continue;
kfree(cft->kf_ops);
cft->kf_ops = NULL;
cft->ss = NULL;
+
+ /* revert flags set by cgroup core while adding @cfts */
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
}
}
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
- int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
int ret;
return ret;
}
+ /**
+ * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the default hierarchy.
+ */
+ int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+ {
+ struct cftype *cft;
+
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_ONLY_ON_DFL;
+ return cgroup_add_cftypes(ss, cfts);
+ }
+
+ /**
+ * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the legacy hierarchies.
+ */
+ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+ {
+ struct cftype *cft;
+
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_NOT_ON_DFL;
+ return cgroup_add_cftypes(ss, cfts);
+ }
+
/**
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
rcu_read_lock();
css_for_each_child(child, css) {
- if (css->flags & CSS_ONLINE) {
+ if (child->flags & CSS_ONLINE) {
ret = true;
break;
}
*
* All this extra complexity was caused by the original implementation
* committing to an entirely unnecessary property. In the long term, we
- * want to do away with it. Explicitly scramble sort order if
- * sane_behavior so that no such expectation exists in the new interface.
+ * want to do away with it. Explicitly scramble sort order if on the
+ * default hierarchy so that no such expectation exists in the new
+ * interface.
*
* Scrambling is done by swapping every two consecutive bits, which is
* non-identity one-to-one mapping which disturbs sort order sufficiently.
static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
{
- if (cgroup_sane_behavior(cgrp))
+ if (cgroup_on_dfl(cgrp))
return pid_fry(pid);
else
return pid;
css_task_iter_end(&it);
length = n;
/* now sort & (if procs) strip out duplicates */
- if (cgroup_sane_behavior(cgrp))
+ if (cgroup_on_dfl(cgrp))
sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
else
sort(array, length, sizeof(pid_t), cmppid, NULL);
return 0;
}
- static struct cftype cgroup_base_files[] = {
+ /* cgroup core interface files for the default hierarchy */
+ static struct cftype cgroup_dfl_base_files[] = {
{
.name = "cgroup.procs",
.seq_start = cgroup_pidlist_start,
.write = cgroup_procs_write,
.mode = S_IRUGO | S_IWUSR,
},
- {
- .name = "cgroup.clone_children",
- .flags = CFTYPE_INSANE,
- .read_u64 = cgroup_clone_children_read,
- .write_u64 = cgroup_clone_children_write,
- },
- {
- .name = "cgroup.sane_behavior",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .seq_show = cgroup_sane_behavior_show,
- },
{
.name = "cgroup.controllers",
- .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+ .flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_root_controllers_show,
},
{
.name = "cgroup.controllers",
- .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_controllers_show,
},
{
.name = "cgroup.subtree_control",
- .flags = CFTYPE_ONLY_ON_DFL,
.seq_show = cgroup_subtree_control_show,
.write = cgroup_subtree_control_write,
},
{
.name = "cgroup.populated",
- .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_populated_show,
},
+ { } /* terminate */
+ };
- /*
- * Historical crazy stuff. These don't have "cgroup." prefix and
- * don't exist if sane_behavior. If you're depending on these, be
- * prepared to be burned.
- */
+ /* cgroup core interface files for the legacy hierarchies */
+ static struct cftype cgroup_legacy_base_files[] = {
+ {
+ .name = "cgroup.procs",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
+ .write = cgroup_procs_write,
+ .mode = S_IRUGO | S_IWUSR,
+ },
+ {
+ .name = "cgroup.clone_children",
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
+ {
+ .name = "cgroup.sane_behavior",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_sane_behavior_show,
+ },
{
.name = "tasks",
- .flags = CFTYPE_INSANE, /* use "procs" instead */
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
},
{
.name = "notify_on_release",
- .flags = CFTYPE_INSANE,
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
},
{
.name = "release_agent",
- .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
+ .flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_release_agent_show,
.write = cgroup_release_agent_write,
.max_write_len = PATH_MAX - 1,
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup *cgrp = css->cgroup;
+ percpu_ref_exit(&css->refcnt);
+
if (css->ss) {
/* css free path */
if (css->parent)
* create_css - create a cgroup_subsys_state
* @cgrp: the cgroup new css will be associated with
* @ss: the subsys of new css
+ * @visible: whether to create control knobs for the new css or not
*
* Create a new css associated with @cgrp - @ss pair. On success, the new
- * css is online and installed in @cgrp with all interface files created.
- * Returns 0 on success, -errno on failure.
+ * css is online and installed in @cgrp with all interface files created if
+ * @visible. Returns 0 on success, -errno on failure.
*/
- static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
+ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+ bool visible)
{
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
goto err_free_percpu_ref;
css->id = err;
- err = cgroup_populate_dir(cgrp, 1 << ss->id);
- if (err)
- goto err_free_id;
+ if (visible) {
+ err = cgroup_populate_dir(cgrp, 1 << ss->id);
+ if (err)
+ goto err_free_id;
+ }
/* @css is ready to be brought online now, make it visible */
list_add_tail_rcu(&css->sibling, &parent_css->children);
err_free_id:
cgroup_idr_remove(&ss->css_idr, css->id);
err_free_percpu_ref:
- percpu_ref_cancel_init(&css->refcnt);
+ percpu_ref_exit(&css->refcnt);
err_free_css:
call_rcu(&css->rcu_head, css_free_rcu_fn);
return err;
struct cgroup_root *root;
struct cgroup_subsys *ss;
struct kernfs_node *kn;
+ struct cftype *base_files;
int ssid, ret;
parent = cgroup_kn_lock_live(parent_kn);
if (ret)
goto out_destroy;
- ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+ if (cgroup_on_dfl(cgrp))
+ base_files = cgroup_dfl_base_files;
+ else
+ base_files = cgroup_legacy_base_files;
+
+ ret = cgroup_addrm_files(cgrp, base_files, true);
if (ret)
goto out_destroy;
/* let's create and online css's */
for_each_subsys(ss, ssid) {
if (parent->child_subsys_mask & (1 << ssid)) {
- ret = create_css(cgrp, ss);
+ ret = create_css(cgrp, ss,
+ parent->subtree_control & (1 << ssid));
if (ret)
goto out_destroy;
}
/*
* On the default hierarchy, a child doesn't automatically inherit
- * child_subsys_mask from the parent. Each is configured manually.
+ * subtree_control from the parent. Each is configured manually.
*/
- if (!cgroup_on_dfl(cgrp))
- cgrp->child_subsys_mask = parent->child_subsys_mask;
+ if (!cgroup_on_dfl(cgrp)) {
+ cgrp->subtree_control = parent->subtree_control;
+ cgroup_refresh_child_subsys_mask(cgrp);
+ }
kernfs_activate(kn);
out_free_id:
cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
out_cancel_ref:
- percpu_ref_cancel_init(&cgrp->self.refcnt);
+ percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
kfree(cgrp);
out_unlock:
*/
int __init cgroup_init_early(void)
{
- static struct cgroup_sb_opts __initdata opts =
- { .flags = CGRP_ROOT_SANE_BEHAVIOR };
+ static struct cgroup_sb_opts __initdata opts;
struct cgroup_subsys *ss;
int i;
unsigned long key;
int ssid, err;
- BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
mutex_lock(&cgroup_mutex);
* disabled flag and cftype registration needs kmalloc,
* both of which aren't available during early_init.
*/
- if (!ss->disabled) {
- cgrp_dfl_root.subsys_mask |= 1 << ss->id;
- WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
+ if (ss->disabled)
+ continue;
+
+ cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+
+ if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
+ ss->dfl_cftypes = ss->legacy_cftypes;
+
+ if (!ss->dfl_cftypes)
+ cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
+
+ if (ss->dfl_cftypes == ss->legacy_cftypes) {
+ WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
+ } else {
+ WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
+ WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
}
}
}
__setup("cgroup_disable=", cgroup_disable);
+ static int __init cgroup_set_legacy_files_on_dfl(char *str)
+ {
+ printk("cgroup: using legacy files on the default hierarchy\n");
+ cgroup_legacy_files_on_dfl = true;
+ return 0;
+ }
+ __setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
+
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
struct cgroup_subsys debug_cgrp_subsys = {
.css_alloc = debug_css_alloc,
.css_free = debug_css_free,
- .base_cftypes = debug_files,
+ .legacy_cftypes = debug_files,
};
#endif /* CONFIG_CGROUP_DEBUG */
struct cgroup_subsys_state css;
unsigned long flags; /* "unsigned long" so bitops work */
- cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
- nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
+
+ /*
+ * On default hierarchy:
+ *
+ * The user-configured masks can only be changed by writing to
+ * cpuset.cpus and cpuset.mems, and won't be limited by the
+ * parent masks.
+ *
+ * The effective masks is the real masks that apply to the tasks
+ * in the cpuset. They may be changed if the configured masks are
+ * changed or hotplug happens.
+ *
+ * effective_mask == configured_mask & parent's effective_mask,
+ * and if it ends up empty, it will inherit the parent's mask.
+ *
+ *
+ * On legacy hierachy:
+ *
+ * The user-configured masks are always the same with effective masks.
+ */
+
+ /* user-configured CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t cpus_allowed;
+ nodemask_t mems_allowed;
+
+ /* effective CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t effective_cpus;
+ nodemask_t effective_mems;
/*
* This is old Memory Nodes tasks took on.
*/
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+ while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
cs = parent_cs(cs);
- cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
+ cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
}
/*
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
- while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
+ while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
cs = parent_cs(cs);
- nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
+ nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}
/*
if (!trial)
return NULL;
- if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
- kfree(trial);
- return NULL;
- }
- cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
+ goto free_cs;
+ if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
+ goto free_cpus;
+ cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ cpumask_copy(trial->effective_cpus, cs->effective_cpus);
return trial;
+
+ free_cpus:
+ free_cpumask_var(trial->cpus_allowed);
+ free_cs:
+ kfree(trial);
+ return NULL;
}
/**
*/
static void free_trial_cpuset(struct cpuset *trial)
{
+ free_cpumask_var(trial->effective_cpus);
free_cpumask_var(trial->cpus_allowed);
kfree(trial);
}
par = parent_cs(cur);
- /* We must be a subset of our parent cpuset */
+ /* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
- if (!is_cpuset_subset(trial, par))
+ if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
goto out;
/*
#ifdef CONFIG_SMP
/*
* Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping cpus_allowed masks?
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
*/
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{
- return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
+ return cpumask_intersects(a->effective_cpus, b->effective_cpus);
}
static void
*dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, &top_cpuset);
}
- cpumask_copy(doms[0], top_cpuset.cpus_allowed);
+ cpumask_copy(doms[0], top_cpuset.effective_cpus);
goto done;
}
struct cpuset *b = csa[j];
if (apn == b->pn) {
- cpumask_or(dp, dp, b->cpus_allowed);
+ cpumask_or(dp, dp, b->effective_cpus);
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
* passing doms with offlined cpu to partition_sched_domains().
* Anyways, hotplug work item will rebuild sched domains.
*/
- if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
goto out;
/* Generate domain masks and attrs */
mutex_unlock(&cpuset_mutex);
}
- /*
- * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
- * @cs: the cpuset in interest
- *
- * A cpuset's effective cpumask is the cpumask of the nearest ancestor
- * with non-empty cpus. We use effective cpumask whenever:
- * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
- * if the cpuset they reside in has no cpus)
- * - we want to retrieve task_cs(tsk)'s cpus_allowed.
- *
- * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
- * exception. See comments there.
- */
- static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
- {
- while (cpumask_empty(cs->cpus_allowed))
- cs = parent_cs(cs);
- return cs;
- }
-
- /*
- * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
- * @cs: the cpuset in interest
- *
- * A cpuset's effective nodemask is the nodemask of the nearest ancestor
- * with non-empty memss. We use effective nodemask whenever:
- * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
- * if the cpuset they reside in has no mems)
- * - we want to retrieve task_cs(tsk)'s mems_allowed.
- *
- * Called with cpuset_mutex held.
- */
- static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
- {
- while (nodes_empty(cs->mems_allowed))
- cs = parent_cs(cs);
- return cs;
- }
-
/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
*/
static void update_tasks_cpumask(struct cpuset *cs)
{
- struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
struct css_task_iter it;
struct task_struct *task;
css_task_iter_start(&cs->css, &it);
while ((task = css_task_iter_next(&it)))
- set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+ set_cpus_allowed_ptr(task, cs->effective_cpus);
css_task_iter_end(&it);
}
/*
- * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
- * @root_cs: the root cpuset of the hierarchy
- * @update_root: update root cpuset or not?
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_cpus: temp variable for calculating new effective_cpus
+ *
+ * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * and all its descendants need to be updated.
*
- * This will update cpumasks of tasks in @root_cs and all other empty cpusets
- * which take on cpumask of @root_cs.
+ * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
*
* Called with cpuset_mutex held
*/
- static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
+ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
+ bool need_rebuild_sched_domains = false;
rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- if (cp == root_cs) {
- if (!update_root)
- continue;
- } else {
- /* skip the whole subtree if @cp have some CPU */
- if (!cpumask_empty(cp->cpus_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ struct cpuset *parent = parent_cs(cp);
+
+ cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+
+ /*
+ * If it becomes empty, inherit the effective mask of the
+ * parent, which is guaranteed to have some CPUs.
+ */
+ if (cpumask_empty(new_cpus))
+ cpumask_copy(new_cpus, parent->effective_cpus);
+
+ /* Skip the whole subtree if the cpumask remains the same. */
+ if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
}
+
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
+ mutex_lock(&callback_mutex);
+ cpumask_copy(cp->effective_cpus, new_cpus);
+ mutex_unlock(&callback_mutex);
+
+ WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
+
update_tasks_cpumask(cp);
+ /*
+ * If the effective cpumask of any non-empty cpuset is changed,
+ * we need to rebuild sched domains.
+ */
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ is_sched_load_balance(cp))
+ need_rebuild_sched_domains = true;
+
rcu_read_lock();
css_put(&cp->css);
}
rcu_read_unlock();
+
+ if (need_rebuild_sched_domains)
+ rebuild_sched_domains_locked();
}
/**
const char *buf)
{
int retval;
- int is_load_balanced;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
if (retval < 0)
return retval;
- if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
+ if (!cpumask_subset(trialcs->cpus_allowed,
+ top_cpuset.cpus_allowed))
return -EINVAL;
}
if (retval < 0)
return retval;
- is_load_balanced = is_sched_load_balance(trialcs);
-
mutex_lock(&callback_mutex);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(&callback_mutex);
- update_tasks_cpumask_hier(cs, true);
-
- if (is_load_balanced)
- rebuild_sched_domains_locked();
+ /* use trialcs->cpus_allowed as a temp variable */
+ update_cpumasks_hier(cs, trialcs->cpus_allowed);
return 0;
}
const nodemask_t *to)
{
struct task_struct *tsk = current;
- struct cpuset *mems_cs;
tsk->mems_allowed = *to;
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
rcu_read_lock();
- mems_cs = effective_nodemask_cpuset(task_cs(tsk));
- guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+ guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
rcu_read_unlock();
}
static void update_tasks_nodemask(struct cpuset *cs)
{
static nodemask_t newmems; /* protected by cpuset_mutex */
- struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
struct css_task_iter it;
struct task_struct *task;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
- guarantee_online_mems(mems_cs, &newmems);
+ guarantee_online_mems(cs, &newmems);
/*
* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
}
/*
- * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
- * @cs: the root cpuset of the hierarchy
- * @update_root: update the root cpuset or not?
+ * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_mems: a temp variable for calculating new effective_mems
*
- * This will update nodemasks of tasks in @root_cs and all other empty cpusets
- * which take on nodemask of @root_cs.
+ * When configured nodemask is changed, the effective nodemasks of this cpuset
+ * and all its descendants need to be updated.
+ *
+ * On legacy hiearchy, effective_mems will be the same with mems_allowed.
*
* Called with cpuset_mutex held
*/
- static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
+ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- if (cp == root_cs) {
- if (!update_root)
- continue;
- } else {
- /* skip the whole subtree if @cp have some CPU */
- if (!nodes_empty(cp->mems_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ struct cpuset *parent = parent_cs(cp);
+
+ nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
+
+ /*
+ * If it becomes empty, inherit the effective mask of the
+ * parent, which is guaranteed to have some MEMs.
+ */
+ if (nodes_empty(*new_mems))
+ *new_mems = parent->effective_mems;
+
+ /* Skip the whole subtree if the nodemask remains the same. */
+ if (nodes_equal(*new_mems, cp->effective_mems)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
}
+
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
+ mutex_lock(&callback_mutex);
+ cp->effective_mems = *new_mems;
+ mutex_unlock(&callback_mutex);
+
+ WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ !nodes_equal(cp->mems_allowed, cp->effective_mems));
+
update_tasks_nodemask(cp);
rcu_read_lock();
goto done;
if (!nodes_subset(trialcs->mems_allowed,
- node_states[N_MEMORY])) {
- retval = -EINVAL;
+ top_cpuset.mems_allowed)) {
+ retval = -EINVAL;
goto done;
}
}
cs->mems_allowed = trialcs->mems_allowed;
mutex_unlock(&callback_mutex);
- update_tasks_nodemask_hier(cs, true);
+ /* use trialcs->mems_allowed as a temp variable */
+ update_nodemasks_hier(cs, &cs->mems_allowed);
done:
return retval;
}
int current_cpuset_is_being_rebound(void)
{
- return task_cs(current) == cpuset_being_rebound;
+ int ret;
+
+ rcu_read_lock();
+ ret = task_cs(current) == cpuset_being_rebound;
+ rcu_read_unlock();
+
+ return ret;
}
static int update_relax_domain_level(struct cpuset *cs, s64 val)
mutex_lock(&cpuset_mutex);
- /*
- * We allow to move tasks into an empty cpuset if sane_behavior
- * flag is set.
- */
+ /* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
- if (!cgroup_sane_behavior(css->cgroup) &&
+ if (!cgroup_on_dfl(css->cgroup) &&
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
struct task_struct *leader = cgroup_taskset_first(tset);
struct cpuset *cs = css_cs(css);
struct cpuset *oldcs = cpuset_attach_old_cs;
- struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
- struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
mutex_lock(&cpuset_mutex);
if (cs == &top_cpuset)
cpumask_copy(cpus_attach, cpu_possible_mask);
else
- guarantee_online_cpus(cpus_cs, cpus_attach);
+ guarantee_online_cpus(cs, cpus_attach);
- guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cgroup_taskset_for_each(task, tset) {
/*
* Change mm, possibly for multiple threads in a threadgroup. This is
* expensive and may sleep.
*/
- cpuset_attach_nodemask_to = cs->mems_allowed;
+ cpuset_attach_nodemask_to = cs->effective_mems;
mm = get_task_mm(leader);
if (mm) {
- struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
-
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
/*
* mm from.
*/
if (is_memory_migrate(cs)) {
- cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
+ cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
}
mmput(mm);
FILE_MEMORY_MIGRATE,
FILE_CPULIST,
FILE_MEMLIST,
+ FILE_EFFECTIVE_CPULIST,
+ FILE_EFFECTIVE_MEMLIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
* resources, wait for the previously scheduled operations before
* proceeding, so that we don't end up keep removing tasks added
* after execution capability is restored.
+ *
+ * cpuset_hotplug_work calls back into cgroup core via
+ * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+ * operation like this one can lead to a deadlock through kernfs
+ * active_ref protection. Let's break the protection. Losing the
+ * protection is okay as we check whether @cs is online after
+ * grabbing cpuset_mutex anyway. This only happens on the legacy
+ * hierarchies.
*/
+ css_get(&cs->css);
+ kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);
mutex_lock(&cpuset_mutex);
free_trial_cpuset(trialcs);
out_unlock:
mutex_unlock(&cpuset_mutex);
+ kernfs_unbreak_active_protection(of->kn);
+ css_put(&cs->css);
return retval ?: nbytes;
}
case FILE_MEMLIST:
s += nodelist_scnprintf(s, count, cs->mems_allowed);
break;
+ case FILE_EFFECTIVE_CPULIST:
+ s += cpulist_scnprintf(s, count, cs->effective_cpus);
+ break;
+ case FILE_EFFECTIVE_MEMLIST:
+ s += nodelist_scnprintf(s, count, cs->effective_mems);
+ break;
default:
ret = -EINVAL;
goto out_unlock;
.private = FILE_MEMLIST,
},
+ {
+ .name = "effective_cpus",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_CPULIST,
+ },
+
+ {
+ .name = "effective_mems",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_MEMLIST,
+ },
+
{
.name = "cpu_exclusive",
.read_u64 = cpuset_read_u64,
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
- if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
- kfree(cs);
- return ERR_PTR(-ENOMEM);
- }
+ if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
+ goto free_cs;
+ if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
+ goto free_cpus;
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
+ cpumask_clear(cs->effective_cpus);
+ nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
return &cs->css;
+
+ free_cpus:
+ free_cpumask_var(cs->cpus_allowed);
+ free_cs:
+ kfree(cs);
+ return ERR_PTR(-ENOMEM);
}
static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
+ mutex_lock(&callback_mutex);
+ if (cgroup_on_dfl(cs->css.cgroup)) {
+ cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+ cs->effective_mems = parent->effective_mems;
+ }
+ mutex_unlock(&callback_mutex);
+
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
goto out_unlock;
{
struct cpuset *cs = css_cs(css);
+ free_cpumask_var(cs->effective_cpus);
free_cpumask_var(cs->cpus_allowed);
kfree(cs);
}
+ static void cpuset_bind(struct cgroup_subsys_state *root_css)
+ {
+ mutex_lock(&cpuset_mutex);
+ mutex_lock(&callback_mutex);
+
+ if (cgroup_on_dfl(root_css->cgroup)) {
+ cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+ top_cpuset.mems_allowed = node_possible_map;
+ } else {
+ cpumask_copy(top_cpuset.cpus_allowed,
+ top_cpuset.effective_cpus);
+ top_cpuset.mems_allowed = top_cpuset.effective_mems;
+ }
+
+ mutex_unlock(&callback_mutex);
+ mutex_unlock(&cpuset_mutex);
+ }
+
struct cgroup_subsys cpuset_cgrp_subsys = {
- .css_alloc = cpuset_css_alloc,
- .css_online = cpuset_css_online,
- .css_offline = cpuset_css_offline,
- .css_free = cpuset_css_free,
- .can_attach = cpuset_can_attach,
- .cancel_attach = cpuset_cancel_attach,
- .attach = cpuset_attach,
- .base_cftypes = files,
- .early_init = 1,
+ .css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpuset_css_offline,
+ .css_free = cpuset_css_free,
+ .can_attach = cpuset_can_attach,
+ .cancel_attach = cpuset_cancel_attach,
+ .attach = cpuset_attach,
+ .bind = cpuset_bind,
+ .legacy_cftypes = files,
+ .early_init = 1,
};
/**
if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
BUG();
+ if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
+ BUG();
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
+ cpumask_setall(top_cpuset.effective_cpus);
+ nodes_setall(top_cpuset.effective_mems);
fmeter_init(&top_cpuset.fmeter);
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
}
}
+ static void
+ hotplug_update_tasks_legacy(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+ {
+ bool is_empty;
+
+ mutex_lock(&callback_mutex);
+ cpumask_copy(cs->cpus_allowed, new_cpus);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->mems_allowed = *new_mems;
+ cs->effective_mems = *new_mems;
+ mutex_unlock(&callback_mutex);
+
+ /*
+ * Don't call update_tasks_cpumask() if the cpuset becomes empty,
+ * as the tasks will be migratecd to an ancestor.
+ */
+ if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
+ update_tasks_cpumask(cs);
+ if (mems_updated && !nodes_empty(cs->mems_allowed))
+ update_tasks_nodemask(cs);
+
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
+
+ mutex_unlock(&cpuset_mutex);
+
+ /*
+ * Move tasks to the nearest ancestor with execution resources,
+ * This is full cgroup operation which will also call back into
+ * cpuset. Should be done outside any lock.
+ */
+ if (is_empty)
+ remove_tasks_in_empty_cpuset(cs);
+
+ mutex_lock(&cpuset_mutex);
+ }
+
+ static void
+ hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+ {
+ if (cpumask_empty(new_cpus))
+ cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
+ if (nodes_empty(*new_mems))
+ *new_mems = parent_cs(cs)->effective_mems;
+
+ mutex_lock(&callback_mutex);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->effective_mems = *new_mems;
+ mutex_unlock(&callback_mutex);
+
+ if (cpus_updated)
+ update_tasks_cpumask(cs);
+ if (mems_updated)
+ update_tasks_nodemask(cs);
+ }
+
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
*/
static void cpuset_hotplug_update_tasks(struct cpuset *cs)
{
- static cpumask_t off_cpus;
- static nodemask_t off_mems;
- bool is_empty;
- bool sane = cgroup_sane_behavior(cs->css.cgroup);
-
+ static cpumask_t new_cpus;
+ static nodemask_t new_mems;
+ bool cpus_updated;
+ bool mems_updated;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
goto retry;
}
- cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
- nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
-
- mutex_lock(&callback_mutex);
- cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
- mutex_unlock(&callback_mutex);
-
- /*
- * If sane_behavior flag is set, we need to update tasks' cpumask
- * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
- * call update_tasks_cpumask() if the cpuset becomes empty, as
- * the tasks in it will be migrated to an ancestor.
- */
- if ((sane && cpumask_empty(cs->cpus_allowed)) ||
- (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
- update_tasks_cpumask(cs);
+ cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+ nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
- mutex_lock(&callback_mutex);
- nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
- mutex_unlock(&callback_mutex);
-
- /*
- * If sane_behavior flag is set, we need to update tasks' nodemask
- * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
- * call update_tasks_nodemask() if the cpuset becomes empty, as
- * the tasks in it will be migratd to an ancestor.
- */
- if ((sane && nodes_empty(cs->mems_allowed)) ||
- (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
- update_tasks_nodemask(cs);
+ cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
+ mems_updated = !nodes_equal(new_mems, cs->effective_mems);
- is_empty = cpumask_empty(cs->cpus_allowed) ||
- nodes_empty(cs->mems_allowed);
+ if (cgroup_on_dfl(cs->css.cgroup))
+ hotplug_update_tasks(cs, &new_cpus, &new_mems,
+ cpus_updated, mems_updated);
+ else
+ hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
+ cpus_updated, mems_updated);
mutex_unlock(&cpuset_mutex);
-
- /*
- * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
- *
- * Otherwise move tasks to the nearest ancestor with execution
- * resources. This is full cgroup operation which will
- * also call back into cpuset. Should be done outside any lock.
- */
- if (!sane && is_empty)
- remove_tasks_in_empty_cpuset(cs);
}
/**
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
+ bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
mutex_lock(&cpuset_mutex);
cpumask_copy(&new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
- cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
- mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+ mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
mutex_lock(&callback_mutex);
- cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ if (!on_dfl)
+ cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
mutex_unlock(&callback_mutex);
/* we don't mess with cpumasks of tasks in top_cpuset */
}
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
mutex_lock(&callback_mutex);
- top_cpuset.mems_allowed = new_mems;
+ if (!on_dfl)
+ top_cpuset.mems_allowed = new_mems;
+ top_cpuset.effective_mems = new_mems;
mutex_unlock(&callback_mutex);
update_tasks_nodemask(&top_cpuset);
}
top_cpuset.mems_allowed = node_states[N_MEMORY];
top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
+ cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
+ top_cpuset.effective_mems = node_states[N_MEMORY];
+
register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
}
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
- struct cpuset *cpus_cs;
-
mutex_lock(&callback_mutex);
rcu_read_lock();
- cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
- guarantee_online_cpus(cpus_cs, pmask);
+ guarantee_online_cpus(task_cs(tsk), pmask);
rcu_read_unlock();
mutex_unlock(&callback_mutex);
}
void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
- struct cpuset *cpus_cs;
-
rcu_read_lock();
- cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
- do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
+ do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
rcu_read_unlock();
/*
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
- struct cpuset *mems_cs;
nodemask_t mask;
mutex_lock(&callback_mutex);
rcu_read_lock();
- mems_cs = effective_nodemask_cpuset(task_cs(tsk));
- guarantee_online_mems(mems_cs, &mask);
+ guarantee_online_mems(task_cs(tsk), &mask);
rcu_read_unlock();
mutex_unlock(&callback_mutex);
int __sched _cond_resched(void)
{
- rcu_cond_resched();
if (should_resched()) {
__cond_resched();
return 1;
*/
int __cond_resched_lock(spinlock_t *lock)
{
- bool need_rcu_resched = rcu_should_resched();
int resched = should_resched();
int ret = 0;
lockdep_assert_held(lock);
- if (spin_needbreak(lock) || resched || need_rcu_resched) {
+ if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
if (resched)
__cond_resched();
- else if (unlikely(need_rcu_resched))
- rcu_resched();
else
cpu_relax();
ret = 1;
{
BUG_ON(!in_softirq());
- rcu_cond_resched(); /* BH disabled OK, just recording QSes. */
if (should_resched()) {
local_bh_enable();
__cond_resched();
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
- .base_cftypes = cpu_files,
+ .legacy_cftypes = cpu_files,
.early_init = 1,
};
{
struct mem_cgroup_eventfd_list *ev;
+ spin_lock(&memcg_oom_lock);
+
list_for_each_entry(ev, &memcg->oom_notify, list)
eventfd_signal(ev->eventfd, 1);
+
+ spin_unlock(&memcg_oom_lock);
return 0;
}
},
{
.name = "use_hierarchy",
- .flags = CFTYPE_INSANE,
.write_u64 = mem_cgroup_hierarchy_write,
.read_u64 = mem_cgroup_hierarchy_read,
},
__mem_cgroup_free(memcg);
}
+ /**
+ * mem_cgroup_css_reset - reset the states of a mem_cgroup
+ * @css: the target css
+ *
+ * Reset the states of the mem_cgroup associated with @css. This is
+ * invoked when the userland requests disabling on the default hierarchy
+ * but the memcg is pinned through dependency. The memcg should stop
+ * applying policies and should revert to the vanilla state as it may be
+ * made visible again.
+ *
+ * The current implementation only resets the essential configurations.
+ * This needs to be expanded to cover all the visible parts.
+ */
+ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
+ {
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ mem_cgroup_resize_limit(memcg, ULLONG_MAX);
+ mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX);
+ memcg_update_kmem_limit(memcg, ULLONG_MAX);
+ res_counter_set_soft_limit(&memcg->res, ULLONG_MAX);
+ }
+
#ifdef CONFIG_MMU
/* Handlers for move charge at task migration. */
#define PRECHARGE_COUNT_AT_ONCE 256
/*
* Cgroup retains root cgroups across [un]mount cycles making it necessary
- * to verify sane_behavior flag on each mount attempt.
+ * to verify whether we're attached to the default hierarchy on each mount
+ * attempt.
*/
static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
{
/*
- * use_hierarchy is forced with sane_behavior. cgroup core
+ * use_hierarchy is forced on the default hierarchy. cgroup core
* guarantees that @root doesn't have any children, so turning it
* on for the root memcg is enough.
*/
- if (cgroup_sane_behavior(root_css->cgroup))
+ if (cgroup_on_dfl(root_css->cgroup))
mem_cgroup_from_css(root_css)->use_hierarchy = true;
}
.css_online = mem_cgroup_css_online,
.css_offline = mem_cgroup_css_offline,
.css_free = mem_cgroup_css_free,
+ .css_reset = mem_cgroup_css_reset,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.attach = mem_cgroup_move_task,
.bind = mem_cgroup_bind,
- .base_cftypes = mem_cgroup_files,
+ .legacy_cftypes = mem_cgroup_files,
.early_init = 0,
};
static void __init memsw_file_init(void)
{
- WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
+ memsw_cgroup_files));
}
static void __init enable_swap_cgroup(void)