Merge tag 'xfs-for-linus-4.6-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
[deliverable/linux.git] / kernel / cgroup.c
index a6d484a667aa485ca7cf330fb6a1892c9f548f9c..671dc05c0b0fd6b732cf03d6efec6a5dd5a3557d 100644 (file)
@@ -59,6 +59,9 @@
 #include <linux/delay.h>
 #include <linux/atomic.h>
 #include <linux/cpuset.h>
+#include <linux/proc_ns.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_ns.h>
 #include <net/sock.h>
 
 /*
@@ -186,6 +189,9 @@ static u16 cgroup_no_v1_mask;
 /* some controllers are not supported in the default hierarchy */
 static u16 cgrp_dfl_inhibit_ss_mask;
 
+/* some controllers are implicitly enabled on the default hierarchy */
+static unsigned long cgrp_dfl_implicit_ss_mask;
+
 /* The list of hierarchy roots */
 
 static LIST_HEAD(cgroup_roots);
@@ -212,6 +218,15 @@ static u16 have_fork_callback __read_mostly;
 static u16 have_exit_callback __read_mostly;
 static u16 have_free_callback __read_mostly;
 
+/* cgroup namespace for init task */
+struct cgroup_namespace init_cgroup_ns = {
+       .count          = { .counter = 2, },
+       .user_ns        = &init_user_ns,
+       .ns.ops         = &cgroupns_operations,
+       .ns.inum        = PROC_CGROUP_INIT_INO,
+       .root_cset      = &init_css_set,
+};
+
 /* Ditto for the can_fork callback. */
 static u16 have_canfork_callback __read_mostly;
 
@@ -220,6 +235,9 @@ static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
 
 static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+static int cgroup_apply_control(struct cgroup *cgrp);
+static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
 static void css_task_iter_advance(struct css_task_iter *it);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
@@ -240,6 +258,9 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
  */
 static bool cgroup_ssid_enabled(int ssid)
 {
+       if (CGROUP_SUBSYS_COUNT == 0)
+               return false;
+
        return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
 }
 
@@ -346,6 +367,32 @@ static struct cgroup *cgroup_parent(struct cgroup *cgrp)
        return NULL;
 }
 
+/* subsystems visibly enabled on a cgroup */
+static u16 cgroup_control(struct cgroup *cgrp)
+{
+       struct cgroup *parent = cgroup_parent(cgrp);
+       u16 root_ss_mask = cgrp->root->subsys_mask;
+
+       if (parent)
+               return parent->subtree_control;
+
+       if (cgroup_on_dfl(cgrp))
+               root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
+                                 cgrp_dfl_implicit_ss_mask);
+       return root_ss_mask;
+}
+
+/* subsystems enabled on a cgroup */
+static u16 cgroup_ss_mask(struct cgroup *cgrp)
+{
+       struct cgroup *parent = cgroup_parent(cgrp);
+
+       if (parent)
+               return parent->subtree_ss_mask;
+
+       return cgrp->root->subsys_mask;
+}
+
 /**
  * cgroup_css - obtain a cgroup's css for the specified subsystem
  * @cgrp: the cgroup of interest
@@ -385,16 +432,15 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
        if (!ss)
                return &cgrp->self;
 
-       if (!(cgrp->root->subsys_mask & (1 << ss->id)))
-               return NULL;
-
        /*
         * This function is used while updating css associations and thus
-        * can't test the csses directly.  Use ->subtree_ss_mask.
+        * can't test the csses directly.  Test ss_mask.
         */
-       while (cgroup_parent(cgrp) &&
-              !(cgroup_parent(cgrp)->subtree_ss_mask & (1 << ss->id)))
+       while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
                cgrp = cgroup_parent(cgrp);
+               if (!cgrp)
+                       return NULL;
+       }
 
        return cgroup_css(cgrp, ss);
 }
@@ -548,6 +594,24 @@ static int notify_on_release(const struct cgroup *cgrp)
                        ;                                               \
                else
 
+/* walk live descendants in preorder */
+#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)         \
+       css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))  \
+               if (({ lockdep_assert_held(&cgroup_mutex);              \
+                      (dsct) = (d_css)->cgroup;                        \
+                      cgroup_is_dead(dsct); }))                        \
+                       ;                                               \
+               else
+
+/* walk live descendants in postorder */
+#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)                \
+       css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
+               if (({ lockdep_assert_held(&cgroup_mutex);              \
+                      (dsct) = (d_css)->cgroup;                        \
+                      cgroup_is_dead(dsct); }))                        \
+                       ;                                               \
+               else
+
 static void cgroup_release_agent(struct work_struct *work);
 static void check_for_release(struct cgroup *cgrp);
 
@@ -1116,13 +1180,13 @@ static void cgroup_destroy_root(struct cgroup_root *root)
        struct cgroup *cgrp = &root->cgrp;
        struct cgrp_cset_link *link, *tmp_link;
 
-       mutex_lock(&cgroup_mutex);
+       cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
 
        BUG_ON(atomic_read(&root->nr_cgrps));
        BUG_ON(!list_empty(&cgrp->self.children));
 
        /* Rebind all subsystems back to the default hierarchy */
-       rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
+       WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
 
        /*
         * Release all the links from cset_links to this hierarchy's
@@ -1263,28 +1327,25 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 
 /**
  * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
- * @cgrp: the target cgroup
  * @subtree_control: the new subtree_control mask to consider
+ * @this_ss_mask: available subsystems
  *
  * On the default hierarchy, a subsystem may request other subsystems to be
  * enabled together through its ->depends_on mask.  In such cases, more
  * subsystems than specified in "cgroup.subtree_control" may be enabled.
  *
  * This function calculates which subsystems need to be enabled if
- * @subtree_control is to be applied to @cgrp.  The returned mask is always
- * a superset of @subtree_control and follows the usual hierarchy rules.
+ * @subtree_control is to be applied while restricted to @this_ss_mask.
  */
-static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control)
+static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
 {
-       struct cgroup *parent = cgroup_parent(cgrp);
        u16 cur_ss_mask = subtree_control;
        struct cgroup_subsys *ss;
        int ssid;
 
        lockdep_assert_held(&cgroup_mutex);
 
-       if (!cgroup_on_dfl(cgrp))
-               return cur_ss_mask;
+       cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
 
        while (true) {
                u16 new_ss_mask = cur_ss_mask;
@@ -1298,10 +1359,7 @@ static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control)
                 * happen only if some depended-upon subsystems were bound
                 * to non-default hierarchies.
                 */
-               if (parent)
-                       new_ss_mask &= parent->subtree_ss_mask;
-               else
-                       new_ss_mask &= cgrp->root->subsys_mask;
+               new_ss_mask &= this_ss_mask;
 
                if (new_ss_mask == cur_ss_mask)
                        break;
@@ -1311,19 +1369,6 @@ static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control)
        return cur_ss_mask;
 }
 
-/**
- * cgroup_refresh_subtree_ss_mask - update subtree_ss_mask
- * @cgrp: the target cgroup
- *
- * Update @cgrp->subtree_ss_mask according to the current
- * @cgrp->subtree_control using cgroup_calc_subtree_ss_mask().
- */
-static void cgroup_refresh_subtree_ss_mask(struct cgroup *cgrp)
-{
-       cgrp->subtree_ss_mask =
-               cgroup_calc_subtree_ss_mask(cgrp, cgrp->subtree_control);
-}
-
 /**
  * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
  * @kn: the kernfs_node being serviced
@@ -1352,19 +1397,22 @@ static void cgroup_kn_unlock(struct kernfs_node *kn)
 /**
  * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
  * @kn: the kernfs_node being serviced
+ * @drain_offline: perform offline draining on the cgroup
  *
  * This helper is to be used by a cgroup kernfs method currently servicing
  * @kn.  It breaks the active protection, performs cgroup locking and
  * verifies that the associated cgroup is alive.  Returns the cgroup if
  * alive; otherwise, %NULL.  A successful return should be undone by a
- * matching cgroup_kn_unlock() invocation.
+ * matching cgroup_kn_unlock() invocation.  If @drain_offline is %true, the
+ * cgroup is drained of offlining csses before return.
  *
  * Any cgroup kernfs method implementation which requires locking the
  * associated cgroup should use this helper.  It avoids nesting cgroup
  * locking under kernfs active protection and allows all kernfs operations
  * including self-removal.
  */
-static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
+static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
+                                         bool drain_offline)
 {
        struct cgroup *cgrp;
 
@@ -1383,7 +1431,10 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
                return NULL;
        kernfs_break_active_protection(kn);
 
-       mutex_lock(&cgroup_mutex);
+       if (drain_offline)
+               cgroup_lock_and_drain_offline(cgrp);
+       else
+               mutex_lock(&cgroup_mutex);
 
        if (!cgroup_is_dead(cgrp))
                return cgrp;
@@ -1413,12 +1464,10 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 /**
  * css_clear_dir - remove subsys files in a cgroup directory
  * @css: taget css
- * @cgrp_override: specify if target cgroup is different from css->cgroup
  */
-static void css_clear_dir(struct cgroup_subsys_state *css,
-                         struct cgroup *cgrp_override)
+static void css_clear_dir(struct cgroup_subsys_state *css)
 {
-       struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+       struct cgroup *cgrp = css->cgroup;
        struct cftype *cfts;
 
        if (!(css->flags & CSS_VISIBLE))
@@ -1433,18 +1482,16 @@ static void css_clear_dir(struct cgroup_subsys_state *css,
 /**
  * css_populate_dir - create subsys files in a cgroup directory
  * @css: target css
- * @cgrp_overried: specify if target cgroup is different from css->cgroup
  *
  * On failure, no file is added.
  */
-static int css_populate_dir(struct cgroup_subsys_state *css,
-                           struct cgroup *cgrp_override)
+static int css_populate_dir(struct cgroup_subsys_state *css)
 {
-       struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+       struct cgroup *cgrp = css->cgroup;
        struct cftype *cfts, *failed_cfts;
        int ret;
 
-       if (css->flags & CSS_VISIBLE)
+       if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
                return 0;
 
        if (!css->ss) {
@@ -1480,14 +1527,18 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 {
        struct cgroup *dcgrp = &dst_root->cgrp;
        struct cgroup_subsys *ss;
-       u16 tmp_ss_mask;
        int ssid, i, ret;
 
        lockdep_assert_held(&cgroup_mutex);
 
        do_each_subsys_mask(ss, ssid, ss_mask) {
-               /* if @ss has non-root csses attached to it, can't move */
-               if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
+               /*
+                * If @ss has non-root csses attached to it, can't move.
+                * If @ss is an implicit controller, it is exempt from this
+                * rule and can be stolen.
+                */
+               if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
+                   !ss->implicit_on_dfl)
                        return -EBUSY;
 
                /* can't move between two non-dummy roots either */
@@ -1495,46 +1546,6 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
                        return -EBUSY;
        } while_each_subsys_mask();
 
-       /* skip creating root files on dfl_root for inhibited subsystems */
-       tmp_ss_mask = ss_mask;
-       if (dst_root == &cgrp_dfl_root)
-               tmp_ss_mask &= ~cgrp_dfl_inhibit_ss_mask;
-
-       do_each_subsys_mask(ss, ssid, tmp_ss_mask) {
-               struct cgroup *scgrp = &ss->root->cgrp;
-               int tssid;
-
-               ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
-               if (!ret)
-                       continue;
-
-               /*
-                * Rebinding back to the default root is not allowed to
-                * fail.  Using both default and non-default roots should
-                * be rare.  Moving subsystems back and forth even more so.
-                * Just warn about it and continue.
-                */
-               if (dst_root == &cgrp_dfl_root) {
-                       if (cgrp_dfl_visible) {
-                               pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
-                                       ret, ss_mask);
-                               pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
-                       }
-                       continue;
-               }
-
-               do_each_subsys_mask(ss, tssid, tmp_ss_mask) {
-                       if (tssid == ssid)
-                               break;
-                       css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
-               } while_each_subsys_mask();
-               return ret;
-       } while_each_subsys_mask();
-
-       /*
-        * Nothing can fail from this point on.  Remove files for the
-        * removed subsystems and rebind each subsystem.
-        */
        do_each_subsys_mask(ss, ssid, ss_mask) {
                struct cgroup_root *src_root = ss->root;
                struct cgroup *scgrp = &src_root->cgrp;
@@ -1543,8 +1554,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 
                WARN_ON(!css || cgroup_css(dcgrp, ss));
 
-               css_clear_dir(css, NULL);
+               /* disable from the source */
+               src_root->subsys_mask &= ~(1 << ssid);
+               WARN_ON(cgroup_apply_control(scgrp));
+               cgroup_finalize_control(scgrp, 0);
 
+               /* rebind */
                RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
                rcu_assign_pointer(dcgrp->subsys[ssid], css);
                ss->root = dst_root;
@@ -1556,20 +1571,20 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
                                       &dcgrp->e_csets[ss->id]);
                spin_unlock_bh(&css_set_lock);
 
-               src_root->subsys_mask &= ~(1 << ssid);
-               scgrp->subtree_control &= ~(1 << ssid);
-               cgroup_refresh_subtree_ss_mask(scgrp);
-
                /* default hierarchy doesn't enable controllers by default */
                dst_root->subsys_mask |= 1 << ssid;
                if (dst_root == &cgrp_dfl_root) {
                        static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
                } else {
                        dcgrp->subtree_control |= 1 << ssid;
-                       cgroup_refresh_subtree_ss_mask(dcgrp);
                        static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
                }
 
+               ret = cgroup_apply_control(dcgrp);
+               if (ret)
+                       pr_warn("partial failure to rebind %s controller (err=%d)\n",
+                               ss->name, ret);
+
                if (ss->bind)
                        ss->bind(css);
        } while_each_subsys_mask();
@@ -1761,7 +1776,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
                return -EINVAL;
        }
 
-       mutex_lock(&cgroup_mutex);
+       cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
 
        /* See what subsystems are wanted */
        ret = parse_cgroupfs_options(data, &opts);
@@ -1794,7 +1809,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
        if (ret)
                goto out_unlock;
 
-       rebind_subsystems(&cgrp_dfl_root, removed_mask);
+       WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
 
        if (opts.release_agent) {
                spin_lock(&release_agent_path_lock);
@@ -1925,10 +1940,11 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
        /*
         * We're accessing css_set_count without locking css_set_lock here,
         * but that's OK - it can only be increased by someone holding
-        * cgroup_lock, and that's us. The worst that can happen is that we
-        * have some link structures left over
+        * cgroup_lock, and that's us.  Later rebinding may disable
+        * controllers on the default hierarchy and thus create new csets,
+        * which can't be more than the existing ones.  Allocate 2x.
         */
-       ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
+       ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
        if (ret)
                goto cancel_ref;
 
@@ -1945,7 +1961,7 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
        }
        root_cgrp->kn = root->kf_root->kn;
 
-       ret = css_populate_dir(&root_cgrp->self, NULL);
+       ret = css_populate_dir(&root_cgrp->self);
        if (ret)
                goto destroy_root;
 
@@ -1998,6 +2014,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 {
        bool is_v2 = fs_type == &cgroup2_fs_type;
        struct super_block *pinned_sb = NULL;
+       struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
        struct cgroup_subsys *ss;
        struct cgroup_root *root;
        struct cgroup_sb_opts opts;
@@ -2006,6 +2023,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        int i;
        bool new_sb;
 
+       get_cgroup_ns(ns);
+
+       /* Check if the caller has permission to mount. */
+       if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
+               put_cgroup_ns(ns);
+               return ERR_PTR(-EPERM);
+       }
+
        /*
         * The first time anyone tries to mount a cgroup, enable the list
         * linking each css_set to its tasks and fix up all existing tasks.
@@ -2016,6 +2041,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        if (is_v2) {
                if (data) {
                        pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+                       put_cgroup_ns(ns);
                        return ERR_PTR(-EINVAL);
                }
                cgrp_dfl_visible = true;
@@ -2024,7 +2050,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                goto out_mount;
        }
 
-       mutex_lock(&cgroup_mutex);
+       cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
 
        /* First find the desired set of subsystems */
        ret = parse_cgroupfs_options(data, &opts);
@@ -2121,6 +2147,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                goto out_unlock;
        }
 
+       /*
+        * We know this subsystem has not yet been bound.  Users in a non-init
+        * user namespace may only mount hierarchies with no bound subsystems,
+        * i.e. 'none,name=user1'
+        */
+       if (!opts.none && !capable(CAP_SYS_ADMIN)) {
+               ret = -EPERM;
+               goto out_unlock;
+       }
+
        root = kzalloc(sizeof(*root), GFP_KERNEL);
        if (!root) {
                ret = -ENOMEM;
@@ -2139,12 +2175,37 @@ out_free:
        kfree(opts.release_agent);
        kfree(opts.name);
 
-       if (ret)
+       if (ret) {
+               put_cgroup_ns(ns);
                return ERR_PTR(ret);
+       }
 out_mount:
        dentry = kernfs_mount(fs_type, flags, root->kf_root,
                              is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
                              &new_sb);
+
+       /*
+        * In non-init cgroup namespace, instead of root cgroup's
+        * dentry, we return the dentry corresponding to the
+        * cgroupns->root_cgrp.
+        */
+       if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
+               struct dentry *nsdentry;
+               struct cgroup *cgrp;
+
+               mutex_lock(&cgroup_mutex);
+               spin_lock_bh(&css_set_lock);
+
+               cgrp = cset_cgroup_from_root(ns->root_cset, root);
+
+               spin_unlock_bh(&css_set_lock);
+               mutex_unlock(&cgroup_mutex);
+
+               nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
+               dput(dentry);
+               dentry = nsdentry;
+       }
+
        if (IS_ERR(dentry) || !new_sb)
                cgroup_put(&root->cgrp);
 
@@ -2157,6 +2218,7 @@ out_mount:
                deactivate_super(pinned_sb);
        }
 
+       put_cgroup_ns(ns);
        return dentry;
 }
 
@@ -2185,14 +2247,45 @@ static struct file_system_type cgroup_fs_type = {
        .name = "cgroup",
        .mount = cgroup_mount,
        .kill_sb = cgroup_kill_sb,
+       .fs_flags = FS_USERNS_MOUNT,
 };
 
 static struct file_system_type cgroup2_fs_type = {
        .name = "cgroup2",
        .mount = cgroup_mount,
        .kill_sb = cgroup_kill_sb,
+       .fs_flags = FS_USERNS_MOUNT,
 };
 
+static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+                                  struct cgroup_namespace *ns)
+{
+       struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
+       int ret;
+
+       ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
+       if (ret < 0 || ret >= buflen)
+               return NULL;
+       return buf;
+}
+
+char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
+                    struct cgroup_namespace *ns)
+{
+       char *ret;
+
+       mutex_lock(&cgroup_mutex);
+       spin_lock_bh(&css_set_lock);
+
+       ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
+
+       spin_unlock_bh(&css_set_lock);
+       mutex_unlock(&cgroup_mutex);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(cgroup_path_ns);
+
 /**
  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
  * @task: target task
@@ -2220,7 +2313,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 
        if (root) {
                cgrp = task_cgroup_from_root(task, root);
-               path = cgroup_path(cgrp, buf, buflen);
+               path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
        } else {
                /* if no hierarchy exists, everyone is in "/" */
                if (strlcpy(buf, "/", buflen) < buflen)
@@ -2364,38 +2457,38 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
 }
 
 /**
- * cgroup_taskset_migrate - migrate a taskset to a cgroup
+ * cgroup_taskset_migrate - migrate a taskset
  * @tset: taget taskset
- * @dst_cgrp: destination cgroup
+ * @root: cgroup root the migration is taking place on
  *
- * Migrate tasks in @tset to @dst_cgrp.  This function fails iff one of the
- * ->can_attach callbacks fails and guarantees that either all or none of
- * the tasks in @tset are migrated.  @tset is consumed regardless of
- * success.
+ * Migrate tasks in @tset as setup by migration preparation functions.
+ * This function fails iff one of the ->can_attach callbacks fails and
+ * guarantees that either all or none of the tasks in @tset are migrated.
+ * @tset is consumed regardless of success.
  */
 static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
-                                 struct cgroup *dst_cgrp)
+                                 struct cgroup_root *root)
 {
-       struct cgroup_subsys_state *css, *failed_css = NULL;
+       struct cgroup_subsys *ss;
        struct task_struct *task, *tmp_task;
        struct css_set *cset, *tmp_cset;
-       int i, ret;
+       int ssid, failed_ssid, ret;
 
        /* methods shouldn't be called if no task is actually migrating */
        if (list_empty(&tset->src_csets))
                return 0;
 
        /* check that we can legitimately attach to the cgroup */
-       for_each_e_css(css, i, dst_cgrp) {
-               if (css->ss->can_attach) {
-                       tset->ssid = i;
-                       ret = css->ss->can_attach(tset);
+       do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+               if (ss->can_attach) {
+                       tset->ssid = ssid;
+                       ret = ss->can_attach(tset);
                        if (ret) {
-                               failed_css = css;
+                               failed_ssid = ssid;
                                goto out_cancel_attach;
                        }
                }
-       }
+       } while_each_subsys_mask();
 
        /*
         * Now that we're guaranteed success, proceed to move all tasks to
@@ -2422,25 +2515,25 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
         */
        tset->csets = &tset->dst_csets;
 
-       for_each_e_css(css, i, dst_cgrp) {
-               if (css->ss->attach) {
-                       tset->ssid = i;
-                       css->ss->attach(tset);
+       do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+               if (ss->attach) {
+                       tset->ssid = ssid;
+                       ss->attach(tset);
                }
-       }
+       } while_each_subsys_mask();
 
        ret = 0;
        goto out_release_tset;
 
 out_cancel_attach:
-       for_each_e_css(css, i, dst_cgrp) {
-               if (css == failed_css)
+       do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+               if (ssid == failed_ssid)
                        break;
-               if (css->ss->cancel_attach) {
-                       tset->ssid = i;
-                       css->ss->cancel_attach(tset);
+               if (ss->cancel_attach) {
+                       tset->ssid = ssid;
+                       ss->cancel_attach(tset);
                }
-       }
+       } while_each_subsys_mask();
 out_release_tset:
        spin_lock_bh(&css_set_lock);
        list_splice_init(&tset->dst_csets, &tset->src_csets);
@@ -2452,6 +2545,20 @@ out_release_tset:
        return ret;
 }
 
+/**
+ * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ * @dst_cgrp: destination cgroup to test
+ *
+ * On the default hierarchy, except for the root, subtree_control must be
+ * zero for migration destination cgroups with tasks so that child cgroups
+ * don't compete against tasks.
+ */
+static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+{
+       return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
+               !dst_cgrp->subtree_control;
+}
+
 /**
  * cgroup_migrate_finish - cleanup after attach
  * @preloaded_csets: list of preloaded css_sets
@@ -2468,6 +2575,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
        spin_lock_bh(&css_set_lock);
        list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
                cset->mg_src_cgrp = NULL;
+               cset->mg_dst_cgrp = NULL;
                cset->mg_dst_cset = NULL;
                list_del_init(&cset->mg_preload_node);
                put_css_set_locked(cset);
@@ -2500,58 +2608,56 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
        lockdep_assert_held(&cgroup_mutex);
        lockdep_assert_held(&css_set_lock);
 
+       /*
+        * If ->dead, @src_set is associated with one or more dead cgroups
+        * and doesn't contain any migratable tasks.  Ignore it early so
+        * that the rest of migration path doesn't get confused by it.
+        */
+       if (src_cset->dead)
+               return;
+
        src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
 
        if (!list_empty(&src_cset->mg_preload_node))
                return;
 
        WARN_ON(src_cset->mg_src_cgrp);
+       WARN_ON(src_cset->mg_dst_cgrp);
        WARN_ON(!list_empty(&src_cset->mg_tasks));
        WARN_ON(!list_empty(&src_cset->mg_node));
 
        src_cset->mg_src_cgrp = src_cgrp;
+       src_cset->mg_dst_cgrp = dst_cgrp;
        get_css_set(src_cset);
        list_add(&src_cset->mg_preload_node, preloaded_csets);
 }
 
 /**
  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @dst_cgrp: the destination cgroup (may be %NULL)
  * @preloaded_csets: list of preloaded source css_sets
  *
- * Tasks are about to be moved to @dst_cgrp and all the source css_sets
- * have been preloaded to @preloaded_csets.  This function looks up and
- * pins all destination css_sets, links each to its source, and append them
- * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each
- * source css_set is assumed to be its cgroup on the default hierarchy.
+ * Tasks are about to be moved and all the source css_sets have been
+ * preloaded to @preloaded_csets.  This function looks up and pins all
+ * destination css_sets, links each to its source, and append them to
+ * @preloaded_csets.
  *
  * This function must be called after cgroup_migrate_add_src() has been
  * called on each migration source css_set.  After migration is performed
  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
  * @preloaded_csets.
  */
-static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
-                                     struct list_head *preloaded_csets)
+static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
 {
        LIST_HEAD(csets);
        struct css_set *src_cset, *tmp_cset;
 
        lockdep_assert_held(&cgroup_mutex);
 
-       /*
-        * Except for the root, subtree_control must be zero for a cgroup
-        * with tasks so that child cgroups don't compete against tasks.
-        */
-       if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
-           dst_cgrp->subtree_control)
-               return -EBUSY;
-
        /* look up the dst cset for each src cset and link it to src */
        list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
                struct css_set *dst_cset;
 
-               dst_cset = find_css_set(src_cset,
-                                       dst_cgrp ?: src_cset->dfl_cgrp);
+               dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
                if (!dst_cset)
                        goto err;
 
@@ -2564,6 +2670,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
                 */
                if (src_cset == dst_cset) {
                        src_cset->mg_src_cgrp = NULL;
+                       src_cset->mg_dst_cgrp = NULL;
                        list_del_init(&src_cset->mg_preload_node);
                        put_css_set(src_cset);
                        put_css_set(dst_cset);
@@ -2589,11 +2696,11 @@ err:
  * cgroup_migrate - migrate a process or task to a cgroup
  * @leader: the leader of the process or the task to migrate
  * @threadgroup: whether @leader points to the whole process or a single task
- * @cgrp: the destination cgroup
+ * @root: cgroup root migration is taking place on
  *
- * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
- * process, the caller must be holding cgroup_threadgroup_rwsem.  The
- * caller is also responsible for invoking cgroup_migrate_add_src() and
+ * Migrate a process or task denoted by @leader.  If migrating a process,
+ * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
+ * responsible for invoking cgroup_migrate_add_src() and
  * cgroup_migrate_prepare_dst() on the targets before invoking this
  * function and following up with cgroup_migrate_finish().
  *
@@ -2604,7 +2711,7 @@ err:
  * actually starting migrating.
  */
 static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
-                         struct cgroup *cgrp)
+                         struct cgroup_root *root)
 {
        struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
        struct task_struct *task;
@@ -2625,7 +2732,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
        rcu_read_unlock();
        spin_unlock_bh(&css_set_lock);
 
-       return cgroup_taskset_migrate(&tset, cgrp);
+       return cgroup_taskset_migrate(&tset, root);
 }
 
 /**
@@ -2643,6 +2750,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
        struct task_struct *task;
        int ret;
 
+       if (!cgroup_may_migrate_to(dst_cgrp))
+               return -EBUSY;
+
        /* look up all src csets */
        spin_lock_bh(&css_set_lock);
        rcu_read_lock();
@@ -2657,9 +2767,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
        spin_unlock_bh(&css_set_lock);
 
        /* prepare dst csets and commit */
-       ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
+       ret = cgroup_migrate_prepare_dst(&preloaded_csets);
        if (!ret)
-               ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
+               ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
 
        cgroup_migrate_finish(&preloaded_csets);
        return ret;
@@ -2722,7 +2832,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
        if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
                return -EINVAL;
 
-       cgrp = cgroup_kn_lock_live(of->kn);
+       cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENODEV;
 
@@ -2820,7 +2930,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
 
        BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
 
-       cgrp = cgroup_kn_lock_live(of->kn);
+       cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENODEV;
        spin_lock(&release_agent_path_lock);
@@ -2864,22 +2974,12 @@ static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
                seq_putc(seq, '\n');
 }
 
-/* show controllers which are currently attached to the default hierarchy */
-static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
-{
-       struct cgroup *cgrp = seq_css(seq)->cgroup;
-
-       cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
-                            ~cgrp_dfl_inhibit_ss_mask);
-       return 0;
-}
-
 /* show controllers which are enabled from the parent */
 static int cgroup_controllers_show(struct seq_file *seq, void *v)
 {
        struct cgroup *cgrp = seq_css(seq)->cgroup;
 
-       cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
+       cgroup_print_ss_mask(seq, cgroup_control(cgrp));
        return 0;
 }
 
@@ -2896,16 +2996,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
  * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
  * @cgrp: root of the subtree to update csses for
  *
- * @cgrp's subtree_ss_mask has changed and its subtree's (self excluded)
- * css associations need to be updated accordingly.  This function looks up
- * all css_sets which are attached to the subtree, creates the matching
- * updated css_sets and migrates the tasks to the new ones.
+ * @cgrp's control masks have changed and its subtree's css associations
+ * need to be updated accordingly.  This function looks up all css_sets
+ * which are attached to the subtree, creates the matching updated css_sets
+ * and migrates the tasks to the new ones.
  */
 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 {
        LIST_HEAD(preloaded_csets);
        struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
-       struct cgroup_subsys_state *css;
+       struct cgroup_subsys_state *d_css;
+       struct cgroup *dsct;
        struct css_set *src_cset;
        int ret;
 
@@ -2915,21 +3016,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 
        /* look up all csses currently attached to @cgrp's subtree */
        spin_lock_bh(&css_set_lock);
-       css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+       cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                struct cgrp_cset_link *link;
 
-               /* self is not affected by subtree_ss_mask change */
-               if (css->cgroup == cgrp)
-                       continue;
-
-               list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
-                       cgroup_migrate_add_src(link->cset, cgrp,
+               list_for_each_entry(link, &dsct->cset_links, cset_link)
+                       cgroup_migrate_add_src(link->cset, dsct,
                                               &preloaded_csets);
        }
        spin_unlock_bh(&css_set_lock);
 
        /* NULL dst indicates self on default hierarchy */
-       ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+       ret = cgroup_migrate_prepare_dst(&preloaded_csets);
        if (ret)
                goto out_finish;
 
@@ -2947,20 +3044,272 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
        }
        spin_unlock_bh(&css_set_lock);
 
-       ret = cgroup_taskset_migrate(&tset, cgrp);
+       ret = cgroup_taskset_migrate(&tset, cgrp->root);
 out_finish:
        cgroup_migrate_finish(&preloaded_csets);
        percpu_up_write(&cgroup_threadgroup_rwsem);
        return ret;
 }
 
+/**
+ * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
+ * @cgrp: root of the target subtree
+ *
+ * Because css offlining is asynchronous, userland may try to re-enable a
+ * controller while the previous css is still around.  This function grabs
+ * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
+ */
+static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
+       __acquires(&cgroup_mutex)
+{
+       struct cgroup *dsct;
+       struct cgroup_subsys_state *d_css;
+       struct cgroup_subsys *ss;
+       int ssid;
+
+restart:
+       mutex_lock(&cgroup_mutex);
+
+       cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+               for_each_subsys(ss, ssid) {
+                       struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+                       DEFINE_WAIT(wait);
+
+                       if (!css || !percpu_ref_is_dying(&css->refcnt))
+                               continue;
+
+                       cgroup_get(dsct);
+                       prepare_to_wait(&dsct->offline_waitq, &wait,
+                                       TASK_UNINTERRUPTIBLE);
+
+                       mutex_unlock(&cgroup_mutex);
+                       schedule();
+                       finish_wait(&dsct->offline_waitq, &wait);
+
+                       cgroup_put(dsct);
+                       goto restart;
+               }
+       }
+}
+
+/**
+ * cgroup_save_control - save control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * Save ->subtree_control and ->subtree_ss_mask to the respective old_
+ * prefixed fields for @cgrp's subtree including @cgrp itself.
+ */
+static void cgroup_save_control(struct cgroup *cgrp)
+{
+       struct cgroup *dsct;
+       struct cgroup_subsys_state *d_css;
+
+       cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+               dsct->old_subtree_control = dsct->subtree_control;
+               dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
+       }
+}
+
+/**
+ * cgroup_propagate_control - refresh control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
+ * ->subtree_control and propagate controller availability through the
+ * subtree so that descendants don't have unavailable controllers enabled.
+ */
+static void cgroup_propagate_control(struct cgroup *cgrp)
+{
+       struct cgroup *dsct;
+       struct cgroup_subsys_state *d_css;
+
+       cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+               dsct->subtree_control &= cgroup_control(dsct);
+               dsct->subtree_ss_mask =
+                       cgroup_calc_subtree_ss_mask(dsct->subtree_control,
+                                                   cgroup_ss_mask(dsct));
+       }
+}
+
+/**
+ * cgroup_restore_control - restore control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * Restore ->subtree_control and ->subtree_ss_mask from the respective old_
+ * prefixed fields for @cgrp's subtree including @cgrp itself.
+ */
+static void cgroup_restore_control(struct cgroup *cgrp)
+{
+       struct cgroup *dsct;
+       struct cgroup_subsys_state *d_css;
+
+       cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+               dsct->subtree_control = dsct->old_subtree_control;
+               dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
+       }
+}
+
+static bool css_visible(struct cgroup_subsys_state *css)
+{
+       struct cgroup_subsys *ss = css->ss;
+       struct cgroup *cgrp = css->cgroup;
+
+       if (cgroup_control(cgrp) & (1 << ss->id))
+               return true;
+       if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
+               return false;
+       return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
+}
+
+/**
+ * cgroup_apply_control_enable - enable or show csses according to control
+ * @cgrp: root of the target subtree
+ *
+ * Walk @cgrp's subtree and create new csses or make the existing ones
+ * visible.  A css is created invisible if it's being implicitly enabled
+ * through dependency.  An invisible css is made visible when the userland
+ * explicitly enables it.
+ *
+ * Returns 0 on success, -errno on failure.  On failure, csses which have
+ * been processed already aren't cleaned up.  The caller is responsible for
+ * cleaning up with cgroup_apply_control_disble().
+ */
+static int cgroup_apply_control_enable(struct cgroup *cgrp)
+{
+       struct cgroup *dsct;
+       struct cgroup_subsys_state *d_css;
+       struct cgroup_subsys *ss;
+       int ssid, ret;
+
+       cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+               for_each_subsys(ss, ssid) {
+                       struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+
+                       WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
+
+                       if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
+                               continue;
+
+                       if (!css) {
+                               css = css_create(dsct, ss);
+                               if (IS_ERR(css))
+                                       return PTR_ERR(css);
+                       }
+
+                       if (css_visible(css)) {
+                               ret = css_populate_dir(css);
+                               if (ret)
+                                       return ret;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * cgroup_apply_control_disable - kill or hide csses according to control
+ * @cgrp: root of the target subtree
+ *
+ * Walk @cgrp's subtree and kill and hide csses so that they match
+ * cgroup_ss_mask() and cgroup_visible_mask().
+ *
+ * A css is hidden when the userland requests it to be disabled while other
+ * subsystems are still depending on it.  The css must not actively control
+ * resources and be in the vanilla state if it's made visible again later.
+ * Controllers which may be depended upon should provide ->css_reset() for
+ * this purpose.
+ */
+static void cgroup_apply_control_disable(struct cgroup *cgrp)
+{
+       struct cgroup *dsct;
+       struct cgroup_subsys_state *d_css;
+       struct cgroup_subsys *ss;
+       int ssid;
+
+       cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+               for_each_subsys(ss, ssid) {
+                       struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+
+                       WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
+
+                       if (!css)
+                               continue;
+
+                       if (css->parent &&
+                           !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
+                               kill_css(css);
+                       } else if (!css_visible(css)) {
+                               css_clear_dir(css);
+                               if (ss->css_reset)
+                                       ss->css_reset(css);
+                       }
+               }
+       }
+}
+
+/**
+ * cgroup_apply_control - apply control mask updates to the subtree
+ * @cgrp: root of the target subtree
+ *
+ * subsystems can be enabled and disabled in a subtree using the following
+ * steps.
+ *
+ * 1. Call cgroup_save_control() to stash the current state.
+ * 2. Update ->subtree_control masks in the subtree as desired.
+ * 3. Call cgroup_apply_control() to apply the changes.
+ * 4. Optionally perform other related operations.
+ * 5. Call cgroup_finalize_control() to finish up.
+ *
+ * This function implements step 3 and propagates the mask changes
+ * throughout @cgrp's subtree, updates csses accordingly and perform
+ * process migrations.
+ */
+static int cgroup_apply_control(struct cgroup *cgrp)
+{
+       int ret;
+
+       cgroup_propagate_control(cgrp);
+
+       ret = cgroup_apply_control_enable(cgrp);
+       if (ret)
+               return ret;
+
+       /*
+        * At this point, cgroup_e_css() results reflect the new csses
+        * making the following cgroup_update_dfl_csses() properly update
+        * css associations of all tasks in the subtree.
+        */
+       ret = cgroup_update_dfl_csses(cgrp);
+       if (ret)
+               return ret;
+
+       return 0;
+}
+
+/**
+ * cgroup_finalize_control - finalize control mask update
+ * @cgrp: root of the target subtree
+ * @ret: the result of the update
+ *
+ * Finalize control mask update.  See cgroup_apply_control() for more info.
+ */
+static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
+{
+       if (ret) {
+               cgroup_restore_control(cgrp);
+               cgroup_propagate_control(cgrp);
+       }
+
+       cgroup_apply_control_disable(cgrp);
+}
+
 /* change the enabled child controllers for a cgroup in the default hierarchy */
 static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                            char *buf, size_t nbytes,
                                            loff_t off)
 {
        u16 enable = 0, disable = 0;
-       u16 css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
        struct cgroup *cgrp, *child;
        struct cgroup_subsys *ss;
        char *tok;
@@ -2994,7 +3343,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                        return -EINVAL;
        }
 
-       cgrp = cgroup_kn_lock_live(of->kn);
+       cgrp = cgroup_kn_lock_live(of->kn, true);
        if (!cgrp)
                return -ENODEV;
 
@@ -3005,10 +3354,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                continue;
                        }
 
-                       /* unavailable or not enabled on the parent? */
-                       if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
-                           (cgroup_parent(cgrp) &&
-                            !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
+                       if (!(cgroup_control(cgrp) & (1 << ssid))) {
                                ret = -ENOENT;
                                goto out_unlock;
                        }
@@ -3042,135 +3388,21 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                goto out_unlock;
        }
 
-       /*
-        * Update subsys masks and calculate what needs to be done.  More
-        * subsystems than specified may need to be enabled or disabled
-        * depending on subsystem dependencies.
-        */
-       old_sc = cgrp->subtree_control;
-       old_ss = cgrp->subtree_ss_mask;
-       new_sc = (old_sc | enable) & ~disable;
-       new_ss = cgroup_calc_subtree_ss_mask(cgrp, new_sc);
-
-       css_enable = ~old_ss & new_ss;
-       css_disable = old_ss & ~new_ss;
-       enable |= css_enable;
-       disable |= css_disable;
-
-       /*
-        * Because css offlining is asynchronous, userland might try to
-        * re-enable the same controller while the previous instance is
-        * still around.  In such cases, wait till it's gone using
-        * offline_waitq.
-        */
-       do_each_subsys_mask(ss, ssid, css_enable) {
-               cgroup_for_each_live_child(child, cgrp) {
-                       DEFINE_WAIT(wait);
-
-                       if (!cgroup_css(child, ss))
-                               continue;
-
-                       cgroup_get(child);
-                       prepare_to_wait(&child->offline_waitq, &wait,
-                                       TASK_UNINTERRUPTIBLE);
-                       cgroup_kn_unlock(of->kn);
-                       schedule();
-                       finish_wait(&child->offline_waitq, &wait);
-                       cgroup_put(child);
-
-                       return restart_syscall();
-               }
-       } while_each_subsys_mask();
-
-       cgrp->subtree_control = new_sc;
-       cgrp->subtree_ss_mask = new_ss;
+       /* save and update control masks and prepare csses */
+       cgroup_save_control(cgrp);
 
-       /*
-        * Create new csses or make the existing ones visible.  A css is
-        * created invisible if it's being implicitly enabled through
-        * dependency.  An invisible css is made visible when the userland
-        * explicitly enables it.
-        */
-       do_each_subsys_mask(ss, ssid, enable) {
-               cgroup_for_each_live_child(child, cgrp) {
-                       if (css_enable & (1 << ssid)) {
-                               struct cgroup_subsys_state *css;
-
-                               css = css_create(child, ss);
-                               if (IS_ERR(css)) {
-                                       ret = PTR_ERR(css);
-                                       goto err_undo_css;
-                               }
-
-                               if (cgrp->subtree_control & (1 << ssid)) {
-                                       ret = css_populate_dir(css, NULL);
-                                       if (ret)
-                                               goto err_undo_css;
-                               }
-                       } else {
-                               ret = css_populate_dir(cgroup_css(child, ss),
-                                                      NULL);
-                               if (ret)
-                                       goto err_undo_css;
-                       }
-               }
-       } while_each_subsys_mask();
-
-       /*
-        * At this point, cgroup_e_css() results reflect the new csses
-        * making the following cgroup_update_dfl_csses() properly update
-        * css associations of all tasks in the subtree.
-        */
-       ret = cgroup_update_dfl_csses(cgrp);
-       if (ret)
-               goto err_undo_css;
+       cgrp->subtree_control |= enable;
+       cgrp->subtree_control &= ~disable;
 
-       /*
-        * All tasks are migrated out of disabled csses.  Kill or hide
-        * them.  A css is hidden when the userland requests it to be
-        * disabled while other subsystems are still depending on it.  The
-        * css must not actively control resources and be in the vanilla
-        * state if it's made visible again later.  Controllers which may
-        * be depended upon should provide ->css_reset() for this purpose.
-        */
-       do_each_subsys_mask(ss, ssid, disable) {
-               cgroup_for_each_live_child(child, cgrp) {
-                       struct cgroup_subsys_state *css = cgroup_css(child, ss);
+       ret = cgroup_apply_control(cgrp);
 
-                       if (css_disable & (1 << ssid)) {
-                               kill_css(css);
-                       } else {
-                               css_clear_dir(css, NULL);
-                               if (ss->css_reset)
-                                       ss->css_reset(css);
-                       }
-               }
-       } while_each_subsys_mask();
+       cgroup_finalize_control(cgrp, ret);
 
        kernfs_activate(cgrp->kn);
        ret = 0;
 out_unlock:
        cgroup_kn_unlock(of->kn);
        return ret ?: nbytes;
-
-err_undo_css:
-       cgrp->subtree_control = old_sc;
-       cgrp->subtree_ss_mask = old_ss;
-
-       do_each_subsys_mask(ss, ssid, enable) {
-               cgroup_for_each_live_child(child, cgrp) {
-                       struct cgroup_subsys_state *css = cgroup_css(child, ss);
-
-                       if (!css)
-                               continue;
-
-                       if (css_enable & (1 << ssid))
-                               kill_css(css);
-                       else
-                               css_clear_dir(css, NULL);
-               }
-       } while_each_subsys_mask();
-       goto out_unlock;
 }
 
 static int cgroup_events_show(struct seq_file *seq, void *v)
@@ -4035,6 +4267,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
        struct task_struct *task;
        int ret;
 
+       if (!cgroup_may_migrate_to(to))
+               return -EBUSY;
+
        mutex_lock(&cgroup_mutex);
 
        /* all tasks in @from are being moved, all csets are source */
@@ -4043,7 +4278,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
                cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
        spin_unlock_bh(&css_set_lock);
 
-       ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
+       ret = cgroup_migrate_prepare_dst(&preloaded_csets);
        if (ret)
                goto out_err;
 
@@ -4059,7 +4294,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
                css_task_iter_end(&it);
 
                if (task) {
-                       ret = cgroup_migrate(task, false, to);
+                       ret = cgroup_migrate(task, false, to->root);
                        put_task_struct(task);
                }
        } while (task && !ret);
@@ -4566,12 +4801,6 @@ static struct cftype cgroup_dfl_base_files[] = {
        },
        {
                .name = "cgroup.controllers",
-               .flags = CFTYPE_ONLY_ON_ROOT,
-               .seq_show = cgroup_root_controllers_show,
-       },
-       {
-               .name = "cgroup.controllers",
-               .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_controllers_show,
        },
        {
@@ -4888,33 +5117,18 @@ err_free_css:
        return ERR_PTR(err);
 }
 
-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
-                       umode_t mode)
+static struct cgroup *cgroup_create(struct cgroup *parent)
 {
-       struct cgroup *parent, *cgrp, *tcgrp;
-       struct cgroup_root *root;
-       struct cgroup_subsys *ss;
-       struct kernfs_node *kn;
-       int level, ssid, ret;
-
-       /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
-        */
-       if (strchr(name, '\n'))
-               return -EINVAL;
-
-       parent = cgroup_kn_lock_live(parent_kn);
-       if (!parent)
-               return -ENODEV;
-       root = parent->root;
-       level = parent->level + 1;
+       struct cgroup_root *root = parent->root;
+       struct cgroup *cgrp, *tcgrp;
+       int level = parent->level + 1;
+       int ret;
 
        /* allocate the cgroup and its ID, 0 is reserved for the root */
        cgrp = kzalloc(sizeof(*cgrp) +
                       sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
-       if (!cgrp) {
-               ret = -ENOMEM;
-               goto out_unlock;
-       }
+       if (!cgrp)
+               return ERR_PTR(-ENOMEM);
 
        ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
        if (ret)
@@ -4958,24 +5172,51 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
         */
        cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
 
-       /* create the csses */
-       do_each_subsys_mask(ss, ssid, parent->subtree_ss_mask) {
-               struct cgroup_subsys_state *css;
-
-               css = css_create(cgrp, ss);
-               if (IS_ERR(css)) {
-                       ret = PTR_ERR(css);
-                       goto out_destroy;
-               }
-       } while_each_subsys_mask();
-
        /*
         * On the default hierarchy, a child doesn't automatically inherit
         * subtree_control from the parent.  Each is configured manually.
         */
-       if (!cgroup_on_dfl(cgrp)) {
-               cgrp->subtree_control = parent->subtree_control;
-               cgroup_refresh_subtree_ss_mask(cgrp);
+       if (!cgroup_on_dfl(cgrp))
+               cgrp->subtree_control = cgroup_control(cgrp);
+
+       cgroup_propagate_control(cgrp);
+
+       /* @cgrp doesn't have dir yet so the following will only create csses */
+       ret = cgroup_apply_control_enable(cgrp);
+       if (ret)
+               goto out_destroy;
+
+       return cgrp;
+
+out_cancel_ref:
+       percpu_ref_exit(&cgrp->self.refcnt);
+out_free_cgrp:
+       kfree(cgrp);
+       return ERR_PTR(ret);
+out_destroy:
+       cgroup_destroy_locked(cgrp);
+       return ERR_PTR(ret);
+}
+
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+                       umode_t mode)
+{
+       struct cgroup *parent, *cgrp;
+       struct kernfs_node *kn;
+       int ret;
+
+       /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
+       if (strchr(name, '\n'))
+               return -EINVAL;
+
+       parent = cgroup_kn_lock_live(parent_kn, false);
+       if (!parent)
+               return -ENODEV;
+
+       cgrp = cgroup_create(parent);
+       if (IS_ERR(cgrp)) {
+               ret = PTR_ERR(cgrp);
+               goto out_unlock;
        }
 
        /* create the directory */
@@ -4996,15 +5237,13 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
        if (ret)
                goto out_destroy;
 
-       ret = css_populate_dir(&cgrp->self, NULL);
+       ret = css_populate_dir(&cgrp->self);
        if (ret)
                goto out_destroy;
 
-       do_each_subsys_mask(ss, ssid, parent->subtree_control) {
-               ret = css_populate_dir(cgroup_css(cgrp, ss), NULL);
-               if (ret)
-                       goto out_destroy;
-       } while_each_subsys_mask();
+       ret = cgroup_apply_control_enable(cgrp);
+       if (ret)
+               goto out_destroy;
 
        /* let's create and online css's */
        kernfs_activate(kn);
@@ -5012,17 +5251,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
        ret = 0;
        goto out_unlock;
 
-out_cancel_ref:
-       percpu_ref_exit(&cgrp->self.refcnt);
-out_free_cgrp:
-       kfree(cgrp);
+out_destroy:
+       cgroup_destroy_locked(cgrp);
 out_unlock:
        cgroup_kn_unlock(parent_kn);
        return ret;
-
-out_destroy:
-       cgroup_destroy_locked(cgrp);
-       goto out_unlock;
 }
 
 /*
@@ -5076,7 +5309,7 @@ static void kill_css(struct cgroup_subsys_state *css)
         * This must happen before css is disassociated with its cgroup.
         * See seq_css() for details.
         */
-       css_clear_dir(css, NULL);
+       css_clear_dir(css);
 
        /*
         * Killing would put the base ref, but we need to keep it alive
@@ -5125,6 +5358,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
        struct cgroup_subsys_state *css;
+       struct cgrp_cset_link *link;
        int ssid;
 
        lockdep_assert_held(&cgroup_mutex);
@@ -5145,11 +5379,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
                return -EBUSY;
 
        /*
-        * Mark @cgrp dead.  This prevents further task migration and child
-        * creation by disabling cgroup_lock_live_group().
+        * Mark @cgrp and the associated csets dead.  The former prevents
+        * further task migration and child creation by disabling
+        * cgroup_lock_live_group().  The latter makes the csets ignored by
+        * the migration path.
         */
        cgrp->self.flags &= ~CSS_ONLINE;
 
+       spin_lock_bh(&css_set_lock);
+       list_for_each_entry(link, &cgrp->cset_links, cset_link)
+               link->cset->dead = true;
+       spin_unlock_bh(&css_set_lock);
+
        /* initiate massacre of all css's */
        for_each_css(css, ssid, cgrp)
                kill_css(css);
@@ -5173,7 +5414,7 @@ static int cgroup_rmdir(struct kernfs_node *kn)
        struct cgroup *cgrp;
        int ret = 0;
 
-       cgrp = cgroup_kn_lock_live(kn);
+       cgrp = cgroup_kn_lock_live(kn, false);
        if (!cgrp)
                return 0;
 
@@ -5298,6 +5539,8 @@ int __init cgroup_init(void)
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
+       get_user_ns(init_cgroup_ns.user_ns);
+
        mutex_lock(&cgroup_mutex);
 
        /*
@@ -5344,7 +5587,9 @@ int __init cgroup_init(void)
 
                cgrp_dfl_root.subsys_mask |= 1 << ss->id;
 
-               if (!ss->dfl_cftypes)
+               if (ss->implicit_on_dfl)
+                       cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
+               else if (!ss->dfl_cftypes)
                        cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
 
                if (ss->dfl_cftypes == ss->legacy_cftypes) {
@@ -5447,7 +5692,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                 * " (deleted)" is appended to the cgroup path.
                 */
                if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
-                       path = cgroup_path(cgrp, buf, PATH_MAX);
+                       path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
+                                               current->nsproxy->cgroup_ns);
                        if (!path) {
                                retval = -ENAMETOOLONG;
                                goto out_unlock;
@@ -5732,7 +5978,9 @@ static void cgroup_release_agent(struct work_struct *work)
        if (!pathbuf || !agentbuf)
                goto out;
 
-       path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+       spin_lock_bh(&css_set_lock);
+       path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+       spin_unlock_bh(&css_set_lock);
        if (!path)
                goto out;
 
@@ -5944,6 +6192,133 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 
 #endif /* CONFIG_SOCK_CGROUP_DATA */
 
+/* cgroup namespaces */
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+       struct cgroup_namespace *new_ns;
+       int ret;
+
+       new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+       if (!new_ns)
+               return ERR_PTR(-ENOMEM);
+       ret = ns_alloc_inum(&new_ns->ns);
+       if (ret) {
+               kfree(new_ns);
+               return ERR_PTR(ret);
+       }
+       atomic_set(&new_ns->count, 1);
+       new_ns->ns.ops = &cgroupns_operations;
+       return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+       put_css_set(ns->root_cset);
+       put_user_ns(ns->user_ns);
+       ns_free_inum(&ns->ns);
+       kfree(ns);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+                                       struct user_namespace *user_ns,
+                                       struct cgroup_namespace *old_ns)
+{
+       struct cgroup_namespace *new_ns;
+       struct css_set *cset;
+
+       BUG_ON(!old_ns);
+
+       if (!(flags & CLONE_NEWCGROUP)) {
+               get_cgroup_ns(old_ns);
+               return old_ns;
+       }
+
+       /* Allow only sysadmin to create cgroup namespace. */
+       if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
+
+       mutex_lock(&cgroup_mutex);
+       spin_lock_bh(&css_set_lock);
+
+       cset = task_css_set(current);
+       get_css_set(cset);
+
+       spin_unlock_bh(&css_set_lock);
+       mutex_unlock(&cgroup_mutex);
+
+       new_ns = alloc_cgroup_ns();
+       if (IS_ERR(new_ns)) {
+               put_css_set(cset);
+               return new_ns;
+       }
+
+       new_ns->user_ns = get_user_ns(user_ns);
+       new_ns->root_cset = cset;
+
+       return new_ns;
+}
+
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+{
+       return container_of(ns, struct cgroup_namespace, ns);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+       struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
+
+       if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+           !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+               return -EPERM;
+
+       /* Don't need to do anything if we are attaching to our own cgroupns. */
+       if (cgroup_ns == nsproxy->cgroup_ns)
+               return 0;
+
+       get_cgroup_ns(cgroup_ns);
+       put_cgroup_ns(nsproxy->cgroup_ns);
+       nsproxy->cgroup_ns = cgroup_ns;
+
+       return 0;
+}
+
+static struct ns_common *cgroupns_get(struct task_struct *task)
+{
+       struct cgroup_namespace *ns = NULL;
+       struct nsproxy *nsproxy;
+
+       task_lock(task);
+       nsproxy = task->nsproxy;
+       if (nsproxy) {
+               ns = nsproxy->cgroup_ns;
+               get_cgroup_ns(ns);
+       }
+       task_unlock(task);
+
+       return ns ? &ns->ns : NULL;
+}
+
+static void cgroupns_put(struct ns_common *ns)
+{
+       put_cgroup_ns(to_cg_ns(ns));
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+       .name           = "cgroup",
+       .type           = CLONE_NEWCGROUP,
+       .get            = cgroupns_get,
+       .put            = cgroupns_put,
+       .install        = cgroupns_install,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+       return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *
 debug_css_alloc(struct cgroup_subsys_state *parent_css)
This page took 0.157736 seconds and 5 git commands to generate.