cgroup: reorganize cgroup_create()

[deliverable/linux.git] / kernel / cgroup.c
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index 3f1ca934a2378495e5129dbe807bfc7111f53e8b..1d6106c3fb4e22d41b270bc10de15f79450da7e2 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -26,6 +26,8 @@
   *  distribution for more details.
   */
  
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
  #include <linux/cgroup.h>
  #include <linux/cred.h>
  #include <linux/ctype.h>
@@ -97,6 +99,12 @@ static DEFINE_MUTEX(cgroup_mutex);
  static DECLARE_RWSEM(css_set_rwsem);
  #endif
  
+/*
+ * Protects cgroup_idr and css_idr so that IDs can be released without
+ * grabbing cgroup_mutex.
+ */
+static DEFINE_SPINLOCK(cgroup_idr_lock);
+
  /*
   * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
   * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
@@ -179,13 +187,46 @@ static struct cftype cgroup_base_files[];
  
  static void cgroup_put(struct cgroup *cgrp);
  static int rebind_subsystems(struct cgroup_root *dst_root,
-                            unsigned long ss_mask);
+                            unsigned int ss_mask);
  static void cgroup_destroy_css_killed(struct cgroup *cgrp);
  static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+static void kill_css(struct cgroup_subsys_state *css);
  static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
                               bool is_add);
  static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
  
+/* IDR wrappers which synchronize using cgroup_idr_lock */
+static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
+                           gfp_t gfp_mask)
+{
+       int ret;
+
+       idr_preload(gfp_mask);
+       spin_lock_bh(&cgroup_idr_lock);
+       ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+       spin_unlock_bh(&cgroup_idr_lock);
+       idr_preload_end();
+       return ret;
+}
+
+static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
+{
+       void *ret;
+
+       spin_lock_bh(&cgroup_idr_lock);
+       ret = idr_replace(idr, ptr, id);
+       spin_unlock_bh(&cgroup_idr_lock);
+       return ret;
+}
+
+static void cgroup_idr_remove(struct idr *idr, int id)
+{
+       spin_lock_bh(&cgroup_idr_lock);
+       idr_remove(idr, id);
+       spin_unlock_bh(&cgroup_idr_lock);
+}
+
  /**
   * cgroup_css - obtain a cgroup's css for the specified subsystem
   * @cgrp: the cgroup of interest
@@ -208,17 +249,44 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
                 return &cgrp->dummy_css;
  }
  
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns the dummy_css)
+ *
+ * Similar to cgroup_css() but returns the effctive css, which is defined
+ * as the matching css of the nearest ancestor including self which has @ss
+ * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
+ * function is guaranteed to return non-NULL css.
+ */
+static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+                                               struct cgroup_subsys *ss)
+{
+       lockdep_assert_held(&cgroup_mutex);
+
+       if (!ss)
+               return &cgrp->dummy_css;
+
+       if (!(cgrp->root->subsys_mask & (1 << ss->id)))
+               return NULL;
+
+       while (cgrp->parent &&
+              !(cgrp->parent->child_subsys_mask & (1 << ss->id)))
+               cgrp = cgrp->parent;
+
+       return cgroup_css(cgrp, ss);
+}
+
  /* convenient tests for these bits */
  static inline bool cgroup_is_dead(const struct cgroup *cgrp)
  {
         return test_bit(CGRP_DEAD, &cgrp->flags);
  }
  
-struct cgroup_subsys_state *seq_css(struct seq_file *seq)
+struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
  {
-       struct kernfs_open_file *of = seq->private;
         struct cgroup *cgrp = of->kn->parent->priv;
-       struct cftype *cft = seq_cft(seq);
+       struct cftype *cft = of_cft(of);
  
         /*
          * This is open and unprotected implementation of cgroup_css().
@@ -233,7 +301,7 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq)
         else
                 return &cgrp->dummy_css;
  }
-EXPORT_SYMBOL_GPL(seq_css);
+EXPORT_SYMBOL_GPL(of_css);
  
  /**
   * cgroup_is_descendant - test ancestry
@@ -273,7 +341,7 @@ static int notify_on_release(const struct cgroup *cgrp)
   * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
   * @cgrp: the target cgroup to iterate css's of
   *
- * Should be called under cgroup_mutex.
+ * Should be called under cgroup_[tree_]mutex.
   */
  #define for_each_css(css, ssid, cgrp)                                  \
         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
@@ -283,6 +351,20 @@ static int notify_on_release(const struct cgroup *cgrp)
                                 lockdep_is_held(&cgroup_mutex)))) { }   \
                 else
  
+/**
+ * for_each_e_css - iterate all effective css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_e_css(css, ssid, cgrp)                                        \
+       for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
+               if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
+                       ;                                               \
+               else
+
  /**
   * for_each_subsys - iterate all enabled cgroup subsystems
   * @ss: the iteration cursor
@@ -296,6 +378,14 @@ static int notify_on_release(const struct cgroup *cgrp)
  #define for_each_root(root)                                            \
         list_for_each_entry((root), &cgroup_roots, root_list)
  
+/* iterate over child cgrps, lock should be held throughout iteration */
+#define cgroup_for_each_live_child(child, cgrp)                                \
+       list_for_each_entry((child), &(cgrp)->children, sibling)        \
+               if (({ lockdep_assert_held(&cgroup_tree_mutex);         \
+                      cgroup_is_dead(child); }))                       \
+                       ;                                               \
+               else
+
  /**
   * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
   * @cgrp: the cgroup to be checked for liveness
@@ -359,6 +449,43 @@ struct css_set init_css_set = {
  
  static int css_set_count       = 1;    /* 1 for init_css_set */
  
+/**
+ * cgroup_update_populated - updated populated count of a cgroup
+ * @cgrp: the target cgroup
+ * @populated: inc or dec populated count
+ *
+ * @cgrp is either getting the first task (css_set) or losing the last.
+ * Update @cgrp->populated_cnt accordingly.  The count is propagated
+ * towards root so that a given cgroup's populated_cnt is zero iff the
+ * cgroup and all its descendants are empty.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if
+ * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
+ * changes from or to zero, userland is notified that the content of the
+ * interface file has changed.  This can be used to detect when @cgrp and
+ * its descendants become populated or empty.
+ */
+static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+{
+       lockdep_assert_held(&css_set_rwsem);
+
+       do {
+               bool trigger;
+
+               if (populated)
+                       trigger = !cgrp->populated_cnt++;
+               else
+                       trigger = !--cgrp->populated_cnt;
+
+               if (!trigger)
+                       break;
+
+               if (cgrp->populated_kn)
+                       kernfs_notify(cgrp->populated_kn);
+               cgrp = cgrp->parent;
+       } while (cgrp);
+}
+
  /*
   * hash table for cgroup groups. This improves the performance to find
   * an existing css_set. This hash doesn't (currently) take into
@@ -383,6 +510,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
  static void put_css_set_locked(struct css_set *cset, bool taskexit)
  {
         struct cgrp_cset_link *link, *tmp_link;
+       struct cgroup_subsys *ss;
+       int ssid;
  
         lockdep_assert_held(&css_set_rwsem);
  
@@ -390,6 +519,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
                 return;
  
         /* This css_set is dead. unlink it and release cgroup refcounts */
+       for_each_subsys(ss, ssid)
+               list_del(&cset->e_cset_node[ssid]);
         hash_del(&cset->hlist);
         css_set_count--;
  
@@ -400,10 +531,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
                 list_del(&link->cgrp_link);
  
                 /* @cgrp can't go away while we're holding css_set_rwsem */
-               if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
-                       if (taskexit)
-                               set_bit(CGRP_RELEASABLE, &cgrp->flags);
-                       check_for_release(cgrp);
+               if (list_empty(&cgrp->cset_links)) {
+                       cgroup_update_populated(cgrp, false);
+                       if (notify_on_release(cgrp)) {
+                               if (taskexit)
+                                       set_bit(CGRP_RELEASABLE, &cgrp->flags);
+                               check_for_release(cgrp);
+                       }
                 }
  
                 kfree(link);
@@ -452,20 +586,20 @@ static bool compare_css_sets(struct css_set *cset,
  {
         struct list_head *l1, *l2;
  
-       if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
-               /* Not all subsystems matched */
+       /*
+        * On the default hierarchy, there can be csets which are
+        * associated with the same set of cgroups but different csses.
+        * Let's first ensure that csses match.
+        */
+       if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
                 return false;
-       }
  
         /*
          * Compare cgroup pointers in order to distinguish between
-        * different cgroups in heirarchies with no subsystems. We
-        * could get by with just this check alone (and skip the
-        * memcmp above) but on most setups the memcmp check will
-        * avoid the need for this more expensive check on almost all
-        * candidates.
+        * different cgroups in hierarchies.  As different cgroups may
+        * share the same effective css, this comparison is always
+        * necessary.
          */
-
         l1 = &cset->cgrp_links;
         l2 = &old_cset->cgrp_links;
         while (1) {
@@ -529,14 +663,17 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
          * won't change, so no need for locking.
          */
         for_each_subsys(ss, i) {
-               if (root->cgrp.subsys_mask & (1UL << i)) {
-                       /* Subsystem is in this hierarchy. So we want
-                        * the subsystem state from the new
-                        * cgroup */
-                       template[i] = cgroup_css(cgrp, ss);
+               if (root->subsys_mask & (1UL << i)) {
+                       /*
+                        * @ss is in this hierarchy, so we want the
+                        * effective css from @cgrp.
+                        */
+                       template[i] = cgroup_e_css(cgrp, ss);
                 } else {
-                       /* Subsystem is not in this hierarchy, so we
-                        * don't want to change the subsystem state */
+                       /*
+                        * @ss is not in this hierarchy, so we don't want
+                        * to change the css.
+                        */
                         template[i] = old_cset->subsys[i];
                 }
         }
@@ -602,10 +739,18 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
         struct cgrp_cset_link *link;
  
         BUG_ON(list_empty(tmp_links));
+
+       if (cgroup_on_dfl(cgrp))
+               cset->dfl_cgrp = cgrp;
+
         link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
         link->cset = cset;
         link->cgrp = cgrp;
+
+       if (list_empty(&cgrp->cset_links))
+               cgroup_update_populated(cgrp, true);
         list_move(&link->cset_link, &cgrp->cset_links);
+
         /*
          * Always add links to the tail of the list so that the list
          * is sorted by order of hierarchy creation
@@ -628,7 +773,9 @@ static struct css_set *find_css_set(struct css_set *old_cset,
         struct css_set *cset;
         struct list_head tmp_links;
         struct cgrp_cset_link *link;
+       struct cgroup_subsys *ss;
         unsigned long key;
+       int ssid;
  
         lockdep_assert_held(&cgroup_mutex);
  
@@ -679,10 +826,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
  
         css_set_count++;
  
-       /* Add this cgroup group to the hash table */
+       /* Add @cset to the hash table */
         key = css_set_hash(cset->subsys);
         hash_add(css_set_table, &cset->hlist, key);
  
+       for_each_subsys(ss, ssid)
+               list_add_tail(&cset->e_cset_node[ssid],
+                             &cset->subsys[ssid]->cgroup->e_csets[ssid]);
+
         up_write(&css_set_rwsem);
  
         return cset;
@@ -742,7 +893,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
         BUG_ON(!list_empty(&cgrp->children));
  
         /* Rebind all subsystems back to the default hierarchy */
-       rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask);
+       rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
  
         /*
          * Release all the links from cset_links to this hierarchy's
@@ -848,7 +999,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
   * update of a tasks cgroup pointer by cgroup_attach_task()
   */
  
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
  static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
  static const struct file_operations proc_cgroupstats_operations;
  
@@ -883,8 +1034,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
         if (cft->read_u64 || cft->read_s64 || cft->seq_show)
                 mode |= S_IRUGO;
  
-       if (cft->write_u64 || cft->write_s64 || cft->write_string ||
-           cft->trigger)
+       if (cft->write_u64 || cft->write_s64 || cft->write)
                 mode |= S_IWUSR;
  
         return mode;
@@ -937,15 +1087,7 @@ static void cgroup_put(struct cgroup *cgrp)
         if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
                 return;
  
-       /*
-        * XXX: cgrp->id is only used to look up css's.  As cgroup and
-        * css's lifetimes will be decoupled, it should be made
-        * per-subsystem and moved to css->id so that lookups are
-        * successful until the target css is released.
-        */
-       mutex_lock(&cgroup_mutex);
-       idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
-       mutex_unlock(&cgroup_mutex);
+       cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
         cgrp->id = -1;
  
         call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
@@ -964,7 +1106,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
   * @cgrp: target cgroup
   * @subsys_mask: mask of the subsystem ids whose files should be removed
   */
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
  {
         struct cgroup_subsys *ss;
         int i;
@@ -972,18 +1114,17 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
         for_each_subsys(ss, i) {
                 struct cftype *cfts;
  
-               if (!test_bit(i, &subsys_mask))
+               if (!(subsys_mask & (1 << i)))
                         continue;
                 list_for_each_entry(cfts, &ss->cfts, node)
                         cgroup_addrm_files(cgrp, cfts, false);
         }
  }
  
-static int rebind_subsystems(struct cgroup_root *dst_root,
-                            unsigned long ss_mask)
+static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
  {
         struct cgroup_subsys *ss;
-       int ssid, ret;
+       int ssid, i, ret;
  
         lockdep_assert_held(&cgroup_tree_mutex);
         lockdep_assert_held(&cgroup_mutex);
@@ -992,16 +1133,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
                 if (!(ss_mask & (1 << ssid)))
                         continue;
  
-               /* if @ss is on the dummy_root, we can always move it */
-               if (ss->root == &cgrp_dfl_root)
-                       continue;
-
-               /* if @ss has non-root cgroups attached to it, can't move */
-               if (!list_empty(&ss->root->cgrp.children))
+               /* if @ss has non-root csses attached to it, can't move */
+               if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
                         return -EBUSY;
  
                 /* can't move between two non-dummy roots either */
-               if (dst_root != &cgrp_dfl_root)
+               if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
                         return -EBUSY;
         }
  
@@ -1017,9 +1154,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
                  * Just warn about it and continue.
                  */
                 if (cgrp_dfl_root_visible) {
-                       pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",
-                                  ret, ss_mask);
-                       pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");
+                       pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
+                               ret, ss_mask);
+                       pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
                 }
         }
  
@@ -1036,6 +1173,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
         for_each_subsys(ss, ssid) {
                 struct cgroup_root *src_root;
                 struct cgroup_subsys_state *css;
+               struct css_set *cset;
  
                 if (!(ss_mask & (1 << ssid)))
                         continue;
@@ -1050,8 +1188,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
                 ss->root = dst_root;
                 css->cgroup = &dst_root->cgrp;
  
-               src_root->cgrp.subsys_mask &= ~(1 << ssid);
-               dst_root->cgrp.subsys_mask |= 1 << ssid;
+               down_write(&css_set_rwsem);
+               hash_for_each(css_set_table, i, cset, hlist)
+                       list_move_tail(&cset->e_cset_node[ss->id],
+                                      &dst_root->cgrp.e_csets[ss->id]);
+               up_write(&css_set_rwsem);
+
+               src_root->subsys_mask &= ~(1 << ssid);
+               src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+
+               /* default hierarchy doesn't enable controllers by default */
+               dst_root->subsys_mask |= 1 << ssid;
+               if (dst_root != &cgrp_dfl_root)
+                       dst_root->cgrp.child_subsys_mask |= 1 << ssid;
  
                 if (ss->bind)
                         ss->bind(css);
@@ -1069,7 +1218,7 @@ static int cgroup_show_options(struct seq_file *seq,
         int ssid;
  
         for_each_subsys(ss, ssid)
-               if (root->cgrp.subsys_mask & (1 << ssid))
+               if (root->subsys_mask & (1 << ssid))
                         seq_printf(seq, ",%s", ss->name);
         if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
                 seq_puts(seq, ",sane_behavior");
@@ -1091,8 +1240,8 @@ static int cgroup_show_options(struct seq_file *seq,
  }
  
  struct cgroup_sb_opts {
-       unsigned long subsys_mask;
-       unsigned long flags;
+       unsigned int subsys_mask;
+       unsigned int flags;
         char *release_agent;
         bool cpuset_clone_children;
         char *name;
@@ -1100,24 +1249,16 @@ struct cgroup_sb_opts {
         bool none;
  };
  
-/*
- * Convert a hierarchy specifier into a bitmask of subsystems and
- * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
- * array. This function takes refcounts on subsystems to be used, unless it
- * returns error, in which case no refcounts are taken.
- */
  static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
  {
         char *token, *o = data;
         bool all_ss = false, one_ss = false;
-       unsigned long mask = (unsigned long)-1;
+       unsigned int mask = -1U;
         struct cgroup_subsys *ss;
         int i;
  
-       BUG_ON(!mutex_is_locked(&cgroup_mutex));
-
  #ifdef CONFIG_CPUSETS
-       mask = ~(1UL << cpuset_cgrp_id);
+       mask = ~(1U << cpuset_cgrp_id);
  #endif
  
         memset(opts, 0, sizeof(*opts));
@@ -1198,7 +1339,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                         /* Mutually exclusive option 'all' + subsystem name */
                         if (all_ss)
                                 return -EINVAL;
-                       set_bit(i, &opts->subsys_mask);
+                       opts->subsys_mask |= (1 << i);
                         one_ss = true;
  
                         break;
@@ -1210,12 +1351,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
         /* Consistency checks */
  
         if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
-               pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
+               pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
  
                 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
                     opts->cpuset_clone_children || opts->release_agent ||
                     opts->name) {
-                       pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
+                       pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
                         return -EINVAL;
                 }
         } else {
@@ -1227,7 +1368,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                 if (all_ss || (!one_ss && !opts->none && !opts->name))
                         for_each_subsys(ss, i)
                                 if (!ss->disabled)
-                                       set_bit(i, &opts->subsys_mask);
+                                       opts->subsys_mask |= (1 << i);
  
                 /*
                  * We either have to specify by name or by subsystems. (So
@@ -1258,10 +1399,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
         int ret = 0;
         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
         struct cgroup_sb_opts opts;
-       unsigned long added_mask, removed_mask;
+       unsigned int added_mask, removed_mask;
  
         if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
-               pr_err("cgroup: sane_behavior: remount is not allowed\n");
+               pr_err("sane_behavior: remount is not allowed\n");
                 return -EINVAL;
         }
  
@@ -1273,17 +1414,17 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
         if (ret)
                 goto out_unlock;
  
-       if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent)
-               pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
-                          task_tgid_nr(current), current->comm);
+       if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+               pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+                       task_tgid_nr(current), current->comm);
  
-       added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask;
-       removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask;
+       added_mask = opts.subsys_mask & ~root->subsys_mask;
+       removed_mask = root->subsys_mask & ~opts.subsys_mask;
  
         /* Don't allow flags or name to change at remount */
         if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
             (opts.name && strcmp(opts.name, root->name))) {
-               pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
+               pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
                        opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
                        root->flags & CGRP_ROOT_OPTION_MASK, root->name);
                 ret = -EINVAL;
@@ -1369,6 +1510,9 @@ out_unlock:
  
  static void init_cgroup_housekeeping(struct cgroup *cgrp)
  {
+       struct cgroup_subsys *ss;
+       int ssid;
+
         atomic_set(&cgrp->refcnt, 1);
         INIT_LIST_HEAD(&cgrp->sibling);
         INIT_LIST_HEAD(&cgrp->children);
@@ -1377,6 +1521,11 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
         INIT_LIST_HEAD(&cgrp->pidlists);
         mutex_init(&cgrp->pidlist_mutex);
         cgrp->dummy_css.cgroup = cgrp;
+
+       for_each_subsys(ss, ssid)
+               INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
+
+       init_waitqueue_head(&cgrp->offline_waitq);
  }
  
  static void init_cgroup_root(struct cgroup_root *root,
@@ -1399,7 +1548,7 @@ static void init_cgroup_root(struct cgroup_root *root,
                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
  }
  
-static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
+static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
  {
         LIST_HEAD(tmp_links);
         struct cgroup *root_cgrp = &root->cgrp;
@@ -1409,7 +1558,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
         lockdep_assert_held(&cgroup_tree_mutex);
         lockdep_assert_held(&cgroup_mutex);
  
-       ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
+       ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
         if (ret < 0)
                 goto out;
         root_cgrp->id = ret;
@@ -1535,7 +1684,7 @@ retry:
                  * subsystems) then they must match.
                  */
                 if ((opts.subsys_mask || opts.none) &&
-                   (opts.subsys_mask != root->cgrp.subsys_mask)) {
+                   (opts.subsys_mask != root->subsys_mask)) {
                         if (!name_match)
                                 continue;
                         ret = -EBUSY;
@@ -1544,11 +1693,11 @@ retry:
  
                 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
                         if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
-                               pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
+                               pr_err("sane_behavior: new mount options should match the existing superblock\n");
                                 ret = -EINVAL;
                                 goto out_unlock;
                         } else {
-                               pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
+                               pr_warn("new mount options do not match the existing superblock, will be ignored\n");
                         }
                 }
  
@@ -1737,7 +1886,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
  
  /**
   * cgroup_task_migrate - move a task from one cgroup to another.
- * @old_cgrp; the cgroup @tsk is being migrated from
+ * @old_cgrp: the cgroup @tsk is being migrated from
   * @tsk: the task being migrated
   * @new_cset: the new css_set @tsk is being attached to
   *
@@ -1829,10 +1978,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
  
         src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
  
-       /* nothing to do if this cset already belongs to the cgroup */
-       if (src_cgrp == dst_cgrp)
-               return;
-
         if (!list_empty(&src_cset->mg_preload_node))
                 return;
  
@@ -1847,13 +1992,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
  
  /**
   * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @dst_cgrp: the destination cgroup
+ * @dst_cgrp: the destination cgroup (may be %NULL)
   * @preloaded_csets: list of preloaded source css_sets
   *
   * Tasks are about to be moved to @dst_cgrp and all the source css_sets
   * have been preloaded to @preloaded_csets.  This function looks up and
- * pins all destination css_sets, links each to its source, and put them on
- * @preloaded_csets.
+ * pins all destination css_sets, links each to its source, and append them
+ * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each
+ * source css_set is assumed to be its cgroup on the default hierarchy.
   *
   * This function must be called after cgroup_migrate_add_src() has been
   * called on each migration source css_set.  After migration is performed
@@ -1864,19 +2010,42 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
                                       struct list_head *preloaded_csets)
  {
         LIST_HEAD(csets);
-       struct css_set *src_cset;
+       struct css_set *src_cset, *tmp_cset;
  
         lockdep_assert_held(&cgroup_mutex);
  
+       /*
+        * Except for the root, child_subsys_mask must be zero for a cgroup
+        * with tasks so that child cgroups don't compete against tasks.
+        */
+       if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent &&
+           dst_cgrp->child_subsys_mask)
+               return -EBUSY;
+
         /* look up the dst cset for each src cset and link it to src */
-       list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
+       list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
                 struct css_set *dst_cset;
  
-               dst_cset = find_css_set(src_cset, dst_cgrp);
+               dst_cset = find_css_set(src_cset,
+                                       dst_cgrp ?: src_cset->dfl_cgrp);
                 if (!dst_cset)
                         goto err;
  
                 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
+
+               /*
+                * If src cset equals dst, it's noop.  Drop the src.
+                * cgroup_migrate() will skip the cset too.  Note that we
+                * can't handle src == dst as some nodes are used by both.
+                */
+               if (src_cset == dst_cset) {
+                       src_cset->mg_src_cgrp = NULL;
+                       list_del_init(&src_cset->mg_preload_node);
+                       put_css_set(src_cset, false);
+                       put_css_set(dst_cset, false);
+                       continue;
+               }
+
                 src_cset->mg_dst_cset = dst_cset;
  
                 if (list_empty(&dst_cset->mg_preload_node))
@@ -1885,7 +2054,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
                         put_css_set(dst_cset, false);
         }
  
-       list_splice(&csets, preloaded_csets);
+       list_splice_tail(&csets, preloaded_csets);
         return 0;
  err:
         cgroup_migrate_finish(&csets);
@@ -1966,7 +2135,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
                 return 0;
  
         /* check that we can legitimately attach to the cgroup */
-       for_each_css(css, i, cgrp) {
+       for_each_e_css(css, i, cgrp) {
                 if (css->ss->can_attach) {
                         ret = css->ss->can_attach(css, &tset);
                         if (ret) {
@@ -1996,7 +2165,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
          */
         tset.csets = &tset.dst_csets;
  
-       for_each_css(css, i, cgrp)
+       for_each_e_css(css, i, cgrp)
                 if (css->ss->attach)
                         css->ss->attach(css, &tset);
  
@@ -2004,7 +2173,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
         goto out_release_tset;
  
  out_cancel_attach:
-       for_each_css(css, i, cgrp) {
+       for_each_e_css(css, i, cgrp) {
                 if (css == failed_css)
                         break;
                 if (css->ss->cancel_attach)
@@ -2063,12 +2232,18 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
   * function to attach either it or all tasks in its threadgroup. Will lock
   * cgroup_mutex and threadgroup.
   */
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+                                   size_t nbytes, loff_t off, bool threadgroup)
  {
         struct task_struct *tsk;
         const struct cred *cred = current_cred(), *tcred;
+       struct cgroup *cgrp = of_css(of)->cgroup;
+       pid_t pid;
         int ret;
  
+       if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+               return -EINVAL;
+
         if (!cgroup_lock_live_group(cgrp))
                 return -ENODEV;
  
@@ -2136,7 +2311,7 @@ retry_find_task:
         put_task_struct(tsk);
  out_unlock_cgroup:
         mutex_unlock(&cgroup_mutex);
-       return ret;
+       return ret ?: nbytes;
  }
  
  /**
@@ -2170,43 +2345,43 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
  }
  EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
  
-static int cgroup_tasks_write(struct cgroup_subsys_state *css,
-                             struct cftype *cft, u64 pid)
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+                                 char *buf, size_t nbytes, loff_t off)
  {
-       return attach_task_by_pid(css->cgroup, pid, false);
+       return __cgroup_procs_write(of, buf, nbytes, off, false);
  }
  
-static int cgroup_procs_write(struct cgroup_subsys_state *css,
-                             struct cftype *cft, u64 tgid)
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+                                 char *buf, size_t nbytes, loff_t off)
  {
-       return attach_task_by_pid(css->cgroup, tgid, true);
+       return __cgroup_procs_write(of, buf, nbytes, off, true);
  }
  
-static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
-                                     struct cftype *cft, char *buffer)
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+                                         char *buf, size_t nbytes, loff_t off)
  {
-       struct cgroup_root *root = css->cgroup->root;
+       struct cgroup *cgrp = of_css(of)->cgroup;
+       struct cgroup_root *root = cgrp->root;
  
         BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
-       if (!cgroup_lock_live_group(css->cgroup))
+       if (!cgroup_lock_live_group(cgrp))
                 return -ENODEV;
         spin_lock(&release_agent_path_lock);
-       strlcpy(root->release_agent_path, buffer,
+       strlcpy(root->release_agent_path, strstrip(buf),
                 sizeof(root->release_agent_path));
         spin_unlock(&release_agent_path_lock);
         mutex_unlock(&cgroup_mutex);
-       return 0;
+       return nbytes;
  }
  
  static int cgroup_release_agent_show(struct seq_file *seq, void *v)
  {
         struct cgroup *cgrp = seq_css(seq)->cgroup;
  
-       if (!cgroup_lock_live_group(cgrp))
-               return -ENODEV;
+       spin_lock(&release_agent_path_lock);
         seq_puts(seq, cgrp->root->release_agent_path);
+       spin_unlock(&release_agent_path_lock);
         seq_putc(seq, '\n');
-       mutex_unlock(&cgroup_mutex);
         return 0;
  }
  
@@ -2218,6 +2393,337 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
         return 0;
  }
  
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+{
+       struct cgroup_subsys *ss;
+       bool printed = false;
+       int ssid;
+
+       for_each_subsys(ss, ssid) {
+               if (ss_mask & (1 << ssid)) {
+                       if (printed)
+                               seq_putc(seq, ' ');
+                       seq_printf(seq, "%s", ss->name);
+                       printed = true;
+               }
+       }
+       if (printed)
+               seq_putc(seq, '\n');
+}
+
+/* show controllers which are currently attached to the default hierarchy */
+static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
+{
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+       cgroup_print_ss_mask(seq, cgrp->root->subsys_mask);
+       return 0;
+}
+
+/* show controllers which are enabled from the parent */
+static int cgroup_controllers_show(struct seq_file *seq, void *v)
+{
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+       cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask);
+       return 0;
+}
+
+/* show controllers which are enabled for a given cgroup's children */
+static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
+{
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+       cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+       return 0;
+}
+
+/**
+ * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
+ * @cgrp: root of the subtree to update csses for
+ *
+ * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
+ * css associations need to be updated accordingly.  This function looks up
+ * all css_sets which are attached to the subtree, creates the matching
+ * updated css_sets and migrates the tasks to the new ones.
+ */
+static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+{
+       LIST_HEAD(preloaded_csets);
+       struct cgroup_subsys_state *css;
+       struct css_set *src_cset;
+       int ret;
+
+       lockdep_assert_held(&cgroup_tree_mutex);
+       lockdep_assert_held(&cgroup_mutex);
+
+       /* look up all csses currently attached to @cgrp's subtree */
+       down_read(&css_set_rwsem);
+       css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+               struct cgrp_cset_link *link;
+
+               /* self is not affected by child_subsys_mask change */
+               if (css->cgroup == cgrp)
+                       continue;
+
+               list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
+                       cgroup_migrate_add_src(link->cset, cgrp,
+                                              &preloaded_csets);
+       }
+       up_read(&css_set_rwsem);
+
+       /* NULL dst indicates self on default hierarchy */
+       ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+       if (ret)
+               goto out_finish;
+
+       list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+               struct task_struct *last_task = NULL, *task;
+
+               /* src_csets precede dst_csets, break on the first dst_cset */
+               if (!src_cset->mg_src_cgrp)
+                       break;
+
+               /*
+                * All tasks in src_cset need to be migrated to the
+                * matching dst_cset.  Empty it process by process.  We
+                * walk tasks but migrate processes.  The leader might even
+                * belong to a different cset but such src_cset would also
+                * be among the target src_csets because the default
+                * hierarchy enforces per-process membership.
+                */
+               while (true) {
+                       down_read(&css_set_rwsem);
+                       task = list_first_entry_or_null(&src_cset->tasks,
+                                               struct task_struct, cg_list);
+                       if (task) {
+                               task = task->group_leader;
+                               WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
+                               get_task_struct(task);
+                       }
+                       up_read(&css_set_rwsem);
+
+                       if (!task)
+                               break;
+
+                       /* guard against possible infinite loop */
+                       if (WARN(last_task == task,
+                                "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
+                               goto out_finish;
+                       last_task = task;
+
+                       threadgroup_lock(task);
+                       /* raced against de_thread() from another thread? */
+                       if (!thread_group_leader(task)) {
+                               threadgroup_unlock(task);
+                               put_task_struct(task);
+                               continue;
+                       }
+
+                       ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
+
+                       threadgroup_unlock(task);
+                       put_task_struct(task);
+
+                       if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
+                               goto out_finish;
+               }
+       }
+
+out_finish:
+       cgroup_migrate_finish(&preloaded_csets);
+       return ret;
+}
+
+/* change the enabled child controllers for a cgroup in the default hierarchy */
+static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
+                                           char *buf, size_t nbytes,
+                                           loff_t off)
+{
+       unsigned int enable = 0, disable = 0;
+       struct cgroup *cgrp = of_css(of)->cgroup, *child;
+       struct cgroup_subsys *ss;
+       char *tok;
+       int ssid, ret;
+
+       /*
+        * Parse input - space separated list of subsystem names prefixed
+        * with either + or -.
+        */
+       buf = strstrip(buf);
+       while ((tok = strsep(&buf, " "))) {
+               if (tok[0] == '\0')
+                       continue;
+               for_each_subsys(ss, ssid) {
+                       if (ss->disabled || strcmp(tok + 1, ss->name))
+                               continue;
+
+                       if (*tok == '+') {
+                               enable |= 1 << ssid;
+                               disable &= ~(1 << ssid);
+                       } else if (*tok == '-') {
+                               disable |= 1 << ssid;
+                               enable &= ~(1 << ssid);
+                       } else {
+                               return -EINVAL;
+                       }
+                       break;
+               }
+               if (ssid == CGROUP_SUBSYS_COUNT)
+                       return -EINVAL;
+       }
+
+       /*
+        * We're gonna grab cgroup_tree_mutex which nests outside kernfs
+        * active_ref.  cgroup_lock_live_group() already provides enough
+        * protection.  Ensure @cgrp stays accessible and break the
+        * active_ref protection.
+        */
+       cgroup_get(cgrp);
+       kernfs_break_active_protection(of->kn);
+
+       mutex_lock(&cgroup_tree_mutex);
+
+       for_each_subsys(ss, ssid) {
+               if (enable & (1 << ssid)) {
+                       if (cgrp->child_subsys_mask & (1 << ssid)) {
+                               enable &= ~(1 << ssid);
+                               continue;
+                       }
+
+                       /*
+                        * Because css offlining is asynchronous, userland
+                        * might try to re-enable the same controller while
+                        * the previous instance is still around.  In such
+                        * cases, wait till it's gone using offline_waitq.
+                        */
+                       cgroup_for_each_live_child(child, cgrp) {
+                               DEFINE_WAIT(wait);
+
+                               if (!cgroup_css(child, ss))
+                                       continue;
+
+                               cgroup_get(child);
+                               prepare_to_wait(&child->offline_waitq, &wait,
+                                               TASK_UNINTERRUPTIBLE);
+                               mutex_unlock(&cgroup_tree_mutex);
+                               schedule();
+                               finish_wait(&child->offline_waitq, &wait);
+                               cgroup_put(child);
+
+                               ret = restart_syscall();
+                               goto out_unbreak;
+                       }
+
+                       /* unavailable or not enabled on the parent? */
+                       if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+                           (cgrp->parent &&
+                            !(cgrp->parent->child_subsys_mask & (1 << ssid)))) {
+                               ret = -ENOENT;
+                               goto out_unlock_tree;
+                       }
+               } else if (disable & (1 << ssid)) {
+                       if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+                               disable &= ~(1 << ssid);
+                               continue;
+                       }
+
+                       /* a child has it enabled? */
+                       cgroup_for_each_live_child(child, cgrp) {
+                               if (child->child_subsys_mask & (1 << ssid)) {
+                                       ret = -EBUSY;
+                                       goto out_unlock_tree;
+                               }
+                       }
+               }
+       }
+
+       if (!enable && !disable) {
+               ret = 0;
+               goto out_unlock_tree;
+       }
+
+       if (!cgroup_lock_live_group(cgrp)) {
+               ret = -ENODEV;
+               goto out_unlock_tree;
+       }
+
+       /*
+        * Except for the root, child_subsys_mask must be zero for a cgroup
+        * with tasks so that child cgroups don't compete against tasks.
+        */
+       if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) {
+               ret = -EBUSY;
+               goto out_unlock;
+       }
+
+       /*
+        * Create csses for enables and update child_subsys_mask.  This
+        * changes cgroup_e_css() results which in turn makes the
+        * subsequent cgroup_update_dfl_csses() associate all tasks in the
+        * subtree to the updated csses.
+        */
+       for_each_subsys(ss, ssid) {
+               if (!(enable & (1 << ssid)))
+                       continue;
+
+               cgroup_for_each_live_child(child, cgrp) {
+                       ret = create_css(child, ss);
+                       if (ret)
+                               goto err_undo_css;
+               }
+       }
+
+       cgrp->child_subsys_mask |= enable;
+       cgrp->child_subsys_mask &= ~disable;
+
+       ret = cgroup_update_dfl_csses(cgrp);
+       if (ret)
+               goto err_undo_css;
+
+       /* all tasks are now migrated away from the old csses, kill them */
+       for_each_subsys(ss, ssid) {
+               if (!(disable & (1 << ssid)))
+                       continue;
+
+               cgroup_for_each_live_child(child, cgrp)
+                       kill_css(cgroup_css(child, ss));
+       }
+
+       kernfs_activate(cgrp->kn);
+       ret = 0;
+out_unlock:
+       mutex_unlock(&cgroup_mutex);
+out_unlock_tree:
+       mutex_unlock(&cgroup_tree_mutex);
+out_unbreak:
+       kernfs_unbreak_active_protection(of->kn);
+       cgroup_put(cgrp);
+       return ret ?: nbytes;
+
+err_undo_css:
+       cgrp->child_subsys_mask &= ~enable;
+       cgrp->child_subsys_mask |= disable;
+
+       for_each_subsys(ss, ssid) {
+               if (!(enable & (1 << ssid)))
+                       continue;
+
+               cgroup_for_each_live_child(child, cgrp) {
+                       struct cgroup_subsys_state *css = cgroup_css(child, ss);
+                       if (css)
+                               kill_css(css);
+               }
+       }
+       goto out_unlock;
+}
+
+static int cgroup_populated_show(struct seq_file *seq, void *v)
+{
+       seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+       return 0;
+}
+
  static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
                                  size_t nbytes, loff_t off)
  {
@@ -2226,6 +2732,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
         struct cgroup_subsys_state *css;
         int ret;
  
+       if (cft->write)
+               return cft->write(of, buf, nbytes, off);
+
         /*
          * kernfs guarantees that a file isn't deleted with operations in
          * flight, which means that the matching css is and stays alive and
@@ -2236,9 +2745,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
         css = cgroup_css(cgrp, cft->ss);
         rcu_read_unlock();
  
-       if (cft->write_string) {
-               ret = cft->write_string(css, cft, strstrip(buf));
-       } else if (cft->write_u64) {
+       if (cft->write_u64) {
                 unsigned long long v;
                 ret = kstrtoull(buf, 0, &v);
                 if (!ret)
@@ -2248,8 +2755,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
                 ret = kstrtoll(buf, 0, &v);
                 if (!ret)
                         ret = cft->write_s64(css, cft, v);
-       } else if (cft->trigger) {
-               ret = cft->trigger(css, (unsigned int)cft->private);
         } else {
                 ret = -EINVAL;
         }
@@ -2377,9 +2882,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
                 return PTR_ERR(kn);
  
         ret = cgroup_kn_set_ugid(kn);
-       if (ret)
+       if (ret) {
                 kernfs_remove(kn);
-       return ret;
+               return ret;
+       }
+
+       if (cft->seq_show == cgroup_populated_show)
+               cgrp->populated_kn = kn;
+       return 0;
  }
  
  /**
@@ -2415,8 +2925,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
                 if (is_add) {
                         ret = cgroup_add_file(cgrp, cft);
                         if (ret) {
-                               pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
-                                       cft->name, ret);
+                               pr_warn("%s: failed to add %s, err=%d\n",
+                                       __func__, cft->name, ret);
                                 return ret;
                         }
                 } else {
@@ -2436,10 +2946,6 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
  
         lockdep_assert_held(&cgroup_tree_mutex);
  
-       /* don't bother if @ss isn't attached */
-       if (ss->root == &cgrp_dfl_root)
-               return 0;
-
         /* add/rm files for all cgroups created before */
         css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
                 struct cgroup *cgrp = css->cgroup;
@@ -2641,10 +3147,19 @@ css_next_child(struct cgroup_subsys_state *pos_css,
                                 break;
         }
  
-       if (&next->sibling == &cgrp->children)
-               return NULL;
+       /*
+        * @next, if not pointing to the head, can be dereferenced and is
+        * the next sibling; however, it might have @ss disabled.  If so,
+        * fast-forward to the next enabled one.
+        */
+       while (&next->sibling != &cgrp->children) {
+               struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss);
  
-       return cgroup_css(next, parent_css->ss);
+               if (next_css)
+                       return next_css;
+               next = list_entry_rcu(next->sibling.next, struct cgroup, sibling);
+       }
+       return NULL;
  }
  
  /**
@@ -2781,27 +3296,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
   */
  static void css_advance_task_iter(struct css_task_iter *it)
  {
-       struct list_head *l = it->cset_link;
+       struct list_head *l = it->cset_pos;
         struct cgrp_cset_link *link;
         struct css_set *cset;
  
         /* Advance to the next non-empty css_set */
         do {
                 l = l->next;
-               if (l == &it->origin_css->cgroup->cset_links) {
-                       it->cset_link = NULL;
+               if (l == it->cset_head) {
+                       it->cset_pos = NULL;
                         return;
                 }
-               link = list_entry(l, struct cgrp_cset_link, cset_link);
-               cset = link->cset;
+
+               if (it->ss) {
+                       cset = container_of(l, struct css_set,
+                                           e_cset_node[it->ss->id]);
+               } else {
+                       link = list_entry(l, struct cgrp_cset_link, cset_link);
+                       cset = link->cset;
+               }
         } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
  
-       it->cset_link = l;
+       it->cset_pos = l;
  
         if (!list_empty(&cset->tasks))
-               it->task = cset->tasks.next;
+               it->task_pos = cset->tasks.next;
         else
-               it->task = cset->mg_tasks.next;
+               it->task_pos = cset->mg_tasks.next;
+
+       it->tasks_head = &cset->tasks;
+       it->mg_tasks_head = &cset->mg_tasks;
  }
  
  /**
@@ -2827,8 +3351,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
  
         down_read(&css_set_rwsem);
  
-       it->origin_css = css;
-       it->cset_link = &css->cgroup->cset_links;
+       it->ss = css->ss;
+
+       if (it->ss)
+               it->cset_pos = &css->cgroup->e_csets[css->ss->id];
+       else
+               it->cset_pos = &css->cgroup->cset_links;
+
+       it->cset_head = it->cset_pos;
  
         css_advance_task_iter(it);
  }
@@ -2844,12 +3374,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
  struct task_struct *css_task_iter_next(struct css_task_iter *it)
  {
         struct task_struct *res;
-       struct list_head *l = it->task;
-       struct cgrp_cset_link *link = list_entry(it->cset_link,
-                                       struct cgrp_cset_link, cset_link);
+       struct list_head *l = it->task_pos;
  
         /* If the iterator cg is NULL, we have no tasks */
-       if (!it->cset_link)
+       if (!it->cset_pos)
                 return NULL;
         res = list_entry(l, struct task_struct, cg_list);
  
@@ -2860,13 +3388,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
          */
         l = l->next;
  
-       if (l == &link->cset->tasks)
-               l = link->cset->mg_tasks.next;
+       if (l == it->tasks_head)
+               l = it->mg_tasks_head->next;
  
-       if (l == &link->cset->mg_tasks)
+       if (l == it->mg_tasks_head)
                 css_advance_task_iter(it);
         else
-               it->task = l;
+               it->task_pos = l;
  
         return res;
  }
@@ -3246,7 +3774,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
  
         /*
          * We aren't being called from kernfs and there's no guarantee on
-        * @kn->priv's validity.  For this and css_tryget_from_dir(),
+        * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
          * @kn->priv is RCU safe.  Let's do the RCU dancing.
          */
         rcu_read_lock();
@@ -3388,17 +3916,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
         return seq_printf(s, "%d\n", *(int *)v);
  }
  
-/*
- * seq_operations functions for iterating on pidlists through seq_file -
- * independent of whether it's tasks or procs
- */
-static const struct seq_operations cgroup_pidlist_seq_operations = {
-       .start = cgroup_pidlist_start,
-       .stop = cgroup_pidlist_stop,
-       .next = cgroup_pidlist_next,
-       .show = cgroup_pidlist_show,
-};
-
  static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
                                          struct cftype *cft)
  {
@@ -3440,7 +3957,7 @@ static struct cftype cgroup_base_files[] = {
                 .seq_stop = cgroup_pidlist_stop,
                 .seq_show = cgroup_pidlist_show,
                 .private = CGROUP_FILE_PROCS,
-               .write_u64 = cgroup_procs_write,
+               .write = cgroup_procs_write,
                 .mode = S_IRUGO | S_IWUSR,
         },
         {
@@ -3454,6 +3971,27 @@ static struct cftype cgroup_base_files[] = {
                 .flags = CFTYPE_ONLY_ON_ROOT,
                 .seq_show = cgroup_sane_behavior_show,
         },
+       {
+               .name = "cgroup.controllers",
+               .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+               .seq_show = cgroup_root_controllers_show,
+       },
+       {
+               .name = "cgroup.controllers",
+               .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+               .seq_show = cgroup_controllers_show,
+       },
+       {
+               .name = "cgroup.subtree_control",
+               .flags = CFTYPE_ONLY_ON_DFL,
+               .seq_show = cgroup_subtree_control_show,
+               .write = cgroup_subtree_control_write,
+       },
+       {
+               .name = "cgroup.populated",
+               .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+               .seq_show = cgroup_populated_show,
+       },
  
         /*
          * Historical crazy stuff.  These don't have "cgroup."  prefix and
@@ -3468,7 +4006,7 @@ static struct cftype cgroup_base_files[] = {
                 .seq_stop = cgroup_pidlist_stop,
                 .seq_show = cgroup_pidlist_show,
                 .private = CGROUP_FILE_TASKS,
-               .write_u64 = cgroup_tasks_write,
+               .write = cgroup_tasks_write,
                 .mode = S_IRUGO | S_IWUSR,
         },
         {
@@ -3481,7 +4019,7 @@ static struct cftype cgroup_base_files[] = {
                 .name = "release_agent",
                 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
                 .seq_show = cgroup_release_agent_show,
-               .write_string = cgroup_release_agent_write,
+               .write = cgroup_release_agent_write,
                 .max_write_len = PATH_MAX - 1,
         },
         { }     /* terminate */
@@ -3494,7 +4032,7 @@ static struct cftype cgroup_base_files[] = {
   *
   * On failure, no file is added.
   */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
  {
         struct cgroup_subsys *ss;
         int i, ret = 0;
@@ -3503,7 +4041,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
         for_each_subsys(ss, i) {
                 struct cftype *cfts;
  
-               if (!test_bit(i, &subsys_mask))
+               if (!(subsys_mask & (1 << i)))
                         continue;
  
                 list_for_each_entry(cfts, &ss->cfts, node) {
@@ -3525,9 +4063,9 @@ err:
   *    Implemented in kill_css().
   *
   * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
- *    and thus css_tryget() is guaranteed to fail, the css can be offlined
- *    by invoking offline_css().  After offlining, the base ref is put.
- *    Implemented in css_killed_work_fn().
+ *    and thus css_tryget_online() is guaranteed to fail, the css can be
+ *    offlined by invoking offline_css().  After offlining, the base ref is
+ *    put.  Implemented in css_killed_work_fn().
   *
   * 3. When the percpu_ref reaches zero, the only possible remaining
   *    accessors are inside RCU read sections.  css_release() schedules the
@@ -3566,22 +4104,28 @@ static void css_release(struct percpu_ref *ref)
  {
         struct cgroup_subsys_state *css =
                 container_of(ref, struct cgroup_subsys_state, refcnt);
+       struct cgroup_subsys *ss = css->ss;
+
+       cgroup_idr_remove(&ss->css_idr, css->id);
  
-       RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
         call_rcu(&css->rcu_head, css_free_rcu_fn);
  }
  
-static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
-                    struct cgroup *cgrp)
+static void init_and_link_css(struct cgroup_subsys_state *css,
+                             struct cgroup_subsys *ss, struct cgroup *cgrp)
  {
+       cgroup_get(cgrp);
+
         css->cgroup = cgrp;
         css->ss = ss;
         css->flags = 0;
  
-       if (cgrp->parent)
+       if (cgrp->parent) {
                 css->parent = cgroup_css(cgrp->parent, ss);
-       else
+               css_get(css->parent);
+       } else {
                 css->flags |= CSS_ROOT;
+       }
  
         BUG_ON(cgroup_css(cgrp, ss));
  }
@@ -3621,7 +4165,9 @@ static void offline_css(struct cgroup_subsys_state *css)
  
         css->flags &= ~CSS_ONLINE;
         css->cgroup->nr_css--;
-       RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
+       RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
+
+       wake_up_all(&css->cgroup->offline_waitq);
  }
  
  /**
@@ -3645,31 +4191,34 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
         if (IS_ERR(css))
                 return PTR_ERR(css);
  
+       init_and_link_css(css, ss, cgrp);
+
         err = percpu_ref_init(&css->refcnt, css_release);
         if (err)
                 goto err_free_css;
  
-       init_css(css, ss, cgrp);
+       err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+       if (err < 0)
+               goto err_free_percpu_ref;
+       css->id = err;
  
         err = cgroup_populate_dir(cgrp, 1 << ss->id);
         if (err)
-               goto err_free_percpu_ref;
+               goto err_free_id;
+
+       /* @css is ready to be brought online now, make it visible */
+       cgroup_idr_replace(&ss->css_idr, css, css->id);
  
         err = online_css(css);
         if (err)
                 goto err_clear_dir;
  
-       cgroup_get(cgrp);
-       css_get(css->parent);
-
-       cgrp->subsys_mask |= 1 << ss->id;
-
         if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
             parent->parent) {
-               pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
-                          current->comm, current->pid, ss->name);
+               pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+                       current->comm, current->pid, ss->name);
                 if (!strcmp(ss->name, "memory"))
-                       pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
+                       pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
                 ss->warned_broken_hierarchy = true;
         }
  
@@ -3677,10 +4226,12 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
  
  err_clear_dir:
         cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+err_free_id:
+       cgroup_idr_remove(&ss->css_idr, css->id);
  err_free_percpu_ref:
         percpu_ref_cancel_init(&css->refcnt);
  err_free_css:
-       ss->css_free(css);
+       call_rcu(&css->rcu_head, css_free_rcu_fn);
         return err;
  }
  
@@ -3695,22 +4246,10 @@ static long cgroup_create(struct cgroup *parent, const char *name,
  {
         struct cgroup *cgrp;
         struct cgroup_root *root = parent->root;
-       int ssid, err;
+       int ssid, ret;
         struct cgroup_subsys *ss;
         struct kernfs_node *kn;
  
-       /*
-        * XXX: The default hierarchy isn't fully implemented yet.  Block
-        * !root cgroup creation on it for now.
-        */
-       if (root == &cgrp_dfl_root)
-               return -EINVAL;
-
-       /* allocate the cgroup and its ID, 0 is reserved for the root */
-       cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
-       if (!cgrp)
-               return -ENOMEM;
-
         mutex_lock(&cgroup_tree_mutex);
  
         /*
@@ -3721,25 +4260,32 @@ static long cgroup_create(struct cgroup *parent, const char *name,
          * don't get nasty surprises if we ever grow another caller.
          */
         if (!cgroup_lock_live_group(parent)) {
-               err = -ENODEV;
-               goto err_unlock_tree;
+               ret = -ENODEV;
+               goto out_unlock_tree;
+       }
+
+       /* allocate the cgroup and its ID, 0 is reserved for the root */
+       cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
+       if (!cgrp) {
+               ret = -ENOMEM;
+               goto out_unlock;
         }
  
         /*
          * Temporarily set the pointer to NULL, so idr_find() won't return
          * a half-baked cgroup.
          */
-       cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
+       cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
         if (cgrp->id < 0) {
-               err = -ENOMEM;
-               goto err_unlock;
+               ret = -ENOMEM;
+               goto out_free_cgrp;
         }
  
         init_cgroup_housekeeping(cgrp);
  
         cgrp->parent = parent;
         cgrp->dummy_css.parent = &parent->dummy_css;
-       cgrp->root = parent->root;
+       cgrp->root = root;
  
         if (notify_on_release(parent))
                 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -3750,8 +4296,8 @@ static long cgroup_create(struct cgroup *parent, const char *name,
         /* create the directory */
         kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
         if (IS_ERR(kn)) {
-               err = PTR_ERR(kn);
-               goto err_free_id;
+               ret = PTR_ERR(kn);
+               goto out_free_id;
         }
         cgrp->kn = kn;
  
@@ -3772,46 +4318,50 @@ static long cgroup_create(struct cgroup *parent, const char *name,
          * @cgrp is now fully operational.  If something fails after this
          * point, it'll be released via the normal destruction path.
          */
-       idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
+       cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
  
-       err = cgroup_kn_set_ugid(kn);
-       if (err)
-               goto err_destroy;
+       ret = cgroup_kn_set_ugid(kn);
+       if (ret)
+               goto out_destroy;
  
-       err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
-       if (err)
-               goto err_destroy;
+       ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+       if (ret)
+               goto out_destroy;
  
         /* let's create and online css's */
         for_each_subsys(ss, ssid) {
-               if (root->cgrp.subsys_mask & (1 << ssid)) {
-                       err = create_css(cgrp, ss);
-                       if (err)
-                               goto err_destroy;
+               if (parent->child_subsys_mask & (1 << ssid)) {
+                       ret = create_css(cgrp, ss);
+                       if (ret)
+                               goto out_destroy;
                 }
         }
  
-       kernfs_activate(kn);
+       /*
+        * On the default hierarchy, a child doesn't automatically inherit
+        * child_subsys_mask from the parent.  Each is configured manually.
+        */
+       if (!cgroup_on_dfl(cgrp))
+               cgrp->child_subsys_mask = parent->child_subsys_mask;
  
-       mutex_unlock(&cgroup_mutex);
-       mutex_unlock(&cgroup_tree_mutex);
+       kernfs_activate(kn);
  
-       return 0;
+       ret = 0;
+       goto out_unlock;
  
-err_free_id:
-       idr_remove(&root->cgroup_idr, cgrp->id);
-err_unlock:
+out_free_id:
+       cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_free_cgrp:
+       kfree(cgrp);
+out_unlock:
         mutex_unlock(&cgroup_mutex);
-err_unlock_tree:
+out_unlock_tree:
         mutex_unlock(&cgroup_tree_mutex);
-       kfree(cgrp);
-       return err;
+       return ret;
  
-err_destroy:
+out_destroy:
         cgroup_destroy_locked(cgrp);
-       mutex_unlock(&cgroup_mutex);
-       mutex_unlock(&cgroup_tree_mutex);
-       return err;
+       goto out_unlock;
  }
  
  static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
@@ -3838,7 +4388,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
  
  /*
   * This is called when the refcnt of a css is confirmed to be killed.
- * css_tryget() is now guaranteed to fail.
+ * css_tryget_online() is now guaranteed to fail.
   */
  static void css_killed_work_fn(struct work_struct *work)
  {
@@ -3850,8 +4400,8 @@ static void css_killed_work_fn(struct work_struct *work)
         mutex_lock(&cgroup_mutex);
  
         /*
-        * css_tryget() is guaranteed to fail now.  Tell subsystems to
-        * initate destruction.
+        * css_tryget_online() is guaranteed to fail now.  Tell subsystems
+        * to initate destruction.
          */
         offline_css(css);
  
@@ -3886,7 +4436,16 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
         queue_work(cgroup_destroy_wq, &css->destroy_work);
  }
  
-static void __kill_css(struct cgroup_subsys_state *css)
+/**
+ * kill_css - destroy a css
+ * @css: css to destroy
+ *
+ * This function initiates destruction of @css by removing cgroup interface
+ * files and putting its base reference.  ->css_offline() will be invoked
+ * asynchronously once css_tryget_online() is guaranteed to fail and when
+ * the reference count reaches zero, @css will be released.
+ */
+static void kill_css(struct cgroup_subsys_state *css)
  {
         lockdep_assert_held(&cgroup_tree_mutex);
  
@@ -3905,7 +4464,7 @@ static void __kill_css(struct cgroup_subsys_state *css)
         /*
          * cgroup core guarantees that, by the time ->css_offline() is
          * invoked, no new css reference will be given out via
-        * css_tryget().  We can't simply call percpu_ref_kill() and
+        * css_tryget_online().  We can't simply call percpu_ref_kill() and
          * proceed to offlining css's because percpu_ref_kill() doesn't
          * guarantee that the ref is seen as killed on all CPUs on return.
          *
@@ -3915,37 +4474,15 @@ static void __kill_css(struct cgroup_subsys_state *css)
         percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
  }
  
-/**
- * kill_css - destroy a css
- * @css: css to destroy
- *
- * This function initiates destruction of @css by removing cgroup interface
- * files and putting its base reference.  ->css_offline() will be invoked
- * asynchronously once css_tryget() is guaranteed to fail and when the
- * reference count reaches zero, @css will be released.
- */
-static void kill_css(struct cgroup_subsys_state *css)
-{
-       struct cgroup *cgrp = css->cgroup;
-
-       lockdep_assert_held(&cgroup_tree_mutex);
-
-       /* if already killed, noop */
-       if (cgrp->subsys_mask & (1 << css->ss->id)) {
-               cgrp->subsys_mask &= ~(1 << css->ss->id);
-               __kill_css(css);
-       }
-}
-
  /**
   * cgroup_destroy_locked - the first stage of cgroup destruction
   * @cgrp: cgroup to be destroyed
   *
   * css's make use of percpu refcnts whose killing latency shouldn't be
   * exposed to userland and are RCU protected.  Also, cgroup core needs to
- * guarantee that css_tryget() won't succeed by the time ->css_offline() is
- * invoked.  To satisfy all the requirements, destruction is implemented in
- * the following two steps.
+ * guarantee that css_tryget_online() won't succeed by the time
+ * ->css_offline() is invoked.  To satisfy all the requirements,
+ * destruction is implemented in the following two steps.
   *
   * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
   *     userland visible parts and start killing the percpu refcnts of
@@ -4039,9 +4576,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         /*
          * There are two control paths which try to determine cgroup from
          * dentry without going through kernfs - cgroupstats_build() and
-        * css_tryget_from_dir().  Those are supported by RCU protecting
-        * clearing of cgrp->kn->priv backpointer, which should happen
-        * after all files under it have been removed.
+        * css_tryget_online_from_dir().  Those are supported by RCU
+        * protecting clearing of cgrp->kn->priv backpointer, which should
+        * happen after all files under it have been removed.
          */
         kernfs_remove(cgrp->kn);        /* @cgrp has an extra ref on its kn */
         RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
@@ -4053,7 +4590,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
  
  /**
   * cgroup_destroy_css_killed - the second step of cgroup destruction
- * @work: cgroup->destroy_free_work
+ * @cgrp: the cgroup whose csses have just finished offlining
   *
   * This function is invoked from a work item for a cgroup which is being
   * destroyed after all css's are offlined and performs the rest of
@@ -4116,7 +4653,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
         .rename                 = cgroup_rename,
  };
  
-static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
  {
         struct cgroup_subsys_state *css;
  
@@ -4125,6 +4662,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         mutex_lock(&cgroup_tree_mutex);
         mutex_lock(&cgroup_mutex);
  
+       idr_init(&ss->css_idr);
         INIT_LIST_HEAD(&ss->cfts);
  
         /* Create the root cgroup state for this subsystem */
@@ -4132,7 +4670,14 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
         /* We don't handle early failures gracefully */
         BUG_ON(IS_ERR(css));
-       init_css(css, ss, &cgrp_dfl_root.cgrp);
+       init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
+       if (early) {
+               /* idr_alloc() can't be called safely during early init */
+               css->id = 1;
+       } else {
+               css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
+               BUG_ON(css->id < 0);
+       }
  
         /* Update the init_css_set to contain a subsys
          * pointer to this state - since the subsystem is
@@ -4149,7 +4694,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
  
         BUG_ON(online_css(css));
  
-       cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
+       cgrp_dfl_root.subsys_mask |= 1 << ss->id;
  
         mutex_unlock(&cgroup_mutex);
         mutex_unlock(&cgroup_tree_mutex);
@@ -4183,7 +4728,7 @@ int __init cgroup_init_early(void)
                 ss->name = cgroup_subsys_name[i];
  
                 if (ss->early_init)
-                       cgroup_init_subsys(ss);
+                       cgroup_init_subsys(ss, true);
         }
         return 0;
  }
@@ -4215,8 +4760,19 @@ int __init cgroup_init(void)
         mutex_unlock(&cgroup_tree_mutex);
  
         for_each_subsys(ss, ssid) {
-               if (!ss->early_init)
-                       cgroup_init_subsys(ss);
+               if (ss->early_init) {
+                       struct cgroup_subsys_state *css =
+                               init_css_set.subsys[ss->id];
+
+                       css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
+                                                  GFP_KERNEL);
+                       BUG_ON(css->id < 0);
+               } else {
+                       cgroup_init_subsys(ss, false);
+               }
+
+               list_add_tail(&init_css_set.e_cset_node[ssid],
+                             &cgrp_dfl_root.cgrp.e_csets[ssid]);
  
                 /*
                  * cftype registration needs kmalloc and can't be done
@@ -4306,7 +4862,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
  
                 seq_printf(m, "%d:", root->hierarchy_id);
                 for_each_subsys(ss, ssid)
-                       if (root->cgrp.subsys_mask & (1 << ssid))
+                       if (root->subsys_mask & (1 << ssid))
                                 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
                 if (strlen(root->name))
                         seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4619,7 +5175,7 @@ static int __init cgroup_disable(char *str)
  __setup("cgroup_disable=", cgroup_disable);
  
  /**
- * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
+ * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
   * @dentry: directory dentry of interest
   * @ss: subsystem of interest
   *
@@ -4627,8 +5183,8 @@ __setup("cgroup_disable=", cgroup_disable);
   * to get the corresponding css and return it.  If such css doesn't exist
   * or can't be pinned, an ERR_PTR value is returned.
   */
-struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
-                                               struct cgroup_subsys *ss)
+struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
+                                                      struct cgroup_subsys *ss)
  {
         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
         struct cgroup_subsys_state *css = NULL;
@@ -4650,7 +5206,7 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
         if (cgrp)
                 css = cgroup_css(cgrp, ss);
  
-       if (!css || !css_tryget(css))
+       if (!css || !css_tryget_online(css))
                 css = ERR_PTR(-ENOENT);
  
         rcu_read_unlock();
@@ -4667,14 +5223,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
   */
  struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
  {
-       struct cgroup *cgrp;
-
-       cgroup_assert_mutexes_or_rcu_locked();
-
-       cgrp = idr_find(&ss->root->cgroup_idr, id);
-       if (cgrp)
-               return cgroup_css(cgrp, ss);
-       return NULL;
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       return idr_find(&ss->css_idr, id);
  }
  
  #ifdef CONFIG_CGROUP_DEBUG