cgroup: make hierarchy iterators deal with cgroup_subsys_state instead of cgroup

[deliverable/linux.git] / kernel / cgroup.c
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index 9b16d75bec633eadd2a9d7f754c3e568b998bdf6..91eac33fac86f7d2a78332bdca6c130ce555d805 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -81,7 +81,7 @@
   */
  #ifdef CONFIG_PROVE_RCU
  DEFINE_MUTEX(cgroup_mutex);
-EXPORT_SYMBOL_GPL(cgroup_mutex);       /* only for task_subsys_state_check() */
+EXPORT_SYMBOL_GPL(cgroup_mutex);       /* only for lockdep */
  #else
  static DEFINE_MUTEX(cgroup_mutex);
  #endif
@@ -215,10 +215,12 @@ static u64 cgroup_serial_nr_next = 1;
   */
  static int need_forkexit_callback __read_mostly;
  
+static struct cftype cgroup_base_files[];
+
  static void cgroup_offline_fn(struct work_struct *work);
  static int cgroup_destroy_locked(struct cgroup *cgrp);
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-                             struct cftype cfts[], bool is_add);
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+                             bool is_add);
  
  /* convenient tests for these bits */
  static inline bool cgroup_is_dead(const struct cgroup *cgrp)
@@ -464,7 +466,7 @@ static inline void put_css_set_taskexit(struct css_set *cset)
   * @new_cgrp: cgroup that's being entered by the task
   * @template: desired set of css pointers in css_set (pre-calculated)
   *
- * Returns true if "cg" matches "old_cg" except for the hierarchy
+ * Returns true if "cset" matches "old_cset" except for the hierarchy
   * which "new_cgrp" belongs to, for which it should match "new_cgrp".
   */
  static bool compare_css_sets(struct css_set *cset,
@@ -804,8 +806,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
  static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
-static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
-                              unsigned long subsys_mask);
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
  static const struct inode_operations cgroup_dir_inode_operations;
  static const struct file_operations proc_cgroupstats_operations;
  
@@ -852,8 +853,11 @@ static void cgroup_free_fn(struct work_struct *work)
         /*
          * Release the subsystem state objects.
          */
-       for_each_root_subsys(cgrp->root, ss)
-               ss->css_free(cgrp);
+       for_each_root_subsys(cgrp->root, ss) {
+               struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
+               ss->css_free(css);
+       }
  
         cgrp->root->number_of_cgroups--;
         mutex_unlock(&cgroup_mutex);
@@ -865,8 +869,6 @@ static void cgroup_free_fn(struct work_struct *work)
          */
         dput(cgrp->parent->dentry);
  
-       ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
-
         /*
          * Drop the active superblock reference that we took when we
          * created the cgroup. This will free cgrp->root, if we are
@@ -957,26 +959,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
  }
  
  /**
- * cgroup_clear_dir - selective removal of base and subsystem files
+ * cgroup_clear_dir - remove subsys files in a cgroup directory
   * @cgrp: target cgroup
- * @base_files: true if the base files should be removed
   * @subsys_mask: mask of the subsystem ids whose files should be removed
   */
-static void cgroup_clear_dir(struct cgroup *cgrp, bool base_files,
-                            unsigned long subsys_mask)
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
  {
         struct cgroup_subsys *ss;
+       int i;
  
-       for_each_root_subsys(cgrp->root, ss) {
+       for_each_subsys(ss, i) {
                 struct cftype_set *set;
-               if (!test_bit(ss->subsys_id, &subsys_mask))
+
+               if (!test_bit(i, &subsys_mask))
                         continue;
                 list_for_each_entry(set, &ss->cftsets, node)
-                       cgroup_addrm_files(cgrp, NULL, set->cfts, false);
-       }
-       if (base_files) {
-               while (!list_empty(&cgrp->files))
-                       cgroup_rm_file(cgrp, NULL);
+                       cgroup_addrm_files(cgrp, set->cfts, false);
         }
  }
  
@@ -1006,32 +1004,47 @@ static int rebind_subsystems(struct cgroupfs_root *root,
  {
         struct cgroup *cgrp = &root->top_cgroup;
         struct cgroup_subsys *ss;
-       int i;
+       unsigned long pinned = 0;
+       int i, ret;
  
         BUG_ON(!mutex_is_locked(&cgroup_mutex));
         BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
  
         /* Check that any added subsystems are currently free */
         for_each_subsys(ss, i) {
-               unsigned long bit = 1UL << i;
-
-               if (!(bit & added_mask))
+               if (!(added_mask & (1 << i)))
                         continue;
  
+               /* is the subsystem mounted elsewhere? */
                 if (ss->root != &cgroup_dummy_root) {
-                       /* Subsystem isn't free */
-                       return -EBUSY;
+                       ret = -EBUSY;
+                       goto out_put;
+               }
+
+               /* pin the module */
+               if (!try_module_get(ss->module)) {
+                       ret = -ENOENT;
+                       goto out_put;
                 }
+               pinned |= 1 << i;
         }
  
-       /* Currently we don't handle adding/removing subsystems when
-        * any child cgroups exist. This is theoretically supportable
-        * but involves complex error handling, so it's being left until
-        * later */
-       if (root->number_of_cgroups > 1)
-               return -EBUSY;
+       /* subsys could be missing if unloaded between parsing and here */
+       if (added_mask != pinned) {
+               ret = -ENOENT;
+               goto out_put;
+       }
+
+       ret = cgroup_populate_dir(cgrp, added_mask);
+       if (ret)
+               goto out_put;
+
+       /*
+        * Nothing can fail from this point on.  Remove files for the
+        * removed subsystems and rebind each subsystem.
+        */
+       cgroup_clear_dir(cgrp, removed_mask);
  
-       /* Process each subsystem */
         for_each_subsys(ss, i) {
                 unsigned long bit = 1UL << i;
  
@@ -1046,7 +1059,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                         list_move(&ss->sibling, &root->subsys_list);
                         ss->root = root;
                         if (ss->bind)
-                               ss->bind(cgrp);
+                               ss->bind(cgrp->subsys[i]);
  
                         /* refcount was already taken, and we're keeping it */
                         root->subsys_mask |= bit;
@@ -1056,7 +1069,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                         BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
  
                         if (ss->bind)
-                               ss->bind(cgroup_dummy_top);
+                               ss->bind(cgroup_dummy_top->subsys[i]);
                         cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top;
                         cgrp->subsys[i] = NULL;
                         cgroup_subsys[i]->root = &cgroup_dummy_root;
@@ -1065,20 +1078,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                         /* subsystem is now free - drop reference on module */
                         module_put(ss->module);
                         root->subsys_mask &= ~bit;
-               } else if (bit & root->subsys_mask) {
-                       /* Subsystem state should already exist */
-                       BUG_ON(!cgrp->subsys[i]);
-                       /*
-                        * a refcount was taken, but we already had one, so
-                        * drop the extra reference.
-                        */
-                       module_put(ss->module);
-#ifdef CONFIG_MODULE_UNLOAD
-                       BUG_ON(ss->module && !module_refcount(ss->module));
-#endif
-               } else {
-                       /* Subsystem state shouldn't exist */
-                       BUG_ON(cgrp->subsys[i]);
                 }
         }
  
@@ -1089,6 +1088,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
         root->flags |= CGRP_ROOT_SUBSYS_BOUND;
  
         return 0;
+
+out_put:
+       for_each_subsys(ss, i)
+               if (pinned & (1 << i))
+                       module_put(ss->module);
+       return ret;
  }
  
  static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
@@ -1139,7 +1144,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
         char *token, *o = data;
         bool all_ss = false, one_ss = false;
         unsigned long mask = (unsigned long)-1;
-       bool module_pin_failed = false;
         struct cgroup_subsys *ss;
         int i;
  
@@ -1282,52 +1286,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
         if (!opts->subsys_mask && !opts->name)
                 return -EINVAL;
  
-       /*
-        * Grab references on all the modules we'll need, so the subsystems
-        * don't dance around before rebind_subsystems attaches them. This may
-        * take duplicate reference counts on a subsystem that's already used,
-        * but rebind_subsystems handles this case.
-        */
-       for_each_subsys(ss, i) {
-               if (!(opts->subsys_mask & (1UL << i)))
-                       continue;
-               if (!try_module_get(cgroup_subsys[i]->module)) {
-                       module_pin_failed = true;
-                       break;
-               }
-       }
-       if (module_pin_failed) {
-               /*
-                * oops, one of the modules was going away. this means that we
-                * raced with a module_delete call, and to the user this is
-                * essentially a "subsystem doesn't exist" case.
-                */
-               for (i--; i >= 0; i--) {
-                       /* drop refcounts only on the ones we took */
-                       unsigned long bit = 1UL << i;
-
-                       if (!(bit & opts->subsys_mask))
-                               continue;
-                       module_put(cgroup_subsys[i]->module);
-               }
-               return -ENOENT;
-       }
-
         return 0;
  }
  
-static void drop_parsed_module_refcounts(unsigned long subsys_mask)
-{
-       struct cgroup_subsys *ss;
-       int i;
-
-       mutex_lock(&cgroup_mutex);
-       for_each_subsys(ss, i)
-               if (subsys_mask & (1UL << i))
-                       module_put(cgroup_subsys[i]->module);
-       mutex_unlock(&cgroup_mutex);
-}
-
  static int cgroup_remount(struct super_block *sb, int *flags, char *data)
  {
         int ret = 0;
@@ -1367,22 +1328,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
                 goto out_unlock;
         }
  
-       /*
-        * Clear out the files of subsystems that should be removed, do
-        * this before rebind_subsystems, since rebind_subsystems may
-        * change this hierarchy's subsys_list.
-        */
-       cgroup_clear_dir(cgrp, false, removed_mask);
-
-       ret = rebind_subsystems(root, added_mask, removed_mask);
-       if (ret) {
-               /* rebind_subsystems failed, re-populate the removed files */
-               cgroup_populate_dir(cgrp, false, removed_mask);
+       /* remounting is not allowed for populated hierarchies */
+       if (root->number_of_cgroups > 1) {
+               ret = -EBUSY;
                 goto out_unlock;
         }
  
-       /* re-populate subsystem files */
-       cgroup_populate_dir(cgrp, false, added_mask);
+       ret = rebind_subsystems(root, added_mask, removed_mask);
+       if (ret)
+               goto out_unlock;
  
         if (opts.release_agent)
                 strcpy(root->release_agent_path, opts.release_agent);
@@ -1392,8 +1346,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
         mutex_unlock(&cgroup_root_mutex);
         mutex_unlock(&cgroup_mutex);
         mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
-       if (ret)
-               drop_parsed_module_refcounts(opts.subsys_mask);
         return ret;
  }
  
@@ -1413,6 +1365,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
         INIT_LIST_HEAD(&cgrp->release_list);
         INIT_LIST_HEAD(&cgrp->pidlists);
         mutex_init(&cgrp->pidlist_mutex);
+       cgrp->dummy_css.cgroup = cgrp;
         INIT_LIST_HEAD(&cgrp->event_list);
         spin_lock_init(&cgrp->event_list_lock);
         simple_xattrs_init(&cgrp->xattrs);
@@ -1428,6 +1381,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
         cgrp->root = root;
         RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
         init_cgroup_housekeeping(cgrp);
+       idr_init(&root->cgroup_idr);
  }
  
  static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
@@ -1500,7 +1454,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
          */
         root->subsys_mask = opts->subsys_mask;
         root->flags = opts->flags;
-       ida_init(&root->cgroup_ida);
         if (opts->release_agent)
                 strcpy(root->release_agent_path, opts->release_agent);
         if (opts->name)
@@ -1516,7 +1469,7 @@ static void cgroup_free_root(struct cgroupfs_root *root)
                 /* hierarhcy ID shoulid already have been released */
                 WARN_ON_ONCE(root->hierarchy_id);
  
-               ida_destroy(&root->cgroup_ida);
+               idr_destroy(&root->cgroup_idr);
                 kfree(root);
         }
  }
@@ -1581,7 +1534,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
         int ret = 0;
         struct super_block *sb;
         struct cgroupfs_root *new_root;
+       struct list_head tmp_links;
         struct inode *inode;
+       const struct cred *cred;
  
         /* First find the desired set of subsystems */
         mutex_lock(&cgroup_mutex);
@@ -1597,7 +1552,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
         new_root = cgroup_root_from_opts(&opts);
         if (IS_ERR(new_root)) {
                 ret = PTR_ERR(new_root);
-               goto drop_modules;
+               goto out_err;
         }
         opts.new_root = new_root;
  
@@ -1606,17 +1561,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
         if (IS_ERR(sb)) {
                 ret = PTR_ERR(sb);
                 cgroup_free_root(opts.new_root);
-               goto drop_modules;
+               goto out_err;
         }
  
         root = sb->s_fs_info;
         BUG_ON(!root);
         if (root == opts.new_root) {
                 /* We used the new root structure, so this is a new hierarchy */
-               struct list_head tmp_links;
                 struct cgroup *root_cgrp = &root->top_cgroup;
                 struct cgroupfs_root *existing_root;
-               const struct cred *cred;
                 int i;
                 struct css_set *cset;
  
@@ -1631,6 +1584,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 mutex_lock(&cgroup_mutex);
                 mutex_lock(&cgroup_root_mutex);
  
+               root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
+                                          0, 1, GFP_KERNEL);
+               if (root_cgrp->id < 0)
+                       goto unlock_drop;
+
                 /* Check for name clashes with existing mounts */
                 ret = -EBUSY;
                 if (strlen(root->name))
@@ -1654,26 +1612,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 if (ret)
                         goto unlock_drop;
  
+               sb->s_root->d_fsdata = root_cgrp;
+               root_cgrp->dentry = sb->s_root;
+
+               /*
+                * We're inside get_sb() and will call lookup_one_len() to
+                * create the root files, which doesn't work if SELinux is
+                * in use.  The following cred dancing somehow works around
+                * it.  See 2ce9738ba ("cgroupfs: use init_cred when
+                * populating new cgroupfs mount") for more details.
+                */
+               cred = override_creds(&init_cred);
+
+               ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+               if (ret)
+                       goto rm_base_files;
+
                 ret = rebind_subsystems(root, root->subsys_mask, 0);
-               if (ret == -EBUSY) {
-                       free_cgrp_cset_links(&tmp_links);
-                       goto unlock_drop;
-               }
+               if (ret)
+                       goto rm_base_files;
+
+               revert_creds(cred);
+
                 /*
                  * There must be no failure case after here, since rebinding
                  * takes care of subsystems' refcounts, which are explicitly
                  * dropped in the failure exit path.
                  */
  
-               /* EBUSY should be the only error here */
-               BUG_ON(ret);
-
                 list_add(&root->root_list, &cgroup_roots);
                 cgroup_root_count++;
  
-               sb->s_root->d_fsdata = root_cgrp;
-               root->top_cgroup.dentry = sb->s_root;
-
                 /* Link the top cgroup in this hierarchy into all
                  * the css_set objects */
                 write_lock(&css_set_lock);
@@ -1686,9 +1655,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 BUG_ON(!list_empty(&root_cgrp->children));
                 BUG_ON(root->number_of_cgroups != 1);
  
-               cred = override_creds(&init_cred);
-               cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
-               revert_creds(cred);
                 mutex_unlock(&cgroup_root_mutex);
                 mutex_unlock(&cgroup_mutex);
                 mutex_unlock(&inode->i_mutex);
@@ -1708,15 +1674,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                                 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
                         }
                 }
-
-               /* no subsys rebinding, so refcounts don't change */
-               drop_parsed_module_refcounts(opts.subsys_mask);
         }
  
         kfree(opts.release_agent);
         kfree(opts.name);
         return dget(sb->s_root);
  
+ rm_base_files:
+       free_cgrp_cset_links(&tmp_links);
+       cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
+       revert_creds(cred);
   unlock_drop:
         cgroup_exit_root_id(root);
         mutex_unlock(&cgroup_root_mutex);
@@ -1724,8 +1691,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
         mutex_unlock(&inode->i_mutex);
   drop_new_super:
         deactivate_locked_super(sb);
- drop_modules:
-       drop_parsed_module_refcounts(opts.subsys_mask);
   out_err:
         kfree(opts.release_agent);
         kfree(opts.name);
@@ -1743,6 +1708,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
         BUG_ON(root->number_of_cgroups != 1);
         BUG_ON(!list_empty(&cgrp->children));
  
+       mutex_lock(&cgrp->dentry->d_inode->i_mutex);
         mutex_lock(&cgroup_mutex);
         mutex_lock(&cgroup_root_mutex);
  
@@ -1775,6 +1741,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
  
         mutex_unlock(&cgroup_root_mutex);
         mutex_unlock(&cgroup_mutex);
+       mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
  
         simple_xattrs_free(&cgrp->xattrs);
  
@@ -1842,36 +1809,43 @@ out:
  EXPORT_SYMBOL_GPL(cgroup_path);
  
  /**
- * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy
+ * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
   * @task: target task
- * @hierarchy_id: the hierarchy to look up @task's cgroup from
   * @buf: the buffer to write the path into
   * @buflen: the length of the buffer
   *
- * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and
- * copy its path into @buf.  This function grabs cgroup_mutex and shouldn't
- * be used inside locks used by cgroup controller callbacks.
+ * Determine @task's cgroup on the first (the one with the lowest non-zero
+ * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
+ * function grabs cgroup_mutex and shouldn't be used inside locks used by
+ * cgroup controller callbacks.
+ *
+ * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short.
   */
-int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id,
-                                   char *buf, size_t buflen)
+int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
  {
         struct cgroupfs_root *root;
-       struct cgroup *cgrp = NULL;
-       int ret = -ENOENT;
+       struct cgroup *cgrp;
+       int hierarchy_id = 1, ret = 0;
+
+       if (buflen < 2)
+               return -ENAMETOOLONG;
  
         mutex_lock(&cgroup_mutex);
  
-       root = idr_find(&cgroup_hierarchy_idr, hierarchy_id);
+       root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
+
         if (root) {
                 cgrp = task_cgroup_from_root(task, root);
                 ret = cgroup_path(cgrp, buf, buflen);
+       } else {
+               /* if no hierarchy exists, everyone is in "/" */
+               memcpy(buf, "/", 2);
         }
  
         mutex_unlock(&cgroup_mutex);
-
         return ret;
  }
-EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy);
+EXPORT_SYMBOL_GPL(task_cgroup_path);
  
  /*
   * Control Group taskset
@@ -1879,7 +1853,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy);
  struct task_and_cgroup {
         struct task_struct      *task;
         struct cgroup           *cgrp;
-       struct css_set          *cg;
+       struct css_set          *cset;
  };
  
  struct cgroup_taskset {
@@ -2079,8 +2053,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
          * step 1: check that we can legitimately attach to the cgroup.
          */
         for_each_root_subsys(root, ss) {
+               struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
                 if (ss->can_attach) {
-                       retval = ss->can_attach(cgrp, &tset);
+                       retval = ss->can_attach(css, &tset);
                         if (retval) {
                                 failed_ss = ss;
                                 goto out_cancel_attach;
@@ -2097,8 +2073,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
  
                 tc = flex_array_get(group, i);
                 old_cset = task_css_set(tc->task);
-               tc->cg = find_css_set(old_cset, cgrp);
-               if (!tc->cg) {
+               tc->cset = find_css_set(old_cset, cgrp);
+               if (!tc->cset) {
                         retval = -ENOMEM;
                         goto out_put_css_set_refs;
                 }
@@ -2111,7 +2087,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
          */
         for (i = 0; i < group_size; i++) {
                 tc = flex_array_get(group, i);
-               cgroup_task_migrate(tc->cgrp, tc->task, tc->cg);
+               cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
         }
         /* nothing is sensitive to fork() after this point. */
  
@@ -2119,8 +2095,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
          * step 4: do subsystem attach callbacks.
          */
         for_each_root_subsys(root, ss) {
+               struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
                 if (ss->attach)
-                       ss->attach(cgrp, &tset);
+                       ss->attach(css, &tset);
         }
  
         /*
@@ -2131,18 +2109,20 @@ out_put_css_set_refs:
         if (retval) {
                 for (i = 0; i < group_size; i++) {
                         tc = flex_array_get(group, i);
-                       if (!tc->cg)
+                       if (!tc->cset)
                                 break;
-                       put_css_set(tc->cg);
+                       put_css_set(tc->cset);
                 }
         }
  out_cancel_attach:
         if (retval) {
                 for_each_root_subsys(root, ss) {
+                       struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
                         if (ss == failed_ss)
                                 break;
                         if (ss->cancel_attach)
-                               ss->cancel_attach(cgrp, &tset);
+                               ss->cancel_attach(css, &tset);
                 }
         }
  out_free_group_list:
@@ -2243,9 +2223,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
  
         mutex_lock(&cgroup_mutex);
         for_each_active_root(root) {
-               struct cgroup *from_cg = task_cgroup_from_root(from, root);
+               struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
  
-               retval = cgroup_attach_task(from_cg, tsk, false);
+               retval = cgroup_attach_task(from_cgrp, tsk, false);
                 if (retval)
                         break;
         }
@@ -2255,34 +2235,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
  }
  EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
  
-static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
+static int cgroup_tasks_write(struct cgroup_subsys_state *css,
+                             struct cftype *cft, u64 pid)
  {
-       return attach_task_by_pid(cgrp, pid, false);
+       return attach_task_by_pid(css->cgroup, pid, false);
  }
  
-static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+static int cgroup_procs_write(struct cgroup_subsys_state *css,
+                             struct cftype *cft, u64 tgid)
  {
-       return attach_task_by_pid(cgrp, tgid, true);
+       return attach_task_by_pid(css->cgroup, tgid, true);
  }
  
-static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
-                                     const char *buffer)
+static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
+                                     struct cftype *cft, const char *buffer)
  {
-       BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+       BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
         if (strlen(buffer) >= PATH_MAX)
                 return -EINVAL;
-       if (!cgroup_lock_live_group(cgrp))
+       if (!cgroup_lock_live_group(css->cgroup))
                 return -ENODEV;
         mutex_lock(&cgroup_root_mutex);
-       strcpy(cgrp->root->release_agent_path, buffer);
+       strcpy(css->cgroup->root->release_agent_path, buffer);
         mutex_unlock(&cgroup_root_mutex);
         mutex_unlock(&cgroup_mutex);
         return 0;
  }
  
-static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
-                                    struct seq_file *seq)
+static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
+                                    struct cftype *cft, struct seq_file *seq)
  {
+       struct cgroup *cgrp = css->cgroup;
+
         if (!cgroup_lock_live_group(cgrp))
                 return -ENODEV;
         seq_puts(seq, cgrp->root->release_agent_path);
@@ -2291,20 +2275,31 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
         return 0;
  }
  
-static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft,
-                                    struct seq_file *seq)
+static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
+                                    struct cftype *cft, struct seq_file *seq)
  {
-       seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+       seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
         return 0;
  }
  
+/* return the css for the given cgroup file */
+static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe)
+{
+       struct cftype *cft = cfe->type;
+       struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
+
+       if (cft->ss)
+               return cgrp->subsys[cft->ss->subsys_id];
+       return &cgrp->dummy_css;
+}
+
  /* A buffer size big enough for numbers or short strings */
  #define CGROUP_LOCAL_BUFFER_SIZE 64
  
-static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
-                               struct file *file,
-                               const char __user *userbuf,
-                               size_t nbytes, loff_t *unused_ppos)
+static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
+                               struct cftype *cft, struct file *file,
+                               const char __user *userbuf, size_t nbytes,
+                               loff_t *unused_ppos)
  {
         char buffer[CGROUP_LOCAL_BUFFER_SIZE];
         int retval = 0;
@@ -2322,22 +2317,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
                 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
                 if (*end)
                         return -EINVAL;
-               retval = cft->write_u64(cgrp, cft, val);
+               retval = cft->write_u64(css, cft, val);
         } else {
                 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
                 if (*end)
                         return -EINVAL;
-               retval = cft->write_s64(cgrp, cft, val);
+               retval = cft->write_s64(css, cft, val);
         }
         if (!retval)
                 retval = nbytes;
         return retval;
  }
  
-static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
-                                  struct file *file,
-                                  const char __user *userbuf,
-                                  size_t nbytes, loff_t *unused_ppos)
+static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
+                                  struct cftype *cft, struct file *file,
+                                  const char __user *userbuf, size_t nbytes,
+                                  loff_t *unused_ppos)
  {
         char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
         int retval = 0;
@@ -2360,7 +2355,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
         }
  
         buffer[nbytes] = 0;     /* nul-terminate */
-       retval = cft->write_string(cgrp, cft, strstrip(buffer));
+       retval = cft->write_string(css, cft, strstrip(buffer));
         if (!retval)
                 retval = nbytes;
  out:
@@ -2370,65 +2365,60 @@ out:
  }
  
  static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
-                                               size_t nbytes, loff_t *ppos)
+                                size_t nbytes, loff_t *ppos)
  {
+       struct cfent *cfe = __d_cfe(file->f_dentry);
         struct cftype *cft = __d_cft(file->f_dentry);
-       struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+       struct cgroup_subsys_state *css = cgroup_file_css(cfe);
  
-       if (cgroup_is_dead(cgrp))
-               return -ENODEV;
         if (cft->write)
-               return cft->write(cgrp, cft, file, buf, nbytes, ppos);
+               return cft->write(css, cft, file, buf, nbytes, ppos);
         if (cft->write_u64 || cft->write_s64)
-               return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
+               return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
         if (cft->write_string)
-               return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
+               return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
         if (cft->trigger) {
-               int ret = cft->trigger(cgrp, (unsigned int)cft->private);
+               int ret = cft->trigger(css, (unsigned int)cft->private);
                 return ret ? ret : nbytes;
         }
         return -EINVAL;
  }
  
-static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
-                              struct file *file,
-                              char __user *buf, size_t nbytes,
-                              loff_t *ppos)
+static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
+                              struct cftype *cft, struct file *file,
+                              char __user *buf, size_t nbytes, loff_t *ppos)
  {
         char tmp[CGROUP_LOCAL_BUFFER_SIZE];
-       u64 val = cft->read_u64(cgrp, cft);
+       u64 val = cft->read_u64(css, cft);
         int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
  
         return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  }
  
-static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
-                              struct file *file,
-                              char __user *buf, size_t nbytes,
-                              loff_t *ppos)
+static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
+                              struct cftype *cft, struct file *file,
+                              char __user *buf, size_t nbytes, loff_t *ppos)
  {
         char tmp[CGROUP_LOCAL_BUFFER_SIZE];
-       s64 val = cft->read_s64(cgrp, cft);
+       s64 val = cft->read_s64(css, cft);
         int len = sprintf(tmp, "%lld\n", (long long) val);
  
         return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  }
  
  static ssize_t cgroup_file_read(struct file *file, char __user *buf,
-                                  size_t nbytes, loff_t *ppos)
+                               size_t nbytes, loff_t *ppos)
  {
+       struct cfent *cfe = __d_cfe(file->f_dentry);
         struct cftype *cft = __d_cft(file->f_dentry);
-       struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-
-       if (cgroup_is_dead(cgrp))
-               return -ENODEV;
+       struct cgroup_subsys_state *css = cgroup_file_css(cfe);
  
         if (cft->read)
-               return cft->read(cgrp, cft, file, buf, nbytes, ppos);
+               return cft->read(css, cft, file, buf, nbytes, ppos);
         if (cft->read_u64)
-               return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
+               return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
         if (cft->read_s64)
-               return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
+               return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
         return -EINVAL;
  }
  
@@ -2437,11 +2427,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
   * supports string->u64 maps, but can be extended in future.
   */
  
-struct cgroup_seqfile_state {
-       struct cftype *cft;
-       struct cgroup *cgroup;
-};
-
  static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
  {
         struct seq_file *sf = cb->state;
@@ -2450,69 +2435,70 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
  
  static int cgroup_seqfile_show(struct seq_file *m, void *arg)
  {
-       struct cgroup_seqfile_state *state = m->private;
-       struct cftype *cft = state->cft;
+       struct cfent *cfe = m->private;
+       struct cftype *cft = cfe->type;
+       struct cgroup_subsys_state *css = cgroup_file_css(cfe);
+
         if (cft->read_map) {
                 struct cgroup_map_cb cb = {
                         .fill = cgroup_map_add,
                         .state = m,
                 };
-               return cft->read_map(state->cgroup, cft, &cb);
+               return cft->read_map(css, cft, &cb);
         }
-       return cft->read_seq_string(state->cgroup, cft, m);
-}
-
-static int cgroup_seqfile_release(struct inode *inode, struct file *file)
-{
-       struct seq_file *seq = file->private_data;
-       kfree(seq->private);
-       return single_release(inode, file);
+       return cft->read_seq_string(css, cft, m);
  }
  
  static const struct file_operations cgroup_seqfile_operations = {
         .read = seq_read,
         .write = cgroup_file_write,
         .llseek = seq_lseek,
-       .release = cgroup_seqfile_release,
+       .release = single_release,
  };
  
  static int cgroup_file_open(struct inode *inode, struct file *file)
  {
+       struct cfent *cfe = __d_cfe(file->f_dentry);
+       struct cftype *cft = __d_cft(file->f_dentry);
+       struct cgroup_subsys_state *css = cgroup_file_css(cfe);
         int err;
-       struct cftype *cft;
  
         err = generic_file_open(inode, file);
         if (err)
                 return err;
-       cft = __d_cft(file->f_dentry);
-
-       if (cft->read_map || cft->read_seq_string) {
-               struct cgroup_seqfile_state *state;
  
-               state = kzalloc(sizeof(*state), GFP_USER);
-               if (!state)
-                       return -ENOMEM;
+       /*
+        * If the file belongs to a subsystem, pin the css.  Will be
+        * unpinned either on open failure or release.  This ensures that
+        * @css stays alive for all file operations.
+        */
+       if (css->ss && !css_tryget(css))
+               return -ENODEV;
  
-               state->cft = cft;
-               state->cgroup = __d_cgrp(file->f_dentry->d_parent);
+       if (cft->read_map || cft->read_seq_string) {
                 file->f_op = &cgroup_seqfile_operations;
-               err = single_open(file, cgroup_seqfile_show, state);
-               if (err < 0)
-                       kfree(state);
-       } else if (cft->open)
+               err = single_open(file, cgroup_seqfile_show, cfe);
+       } else if (cft->open) {
                 err = cft->open(inode, file);
-       else
-               err = 0;
+       }
  
+       if (css->ss && err)
+               css_put(css);
         return err;
  }
  
  static int cgroup_file_release(struct inode *inode, struct file *file)
  {
+       struct cfent *cfe = __d_cfe(file->f_dentry);
         struct cftype *cft = __d_cft(file->f_dentry);
+       struct cgroup_subsys_state *css = cgroup_file_css(cfe);
+       int ret = 0;
+
         if (cft->release)
-               return cft->release(inode, file);
-       return 0;
+               ret = cft->release(inode, file);
+       if (css->ss)
+               css_put(css);
+       return ret;
  }
  
  /*
@@ -2734,8 +2720,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
         return mode;
  }
  
-static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-                          struct cftype *cft)
+static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
  {
         struct dentry *dir = cgrp->dentry;
         struct cgroup *parent = __d_cgrp(dir);
@@ -2745,8 +2730,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
         umode_t mode;
         char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
  
-       if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
-               strcpy(name, subsys->name);
+       if (cft->ss && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
+               strcpy(name, cft->ss->name);
                 strcat(name, ".");
         }
         strcat(name, cft->name);
@@ -2783,17 +2768,16 @@ out:
  /**
   * cgroup_addrm_files - add or remove files to a cgroup directory
   * @cgrp: the target cgroup
- * @subsys: the subsystem of files to be added
   * @cfts: array of cftypes to be added
   * @is_add: whether to add or remove
   *
   * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
- * All @cfts should belong to @subsys.  For removals, this function never
- * fails.  If addition fails, this function doesn't remove files already
- * added.  The caller is responsible for cleaning up.
+ * For removals, this function never fails.  If addition fails, this
+ * function doesn't remove files already added.  The caller is responsible
+ * for cleaning up.
   */
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-                             struct cftype cfts[], bool is_add)
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+                             bool is_add)
  {
         struct cftype *cft;
         int ret;
@@ -2811,7 +2795,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
                         continue;
  
                 if (is_add) {
-                       ret = cgroup_add_file(cgrp, subsys, cft);
+                       ret = cgroup_add_file(cgrp, cft);
                         if (ret) {
                                 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
                                         cft->name, ret);
@@ -2830,28 +2814,30 @@ static void cgroup_cfts_prepare(void)
         /*
          * Thanks to the entanglement with vfs inode locking, we can't walk
          * the existing cgroups under cgroup_mutex and create files.
-        * Instead, we use cgroup_for_each_descendant_pre() and drop RCU
-        * read lock before calling cgroup_addrm_files().
+        * Instead, we use css_for_each_descendant_pre() and drop RCU read
+        * lock before calling cgroup_addrm_files().
          */
         mutex_lock(&cgroup_mutex);
  }
  
-static void cgroup_cfts_commit(struct cgroup_subsys *ss,
-                              struct cftype *cfts, bool is_add)
+static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
         __releases(&cgroup_mutex)
  {
         LIST_HEAD(pending);
-       struct cgroup *cgrp, *root = &ss->root->top_cgroup;
+       struct cgroup_subsys *ss = cfts[0].ss;
+       struct cgroup *root = &ss->root->top_cgroup;
         struct super_block *sb = ss->root->sb;
         struct dentry *prev = NULL;
         struct inode *inode;
+       struct cgroup_subsys_state *css;
         u64 update_before;
+       int ret = 0;
  
         /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
         if (!cfts || ss->root == &cgroup_dummy_root ||
             !atomic_inc_not_zero(&sb->s_active)) {
                 mutex_unlock(&cgroup_mutex);
-               return;
+               return 0;
         }
  
         /*
@@ -2867,13 +2853,18 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
         inode = root->dentry->d_inode;
         mutex_lock(&inode->i_mutex);
         mutex_lock(&cgroup_mutex);
-       cgroup_addrm_files(root, ss, cfts, is_add);
+       ret = cgroup_addrm_files(root, cfts, is_add);
         mutex_unlock(&cgroup_mutex);
         mutex_unlock(&inode->i_mutex);
  
+       if (ret)
+               goto out_deact;
+
         /* add/rm files for all cgroups created before */
         rcu_read_lock();
-       cgroup_for_each_descendant_pre(cgrp, root) {
+       css_for_each_descendant_pre(css, cgroup_css(root, ss->subsys_id)) {
+               struct cgroup *cgrp = css->cgroup;
+
                 if (cgroup_is_dead(cgrp))
                         continue;
  
@@ -2887,15 +2878,19 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
                 mutex_lock(&inode->i_mutex);
                 mutex_lock(&cgroup_mutex);
                 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
-                       cgroup_addrm_files(cgrp, ss, cfts, is_add);
+                       ret = cgroup_addrm_files(cgrp, cfts, is_add);
                 mutex_unlock(&cgroup_mutex);
                 mutex_unlock(&inode->i_mutex);
  
                 rcu_read_lock();
+               if (ret)
+                       break;
         }
         rcu_read_unlock();
         dput(prev);
+out_deact:
         deactivate_super(sb);
+       return ret;
  }
  
  /**
@@ -2915,49 +2910,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
  int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
  {
         struct cftype_set *set;
+       struct cftype *cft;
+       int ret;
  
         set = kzalloc(sizeof(*set), GFP_KERNEL);
         if (!set)
                 return -ENOMEM;
  
+       for (cft = cfts; cft->name[0] != '\0'; cft++)
+               cft->ss = ss;
+
         cgroup_cfts_prepare();
         set->cfts = cfts;
         list_add_tail(&set->node, &ss->cftsets);
-       cgroup_cfts_commit(ss, cfts, true);
-
-       return 0;
+       ret = cgroup_cfts_commit(cfts, true);
+       if (ret)
+               cgroup_rm_cftypes(cfts);
+       return ret;
  }
  EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
  
  /**
   * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
- * @ss: target cgroup subsystem
   * @cfts: zero-length name terminated array of cftypes
   *
- * Unregister @cfts from @ss.  Files described by @cfts are removed from
- * all existing cgroups to which @ss is attached and all future cgroups
- * won't have them either.  This function can be called anytime whether @ss
- * is attached or not.
+ * Unregister @cfts.  Files described by @cfts are removed from all
+ * existing cgroups and all future cgroups won't have them either.  This
+ * function can be called anytime whether @cfts' subsys is attached or not.
   *
   * Returns 0 on successful unregistration, -ENOENT if @cfts is not
- * registered with @ss.
+ * registered.
   */
-int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+int cgroup_rm_cftypes(struct cftype *cfts)
  {
         struct cftype_set *set;
  
+       if (!cfts || !cfts[0].ss)
+               return -ENOENT;
+
         cgroup_cfts_prepare();
  
-       list_for_each_entry(set, &ss->cftsets, node) {
+       list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
                 if (set->cfts == cfts) {
                         list_del(&set->node);
                         kfree(set);
-                       cgroup_cfts_commit(ss, cfts, false);
+                       cgroup_cfts_commit(cfts, false);
                         return 0;
                 }
         }
  
-       cgroup_cfts_commit(ss, NULL, false);
+       cgroup_cfts_commit(NULL, false);
         return -ENOENT;
  }
  
@@ -3038,16 +3040,21 @@ static void cgroup_enable_task_cg_lists(void)
  }
  
  /**
- * cgroup_next_sibling - find the next sibling of a given cgroup
- * @pos: the current cgroup
+ * css_next_child - find the next child of a given css
+ * @pos_css: the current position (%NULL to initiate traversal)
+ * @parent_css: css whose children to walk
   *
- * This function returns the next sibling of @pos and should be called
- * under RCU read lock.  The only requirement is that @pos is accessible.
- * The next sibling is guaranteed to be returned regardless of @pos's
- * state.
+ * This function returns the next child of @parent_css and should be called
+ * under RCU read lock.  The only requirement is that @parent_css and
+ * @pos_css are accessible.  The next sibling is guaranteed to be returned
+ * regardless of their states.
   */
-struct cgroup *cgroup_next_sibling(struct cgroup *pos)
+struct cgroup_subsys_state *
+css_next_child(struct cgroup_subsys_state *pos_css,
+              struct cgroup_subsys_state *parent_css)
  {
+       struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
+       struct cgroup *cgrp = parent_css->cgroup;
         struct cgroup *next;
  
         WARN_ON_ONCE(!rcu_read_lock_held());
@@ -3062,78 +3069,83 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos)
          * safe to dereference from this RCU critical section.  If
          * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
          * to be visible as %true here.
+        *
+        * If @pos is dead, its next pointer can't be dereferenced;
+        * however, as each cgroup is given a monotonically increasing
+        * unique serial number and always appended to the sibling list,
+        * the next one can be found by walking the parent's children until
+        * we see a cgroup with higher serial number than @pos's.  While
+        * this path can be slower, it's taken only when either the current
+        * cgroup is removed or iteration and removal race.
          */
-       if (likely(!cgroup_is_dead(pos))) {
+       if (!pos) {
+               next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
+       } else if (likely(!cgroup_is_dead(pos))) {
                 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
-               if (&next->sibling != &pos->parent->children)
-                       return next;
-               return NULL;
+       } else {
+               list_for_each_entry_rcu(next, &cgrp->children, sibling)
+                       if (next->serial_nr > pos->serial_nr)
+                               break;
         }
  
-       /*
-        * Can't dereference the next pointer.  Each cgroup is given a
-        * monotonically increasing unique serial number and always
-        * appended to the sibling list, so the next one can be found by
-        * walking the parent's children until we see a cgroup with higher
-        * serial number than @pos's.
-        *
-        * While this path can be slow, it's taken only when either the
-        * current cgroup is removed or iteration and removal race.
-        */
-       list_for_each_entry_rcu(next, &pos->parent->children, sibling)
-               if (next->serial_nr > pos->serial_nr)
-                       return next;
-       return NULL;
+       if (&next->sibling == &cgrp->children)
+               return NULL;
+
+       if (parent_css->ss)
+               return cgroup_css(next, parent_css->ss->subsys_id);
+       else
+               return &next->dummy_css;
  }
-EXPORT_SYMBOL_GPL(cgroup_next_sibling);
+EXPORT_SYMBOL_GPL(css_next_child);
  
  /**
- * cgroup_next_descendant_pre - find the next descendant for pre-order walk
+ * css_next_descendant_pre - find the next descendant for pre-order walk
   * @pos: the current position (%NULL to initiate traversal)
- * @cgroup: cgroup whose descendants to walk
+ * @root: css whose descendants to walk
   *
- * To be used by cgroup_for_each_descendant_pre().  Find the next
- * descendant to visit for pre-order traversal of @cgroup's descendants.
+ * To be used by css_for_each_descendant_pre().  Find the next descendant
+ * to visit for pre-order traversal of @root's descendants.
   *
   * While this function requires RCU read locking, it doesn't require the
   * whole traversal to be contained in a single RCU critical section.  This
   * function will return the correct next descendant as long as both @pos
- * and @cgroup are accessible and @pos is a descendant of @cgroup.
+ * and @root are accessible and @pos is a descendant of @root.
   */
-struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
-                                         struct cgroup *cgroup)
+struct cgroup_subsys_state *
+css_next_descendant_pre(struct cgroup_subsys_state *pos,
+                       struct cgroup_subsys_state *root)
  {
-       struct cgroup *next;
+       struct cgroup_subsys_state *next;
  
         WARN_ON_ONCE(!rcu_read_lock_held());
  
-       /* if first iteration, pretend we just visited @cgroup */
+       /* if first iteration, pretend we just visited @root */
         if (!pos)
-               pos = cgroup;
+               pos = root;
  
         /* visit the first child if exists */
-       next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
+       next = css_next_child(NULL, pos);
         if (next)
                 return next;
  
         /* no child, visit my or the closest ancestor's next sibling */
-       while (pos != cgroup) {
-               next = cgroup_next_sibling(pos);
+       while (pos != root) {
+               next = css_next_child(pos, css_parent(pos));
                 if (next)
                         return next;
-               pos = pos->parent;
+               pos = css_parent(pos);
         }
  
         return NULL;
  }
-EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+EXPORT_SYMBOL_GPL(css_next_descendant_pre);
  
  /**
- * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
- * @pos: cgroup of interest
+ * css_rightmost_descendant - return the rightmost descendant of a css
+ * @pos: css of interest
   *
- * Return the rightmost descendant of @pos.  If there's no descendant,
- * @pos is returned.  This can be used during pre-order traversal to skip
+ * Return the rightmost descendant of @pos.  If there's no descendant, @pos
+ * is returned.  This can be used during pre-order traversal to skip
   * subtree of @pos.
   *
   * While this function requires RCU read locking, it doesn't require the
@@ -3141,9 +3153,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
   * function will return the correct rightmost descendant as long as @pos is
   * accessible.
   */
-struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
+struct cgroup_subsys_state *
+css_rightmost_descendant(struct cgroup_subsys_state *pos)
  {
-       struct cgroup *last, *tmp;
+       struct cgroup_subsys_state *last, *tmp;
  
         WARN_ON_ONCE(!rcu_read_lock_held());
  
@@ -3151,63 +3164,64 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
                 last = pos;
                 /* ->prev isn't RCU safe, walk ->next till the end */
                 pos = NULL;
-               list_for_each_entry_rcu(tmp, &last->children, sibling)
+               css_for_each_child(tmp, last)
                         pos = tmp;
         } while (pos);
  
         return last;
  }
-EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
+EXPORT_SYMBOL_GPL(css_rightmost_descendant);
  
-static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
+static struct cgroup_subsys_state *
+css_leftmost_descendant(struct cgroup_subsys_state *pos)
  {
-       struct cgroup *last;
+       struct cgroup_subsys_state *last;
  
         do {
                 last = pos;
-               pos = list_first_or_null_rcu(&pos->children, struct cgroup,
-                                            sibling);
+               pos = css_next_child(NULL, pos);
         } while (pos);
  
         return last;
  }
  
  /**
- * cgroup_next_descendant_post - find the next descendant for post-order walk
+ * css_next_descendant_post - find the next descendant for post-order walk
   * @pos: the current position (%NULL to initiate traversal)
- * @cgroup: cgroup whose descendants to walk
+ * @root: css whose descendants to walk
   *
- * To be used by cgroup_for_each_descendant_post().  Find the next
- * descendant to visit for post-order traversal of @cgroup's descendants.
+ * To be used by css_for_each_descendant_post().  Find the next descendant
+ * to visit for post-order traversal of @root's descendants.
   *
   * While this function requires RCU read locking, it doesn't require the
   * whole traversal to be contained in a single RCU critical section.  This
   * function will return the correct next descendant as long as both @pos
   * and @cgroup are accessible and @pos is a descendant of @cgroup.
   */
-struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
-                                          struct cgroup *cgroup)
+struct cgroup_subsys_state *
+css_next_descendant_post(struct cgroup_subsys_state *pos,
+                        struct cgroup_subsys_state *root)
  {
-       struct cgroup *next;
+       struct cgroup_subsys_state *next;
  
         WARN_ON_ONCE(!rcu_read_lock_held());
  
         /* if first iteration, visit the leftmost descendant */
         if (!pos) {
-               next = cgroup_leftmost_descendant(cgroup);
-               return next != cgroup ? next : NULL;
+               next = css_leftmost_descendant(root);
+               return next != root ? next : NULL;
         }
  
         /* if there's an unvisited sibling, visit its leftmost descendant */
-       next = cgroup_next_sibling(pos);
+       next = css_next_child(pos, css_parent(pos));
         if (next)
-               return cgroup_leftmost_descendant(next);
+               return css_leftmost_descendant(next);
  
         /* no sibling left, visit parent */
-       next = pos->parent;
-       return next != cgroup ? next : NULL;
+       next = css_parent(pos);
+       return next != root ? next : NULL;
  }
-EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
+EXPORT_SYMBOL_GPL(css_next_descendant_post);
  
  void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
         __acquires(css_set_lock)
@@ -3354,8 +3368,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
          * guarantees forward progress and that we don't miss any tasks.
          */
         heap->size = 0;
-       cgroup_iter_start(scan->cg, &it);
-       while ((p = cgroup_iter_next(scan->cg, &it))) {
+       cgroup_iter_start(scan->cgrp, &it);
+       while ((p = cgroup_iter_next(scan->cgrp, &it))) {
                 /*
                  * Only affect tasks that qualify per the caller's callback,
                  * if he provided one
@@ -3388,7 +3402,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
                  * the heap and wasn't inserted
                  */
         }
-       cgroup_iter_end(scan->cg, &it);
+       cgroup_iter_end(scan->cgrp, &it);
  
         if (heap->size) {
                 for (i = 0; i < heap->size; i++) {
@@ -3434,7 +3448,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
  {
         struct cgroup_scanner scan;
  
-       scan.cg = from;
+       scan.cgrp = from;
         scan.test_task = NULL; /* select all tasks in cgroup */
         scan.process_task = cgroup_transfer_one_task;
         scan.heap = NULL;
@@ -3482,7 +3496,7 @@ struct cgroup_pidlist {
         /* pointer to the cgroup we belong to, for list removal purposes */
         struct cgroup *owner;
         /* protects the other fields */
-       struct rw_semaphore mutex;
+       struct rw_semaphore rwsem;
  };
  
  /*
@@ -3555,7 +3569,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
         struct pid_namespace *ns = task_active_pid_ns(current);
  
         /*
-        * We can't drop the pidlist_mutex before taking the l->mutex in case
+        * We can't drop the pidlist_mutex before taking the l->rwsem in case
          * the last ref-holder is trying to remove l from the list at the same
          * time. Holding the pidlist_mutex precludes somebody taking whichever
          * list we find out from under us - compare release_pid_array().
@@ -3564,7 +3578,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
         list_for_each_entry(l, &cgrp->pidlists, links) {
                 if (l->key.type == type && l->key.ns == ns) {
                         /* make sure l doesn't vanish out from under us */
-                       down_write(&l->mutex);
+                       down_write(&l->rwsem);
                         mutex_unlock(&cgrp->pidlist_mutex);
                         return l;
                 }
@@ -3575,8 +3589,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
                 mutex_unlock(&cgrp->pidlist_mutex);
                 return l;
         }
-       init_rwsem(&l->mutex);
-       down_write(&l->mutex);
+       init_rwsem(&l->rwsem);
+       down_write(&l->rwsem);
         l->key.type = type;
         l->key.ns = get_pid_ns(ns);
         l->owner = cgrp;
@@ -3637,7 +3651,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
         l->list = array;
         l->length = length;
         l->use_count++;
-       up_write(&l->mutex);
+       up_write(&l->rwsem);
         *lp = l;
         return 0;
  }
@@ -3715,7 +3729,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
         int index = 0, pid = *pos;
         int *iter;
  
-       down_read(&l->mutex);
+       down_read(&l->rwsem);
         if (pid) {
                 int end = l->length;
  
@@ -3742,7 +3756,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
  static void cgroup_pidlist_stop(struct seq_file *s, void *v)
  {
         struct cgroup_pidlist *l = s->private;
-       up_read(&l->mutex);
+       up_read(&l->rwsem);
  }
  
  static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3788,7 +3802,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
          * pidlist_mutex, we have to take pidlist_mutex first.
          */
         mutex_lock(&l->owner->pidlist_mutex);
-       down_write(&l->mutex);
+       down_write(&l->rwsem);
         BUG_ON(!l->use_count);
         if (!--l->use_count) {
                 /* we're the last user if refcount is 0; remove and free */
@@ -3796,12 +3810,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
                 mutex_unlock(&l->owner->pidlist_mutex);
                 pidlist_free(l->list);
                 put_pid_ns(l->key.ns);
-               up_write(&l->mutex);
+               up_write(&l->rwsem);
                 kfree(l);
                 return;
         }
         mutex_unlock(&l->owner->pidlist_mutex);
-       up_write(&l->mutex);
+       up_write(&l->rwsem);
  }
  
  static int cgroup_pidlist_release(struct inode *inode, struct file *file)
@@ -3865,21 +3879,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file)
         return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
  }
  
-static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
-                                           struct cftype *cft)
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+                                        struct cftype *cft)
  {
-       return notify_on_release(cgrp);
+       return notify_on_release(css->cgroup);
  }
  
-static int cgroup_write_notify_on_release(struct cgroup *cgrp,
-                                         struct cftype *cft,
-                                         u64 val)
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+                                         struct cftype *cft, u64 val)
  {
-       clear_bit(CGRP_RELEASABLE, &cgrp->flags);
+       clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
         if (val)
-               set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+               set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
         else
-               clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+               clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
         return 0;
  }
  
@@ -3977,14 +3990,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file,
   * Input must be in format '<event_fd> <control_fd> <args>'.
   * Interpretation of args is defined by control file implementation.
   */
-static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
-                                     const char *buffer)
+static int cgroup_write_event_control(struct cgroup_subsys_state *css,
+                                     struct cftype *cft, const char *buffer)
  {
-       struct cgroup_event *event = NULL;
+       struct cgroup *cgrp = css->cgroup;
+       struct cgroup_event *event;
         struct cgroup *cgrp_cfile;
         unsigned int efd, cfd;
-       struct file *efile = NULL;
-       struct file *cfile = NULL;
+       struct file *efile;
+       struct file *cfile;
         char *endp;
         int ret;
  
@@ -4010,31 +4024,31 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
         efile = eventfd_fget(efd);
         if (IS_ERR(efile)) {
                 ret = PTR_ERR(efile);
-               goto fail;
+               goto out_kfree;
         }
  
         event->eventfd = eventfd_ctx_fileget(efile);
         if (IS_ERR(event->eventfd)) {
                 ret = PTR_ERR(event->eventfd);
-               goto fail;
+               goto out_put_efile;
         }
  
         cfile = fget(cfd);
         if (!cfile) {
                 ret = -EBADF;
-               goto fail;
+               goto out_put_eventfd;
         }
  
         /* the process need read permission on control file */
         /* AV: shouldn't we check that it's been opened for read instead? */
         ret = inode_permission(file_inode(cfile), MAY_READ);
         if (ret < 0)
-               goto fail;
+               goto out_put_cfile;
  
         event->cft = __file_cft(cfile);
         if (IS_ERR(event->cft)) {
                 ret = PTR_ERR(event->cft);
-               goto fail;
+               goto out_put_cfile;
         }
  
         /*
@@ -4044,18 +4058,18 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
         cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
         if (cgrp_cfile != cgrp) {
                 ret = -EINVAL;
-               goto fail;
+               goto out_put_cfile;
         }
  
         if (!event->cft->register_event || !event->cft->unregister_event) {
                 ret = -EINVAL;
-               goto fail;
+               goto out_put_cfile;
         }
  
         ret = event->cft->register_event(cgrp, event->cft,
                         event->eventfd, buffer);
         if (ret)
-               goto fail;
+               goto out_put_cfile;
  
         efile->f_op->poll(efile, &event->pt);
  
@@ -4075,35 +4089,31 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
  
         return 0;
  
-fail:
-       if (cfile)
-               fput(cfile);
-
-       if (event && event->eventfd && !IS_ERR(event->eventfd))
-               eventfd_ctx_put(event->eventfd);
-
-       if (!IS_ERR_OR_NULL(efile))
-               fput(efile);
-
+out_put_cfile:
+       fput(cfile);
+out_put_eventfd:
+       eventfd_ctx_put(event->eventfd);
+out_put_efile:
+       fput(efile);
+out_kfree:
         kfree(event);
  
         return ret;
  }
  
-static u64 cgroup_clone_children_read(struct cgroup *cgrp,
-                                   struct cftype *cft)
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+                                     struct cftype *cft)
  {
-       return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
+       return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
  }
  
-static int cgroup_clone_children_write(struct cgroup *cgrp,
-                                    struct cftype *cft,
-                                    u64 val)
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+                                      struct cftype *cft, u64 val)
  {
         if (val)
-               set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
+               set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
         else
-               clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
+               clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
         return 0;
  }
  
@@ -4162,31 +4172,29 @@ static struct cftype cgroup_base_files[] = {
  };
  
  /**
- * cgroup_populate_dir - selectively creation of files in a directory
+ * cgroup_populate_dir - create subsys files in a cgroup directory
   * @cgrp: target cgroup
- * @base_files: true if the base files should be added
   * @subsys_mask: mask of the subsystem ids whose files should be added
+ *
+ * On failure, no file is added.
   */
-static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
-                              unsigned long subsys_mask)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
  {
-       int err;
         struct cgroup_subsys *ss;
-
-       if (base_files) {
-               err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
-               if (err < 0)
-                       return err;
-       }
+       int i, ret = 0;
  
         /* process cftsets of each subsystem */
-       for_each_root_subsys(cgrp->root, ss) {
+       for_each_subsys(ss, i) {
                 struct cftype_set *set;
-               if (!test_bit(ss->subsys_id, &subsys_mask))
+
+               if (!test_bit(i, &subsys_mask))
                         continue;
  
-               list_for_each_entry(set, &ss->cftsets, node)
-                       cgroup_addrm_files(cgrp, ss, set->cfts, true);
+               list_for_each_entry(set, &ss->cftsets, node) {
+                       ret = cgroup_addrm_files(cgrp, set->cfts, true);
+                       if (ret < 0)
+                               goto err;
+               }
         }
  
         /* This cgroup is ready now */
@@ -4204,6 +4212,9 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
         }
  
         return 0;
+err:
+       cgroup_clear_dir(cgrp, subsys_mask);
+       return ret;
  }
  
  static void css_dput_fn(struct work_struct *work)
@@ -4227,6 +4238,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
                                struct cgroup *cgrp)
  {
         css->cgroup = cgrp;
+       css->ss = ss;
         css->flags = 0;
         css->id = NULL;
         if (cgrp == cgroup_dummy_top)
@@ -4243,23 +4255,23 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
         INIT_WORK(&css->dput_work, css_dput_fn);
  }
  
-/* invoke ->post_create() on a new CSS and mark it online if successful */
+/* invoke ->css_online() on a new CSS and mark it online if successful */
  static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
  {
+       struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
         int ret = 0;
  
         lockdep_assert_held(&cgroup_mutex);
  
         if (ss->css_online)
-               ret = ss->css_online(cgrp);
+               ret = ss->css_online(css);
         if (!ret)
-               cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
+               css->flags |= CSS_ONLINE;
         return ret;
  }
  
-/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
+/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
  static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
-       __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
  {
         struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  
@@ -4269,9 +4281,9 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
                 return;
  
         if (ss->css_offline)
-               ss->css_offline(cgrp);
+               ss->css_offline(css);
  
-       cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
+       css->flags &= ~CSS_ONLINE;
  }
  
  /*
@@ -4302,7 +4314,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                 goto err_free_cgrp;
         rcu_assign_pointer(cgrp->name, name);
  
-       cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
+       /*
+        * Temporarily set the pointer to NULL, so idr_find() won't return
+        * a half-baked cgroup.
+        */
+       cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
         if (cgrp->id < 0)
                 goto err_free_name;
  
@@ -4342,15 +4358,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
         for_each_root_subsys(root, ss) {
                 struct cgroup_subsys_state *css;
  
-               css = ss->css_alloc(cgrp);
+               css = ss->css_alloc(parent->subsys[ss->subsys_id]);
                 if (IS_ERR(css)) {
                         err = PTR_ERR(css);
                         goto err_free_all;
                 }
  
                 err = percpu_ref_init(&css->refcnt, css_release);
-               if (err)
+               if (err) {
+                       ss->css_free(css);
                         goto err_free_all;
+               }
  
                 init_cgroup_css(css, ss, cgrp);
  
@@ -4400,7 +4418,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                 }
         }
  
-       err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
+       idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
+
+       err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+       if (err)
+               goto err_destroy;
+
+       err = cgroup_populate_dir(cgrp, root->subsys_mask);
         if (err)
                 goto err_destroy;
  
@@ -4415,14 +4439,14 @@ err_free_all:
  
                 if (css) {
                         percpu_ref_cancel_init(&css->refcnt);
-                       ss->css_free(cgrp);
+                       ss->css_free(css);
                 }
         }
         mutex_unlock(&cgroup_mutex);
         /* Release the reference count that we took on the superblock */
         deactivate_super(sb);
  err_free_id:
-       ida_simple_remove(&root->cgroup_ida, cgrp->id);
+       idr_remove(&root->cgroup_idr, cgrp->id);
  err_free_name:
         kfree(rcu_dereference_raw(cgrp->name));
  err_free_cgrp:
@@ -4540,9 +4564,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         /*
          * Mark @cgrp dead.  This prevents further task migration and child
          * creation by disabling cgroup_lock_live_group().  Note that
-        * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to
+        * CGRP_DEAD assertion is depended upon by css_next_child() to
          * resume iteration after dropping RCU read lock.  See
-        * cgroup_next_sibling() for details.
+        * css_next_child() for details.
          */
         set_bit(CGRP_DEAD, &cgrp->flags);
  
@@ -4556,7 +4580,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
          * Clear and remove @cgrp directory.  The removal puts the base ref
          * but we aren't quite done with @cgrp yet, so hold onto it.
          */
-       cgroup_clear_dir(cgrp, true, cgrp->root->subsys_mask);
+       cgroup_clear_dir(cgrp, cgrp->root->subsys_mask);
+       cgroup_addrm_files(cgrp, cgroup_base_files, false);
         dget(d);
         cgroup_d_remove_dir(d);
  
@@ -4614,6 +4639,14 @@ static void cgroup_offline_fn(struct work_struct *work)
         /* delete this cgroup from parent->children */
         list_del_rcu(&cgrp->sibling);
  
+       /*
+        * We should remove the cgroup object from idr before its grace
+        * period starts, so we won't be looking up a cgroup while the
+        * cgroup is being freed.
+        */
+       idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+       cgrp->id = -1;
+
         dput(d);
  
         set_bit(CGRP_RELEASABLE, &parent->flags);
@@ -4642,6 +4675,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
          * deregistration.
          */
         if (ss->base_cftypes) {
+               struct cftype *cft;
+
+               for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
+                       cft->ss = ss;
+
                 ss->base_cftset.cfts = ss->base_cftypes;
                 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
         }
@@ -4661,7 +4699,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         /* Create the top cgroup state for this subsystem */
         list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
         ss->root = &cgroup_dummy_root;
-       css = ss->css_alloc(cgroup_dummy_top);
+       css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]);
         /* We don't handle early failures gracefully */
         BUG_ON(IS_ERR(css));
         init_cgroup_css(css, ss, cgroup_dummy_top);
@@ -4740,7 +4778,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
          * struct, so this can happen first (i.e. before the dummy root
          * attachment).
          */
-       css = ss->css_alloc(cgroup_dummy_top);
+       css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]);
         if (IS_ERR(css)) {
                 /* failure case - need to deassign the cgroup_subsys[] slot. */
                 cgroup_subsys[ss->subsys_id] = NULL;
@@ -4815,7 +4853,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
  
         /*
          * we shouldn't be called if the subsystem is in use, and the use of
-        * try_module_get in parse_cgroupfs_options should ensure that it
+        * try_module_get() in rebind_subsystems() should ensure that it
          * doesn't start being used while we're killing it off.
          */
         BUG_ON(ss->root != &cgroup_dummy_root);
@@ -4856,7 +4894,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
          * the cgrp->subsys pointer to find their state. note that this
          * also takes care of freeing the css_id.
          */
-       ss->css_free(cgroup_dummy_top);
+       ss->css_free(cgroup_dummy_top->subsys[ss->subsys_id]);
         cgroup_dummy_top->subsys[ss->subsys_id] = NULL;
  
         mutex_unlock(&cgroup_mutex);
@@ -4939,6 +4977,10 @@ int __init cgroup_init(void)
  
         BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
  
+       err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
+                       0, 1, GFP_KERNEL);
+       BUG_ON(err < 0);
+
         mutex_unlock(&cgroup_root_mutex);
         mutex_unlock(&cgroup_mutex);
  
@@ -5208,10 +5250,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
                  */
                 for_each_builtin_subsys(ss, i) {
                         if (ss->exit) {
-                               struct cgroup *old_cgrp = cset->subsys[i]->cgroup;
-                               struct cgroup *cgrp = task_cgroup(tsk, i);
+                               struct cgroup_subsys_state *old_css = cset->subsys[i];
+                               struct cgroup_subsys_state *css = task_css(tsk, i);
  
-                               ss->exit(cgrp, old_cgrp, tsk);
+                               ss->exit(css, old_css, tsk);
                         }
                 }
         }
@@ -5545,7 +5587,8 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
  }
  
  #ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
  {
         struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
  
@@ -5555,22 +5598,24 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
         return css;
  }
  
-static void debug_css_free(struct cgroup *cgrp)
+static void debug_css_free(struct cgroup_subsys_state *css)
  {
-       kfree(cgrp->subsys[debug_subsys_id]);
+       kfree(css);
  }
  
-static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft)
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+                               struct cftype *cft)
  {
-       return cgroup_task_count(cgrp);
+       return cgroup_task_count(css->cgroup);
  }
  
-static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft)
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+                               struct cftype *cft)
  {
         return (u64)(unsigned long)current->cgroups;
  }
  
-static u64 current_css_set_refcount_read(struct cgroup *cgrp,
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
                                          struct cftype *cft)
  {
         u64 count;
@@ -5581,7 +5626,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp,
         return count;
  }
  
-static int current_css_set_cg_links_read(struct cgroup *cgrp,
+static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
                                          struct cftype *cft,
                                          struct seq_file *seq)
  {
@@ -5608,14 +5653,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp,
  }
  
  #define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct cgroup *cgrp,
-                                struct cftype *cft,
-                                struct seq_file *seq)
+static int cgroup_css_links_read(struct cgroup_subsys_state *css,
+                                struct cftype *cft, struct seq_file *seq)
  {
         struct cgrp_cset_link *link;
  
         read_lock(&css_set_lock);
-       list_for_each_entry(link, &cgrp->cset_links, cset_link) {
+       list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
                 struct css_set *cset = link->cset;
                 struct task_struct *task;
                 int count = 0;
@@ -5634,9 +5678,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp,
         return 0;
  }
  
-static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
  {
-       return test_bit(CGRP_RELEASABLE, &cgrp->flags);
+       return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
  }
  
  static struct cftype debug_files[] =  {