Merge tag 'disintegrate-fbdev-20121220' of git://git.infradead.org/users/dhowells...

[deliverable/linux.git] / block / cfq-iosched.c
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c

index bc076f45ca46d6055b04060c71acd264a2bd368c..4f0ade74cfd04a1c48f22218a6c4369517efa88b 100644 (file)
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -85,7 +85,6 @@ struct cfq_rb_root {
         struct rb_root rb;
         struct rb_node *left;
         unsigned count;
-       unsigned total_weight;
         u64 min_vdisktime;
         struct cfq_ttime ttime;
  };
@@ -223,10 +222,45 @@ struct cfq_group {
  
         /* group service_tree key */
         u64 vdisktime;
+
+       /*
+        * The number of active cfqgs and sum of their weights under this
+        * cfqg.  This covers this cfqg's leaf_weight and all children's
+        * weights, but does not cover weights of further descendants.
+        *
+        * If a cfqg is on the service tree, it's active.  An active cfqg
+        * also activates its parent and contributes to the children_weight
+        * of the parent.
+        */
+       int nr_active;
+       unsigned int children_weight;
+
+       /*
+        * vfraction is the fraction of vdisktime that the tasks in this
+        * cfqg are entitled to.  This is determined by compounding the
+        * ratios walking up from this cfqg to the root.
+        *
+        * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
+        * vfractions on a service tree is approximately 1.  The sum may
+        * deviate a bit due to rounding errors and fluctuations caused by
+        * cfqgs entering and leaving the service tree.
+        */
+       unsigned int vfraction;
+
+       /*
+        * There are two weights - (internal) weight is the weight of this
+        * cfqg against the sibling cfqgs.  leaf_weight is the wight of
+        * this cfqg against the child cfqgs.  For the root cfqg, both
+        * weights are kept in sync for backward compatibility.
+        */
         unsigned int weight;
         unsigned int new_weight;
         unsigned int dev_weight;
  
+       unsigned int leaf_weight;
+       unsigned int new_leaf_weight;
+       unsigned int dev_leaf_weight;
+
         /* number of cfqq currently on this group */
         int nr_cfqq;
  
@@ -255,7 +289,8 @@ struct cfq_group {
         /* number of requests that are on the dispatch list or inside driver */
         int dispatched;
         struct cfq_ttime ttime;
-       struct cfqg_stats stats;
+       struct cfqg_stats stats;        /* stats for this cfqg */
+       struct cfqg_stats dead_stats;   /* stats pushed from dead children */
  };
  
  struct cfq_io_cq {
@@ -502,7 +537,7 @@ static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
  {
         struct cfqg_stats *stats = &cfqg->stats;
  
-       if (blkg_rwstat_sum(&stats->queued))
+       if (blkg_rwstat_total(&stats->queued))
                 return;
  
         /*
@@ -546,7 +581,7 @@ static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
         struct cfqg_stats *stats = &cfqg->stats;
  
         blkg_stat_add(&stats->avg_queue_size_sum,
-                     blkg_rwstat_sum(&stats->queued));
+                     blkg_rwstat_total(&stats->queued));
         blkg_stat_add(&stats->avg_queue_size_samples, 1);
         cfqg_stats_update_group_wait_time(stats);
  }
@@ -572,6 +607,13 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
         return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
  }
  
+static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
+{
+       struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
+
+       return pblkg ? blkg_to_cfqg(pblkg) : NULL;
+}
+
  static inline void cfqg_get(struct cfq_group *cfqg)
  {
         return blkg_get(cfqg_to_blkg(cfqg));
@@ -647,11 +689,9 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
                                 io_start_time - start_time);
  }
  
-static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+/* @stats = 0 */
+static void cfqg_stats_reset(struct cfqg_stats *stats)
  {
-       struct cfq_group *cfqg = blkg_to_cfqg(blkg);
-       struct cfqg_stats *stats = &cfqg->stats;
-
         /* queued stats shouldn't be cleared */
         blkg_rwstat_reset(&stats->service_bytes);
         blkg_rwstat_reset(&stats->serviced);
@@ -670,8 +710,50 @@ static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
  #endif
  }
  
+/* @to += @from */
+static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
+{
+       /* queued stats shouldn't be cleared */
+       blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
+       blkg_rwstat_merge(&to->serviced, &from->serviced);
+       blkg_rwstat_merge(&to->merged, &from->merged);
+       blkg_rwstat_merge(&to->service_time, &from->service_time);
+       blkg_rwstat_merge(&to->wait_time, &from->wait_time);
+       blkg_stat_merge(&from->time, &from->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+       blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
+       blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+       blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+       blkg_stat_merge(&to->dequeue, &from->dequeue);
+       blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
+       blkg_stat_merge(&to->idle_time, &from->idle_time);
+       blkg_stat_merge(&to->empty_time, &from->empty_time);
+#endif
+}
+
+/*
+ * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
+ * recursive stats can still account for the amount used by this cfqg after
+ * it's gone.
+ */
+static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
+{
+       struct cfq_group *parent = cfqg_parent(cfqg);
+
+       lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
+
+       if (unlikely(!parent))
+               return;
+
+       cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
+       cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
+       cfqg_stats_reset(&cfqg->stats);
+       cfqg_stats_reset(&cfqg->dead_stats);
+}
+
  #else  /* CONFIG_CFQ_GROUP_IOSCHED */
  
+static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
  static inline void cfqg_get(struct cfq_group *cfqg) { }
  static inline void cfqg_put(struct cfq_group *cfqg) { }
  
@@ -851,13 +933,27 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
         return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
  }
  
-static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
+/**
+ * cfqg_scale_charge - scale disk time charge according to cfqg weight
+ * @charge: disk time being charged
+ * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
+ *
+ * Scale @charge according to @vfraction, which is in range (0, 1].  The
+ * scaling is inversely proportional.
+ *
+ * scaled = charge / vfraction
+ *
+ * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
+ */
+static inline u64 cfqg_scale_charge(unsigned long charge,
+                                   unsigned int vfraction)
  {
-       u64 d = delta << CFQ_SERVICE_SHIFT;
+       u64 c = charge << CFQ_SERVICE_SHIFT;    /* make it fixed point */
  
-       d = d * CFQ_WEIGHT_DEFAULT;
-       do_div(d, cfqg->weight);
-       return d;
+       /* charge / vfraction */
+       c <<= CFQ_SERVICE_SHIFT;
+       do_div(c, vfraction);
+       return c;
  }
  
  static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
@@ -913,9 +1009,7 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
  static inline unsigned
  cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
  {
-       struct cfq_rb_root *st = &cfqd->grp_service_tree;
-
-       return cfqd->cfq_target_latency * cfqg->weight / st->total_weight;
+       return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
  }
  
  static inline unsigned
@@ -1182,20 +1276,61 @@ static void
  cfq_update_group_weight(struct cfq_group *cfqg)
  {
         BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+
         if (cfqg->new_weight) {
                 cfqg->weight = cfqg->new_weight;
                 cfqg->new_weight = 0;
         }
+
+       if (cfqg->new_leaf_weight) {
+               cfqg->leaf_weight = cfqg->new_leaf_weight;
+               cfqg->new_leaf_weight = 0;
+       }
  }
  
  static void
  cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
  {
+       unsigned int vfr = 1 << CFQ_SERVICE_SHIFT;      /* start with 1 */
+       struct cfq_group *pos = cfqg;
+       struct cfq_group *parent;
+       bool propagate;
+
+       /* add to the service tree */
         BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
  
         cfq_update_group_weight(cfqg);
         __cfq_group_service_tree_add(st, cfqg);
-       st->total_weight += cfqg->weight;
+
+       /*
+        * Activate @cfqg and calculate the portion of vfraction @cfqg is
+        * entitled to.  vfraction is calculated by walking the tree
+        * towards the root calculating the fraction it has at each level.
+        * The compounded ratio is how much vfraction @cfqg owns.
+        *
+        * Start with the proportion tasks in this cfqg has against active
+        * children cfqgs - its leaf_weight against children_weight.
+        */
+       propagate = !pos->nr_active++;
+       pos->children_weight += pos->leaf_weight;
+       vfr = vfr * pos->leaf_weight / pos->children_weight;
+
+       /*
+        * Compound ->weight walking up the tree.  Both activation and
+        * vfraction calculation are done in the same loop.  Propagation
+        * stops once an already activated node is met.  vfraction
+        * calculation should always continue to the root.
+        */
+       while ((parent = cfqg_parent(pos))) {
+               if (propagate) {
+                       propagate = !parent->nr_active++;
+                       parent->children_weight += pos->weight;
+               }
+               vfr = vfr * pos->weight / parent->children_weight;
+               pos = parent;
+       }
+
+       cfqg->vfraction = max_t(unsigned, vfr, 1);
  }
  
  static void
@@ -1226,7 +1361,32 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
  static void
  cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
  {
-       st->total_weight -= cfqg->weight;
+       struct cfq_group *pos = cfqg;
+       bool propagate;
+
+       /*
+        * Undo activation from cfq_group_service_tree_add().  Deactivate
+        * @cfqg and propagate deactivation upwards.
+        */
+       propagate = !--pos->nr_active;
+       pos->children_weight -= pos->leaf_weight;
+
+       while (propagate) {
+               struct cfq_group *parent = cfqg_parent(pos);
+
+               /* @pos has 0 nr_active at this point */
+               WARN_ON_ONCE(pos->children_weight);
+               pos->vfraction = 0;
+
+               if (!parent)
+                       break;
+
+               propagate = !--parent->nr_active;
+               parent->children_weight -= pos->weight;
+               pos = parent;
+       }
+
+       /* remove from the service tree */
         if (!RB_EMPTY_NODE(&cfqg->rb_node))
                 cfq_rb_erase(&cfqg->rb_node, st);
  }
@@ -1288,6 +1448,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
         unsigned int used_sl, charge, unaccounted_sl = 0;
         int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
                         - cfqg->service_tree_idle.count;
+       unsigned int vfr;
  
         BUG_ON(nr_sync < 0);
         used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
@@ -1297,10 +1458,15 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
         else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
                 charge = cfqq->allocated_slice;
  
-       /* Can't update vdisktime while group is on service tree */
+       /*
+        * Can't update vdisktime while on service tree and cfqg->vfraction
+        * is valid only while on it.  Cache vfr, leave the service tree,
+        * update vdisktime and go back on.  The re-addition to the tree
+        * will also update the weights as necessary.
+        */
+       vfr = cfqg->vfraction;
         cfq_group_service_tree_del(st, cfqg);
-       cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
-       /* If a new weight was requested, update now, off tree */
+       cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
         cfq_group_service_tree_add(st, cfqg);
  
         /* This group is being expired. Save the context */
@@ -1348,6 +1514,52 @@ static void cfq_pd_init(struct blkcg_gq *blkg)
  
         cfq_init_cfqg_base(cfqg);
         cfqg->weight = blkg->blkcg->cfq_weight;
+       cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
+}
+
+static void cfq_pd_offline(struct blkcg_gq *blkg)
+{
+       /*
+        * @blkg is going offline and will be ignored by
+        * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
+        * that they don't get lost.  If IOs complete after this point, the
+        * stats for them will be lost.  Oh well...
+        */
+       cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
+}
+
+/* offset delta from cfqg->stats to cfqg->dead_stats */
+static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
+                                       offsetof(struct cfq_group, stats);
+
+/* to be used by recursive prfill, sums live and dead stats recursively */
+static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+{
+       u64 sum = 0;
+
+       sum += blkg_stat_recursive_sum(pd, off);
+       sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
+       return sum;
+}
+
+/* to be used by recursive prfill, sums live and dead rwstats recursively */
+static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
+                                                      int off)
+{
+       struct blkg_rwstat a, b;
+
+       a = blkg_rwstat_recursive_sum(pd, off);
+       b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
+       blkg_rwstat_merge(&a, &b);
+       return a;
+}
+
+static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+{
+       struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+
+       cfqg_stats_reset(&cfqg->stats);
+       cfqg_stats_reset(&cfqg->dead_stats);
  }
  
  /*
@@ -1404,6 +1616,26 @@ static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
         return 0;
  }
  
+static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
+                                         struct blkg_policy_data *pd, int off)
+{
+       struct cfq_group *cfqg = pd_to_cfqg(pd);
+
+       if (!cfqg->dev_leaf_weight)
+               return 0;
+       return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
+}
+
+static int cfqg_print_leaf_weight_device(struct cgroup *cgrp,
+                                        struct cftype *cft,
+                                        struct seq_file *sf)
+{
+       blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
+                         cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0,
+                         false);
+       return 0;
+}
+
  static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
                             struct seq_file *sf)
  {
@@ -1411,8 +1643,16 @@ static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
         return 0;
  }
  
-static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
-                                 const char *buf)
+static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft,
+                                struct seq_file *sf)
+{
+       seq_printf(sf, "%u\n",
+                  cgroup_to_blkcg(cgrp)->cfq_leaf_weight);
+       return 0;
+}
+
+static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+                                   const char *buf, bool is_leaf_weight)
  {
         struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
         struct blkg_conf_ctx ctx;
@@ -1426,8 +1666,13 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
         ret = -EINVAL;
         cfqg = blkg_to_cfqg(ctx.blkg);
         if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
-               cfqg->dev_weight = ctx.v;
-               cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight;
+               if (!is_leaf_weight) {
+                       cfqg->dev_weight = ctx.v;
+                       cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
+               } else {
+                       cfqg->dev_leaf_weight = ctx.v;
+                       cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
+               }
                 ret = 0;
         }
  
@@ -1435,29 +1680,63 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
         return ret;
  }
  
-static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+                                 const char *buf)
+{
+       return __cfqg_set_weight_device(cgrp, cft, buf, false);
+}
+
+static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft,
+                                      const char *buf)
+{
+       return __cfqg_set_weight_device(cgrp, cft, buf, true);
+}
+
+static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
+                           bool is_leaf_weight)
  {
         struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
         struct blkcg_gq *blkg;
-       struct hlist_node *n;
  
         if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
                 return -EINVAL;
  
         spin_lock_irq(&blkcg->lock);
-       blkcg->cfq_weight = (unsigned int)val;
  
-       hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+       if (!is_leaf_weight)
+               blkcg->cfq_weight = val;
+       else
+               blkcg->cfq_leaf_weight = val;
+
+       hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
                 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
  
-               if (cfqg && !cfqg->dev_weight)
-                       cfqg->new_weight = blkcg->cfq_weight;
+               if (!cfqg)
+                       continue;
+
+               if (!is_leaf_weight) {
+                       if (!cfqg->dev_weight)
+                               cfqg->new_weight = blkcg->cfq_weight;
+               } else {
+                       if (!cfqg->dev_leaf_weight)
+                               cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
+               }
         }
  
         spin_unlock_irq(&blkcg->lock);
         return 0;
  }
  
+static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+       return __cfq_set_weight(cgrp, cft, val, false);
+}
+
+static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+       return __cfq_set_weight(cgrp, cft, val, true);
+}
+
  static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
                            struct seq_file *sf)
  {
@@ -1478,6 +1757,42 @@ static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
         return 0;
  }
  
+static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
+                                     struct blkg_policy_data *pd, int off)
+{
+       u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
+
+       return __blkg_prfill_u64(sf, pd, sum);
+}
+
+static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
+                                       struct blkg_policy_data *pd, int off)
+{
+       struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
+
+       return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft,
+                                    struct seq_file *sf)
+{
+       struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+       blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive,
+                         &blkcg_policy_cfq, cft->private, false);
+       return 0;
+}
+
+static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft,
+                                      struct seq_file *sf)
+{
+       struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+       blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive,
+                         &blkcg_policy_cfq, cft->private, true);
+       return 0;
+}
+
  #ifdef CONFIG_DEBUG_BLK_CGROUP
  static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
                                       struct blkg_policy_data *pd, int off)
@@ -1507,17 +1822,49 @@ static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
  #endif /* CONFIG_DEBUG_BLK_CGROUP */
  
  static struct cftype cfq_blkcg_files[] = {
+       /* on root, weight is mapped to leaf_weight */
+       {
+               .name = "weight_device",
+               .flags = CFTYPE_ONLY_ON_ROOT,
+               .read_seq_string = cfqg_print_leaf_weight_device,
+               .write_string = cfqg_set_leaf_weight_device,
+               .max_write_len = 256,
+       },
+       {
+               .name = "weight",
+               .flags = CFTYPE_ONLY_ON_ROOT,
+               .read_seq_string = cfq_print_leaf_weight,
+               .write_u64 = cfq_set_leaf_weight,
+       },
+
+       /* no such mapping necessary for !roots */
         {
                 .name = "weight_device",
+               .flags = CFTYPE_NOT_ON_ROOT,
                 .read_seq_string = cfqg_print_weight_device,
                 .write_string = cfqg_set_weight_device,
                 .max_write_len = 256,
         },
         {
                 .name = "weight",
+               .flags = CFTYPE_NOT_ON_ROOT,
                 .read_seq_string = cfq_print_weight,
                 .write_u64 = cfq_set_weight,
         },
+
+       {
+               .name = "leaf_weight_device",
+               .read_seq_string = cfqg_print_leaf_weight_device,
+               .write_string = cfqg_set_leaf_weight_device,
+               .max_write_len = 256,
+       },
+       {
+               .name = "leaf_weight",
+               .read_seq_string = cfq_print_leaf_weight,
+               .write_u64 = cfq_set_leaf_weight,
+       },
+
+       /* statistics, covers only the tasks in the cfqg */
         {
                 .name = "time",
                 .private = offsetof(struct cfq_group, stats.time),
@@ -1558,6 +1905,48 @@ static struct cftype cfq_blkcg_files[] = {
                 .private = offsetof(struct cfq_group, stats.queued),
                 .read_seq_string = cfqg_print_rwstat,
         },
+
+       /* the same statictics which cover the cfqg and its descendants */
+       {
+               .name = "time_recursive",
+               .private = offsetof(struct cfq_group, stats.time),
+               .read_seq_string = cfqg_print_stat_recursive,
+       },
+       {
+               .name = "sectors_recursive",
+               .private = offsetof(struct cfq_group, stats.sectors),
+               .read_seq_string = cfqg_print_stat_recursive,
+       },
+       {
+               .name = "io_service_bytes_recursive",
+               .private = offsetof(struct cfq_group, stats.service_bytes),
+               .read_seq_string = cfqg_print_rwstat_recursive,
+       },
+       {
+               .name = "io_serviced_recursive",
+               .private = offsetof(struct cfq_group, stats.serviced),
+               .read_seq_string = cfqg_print_rwstat_recursive,
+       },
+       {
+               .name = "io_service_time_recursive",
+               .private = offsetof(struct cfq_group, stats.service_time),
+               .read_seq_string = cfqg_print_rwstat_recursive,
+       },
+       {
+               .name = "io_wait_time_recursive",
+               .private = offsetof(struct cfq_group, stats.wait_time),
+               .read_seq_string = cfqg_print_rwstat_recursive,
+       },
+       {
+               .name = "io_merged_recursive",
+               .private = offsetof(struct cfq_group, stats.merged),
+               .read_seq_string = cfqg_print_rwstat_recursive,
+       },
+       {
+               .name = "io_queued_recursive",
+               .private = offsetof(struct cfq_group, stats.queued),
+               .read_seq_string = cfqg_print_rwstat_recursive,
+       },
  #ifdef CONFIG_DEBUG_BLK_CGROUP
         {
                 .name = "avg_queue_size",
@@ -3204,6 +3593,8 @@ retry:
                         spin_lock_irq(cfqd->queue->queue_lock);
                         if (new_cfqq)
                                 goto retry;
+                       else
+                               return &cfqd->oom_cfqq;
                 } else {
                         cfqq = kmem_cache_alloc_node(cfq_pool,
                                         gfp_mask | __GFP_ZERO,
@@ -3992,6 +4383,7 @@ static int cfq_init_queue(struct request_queue *q)
         cfq_init_cfqg_base(cfqd->root_group);
  #endif
         cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
+       cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
  
         /*
          * Not strictly needed (since RB_ROOT just clears the node and we
@@ -4176,6 +4568,7 @@ static struct blkcg_policy blkcg_policy_cfq = {
         .cftypes                = cfq_blkcg_files,
  
         .pd_init_fn             = cfq_pd_init,
+       .pd_offline_fn          = cfq_pd_offline,
         .pd_reset_stats_fn      = cfq_pd_reset_stats,
  };
  #endif