struct rb_root rb;
struct rb_node *left;
unsigned count;
- unsigned total_weight;
u64 min_vdisktime;
struct cfq_ttime ttime;
};
/* group service_tree key */
u64 vdisktime;
+
+ /*
+ * The number of active cfqgs and sum of their weights under this
+ * cfqg. This covers this cfqg's leaf_weight and all children's
+ * weights, but does not cover weights of further descendants.
+ *
+ * If a cfqg is on the service tree, it's active. An active cfqg
+ * also activates its parent and contributes to the children_weight
+ * of the parent.
+ */
+ int nr_active;
+ unsigned int children_weight;
+
+ /*
+ * vfraction is the fraction of vdisktime that the tasks in this
+ * cfqg are entitled to. This is determined by compounding the
+ * ratios walking up from this cfqg to the root.
+ *
+ * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
+ * vfractions on a service tree is approximately 1. The sum may
+ * deviate a bit due to rounding errors and fluctuations caused by
+ * cfqgs entering and leaving the service tree.
+ */
+ unsigned int vfraction;
+
+ /*
+ * There are two weights - (internal) weight is the weight of this
+ * cfqg against the sibling cfqgs. leaf_weight is the wight of
+ * this cfqg against the child cfqgs. For the root cfqg, both
+ * weights are kept in sync for backward compatibility.
+ */
unsigned int weight;
unsigned int new_weight;
unsigned int dev_weight;
+ unsigned int leaf_weight;
+ unsigned int new_leaf_weight;
+ unsigned int dev_leaf_weight;
+
/* number of cfqq currently on this group */
int nr_cfqq;
/* number of requests that are on the dispatch list or inside driver */
int dispatched;
struct cfq_ttime ttime;
- struct cfqg_stats stats;
+ struct cfqg_stats stats; /* stats for this cfqg */
+ struct cfqg_stats dead_stats; /* stats pushed from dead children */
};
struct cfq_io_cq {
{
struct cfqg_stats *stats = &cfqg->stats;
- if (blkg_rwstat_sum(&stats->queued))
+ if (blkg_rwstat_total(&stats->queued))
return;
/*
struct cfqg_stats *stats = &cfqg->stats;
blkg_stat_add(&stats->avg_queue_size_sum,
- blkg_rwstat_sum(&stats->queued));
+ blkg_rwstat_total(&stats->queued));
blkg_stat_add(&stats->avg_queue_size_samples, 1);
cfqg_stats_update_group_wait_time(stats);
}
return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
}
+static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
+{
+ struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
+
+ return pblkg ? blkg_to_cfqg(pblkg) : NULL;
+}
+
static inline void cfqg_get(struct cfq_group *cfqg)
{
return blkg_get(cfqg_to_blkg(cfqg));
io_start_time - start_time);
}
-static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+/* @stats = 0 */
+static void cfqg_stats_reset(struct cfqg_stats *stats)
{
- struct cfq_group *cfqg = blkg_to_cfqg(blkg);
- struct cfqg_stats *stats = &cfqg->stats;
-
/* queued stats shouldn't be cleared */
blkg_rwstat_reset(&stats->service_bytes);
blkg_rwstat_reset(&stats->serviced);
#endif
}
+/* @to += @from */
+static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
+{
+ /* queued stats shouldn't be cleared */
+ blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
+ blkg_rwstat_merge(&to->serviced, &from->serviced);
+ blkg_rwstat_merge(&to->merged, &from->merged);
+ blkg_rwstat_merge(&to->service_time, &from->service_time);
+ blkg_rwstat_merge(&to->wait_time, &from->wait_time);
+ blkg_stat_merge(&from->time, &from->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
+ blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+ blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+ blkg_stat_merge(&to->dequeue, &from->dequeue);
+ blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
+ blkg_stat_merge(&to->idle_time, &from->idle_time);
+ blkg_stat_merge(&to->empty_time, &from->empty_time);
+#endif
+}
+
+/*
+ * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
+ * recursive stats can still account for the amount used by this cfqg after
+ * it's gone.
+ */
+static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
+{
+ struct cfq_group *parent = cfqg_parent(cfqg);
+
+ lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
+
+ if (unlikely(!parent))
+ return;
+
+ cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
+ cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
+ cfqg_stats_reset(&cfqg->stats);
+ cfqg_stats_reset(&cfqg->dead_stats);
+}
+
#else /* CONFIG_CFQ_GROUP_IOSCHED */
+static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
static inline void cfqg_get(struct cfq_group *cfqg) { }
static inline void cfqg_put(struct cfq_group *cfqg) { }
return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
}
-static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
+/**
+ * cfqg_scale_charge - scale disk time charge according to cfqg weight
+ * @charge: disk time being charged
+ * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
+ *
+ * Scale @charge according to @vfraction, which is in range (0, 1]. The
+ * scaling is inversely proportional.
+ *
+ * scaled = charge / vfraction
+ *
+ * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
+ */
+static inline u64 cfqg_scale_charge(unsigned long charge,
+ unsigned int vfraction)
{
- u64 d = delta << CFQ_SERVICE_SHIFT;
+ u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */
- d = d * CFQ_WEIGHT_DEFAULT;
- do_div(d, cfqg->weight);
- return d;
+ /* charge / vfraction */
+ c <<= CFQ_SERVICE_SHIFT;
+ do_div(c, vfraction);
+ return c;
}
static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
static inline unsigned
cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
- struct cfq_rb_root *st = &cfqd->grp_service_tree;
-
- return cfqd->cfq_target_latency * cfqg->weight / st->total_weight;
+ return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
}
static inline unsigned
cfq_update_group_weight(struct cfq_group *cfqg)
{
BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+
if (cfqg->new_weight) {
cfqg->weight = cfqg->new_weight;
cfqg->new_weight = 0;
}
+
+ if (cfqg->new_leaf_weight) {
+ cfqg->leaf_weight = cfqg->new_leaf_weight;
+ cfqg->new_leaf_weight = 0;
+ }
}
static void
cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
+ unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */
+ struct cfq_group *pos = cfqg;
+ struct cfq_group *parent;
+ bool propagate;
+
+ /* add to the service tree */
BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
cfq_update_group_weight(cfqg);
__cfq_group_service_tree_add(st, cfqg);
- st->total_weight += cfqg->weight;
+
+ /*
+ * Activate @cfqg and calculate the portion of vfraction @cfqg is
+ * entitled to. vfraction is calculated by walking the tree
+ * towards the root calculating the fraction it has at each level.
+ * The compounded ratio is how much vfraction @cfqg owns.
+ *
+ * Start with the proportion tasks in this cfqg has against active
+ * children cfqgs - its leaf_weight against children_weight.
+ */
+ propagate = !pos->nr_active++;
+ pos->children_weight += pos->leaf_weight;
+ vfr = vfr * pos->leaf_weight / pos->children_weight;
+
+ /*
+ * Compound ->weight walking up the tree. Both activation and
+ * vfraction calculation are done in the same loop. Propagation
+ * stops once an already activated node is met. vfraction
+ * calculation should always continue to the root.
+ */
+ while ((parent = cfqg_parent(pos))) {
+ if (propagate) {
+ propagate = !parent->nr_active++;
+ parent->children_weight += pos->weight;
+ }
+ vfr = vfr * pos->weight / parent->children_weight;
+ pos = parent;
+ }
+
+ cfqg->vfraction = max_t(unsigned, vfr, 1);
}
static void
static void
cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
- st->total_weight -= cfqg->weight;
+ struct cfq_group *pos = cfqg;
+ bool propagate;
+
+ /*
+ * Undo activation from cfq_group_service_tree_add(). Deactivate
+ * @cfqg and propagate deactivation upwards.
+ */
+ propagate = !--pos->nr_active;
+ pos->children_weight -= pos->leaf_weight;
+
+ while (propagate) {
+ struct cfq_group *parent = cfqg_parent(pos);
+
+ /* @pos has 0 nr_active at this point */
+ WARN_ON_ONCE(pos->children_weight);
+ pos->vfraction = 0;
+
+ if (!parent)
+ break;
+
+ propagate = !--parent->nr_active;
+ parent->children_weight -= pos->weight;
+ pos = parent;
+ }
+
+ /* remove from the service tree */
if (!RB_EMPTY_NODE(&cfqg->rb_node))
cfq_rb_erase(&cfqg->rb_node, st);
}
unsigned int used_sl, charge, unaccounted_sl = 0;
int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
- cfqg->service_tree_idle.count;
+ unsigned int vfr;
BUG_ON(nr_sync < 0);
used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
charge = cfqq->allocated_slice;
- /* Can't update vdisktime while group is on service tree */
+ /*
+ * Can't update vdisktime while on service tree and cfqg->vfraction
+ * is valid only while on it. Cache vfr, leave the service tree,
+ * update vdisktime and go back on. The re-addition to the tree
+ * will also update the weights as necessary.
+ */
+ vfr = cfqg->vfraction;
cfq_group_service_tree_del(st, cfqg);
- cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
- /* If a new weight was requested, update now, off tree */
+ cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
cfq_group_service_tree_add(st, cfqg);
/* This group is being expired. Save the context */
cfq_init_cfqg_base(cfqg);
cfqg->weight = blkg->blkcg->cfq_weight;
+ cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
+}
+
+static void cfq_pd_offline(struct blkcg_gq *blkg)
+{
+ /*
+ * @blkg is going offline and will be ignored by
+ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
+ * that they don't get lost. If IOs complete after this point, the
+ * stats for them will be lost. Oh well...
+ */
+ cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
+}
+
+/* offset delta from cfqg->stats to cfqg->dead_stats */
+static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
+ offsetof(struct cfq_group, stats);
+
+/* to be used by recursive prfill, sums live and dead stats recursively */
+static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+{
+ u64 sum = 0;
+
+ sum += blkg_stat_recursive_sum(pd, off);
+ sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
+ return sum;
+}
+
+/* to be used by recursive prfill, sums live and dead rwstats recursively */
+static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
+ int off)
+{
+ struct blkg_rwstat a, b;
+
+ a = blkg_rwstat_recursive_sum(pd, off);
+ b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
+ blkg_rwstat_merge(&a, &b);
+ return a;
+}
+
+static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+{
+ struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+
+ cfqg_stats_reset(&cfqg->stats);
+ cfqg_stats_reset(&cfqg->dead_stats);
}
/*
return 0;
}
+static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct cfq_group *cfqg = pd_to_cfqg(pd);
+
+ if (!cfqg->dev_leaf_weight)
+ return 0;
+ return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
+}
+
+static int cfqg_print_leaf_weight_device(struct cgroup *cgrp,
+ struct cftype *cft,
+ struct seq_file *sf)
+{
+ blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
+ cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0,
+ false);
+ return 0;
+}
+
static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
struct seq_file *sf)
{
return 0;
}
-static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
- const char *buf)
+static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *sf)
+{
+ seq_printf(sf, "%u\n",
+ cgroup_to_blkcg(cgrp)->cfq_leaf_weight);
+ return 0;
+}
+
+static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+ const char *buf, bool is_leaf_weight)
{
struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
struct blkg_conf_ctx ctx;
ret = -EINVAL;
cfqg = blkg_to_cfqg(ctx.blkg);
if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
- cfqg->dev_weight = ctx.v;
- cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight;
+ if (!is_leaf_weight) {
+ cfqg->dev_weight = ctx.v;
+ cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
+ } else {
+ cfqg->dev_leaf_weight = ctx.v;
+ cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
+ }
ret = 0;
}
return ret;
}
-static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+ const char *buf)
+{
+ return __cfqg_set_weight_device(cgrp, cft, buf, false);
+}
+
+static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft,
+ const char *buf)
+{
+ return __cfqg_set_weight_device(cgrp, cft, buf, true);
+}
+
+static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
+ bool is_leaf_weight)
{
struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
struct blkcg_gq *blkg;
- struct hlist_node *n;
if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
return -EINVAL;
spin_lock_irq(&blkcg->lock);
- blkcg->cfq_weight = (unsigned int)val;
- hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+ if (!is_leaf_weight)
+ blkcg->cfq_weight = val;
+ else
+ blkcg->cfq_leaf_weight = val;
+
+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
struct cfq_group *cfqg = blkg_to_cfqg(blkg);
- if (cfqg && !cfqg->dev_weight)
- cfqg->new_weight = blkcg->cfq_weight;
+ if (!cfqg)
+ continue;
+
+ if (!is_leaf_weight) {
+ if (!cfqg->dev_weight)
+ cfqg->new_weight = blkcg->cfq_weight;
+ } else {
+ if (!cfqg->dev_leaf_weight)
+ cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
+ }
}
spin_unlock_irq(&blkcg->lock);
return 0;
}
+static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+ return __cfq_set_weight(cgrp, cft, val, false);
+}
+
+static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+ return __cfq_set_weight(cgrp, cft, val, true);
+}
+
static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
struct seq_file *sf)
{
return 0;
}
+static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
+
+ return __blkg_prfill_u64(sf, pd, sum);
+}
+
+static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
+
+ return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *sf)
+{
+ struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+ blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive,
+ &blkcg_policy_cfq, cft->private, false);
+ return 0;
+}
+
+static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *sf)
+{
+ struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+ blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive,
+ &blkcg_policy_cfq, cft->private, true);
+ return 0;
+}
+
#ifdef CONFIG_DEBUG_BLK_CGROUP
static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
#endif /* CONFIG_DEBUG_BLK_CGROUP */
static struct cftype cfq_blkcg_files[] = {
+ /* on root, weight is mapped to leaf_weight */
+ {
+ .name = "weight_device",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .read_seq_string = cfqg_print_leaf_weight_device,
+ .write_string = cfqg_set_leaf_weight_device,
+ .max_write_len = 256,
+ },
+ {
+ .name = "weight",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .read_seq_string = cfq_print_leaf_weight,
+ .write_u64 = cfq_set_leaf_weight,
+ },
+
+ /* no such mapping necessary for !roots */
{
.name = "weight_device",
+ .flags = CFTYPE_NOT_ON_ROOT,
.read_seq_string = cfqg_print_weight_device,
.write_string = cfqg_set_weight_device,
.max_write_len = 256,
},
{
.name = "weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
.read_seq_string = cfq_print_weight,
.write_u64 = cfq_set_weight,
},
+
+ {
+ .name = "leaf_weight_device",
+ .read_seq_string = cfqg_print_leaf_weight_device,
+ .write_string = cfqg_set_leaf_weight_device,
+ .max_write_len = 256,
+ },
+ {
+ .name = "leaf_weight",
+ .read_seq_string = cfq_print_leaf_weight,
+ .write_u64 = cfq_set_leaf_weight,
+ },
+
+ /* statistics, covers only the tasks in the cfqg */
{
.name = "time",
.private = offsetof(struct cfq_group, stats.time),
.private = offsetof(struct cfq_group, stats.queued),
.read_seq_string = cfqg_print_rwstat,
},
+
+ /* the same statictics which cover the cfqg and its descendants */
+ {
+ .name = "time_recursive",
+ .private = offsetof(struct cfq_group, stats.time),
+ .read_seq_string = cfqg_print_stat_recursive,
+ },
+ {
+ .name = "sectors_recursive",
+ .private = offsetof(struct cfq_group, stats.sectors),
+ .read_seq_string = cfqg_print_stat_recursive,
+ },
+ {
+ .name = "io_service_bytes_recursive",
+ .private = offsetof(struct cfq_group, stats.service_bytes),
+ .read_seq_string = cfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "io_serviced_recursive",
+ .private = offsetof(struct cfq_group, stats.serviced),
+ .read_seq_string = cfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "io_service_time_recursive",
+ .private = offsetof(struct cfq_group, stats.service_time),
+ .read_seq_string = cfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "io_wait_time_recursive",
+ .private = offsetof(struct cfq_group, stats.wait_time),
+ .read_seq_string = cfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "io_merged_recursive",
+ .private = offsetof(struct cfq_group, stats.merged),
+ .read_seq_string = cfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "io_queued_recursive",
+ .private = offsetof(struct cfq_group, stats.queued),
+ .read_seq_string = cfqg_print_rwstat_recursive,
+ },
#ifdef CONFIG_DEBUG_BLK_CGROUP
{
.name = "avg_queue_size",
spin_lock_irq(cfqd->queue->queue_lock);
if (new_cfqq)
goto retry;
+ else
+ return &cfqd->oom_cfqq;
} else {
cfqq = kmem_cache_alloc_node(cfq_pool,
gfp_mask | __GFP_ZERO,
cfq_init_cfqg_base(cfqd->root_group);
#endif
cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
+ cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
/*
* Not strictly needed (since RB_ROOT just clears the node and we
.cftypes = cfq_blkcg_files,
.pd_init_fn = cfq_pd_init,
+ .pd_offline_fn = cfq_pd_offline,
.pd_reset_stats_fn = cfq_pd_reset_stats,
};
#endif