}
}
-static int grow_buffers(struct stripe_head *sh)
+static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
{
int i;
int num = sh->raid_conf->pool_size;
for (i = 0; i < num; i++) {
struct page *page;
- if (!(page = alloc_page(GFP_KERNEL))) {
+ if (!(page = alloc_page(gfp))) {
return 1;
}
sh->dev[i].page = page;
*(conf->hash_locks + hash));
sh = __find_stripe(conf, sector, conf->generation - previous);
if (!sh) {
- if (!conf->inactive_blocked)
+ if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
sh = get_free_stripe(conf, hash);
+ if (!sh && llist_empty(&conf->released_stripes) &&
+ !test_bit(R5_DID_ALLOC, &conf->cache_state))
+ set_bit(R5_ALLOC_MORE,
+ &conf->cache_state);
+ }
if (noblock && sh == NULL)
break;
if (!sh) {
- conf->inactive_blocked = 1;
+ set_bit(R5_INACTIVE_BLOCKED,
+ &conf->cache_state);
wait_event_lock_irq(
conf->wait_for_stripe,
!list_empty(conf->inactive_list + hash) &&
(atomic_read(&conf->active_stripes)
< (conf->max_nr_stripes * 3 / 4)
- || !conf->inactive_blocked),
+ || !test_bit(R5_INACTIVE_BLOCKED,
+ &conf->cache_state)),
*(conf->hash_locks + hash));
- conf->inactive_blocked = 0;
+ clear_bit(R5_INACTIVE_BLOCKED,
+ &conf->cache_state);
} else {
init_stripe(sh, sector, previous);
atomic_inc(&sh->count);
pr_debug("skip op %ld on disc %d for sector %llu\n",
bi->bi_rw, i, (unsigned long long)sh->sector);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ if (sh->batch_head)
+ set_bit(STRIPE_BATCH_ERR,
+ &sh->batch_head->state);
set_bit(STRIPE_HANDLE, &sh->state);
}
* destination buffer is recorded in srcs[count] and the Q destination
* is recorded in srcs[count+1]].
*/
-static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
+static int set_syndrome_sources(struct page **srcs,
+ struct stripe_head *sh,
+ int srctype)
{
int disks = sh->disks;
int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
i = d0_idx;
do {
int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+ struct r5dev *dev = &sh->dev[i];
- srcs[slot] = sh->dev[i].page;
+ if (i == sh->qd_idx || i == sh->pd_idx ||
+ (srctype == SYNDROME_SRC_ALL) ||
+ (srctype == SYNDROME_SRC_WANT_DRAIN &&
+ test_bit(R5_Wantdrain, &dev->flags)) ||
+ (srctype == SYNDROME_SRC_WRITTEN &&
+ dev->written))
+ srcs[slot] = sh->dev[i].page;
i = raid6_next_disk(i, disks);
} while (i != d0_idx);
atomic_inc(&sh->count);
if (target == qd_idx) {
- count = set_syndrome_sources(blocks, sh);
+ count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
blocks[count] = NULL; /* regenerating p is not necessary */
BUG_ON(blocks[count+1] != dest); /* q should already be set */
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
&submit);
- count = set_syndrome_sources(blocks, sh);
+ count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
init_async_submit(&submit, ASYNC_TX_FENCE, tx,
ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
}
static struct dma_async_tx_descriptor *
-ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
- struct dma_async_tx_descriptor *tx)
+ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx)
{
int disks = sh->disks;
struct page **xor_srcs = to_addr_page(percpu, 0);
return tx;
}
+static struct dma_async_tx_descriptor *
+ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx)
+{
+ struct page **blocks = to_addr_page(percpu, 0);
+ int count;
+ struct async_submit_ctl submit;
+
+ pr_debug("%s: stripe %llu\n", __func__,
+ (unsigned long long)sh->sector);
+
+ count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
+
+ init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
+ ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
+ tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
+
+ return tx;
+}
+
static struct dma_async_tx_descriptor *
ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
{
int count, i, j = 0;
struct stripe_head *head_sh = sh;
int last_stripe;
+ int synflags;
+ unsigned long txflags;
pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
again:
blocks = to_addr_page(percpu, j);
- count = set_syndrome_sources(blocks, sh);
+
+ if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+ synflags = SYNDROME_SRC_WRITTEN;
+ txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
+ } else {
+ synflags = SYNDROME_SRC_ALL;
+ txflags = ASYNC_TX_ACK;
+ }
+
+ count = set_syndrome_sources(blocks, sh, synflags);
last_stripe = !head_sh->batch_head ||
list_first_entry(&sh->batch_list,
struct stripe_head, batch_list) == head_sh;
if (last_stripe) {
atomic_inc(&head_sh->count);
- init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
+ init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
head_sh, to_addr_conv(sh, percpu, j));
} else
init_async_submit(&submit, 0, tx, NULL, NULL,
(unsigned long long)sh->sector, checkp);
BUG_ON(sh->batch_head);
- count = set_syndrome_sources(srcs, sh);
+ count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
if (!checkp)
srcs[count] = NULL;
async_tx_ack(tx);
}
- if (test_bit(STRIPE_OP_PREXOR, &ops_request))
- tx = ops_run_prexor(sh, percpu, tx);
+ if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
+ if (level < 6)
+ tx = ops_run_prexor5(sh, percpu, tx);
+ else
+ tx = ops_run_prexor6(sh, percpu, tx);
+ }
if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
tx = ops_run_biodrain(sh, tx);
put_cpu();
}
-static int grow_one_stripe(struct r5conf *conf, int hash)
+static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
{
struct stripe_head *sh;
- sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
+ sh = kmem_cache_zalloc(conf->slab_cache, gfp);
if (!sh)
return 0;
spin_lock_init(&sh->stripe_lock);
- if (grow_buffers(sh)) {
+ if (grow_buffers(sh, gfp)) {
shrink_buffers(sh);
kmem_cache_free(conf->slab_cache, sh);
return 0;
}
- sh->hash_lock_index = hash;
+ sh->hash_lock_index =
+ conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
/* we just created an active stripe so... */
atomic_set(&sh->count, 1);
atomic_inc(&conf->active_stripes);
INIT_LIST_HEAD(&sh->batch_list);
sh->batch_head = NULL;
release_stripe(sh);
+ conf->max_nr_stripes++;
return 1;
}
{
struct kmem_cache *sc;
int devs = max(conf->raid_disks, conf->previous_raid_disks);
- int hash;
if (conf->mddev->gendisk)
sprintf(conf->cache_name[0],
return 1;
conf->slab_cache = sc;
conf->pool_size = devs;
- hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
- while (num--) {
- if (!grow_one_stripe(conf, hash))
+ while (num--)
+ if (!grow_one_stripe(conf, GFP_KERNEL))
return 1;
- conf->max_nr_stripes++;
- hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
- }
+
return 0;
}
return err;
}
-static int drop_one_stripe(struct r5conf *conf, int hash)
+static int drop_one_stripe(struct r5conf *conf)
{
struct stripe_head *sh;
+ int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
spin_lock_irq(conf->hash_locks + hash);
sh = get_free_stripe(conf, hash);
shrink_buffers(sh);
kmem_cache_free(conf->slab_cache, sh);
atomic_dec(&conf->active_stripes);
+ conf->max_nr_stripes--;
return 1;
}
static void shrink_stripes(struct r5conf *conf)
{
- int hash;
- for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
- while (drop_one_stripe(conf, hash))
- ;
+ while (conf->max_nr_stripes &&
+ drop_one_stripe(conf))
+ ;
if (conf->slab_cache)
kmem_cache_destroy(conf->slab_cache);
}
rdev_dec_pending(rdev, conf->mddev);
+ if (sh->batch_head && !uptodate)
+ set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
+
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
int rcw, int expand)
{
- int i, pd_idx = sh->pd_idx, disks = sh->disks;
+ int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
struct r5conf *conf = sh->raid_conf;
int level = conf->level;
if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
atomic_inc(&conf->pending_full_writes);
} else {
- BUG_ON(level == 6);
BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
+ BUG_ON(level == 6 &&
+ (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
+ test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- if (i == pd_idx)
+ if (i == pd_idx || i == qd_idx)
continue;
if (dev->towrite &&
struct stripe_head, batch_list);
list_del_init(&sh->batch_list);
- sh->state = head_sh->state & (~((1 << STRIPE_ACTIVE) |
- (1 << STRIPE_PREREAD_ACTIVE)));
+ set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+ head_sh->state & ~((1 << STRIPE_ACTIVE) |
+ (1 << STRIPE_PREREAD_ACTIVE) |
+ STRIPE_EXPAND_SYNC_FLAG));
sh->check_state = head_sh->check_state;
sh->reconstruct_state = head_sh->reconstruct_state;
for (i = 0; i < sh->disks; i++) {
spin_lock_irq(&sh->stripe_lock);
sh->batch_head = NULL;
spin_unlock_irq(&sh->stripe_lock);
+ if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
+ set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
}
head_sh->batch_head = NULL;
spin_unlock_irq(&head_sh->stripe_lock);
wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
+ if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
+ set_bit(STRIPE_HANDLE, &head_sh->state);
}
static void handle_stripe_dirtying(struct r5conf *conf,
int rmw = 0, rcw = 0, i;
sector_t recovery_cp = conf->mddev->recovery_cp;
- /* RAID6 requires 'rcw' in current implementation.
- * Otherwise, check whether resync is now happening or should start.
+ /* Check whether resync is now happening or should start.
* If yes, then the array is dirty (after unclean shutdown or
* initial creation), so parity in some stripes might be inconsistent.
* In this case, we need to always do reconstruct-write, to ensure
* that in case of drive failure or read-error correction, we
* generate correct data from the parity.
*/
- if (conf->max_degraded == 2 ||
+ if (conf->rmw_level == PARITY_DISABLE_RMW ||
(recovery_cp < MaxSector && sh->sector >= recovery_cp &&
s->failed == 0)) {
/* Calculate the real rcw later - for now make it
* look like rcw is cheaper
*/
rcw = 1; rmw = 2;
- pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
- conf->max_degraded, (unsigned long long)recovery_cp,
+ pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
+ conf->rmw_level, (unsigned long long)recovery_cp,
(unsigned long long)sh->sector);
} else for (i = disks; i--; ) {
/* would I have to read this buffer for read_modify_write */
struct r5dev *dev = &sh->dev[i];
- if ((dev->towrite || i == sh->pd_idx) &&
+ if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
rmw += 2*disks; /* cannot read it */
}
/* Would I have to read this buffer for reconstruct_write */
- if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+ if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+ i != sh->pd_idx && i != sh->qd_idx &&
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
pr_debug("for sector %llu, rmw=%d rcw=%d\n",
(unsigned long long)sh->sector, rmw, rcw);
set_bit(STRIPE_HANDLE, &sh->state);
- if (rmw < rcw && rmw > 0) {
+ if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
if (conf->mddev->queue)
blk_add_trace_msg(conf->mddev->queue,
(unsigned long long)sh->sector, rmw);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
- if ((dev->towrite || i == sh->pd_idx) &&
+ if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags)) &&
}
}
}
- if (rcw <= rmw && rcw > 0) {
+ if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) {
/* want reconstruct write, but need to get some data */
int qread =0;
rcw = 0;
memset(s, 0, sizeof(*s));
- s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
- s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+ s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
+ s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
s->failed_num[0] = -1;
s->failed_num[1] = -1;
return 0;
}
+static void check_break_stripe_batch_list(struct stripe_head *sh)
+{
+ struct stripe_head *head_sh, *next;
+ int i;
+
+ if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
+ return;
+
+ head_sh = sh;
+ do {
+ sh = list_first_entry(&sh->batch_list,
+ struct stripe_head, batch_list);
+ BUG_ON(sh == head_sh);
+ } while (!test_bit(STRIPE_DEGRADED, &sh->state));
+
+ while (sh != head_sh) {
+ next = list_first_entry(&sh->batch_list,
+ struct stripe_head, batch_list);
+ list_del_init(&sh->batch_list);
+
+ set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+ head_sh->state & ~((1 << STRIPE_ACTIVE) |
+ (1 << STRIPE_PREREAD_ACTIVE) |
+ (1 << STRIPE_DEGRADED) |
+ STRIPE_EXPAND_SYNC_FLAG));
+ sh->check_state = head_sh->check_state;
+ sh->reconstruct_state = head_sh->reconstruct_state;
+ for (i = 0; i < sh->disks; i++)
+ sh->dev[i].flags = head_sh->dev[i].flags &
+ (~((1 << R5_WriteError) | (1 << R5_Overlap)));
+
+ spin_lock_irq(&sh->stripe_lock);
+ sh->batch_head = NULL;
+ spin_unlock_irq(&sh->stripe_lock);
+
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+
+ sh = next;
+ }
+}
+
static void handle_stripe(struct stripe_head *sh)
{
struct stripe_head_state s;
return;
}
- if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+ check_break_stripe_batch_list(sh);
+
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
spin_lock(&sh->stripe_lock);
/* Cannot process 'sync' concurrently with 'discard' */
if (!test_bit(STRIPE_DISCARD, &sh->state) &&
* how busy the stripe_cache is
*/
- if (conf->inactive_blocked)
+ if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
return 1;
if (conf->quiesce)
return 1;
unsigned int chunk_sectors = mddev->chunk_sectors;
unsigned int bio_sectors = bvm->bi_size >> 9;
- if ((bvm->bi_rw & 1) == WRITE)
- return biovec->bv_len; /* always allow writes to be mergeable */
+ /*
+ * always allow writes to be mergeable, read as well if array
+ * is degraded as we'll go through stripe cache anyway.
+ */
+ if ((bvm->bi_rw & 1) == WRITE || mddev->degraded)
+ return biovec->bv_len;
if (mddev->new_chunk_sectors < mddev->chunk_sectors)
chunk_sectors = mddev->new_chunk_sectors;
md_write_start(mddev, bi);
- if (rw == READ &&
+ /*
+ * If array is degraded, better not do chunk aligned read because
+ * later we might have to read it again in order to reconstruct
+ * data on failed drives.
+ */
+ if (rw == READ && mddev->degraded == 0 &&
mddev->reshape_position == MaxSector &&
chunk_aligned_read(mddev,bi))
return;
int batch_size, released;
released = release_stripe_list(conf, conf->temp_inactive_list);
+ if (released)
+ clear_bit(R5_DID_ALLOC, &conf->cache_state);
if (
!list_empty(&conf->bitmap_list)) {
pr_debug("%d stripes handled\n", handled);
spin_unlock_irq(&conf->device_lock);
+ if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
+ grow_one_stripe(conf, __GFP_NOWARN);
+ /* Set flag even if allocation failed. This helps
+ * slow down allocation requests when mem is short
+ */
+ set_bit(R5_DID_ALLOC, &conf->cache_state);
+ }
async_tx_issue_pending_all();
blk_finish_plug(&plug);
spin_lock(&mddev->lock);
conf = mddev->private;
if (conf)
- ret = sprintf(page, "%d\n", conf->max_nr_stripes);
+ ret = sprintf(page, "%d\n", conf->min_nr_stripes);
spin_unlock(&mddev->lock);
return ret;
}
{
struct r5conf *conf = mddev->private;
int err;
- int hash;
if (size <= 16 || size > 32768)
return -EINVAL;
- hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
- while (size < conf->max_nr_stripes) {
- if (drop_one_stripe(conf, hash))
- conf->max_nr_stripes--;
- else
- break;
- hash--;
- if (hash < 0)
- hash = NR_STRIPE_HASH_LOCKS - 1;
- }
+
+ conf->min_nr_stripes = size;
+ while (size < conf->max_nr_stripes &&
+ drop_one_stripe(conf))
+ ;
+
+
err = md_allow_write(mddev);
if (err)
return err;
- hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
- while (size > conf->max_nr_stripes) {
- if (grow_one_stripe(conf, hash))
- conf->max_nr_stripes++;
- else break;
- hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
- }
+
+ while (size > conf->max_nr_stripes)
+ if (!grow_one_stripe(conf, GFP_KERNEL))
+ break;
+
return 0;
}
EXPORT_SYMBOL(raid5_set_cache_size);
raid5_show_stripe_cache_size,
raid5_store_stripe_cache_size);
+static ssize_t
+raid5_show_rmw_level(struct mddev *mddev, char *page)
+{
+ struct r5conf *conf = mddev->private;
+ if (conf)
+ return sprintf(page, "%d\n", conf->rmw_level);
+ else
+ return 0;
+}
+
+static ssize_t
+raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len)
+{
+ struct r5conf *conf = mddev->private;
+ unsigned long new;
+
+ if (!conf)
+ return -ENODEV;
+
+ if (len >= PAGE_SIZE)
+ return -EINVAL;
+
+ if (kstrtoul(page, 10, &new))
+ return -EINVAL;
+
+ if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
+ return -EINVAL;
+
+ if (new != PARITY_DISABLE_RMW &&
+ new != PARITY_ENABLE_RMW &&
+ new != PARITY_PREFER_RMW)
+ return -EINVAL;
+
+ conf->rmw_level = new;
+ return len;
+}
+
+static struct md_sysfs_entry
+raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
+ raid5_show_rmw_level,
+ raid5_store_rmw_level);
+
+
static ssize_t
raid5_show_preread_threshold(struct mddev *mddev, char *page)
{
conf = mddev->private;
if (!conf)
err = -ENODEV;
- else if (new > conf->max_nr_stripes)
+ else if (new > conf->min_nr_stripes)
err = -EINVAL;
else
conf->bypass_threshold = new;
&raid5_preread_bypass_threshold.attr,
&raid5_group_thread_cnt.attr,
&raid5_skip_copy.attr,
+ &raid5_rmw_level.attr,
NULL,
};
static struct attribute_group raid5_attrs_group = {
static void free_conf(struct r5conf *conf)
{
+ if (conf->shrinker.seeks)
+ unregister_shrinker(&conf->shrinker);
free_thread_groups(conf);
shrink_stripes(conf);
raid5_free_percpu(conf);
return err;
}
+static unsigned long raid5_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+ int ret = 0;
+ while (ret < sc->nr_to_scan) {
+ if (drop_one_stripe(conf) == 0)
+ return SHRINK_STOP;
+ ret++;
+ }
+ return ret;
+}
+
+static unsigned long raid5_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+
+ if (conf->max_nr_stripes < conf->min_nr_stripes)
+ /* unlikely, but not impossible */
+ return 0;
+ return conf->max_nr_stripes - conf->min_nr_stripes;
+}
+
static struct r5conf *setup_conf(struct mddev *mddev)
{
struct r5conf *conf;
}
conf->level = mddev->new_level;
- if (conf->level == 6)
+ if (conf->level == 6) {
conf->max_degraded = 2;
- else
+ if (raid6_call.xor_syndrome)
+ conf->rmw_level = PARITY_ENABLE_RMW;
+ else
+ conf->rmw_level = PARITY_DISABLE_RMW;
+ } else {
conf->max_degraded = 1;
+ conf->rmw_level = PARITY_ENABLE_RMW;
+ }
conf->algorithm = mddev->new_layout;
conf->reshape_progress = mddev->reshape_position;
if (conf->reshape_progress != MaxSector) {
conf->prev_algo = mddev->layout;
}
- memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
+ conf->min_nr_stripes = NR_STRIPES;
+ memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
- if (grow_stripes(conf, NR_STRIPES)) {
+ if (grow_stripes(conf, conf->min_nr_stripes)) {
printk(KERN_ERR
"md/raid:%s: couldn't allocate %dkB for buffers\n",
mdname(mddev), memory);
} else
printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
mdname(mddev), memory);
+ /*
+ * Losing a stripe head costs more than the time to refill it,
+ * it reduces the queue depth and so can hurt throughput.
+ * So set it rather large, scaled by number of devices.
+ */
+ conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
+ conf->shrinker.scan_objects = raid5_cache_scan;
+ conf->shrinker.count_objects = raid5_cache_count;
+ conf->shrinker.batch = 128;
+ conf->shrinker.flags = 0;
+ register_shrinker(&conf->shrinker);
sprintf(pers_name, "raid%d", mddev->new_level);
conf->thread = md_register_thread(raid5d, mddev, pers_name);
*/
struct r5conf *conf = mddev->private;
if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
- > conf->max_nr_stripes ||
+ > conf->min_nr_stripes ||
((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
- > conf->max_nr_stripes) {
+ > conf->min_nr_stripes) {
printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n",
mdname(mddev),
((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)