Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

[deliverable/linux.git] / drivers / md / raid5.c
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 717189e742434a68d36e420b43e32183fff08324..77dfd720aaa00ebc55d14234cd40b6e9b65bae5f 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -497,7 +497,7 @@ static void shrink_buffers(struct stripe_head *sh)
         }
  }
  
-static int grow_buffers(struct stripe_head *sh)
+static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
  {
         int i;
         int num = sh->raid_conf->pool_size;
@@ -505,7 +505,7 @@ static int grow_buffers(struct stripe_head *sh)
         for (i = 0; i < num; i++) {
                 struct page *page;
  
-               if (!(page = alloc_page(GFP_KERNEL))) {
+               if (!(page = alloc_page(gfp))) {
                         return 1;
                 }
                 sh->dev[i].page = page;
@@ -672,20 +672,28 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                     *(conf->hash_locks + hash));
                 sh = __find_stripe(conf, sector, conf->generation - previous);
                 if (!sh) {
-                       if (!conf->inactive_blocked)
+                       if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
                                 sh = get_free_stripe(conf, hash);
+                               if (!sh && llist_empty(&conf->released_stripes) &&
+                                   !test_bit(R5_DID_ALLOC, &conf->cache_state))
+                                       set_bit(R5_ALLOC_MORE,
+                                               &conf->cache_state);
+                       }
                         if (noblock && sh == NULL)
                                 break;
                         if (!sh) {
-                               conf->inactive_blocked = 1;
+                               set_bit(R5_INACTIVE_BLOCKED,
+                                       &conf->cache_state);
                                 wait_event_lock_irq(
                                         conf->wait_for_stripe,
                                         !list_empty(conf->inactive_list + hash) &&
                                         (atomic_read(&conf->active_stripes)
                                          < (conf->max_nr_stripes * 3 / 4)
-                                        || !conf->inactive_blocked),
+                                        || !test_bit(R5_INACTIVE_BLOCKED,
+                                                     &conf->cache_state)),
                                         *(conf->hash_locks + hash));
-                               conf->inactive_blocked = 0;
+                               clear_bit(R5_INACTIVE_BLOCKED,
+                                         &conf->cache_state);
                         } else {
                                 init_stripe(sh, sector, previous);
                                 atomic_inc(&sh->count);
@@ -1070,6 +1078,9 @@ again:
                         pr_debug("skip op %ld on disc %d for sector %llu\n",
                                 bi->bi_rw, i, (unsigned long long)sh->sector);
                         clear_bit(R5_LOCKED, &sh->dev[i].flags);
+                       if (sh->batch_head)
+                               set_bit(STRIPE_BATCH_ERR,
+                                       &sh->batch_head->state);
                         set_bit(STRIPE_HANDLE, &sh->state);
                 }
  
@@ -1314,7 +1325,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
   * destination buffer is recorded in srcs[count] and the Q destination
   * is recorded in srcs[count+1]].
   */
-static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
+static int set_syndrome_sources(struct page **srcs,
+                               struct stripe_head *sh,
+                               int srctype)
  {
         int disks = sh->disks;
         int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
@@ -1329,8 +1342,15 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
         i = d0_idx;
         do {
                 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+               struct r5dev *dev = &sh->dev[i];
  
-               srcs[slot] = sh->dev[i].page;
+               if (i == sh->qd_idx || i == sh->pd_idx ||
+                   (srctype == SYNDROME_SRC_ALL) ||
+                   (srctype == SYNDROME_SRC_WANT_DRAIN &&
+                    test_bit(R5_Wantdrain, &dev->flags)) ||
+                   (srctype == SYNDROME_SRC_WRITTEN &&
+                    dev->written))
+                       srcs[slot] = sh->dev[i].page;
                 i = raid6_next_disk(i, disks);
         } while (i != d0_idx);
  
@@ -1370,7 +1390,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
         atomic_inc(&sh->count);
  
         if (target == qd_idx) {
-               count = set_syndrome_sources(blocks, sh);
+               count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
                 blocks[count] = NULL; /* regenerating p is not necessary */
                 BUG_ON(blocks[count+1] != dest); /* q should already be set */
                 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
@@ -1478,7 +1498,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
                         tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
                                        &submit);
  
-                       count = set_syndrome_sources(blocks, sh);
+                       count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
                         init_async_submit(&submit, ASYNC_TX_FENCE, tx,
                                           ops_complete_compute, sh,
                                           to_addr_conv(sh, percpu, 0));
@@ -1512,8 +1532,8 @@ static void ops_complete_prexor(void *stripe_head_ref)
  }
  
  static struct dma_async_tx_descriptor *
-ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
-              struct dma_async_tx_descriptor *tx)
+ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
+               struct dma_async_tx_descriptor *tx)
  {
         int disks = sh->disks;
         struct page **xor_srcs = to_addr_page(percpu, 0);
@@ -1541,6 +1561,26 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
         return tx;
  }
  
+static struct dma_async_tx_descriptor *
+ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
+               struct dma_async_tx_descriptor *tx)
+{
+       struct page **blocks = to_addr_page(percpu, 0);
+       int count;
+       struct async_submit_ctl submit;
+
+       pr_debug("%s: stripe %llu\n", __func__,
+               (unsigned long long)sh->sector);
+
+       count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
+
+       init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
+                         ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
+       tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
+
+       return tx;
+}
+
  static struct dma_async_tx_descriptor *
  ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
  {
@@ -1743,6 +1783,8 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
         int count, i, j = 0;
         struct stripe_head *head_sh = sh;
         int last_stripe;
+       int synflags;
+       unsigned long txflags;
  
         pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
  
@@ -1762,14 +1804,23 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
  
  again:
         blocks = to_addr_page(percpu, j);
-       count = set_syndrome_sources(blocks, sh);
+
+       if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+               synflags = SYNDROME_SRC_WRITTEN;
+               txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
+       } else {
+               synflags = SYNDROME_SRC_ALL;
+               txflags = ASYNC_TX_ACK;
+       }
+
+       count = set_syndrome_sources(blocks, sh, synflags);
         last_stripe = !head_sh->batch_head ||
                 list_first_entry(&sh->batch_list,
                                  struct stripe_head, batch_list) == head_sh;
  
         if (last_stripe) {
                 atomic_inc(&head_sh->count);
-               init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
+               init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
                                   head_sh, to_addr_conv(sh, percpu, j));
         } else
                 init_async_submit(&submit, 0, tx, NULL, NULL,
@@ -1840,7 +1891,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
                 (unsigned long long)sh->sector, checkp);
  
         BUG_ON(sh->batch_head);
-       count = set_syndrome_sources(srcs, sh);
+       count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
         if (!checkp)
                 srcs[count] = NULL;
  
@@ -1881,8 +1932,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
                         async_tx_ack(tx);
         }
  
-       if (test_bit(STRIPE_OP_PREXOR, &ops_request))
-               tx = ops_run_prexor(sh, percpu, tx);
+       if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
+               if (level < 6)
+                       tx = ops_run_prexor5(sh, percpu, tx);
+               else
+                       tx = ops_run_prexor6(sh, percpu, tx);
+       }
  
         if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
                 tx = ops_run_biodrain(sh, tx);
@@ -1916,10 +1971,10 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
         put_cpu();
  }
  
-static int grow_one_stripe(struct r5conf *conf, int hash)
+static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
  {
         struct stripe_head *sh;
-       sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
+       sh = kmem_cache_zalloc(conf->slab_cache, gfp);
         if (!sh)
                 return 0;
  
@@ -1927,12 +1982,13 @@ static int grow_one_stripe(struct r5conf *conf, int hash)
  
         spin_lock_init(&sh->stripe_lock);
  
-       if (grow_buffers(sh)) {
+       if (grow_buffers(sh, gfp)) {
                 shrink_buffers(sh);
                 kmem_cache_free(conf->slab_cache, sh);
                 return 0;
         }
-       sh->hash_lock_index = hash;
+       sh->hash_lock_index =
+               conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
         /* we just created an active stripe so... */
         atomic_set(&sh->count, 1);
         atomic_inc(&conf->active_stripes);
@@ -1942,6 +1998,7 @@ static int grow_one_stripe(struct r5conf *conf, int hash)
         INIT_LIST_HEAD(&sh->batch_list);
         sh->batch_head = NULL;
         release_stripe(sh);
+       conf->max_nr_stripes++;
         return 1;
  }
  
@@ -1949,7 +2006,6 @@ static int grow_stripes(struct r5conf *conf, int num)
  {
         struct kmem_cache *sc;
         int devs = max(conf->raid_disks, conf->previous_raid_disks);
-       int hash;
  
         if (conf->mddev->gendisk)
                 sprintf(conf->cache_name[0],
@@ -1967,13 +2023,10 @@ static int grow_stripes(struct r5conf *conf, int num)
                 return 1;
         conf->slab_cache = sc;
         conf->pool_size = devs;
-       hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
-       while (num--) {
-               if (!grow_one_stripe(conf, hash))
+       while (num--)
+               if (!grow_one_stripe(conf, GFP_KERNEL))
                         return 1;
-               conf->max_nr_stripes++;
-               hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
-       }
+
         return 0;
  }
  
@@ -2163,9 +2216,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
         return err;
  }
  
-static int drop_one_stripe(struct r5conf *conf, int hash)
+static int drop_one_stripe(struct r5conf *conf)
  {
         struct stripe_head *sh;
+       int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
  
         spin_lock_irq(conf->hash_locks + hash);
         sh = get_free_stripe(conf, hash);
@@ -2176,15 +2230,15 @@ static int drop_one_stripe(struct r5conf *conf, int hash)
         shrink_buffers(sh);
         kmem_cache_free(conf->slab_cache, sh);
         atomic_dec(&conf->active_stripes);
+       conf->max_nr_stripes--;
         return 1;
  }
  
  static void shrink_stripes(struct r5conf *conf)
  {
-       int hash;
-       for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
-               while (drop_one_stripe(conf, hash))
-                       ;
+       while (conf->max_nr_stripes &&
+              drop_one_stripe(conf))
+               ;
  
         if (conf->slab_cache)
                 kmem_cache_destroy(conf->slab_cache);
@@ -2380,6 +2434,9 @@ static void raid5_end_write_request(struct bio *bi, int error)
         }
         rdev_dec_pending(rdev, conf->mddev);
  
+       if (sh->batch_head && !uptodate)
+               set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
+
         if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
                 clear_bit(R5_LOCKED, &sh->dev[i].flags);
         set_bit(STRIPE_HANDLE, &sh->state);
@@ -2764,7 +2821,7 @@ static void
  schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                          int rcw, int expand)
  {
-       int i, pd_idx = sh->pd_idx, disks = sh->disks;
+       int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
         struct r5conf *conf = sh->raid_conf;
         int level = conf->level;
  
@@ -2800,13 +2857,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                         if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
                                 atomic_inc(&conf->pending_full_writes);
         } else {
-               BUG_ON(level == 6);
                 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
                         test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
+               BUG_ON(level == 6 &&
+                       (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
+                          test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
  
                 for (i = disks; i--; ) {
                         struct r5dev *dev = &sh->dev[i];
-                       if (i == pd_idx)
+                       if (i == pd_idx || i == qd_idx)
                                 continue;
  
                         if (dev->towrite &&
@@ -3434,8 +3493,10 @@ unhash:
                                       struct stripe_head, batch_list);
                 list_del_init(&sh->batch_list);
  
-               sh->state = head_sh->state & (~((1 << STRIPE_ACTIVE) |
-                       (1 << STRIPE_PREREAD_ACTIVE)));
+               set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+                             head_sh->state & ~((1 << STRIPE_ACTIVE) |
+                                                (1 << STRIPE_PREREAD_ACTIVE) |
+                                                STRIPE_EXPAND_SYNC_FLAG));
                 sh->check_state = head_sh->check_state;
                 sh->reconstruct_state = head_sh->reconstruct_state;
                 for (i = 0; i < sh->disks; i++) {
@@ -3447,6 +3508,8 @@ unhash:
                 spin_lock_irq(&sh->stripe_lock);
                 sh->batch_head = NULL;
                 spin_unlock_irq(&sh->stripe_lock);
+               if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
+                       set_bit(STRIPE_HANDLE, &sh->state);
                 release_stripe(sh);
         }
  
@@ -3454,6 +3517,8 @@ unhash:
         head_sh->batch_head = NULL;
         spin_unlock_irq(&head_sh->stripe_lock);
         wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
+       if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
+               set_bit(STRIPE_HANDLE, &head_sh->state);
  }
  
  static void handle_stripe_dirtying(struct r5conf *conf,
@@ -3464,28 +3529,27 @@ static void handle_stripe_dirtying(struct r5conf *conf,
         int rmw = 0, rcw = 0, i;
         sector_t recovery_cp = conf->mddev->recovery_cp;
  
-       /* RAID6 requires 'rcw' in current implementation.
-        * Otherwise, check whether resync is now happening or should start.
+       /* Check whether resync is now happening or should start.
          * If yes, then the array is dirty (after unclean shutdown or
          * initial creation), so parity in some stripes might be inconsistent.
          * In this case, we need to always do reconstruct-write, to ensure
          * that in case of drive failure or read-error correction, we
          * generate correct data from the parity.
          */
-       if (conf->max_degraded == 2 ||
+       if (conf->rmw_level == PARITY_DISABLE_RMW ||
             (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
              s->failed == 0)) {
                 /* Calculate the real rcw later - for now make it
                  * look like rcw is cheaper
                  */
                 rcw = 1; rmw = 2;
-               pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
-                        conf->max_degraded, (unsigned long long)recovery_cp,
+               pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
+                        conf->rmw_level, (unsigned long long)recovery_cp,
                          (unsigned long long)sh->sector);
         } else for (i = disks; i--; ) {
                 /* would I have to read this buffer for read_modify_write */
                 struct r5dev *dev = &sh->dev[i];
-               if ((dev->towrite || i == sh->pd_idx) &&
+               if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
                     !test_bit(R5_LOCKED, &dev->flags) &&
                     !(test_bit(R5_UPTODATE, &dev->flags) ||
                       test_bit(R5_Wantcompute, &dev->flags))) {
@@ -3495,7 +3559,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                                 rmw += 2*disks;  /* cannot read it */
                 }
                 /* Would I have to read this buffer for reconstruct_write */
-               if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+               if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+                   i != sh->pd_idx && i != sh->qd_idx &&
                     !test_bit(R5_LOCKED, &dev->flags) &&
                     !(test_bit(R5_UPTODATE, &dev->flags) ||
                     test_bit(R5_Wantcompute, &dev->flags))) {
@@ -3508,7 +3573,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
         pr_debug("for sector %llu, rmw=%d rcw=%d\n",
                 (unsigned long long)sh->sector, rmw, rcw);
         set_bit(STRIPE_HANDLE, &sh->state);
-       if (rmw < rcw && rmw > 0) {
+       if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) {
                 /* prefer read-modify-write, but need to get some data */
                 if (conf->mddev->queue)
                         blk_add_trace_msg(conf->mddev->queue,
@@ -3516,7 +3581,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                                           (unsigned long long)sh->sector, rmw);
                 for (i = disks; i--; ) {
                         struct r5dev *dev = &sh->dev[i];
-                       if ((dev->towrite || i == sh->pd_idx) &&
+                       if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
                             !test_bit(R5_LOCKED, &dev->flags) &&
                             !(test_bit(R5_UPTODATE, &dev->flags) ||
                             test_bit(R5_Wantcompute, &dev->flags)) &&
@@ -3535,7 +3600,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                         }
                 }
         }
-       if (rcw <= rmw && rcw > 0) {
+       if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) {
                 /* want reconstruct write, but need to get some data */
                 int qread =0;
                 rcw = 0;
@@ -3921,8 +3986,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
  
         memset(s, 0, sizeof(*s));
  
-       s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-       s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+       s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
+       s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
         s->failed_num[0] = -1;
         s->failed_num[1] = -1;
  
@@ -4124,6 +4189,48 @@ static int clear_batch_ready(struct stripe_head *sh)
         return 0;
  }
  
+static void check_break_stripe_batch_list(struct stripe_head *sh)
+{
+       struct stripe_head *head_sh, *next;
+       int i;
+
+       if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
+               return;
+
+       head_sh = sh;
+       do {
+               sh = list_first_entry(&sh->batch_list,
+                                     struct stripe_head, batch_list);
+               BUG_ON(sh == head_sh);
+       } while (!test_bit(STRIPE_DEGRADED, &sh->state));
+
+       while (sh != head_sh) {
+               next = list_first_entry(&sh->batch_list,
+                                       struct stripe_head, batch_list);
+               list_del_init(&sh->batch_list);
+
+               set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+                             head_sh->state & ~((1 << STRIPE_ACTIVE) |
+                                                (1 << STRIPE_PREREAD_ACTIVE) |
+                                                (1 << STRIPE_DEGRADED) |
+                                                STRIPE_EXPAND_SYNC_FLAG));
+               sh->check_state = head_sh->check_state;
+               sh->reconstruct_state = head_sh->reconstruct_state;
+               for (i = 0; i < sh->disks; i++)
+                       sh->dev[i].flags = head_sh->dev[i].flags &
+                               (~((1 << R5_WriteError) | (1 << R5_Overlap)));
+
+               spin_lock_irq(&sh->stripe_lock);
+               sh->batch_head = NULL;
+               spin_unlock_irq(&sh->stripe_lock);
+
+               set_bit(STRIPE_HANDLE, &sh->state);
+               release_stripe(sh);
+
+               sh = next;
+       }
+}
+
  static void handle_stripe(struct stripe_head *sh)
  {
         struct stripe_head_state s;
@@ -4146,7 +4253,9 @@ static void handle_stripe(struct stripe_head *sh)
                 return;
         }
  
-       if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+       check_break_stripe_batch_list(sh);
+
+       if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
                 spin_lock(&sh->stripe_lock);
                 /* Cannot process 'sync' concurrently with 'discard' */
                 if (!test_bit(STRIPE_DISCARD, &sh->state) &&
@@ -4501,7 +4610,7 @@ static int raid5_congested(struct mddev *mddev, int bits)
          * how busy the stripe_cache is
          */
  
-       if (conf->inactive_blocked)
+       if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
                 return 1;
         if (conf->quiesce)
                 return 1;
@@ -4523,8 +4632,12 @@ static int raid5_mergeable_bvec(struct mddev *mddev,
         unsigned int chunk_sectors = mddev->chunk_sectors;
         unsigned int bio_sectors = bvm->bi_size >> 9;
  
-       if ((bvm->bi_rw & 1) == WRITE)
-               return biovec->bv_len; /* always allow writes to be mergeable */
+       /*
+        * always allow writes to be mergeable, read as well if array
+        * is degraded as we'll go through stripe cache anyway.
+        */
+       if ((bvm->bi_rw & 1) == WRITE || mddev->degraded)
+               return biovec->bv_len;
  
         if (mddev->new_chunk_sectors < mddev->chunk_sectors)
                 chunk_sectors = mddev->new_chunk_sectors;
@@ -5001,7 +5114,12 @@ static void make_request(struct mddev *mddev, struct bio * bi)
  
         md_write_start(mddev, bi);
  
-       if (rw == READ &&
+       /*
+        * If array is degraded, better not do chunk aligned read because
+        * later we might have to read it again in order to reconstruct
+        * data on failed drives.
+        */
+       if (rw == READ && mddev->degraded == 0 &&
              mddev->reshape_position == MaxSector &&
              chunk_aligned_read(mddev,bi))
                 return;
@@ -5657,6 +5775,8 @@ static void raid5d(struct md_thread *thread)
                 int batch_size, released;
  
                 released = release_stripe_list(conf, conf->temp_inactive_list);
+               if (released)
+                       clear_bit(R5_DID_ALLOC, &conf->cache_state);
  
                 if (
                     !list_empty(&conf->bitmap_list)) {
@@ -5695,6 +5815,13 @@ static void raid5d(struct md_thread *thread)
         pr_debug("%d stripes handled\n", handled);
  
         spin_unlock_irq(&conf->device_lock);
+       if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
+               grow_one_stripe(conf, __GFP_NOWARN);
+               /* Set flag even if allocation failed.  This helps
+                * slow down allocation requests when mem is short
+                */
+               set_bit(R5_DID_ALLOC, &conf->cache_state);
+       }
  
         async_tx_issue_pending_all();
         blk_finish_plug(&plug);
@@ -5710,7 +5837,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
         spin_lock(&mddev->lock);
         conf = mddev->private;
         if (conf)
-               ret = sprintf(page, "%d\n", conf->max_nr_stripes);
+               ret = sprintf(page, "%d\n", conf->min_nr_stripes);
         spin_unlock(&mddev->lock);
         return ret;
  }
@@ -5720,30 +5847,24 @@ raid5_set_cache_size(struct mddev *mddev, int size)
  {
         struct r5conf *conf = mddev->private;
         int err;
-       int hash;
  
         if (size <= 16 || size > 32768)
                 return -EINVAL;
-       hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
-       while (size < conf->max_nr_stripes) {
-               if (drop_one_stripe(conf, hash))
-                       conf->max_nr_stripes--;
-               else
-                       break;
-               hash--;
-               if (hash < 0)
-                       hash = NR_STRIPE_HASH_LOCKS - 1;
-       }
+
+       conf->min_nr_stripes = size;
+       while (size < conf->max_nr_stripes &&
+              drop_one_stripe(conf))
+               ;
+
+
         err = md_allow_write(mddev);
         if (err)
                 return err;
-       hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
-       while (size > conf->max_nr_stripes) {
-               if (grow_one_stripe(conf, hash))
-                       conf->max_nr_stripes++;
-               else break;
-               hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
-       }
+
+       while (size > conf->max_nr_stripes)
+               if (!grow_one_stripe(conf, GFP_KERNEL))
+                       break;
+
         return 0;
  }
  EXPORT_SYMBOL(raid5_set_cache_size);
@@ -5777,6 +5898,49 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
                                 raid5_show_stripe_cache_size,
                                 raid5_store_stripe_cache_size);
  
+static ssize_t
+raid5_show_rmw_level(struct mddev  *mddev, char *page)
+{
+       struct r5conf *conf = mddev->private;
+       if (conf)
+               return sprintf(page, "%d\n", conf->rmw_level);
+       else
+               return 0;
+}
+
+static ssize_t
+raid5_store_rmw_level(struct mddev  *mddev, const char *page, size_t len)
+{
+       struct r5conf *conf = mddev->private;
+       unsigned long new;
+
+       if (!conf)
+               return -ENODEV;
+
+       if (len >= PAGE_SIZE)
+               return -EINVAL;
+
+       if (kstrtoul(page, 10, &new))
+               return -EINVAL;
+
+       if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
+               return -EINVAL;
+
+       if (new != PARITY_DISABLE_RMW &&
+           new != PARITY_ENABLE_RMW &&
+           new != PARITY_PREFER_RMW)
+               return -EINVAL;
+
+       conf->rmw_level = new;
+       return len;
+}
+
+static struct md_sysfs_entry
+raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
+                        raid5_show_rmw_level,
+                        raid5_store_rmw_level);
+
+
  static ssize_t
  raid5_show_preread_threshold(struct mddev *mddev, char *page)
  {
@@ -5808,7 +5972,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
         conf = mddev->private;
         if (!conf)
                 err = -ENODEV;
-       else if (new > conf->max_nr_stripes)
+       else if (new > conf->min_nr_stripes)
                 err = -EINVAL;
         else
                 conf->bypass_threshold = new;
@@ -5963,6 +6127,7 @@ static struct attribute *raid5_attrs[] =  {
         &raid5_preread_bypass_threshold.attr,
         &raid5_group_thread_cnt.attr,
         &raid5_skip_copy.attr,
+       &raid5_rmw_level.attr,
         NULL,
  };
  static struct attribute_group raid5_attrs_group = {
@@ -6088,6 +6253,8 @@ static void raid5_free_percpu(struct r5conf *conf)
  
  static void free_conf(struct r5conf *conf)
  {
+       if (conf->shrinker.seeks)
+               unregister_shrinker(&conf->shrinker);
         free_thread_groups(conf);
         shrink_stripes(conf);
         raid5_free_percpu(conf);
@@ -6155,6 +6322,30 @@ static int raid5_alloc_percpu(struct r5conf *conf)
         return err;
  }
  
+static unsigned long raid5_cache_scan(struct shrinker *shrink,
+                                     struct shrink_control *sc)
+{
+       struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+       int ret = 0;
+       while (ret < sc->nr_to_scan) {
+               if (drop_one_stripe(conf) == 0)
+                       return SHRINK_STOP;
+               ret++;
+       }
+       return ret;
+}
+
+static unsigned long raid5_cache_count(struct shrinker *shrink,
+                                      struct shrink_control *sc)
+{
+       struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+
+       if (conf->max_nr_stripes < conf->min_nr_stripes)
+               /* unlikely, but not impossible */
+               return 0;
+       return conf->max_nr_stripes - conf->min_nr_stripes;
+}
+
  static struct r5conf *setup_conf(struct mddev *mddev)
  {
         struct r5conf *conf;
@@ -6288,10 +6479,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
         }
  
         conf->level = mddev->new_level;
-       if (conf->level == 6)
+       if (conf->level == 6) {
                 conf->max_degraded = 2;
-       else
+               if (raid6_call.xor_syndrome)
+                       conf->rmw_level = PARITY_ENABLE_RMW;
+               else
+                       conf->rmw_level = PARITY_DISABLE_RMW;
+       } else {
                 conf->max_degraded = 1;
+               conf->rmw_level = PARITY_ENABLE_RMW;
+       }
         conf->algorithm = mddev->new_layout;
         conf->reshape_progress = mddev->reshape_position;
         if (conf->reshape_progress != MaxSector) {
@@ -6299,10 +6496,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
                 conf->prev_algo = mddev->layout;
         }
  
-       memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
+       conf->min_nr_stripes = NR_STRIPES;
+       memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
                  max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
         atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
-       if (grow_stripes(conf, NR_STRIPES)) {
+       if (grow_stripes(conf, conf->min_nr_stripes)) {
                 printk(KERN_ERR
                        "md/raid:%s: couldn't allocate %dkB for buffers\n",
                        mdname(mddev), memory);
@@ -6310,6 +6508,17 @@ static struct r5conf *setup_conf(struct mddev *mddev)
         } else
                 printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
                        mdname(mddev), memory);
+       /*
+        * Losing a stripe head costs more than the time to refill it,
+        * it reduces the queue depth and so can hurt throughput.
+        * So set it rather large, scaled by number of devices.
+        */
+       conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
+       conf->shrinker.scan_objects = raid5_cache_scan;
+       conf->shrinker.count_objects = raid5_cache_count;
+       conf->shrinker.batch = 128;
+       conf->shrinker.flags = 0;
+       register_shrinker(&conf->shrinker);
  
         sprintf(pers_name, "raid%d", mddev->new_level);
         conf->thread = md_register_thread(raid5d, mddev, pers_name);
@@ -6951,9 +7160,9 @@ static int check_stripe_cache(struct mddev *mddev)
          */
         struct r5conf *conf = mddev->private;
         if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
-           > conf->max_nr_stripes ||
+           > conf->min_nr_stripes ||
             ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
-           > conf->max_nr_stripes) {
+           > conf->min_nr_stripes) {
                 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
                        mdname(mddev),
                        ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)