Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[deliverable/linux.git] / drivers / md / raid5.c
index 717189e742434a68d36e420b43e32183fff08324..77dfd720aaa00ebc55d14234cd40b6e9b65bae5f 100644 (file)
@@ -497,7 +497,7 @@ static void shrink_buffers(struct stripe_head *sh)
        }
 }
 
-static int grow_buffers(struct stripe_head *sh)
+static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 {
        int i;
        int num = sh->raid_conf->pool_size;
@@ -505,7 +505,7 @@ static int grow_buffers(struct stripe_head *sh)
        for (i = 0; i < num; i++) {
                struct page *page;
 
-               if (!(page = alloc_page(GFP_KERNEL))) {
+               if (!(page = alloc_page(gfp))) {
                        return 1;
                }
                sh->dev[i].page = page;
@@ -672,20 +672,28 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                    *(conf->hash_locks + hash));
                sh = __find_stripe(conf, sector, conf->generation - previous);
                if (!sh) {
-                       if (!conf->inactive_blocked)
+                       if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
                                sh = get_free_stripe(conf, hash);
+                               if (!sh && llist_empty(&conf->released_stripes) &&
+                                   !test_bit(R5_DID_ALLOC, &conf->cache_state))
+                                       set_bit(R5_ALLOC_MORE,
+                                               &conf->cache_state);
+                       }
                        if (noblock && sh == NULL)
                                break;
                        if (!sh) {
-                               conf->inactive_blocked = 1;
+                               set_bit(R5_INACTIVE_BLOCKED,
+                                       &conf->cache_state);
                                wait_event_lock_irq(
                                        conf->wait_for_stripe,
                                        !list_empty(conf->inactive_list + hash) &&
                                        (atomic_read(&conf->active_stripes)
                                         < (conf->max_nr_stripes * 3 / 4)
-                                        || !conf->inactive_blocked),
+                                        || !test_bit(R5_INACTIVE_BLOCKED,
+                                                     &conf->cache_state)),
                                        *(conf->hash_locks + hash));
-                               conf->inactive_blocked = 0;
+                               clear_bit(R5_INACTIVE_BLOCKED,
+                                         &conf->cache_state);
                        } else {
                                init_stripe(sh, sector, previous);
                                atomic_inc(&sh->count);
@@ -1070,6 +1078,9 @@ again:
                        pr_debug("skip op %ld on disc %d for sector %llu\n",
                                bi->bi_rw, i, (unsigned long long)sh->sector);
                        clear_bit(R5_LOCKED, &sh->dev[i].flags);
+                       if (sh->batch_head)
+                               set_bit(STRIPE_BATCH_ERR,
+                                       &sh->batch_head->state);
                        set_bit(STRIPE_HANDLE, &sh->state);
                }
 
@@ -1314,7 +1325,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
  * destination buffer is recorded in srcs[count] and the Q destination
  * is recorded in srcs[count+1]].
  */
-static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
+static int set_syndrome_sources(struct page **srcs,
+                               struct stripe_head *sh,
+                               int srctype)
 {
        int disks = sh->disks;
        int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
@@ -1329,8 +1342,15 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
        i = d0_idx;
        do {
                int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+               struct r5dev *dev = &sh->dev[i];
 
-               srcs[slot] = sh->dev[i].page;
+               if (i == sh->qd_idx || i == sh->pd_idx ||
+                   (srctype == SYNDROME_SRC_ALL) ||
+                   (srctype == SYNDROME_SRC_WANT_DRAIN &&
+                    test_bit(R5_Wantdrain, &dev->flags)) ||
+                   (srctype == SYNDROME_SRC_WRITTEN &&
+                    dev->written))
+                       srcs[slot] = sh->dev[i].page;
                i = raid6_next_disk(i, disks);
        } while (i != d0_idx);
 
@@ -1370,7 +1390,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
        atomic_inc(&sh->count);
 
        if (target == qd_idx) {
-               count = set_syndrome_sources(blocks, sh);
+               count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
                blocks[count] = NULL; /* regenerating p is not necessary */
                BUG_ON(blocks[count+1] != dest); /* q should already be set */
                init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
@@ -1478,7 +1498,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
                        tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
                                       &submit);
 
-                       count = set_syndrome_sources(blocks, sh);
+                       count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
                        init_async_submit(&submit, ASYNC_TX_FENCE, tx,
                                          ops_complete_compute, sh,
                                          to_addr_conv(sh, percpu, 0));
@@ -1512,8 +1532,8 @@ static void ops_complete_prexor(void *stripe_head_ref)
 }
 
 static struct dma_async_tx_descriptor *
-ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
-              struct dma_async_tx_descriptor *tx)
+ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
+               struct dma_async_tx_descriptor *tx)
 {
        int disks = sh->disks;
        struct page **xor_srcs = to_addr_page(percpu, 0);
@@ -1541,6 +1561,26 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
        return tx;
 }
 
+static struct dma_async_tx_descriptor *
+ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
+               struct dma_async_tx_descriptor *tx)
+{
+       struct page **blocks = to_addr_page(percpu, 0);
+       int count;
+       struct async_submit_ctl submit;
+
+       pr_debug("%s: stripe %llu\n", __func__,
+               (unsigned long long)sh->sector);
+
+       count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
+
+       init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
+                         ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
+       tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
+
+       return tx;
+}
+
 static struct dma_async_tx_descriptor *
 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
@@ -1743,6 +1783,8 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
        int count, i, j = 0;
        struct stripe_head *head_sh = sh;
        int last_stripe;
+       int synflags;
+       unsigned long txflags;
 
        pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
 
@@ -1762,14 +1804,23 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 
 again:
        blocks = to_addr_page(percpu, j);
-       count = set_syndrome_sources(blocks, sh);
+
+       if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+               synflags = SYNDROME_SRC_WRITTEN;
+               txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
+       } else {
+               synflags = SYNDROME_SRC_ALL;
+               txflags = ASYNC_TX_ACK;
+       }
+
+       count = set_syndrome_sources(blocks, sh, synflags);
        last_stripe = !head_sh->batch_head ||
                list_first_entry(&sh->batch_list,
                                 struct stripe_head, batch_list) == head_sh;
 
        if (last_stripe) {
                atomic_inc(&head_sh->count);
-               init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
+               init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
                                  head_sh, to_addr_conv(sh, percpu, j));
        } else
                init_async_submit(&submit, 0, tx, NULL, NULL,
@@ -1840,7 +1891,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
                (unsigned long long)sh->sector, checkp);
 
        BUG_ON(sh->batch_head);
-       count = set_syndrome_sources(srcs, sh);
+       count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
        if (!checkp)
                srcs[count] = NULL;
 
@@ -1881,8 +1932,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
                        async_tx_ack(tx);
        }
 
-       if (test_bit(STRIPE_OP_PREXOR, &ops_request))
-               tx = ops_run_prexor(sh, percpu, tx);
+       if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
+               if (level < 6)
+                       tx = ops_run_prexor5(sh, percpu, tx);
+               else
+                       tx = ops_run_prexor6(sh, percpu, tx);
+       }
 
        if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
                tx = ops_run_biodrain(sh, tx);
@@ -1916,10 +1971,10 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
        put_cpu();
 }
 
-static int grow_one_stripe(struct r5conf *conf, int hash)
+static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
 {
        struct stripe_head *sh;
-       sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
+       sh = kmem_cache_zalloc(conf->slab_cache, gfp);
        if (!sh)
                return 0;
 
@@ -1927,12 +1982,13 @@ static int grow_one_stripe(struct r5conf *conf, int hash)
 
        spin_lock_init(&sh->stripe_lock);
 
-       if (grow_buffers(sh)) {
+       if (grow_buffers(sh, gfp)) {
                shrink_buffers(sh);
                kmem_cache_free(conf->slab_cache, sh);
                return 0;
        }
-       sh->hash_lock_index = hash;
+       sh->hash_lock_index =
+               conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
        /* we just created an active stripe so... */
        atomic_set(&sh->count, 1);
        atomic_inc(&conf->active_stripes);
@@ -1942,6 +1998,7 @@ static int grow_one_stripe(struct r5conf *conf, int hash)
        INIT_LIST_HEAD(&sh->batch_list);
        sh->batch_head = NULL;
        release_stripe(sh);
+       conf->max_nr_stripes++;
        return 1;
 }
 
@@ -1949,7 +2006,6 @@ static int grow_stripes(struct r5conf *conf, int num)
 {
        struct kmem_cache *sc;
        int devs = max(conf->raid_disks, conf->previous_raid_disks);
-       int hash;
 
        if (conf->mddev->gendisk)
                sprintf(conf->cache_name[0],
@@ -1967,13 +2023,10 @@ static int grow_stripes(struct r5conf *conf, int num)
                return 1;
        conf->slab_cache = sc;
        conf->pool_size = devs;
-       hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
-       while (num--) {
-               if (!grow_one_stripe(conf, hash))
+       while (num--)
+               if (!grow_one_stripe(conf, GFP_KERNEL))
                        return 1;
-               conf->max_nr_stripes++;
-               hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
-       }
+
        return 0;
 }
 
@@ -2163,9 +2216,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        return err;
 }
 
-static int drop_one_stripe(struct r5conf *conf, int hash)
+static int drop_one_stripe(struct r5conf *conf)
 {
        struct stripe_head *sh;
+       int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
 
        spin_lock_irq(conf->hash_locks + hash);
        sh = get_free_stripe(conf, hash);
@@ -2176,15 +2230,15 @@ static int drop_one_stripe(struct r5conf *conf, int hash)
        shrink_buffers(sh);
        kmem_cache_free(conf->slab_cache, sh);
        atomic_dec(&conf->active_stripes);
+       conf->max_nr_stripes--;
        return 1;
 }
 
 static void shrink_stripes(struct r5conf *conf)
 {
-       int hash;
-       for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
-               while (drop_one_stripe(conf, hash))
-                       ;
+       while (conf->max_nr_stripes &&
+              drop_one_stripe(conf))
+               ;
 
        if (conf->slab_cache)
                kmem_cache_destroy(conf->slab_cache);
@@ -2380,6 +2434,9 @@ static void raid5_end_write_request(struct bio *bi, int error)
        }
        rdev_dec_pending(rdev, conf->mddev);
 
+       if (sh->batch_head && !uptodate)
+               set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
+
        if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
                clear_bit(R5_LOCKED, &sh->dev[i].flags);
        set_bit(STRIPE_HANDLE, &sh->state);
@@ -2764,7 +2821,7 @@ static void
 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                         int rcw, int expand)
 {
-       int i, pd_idx = sh->pd_idx, disks = sh->disks;
+       int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
        struct r5conf *conf = sh->raid_conf;
        int level = conf->level;
 
@@ -2800,13 +2857,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
                        if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
                                atomic_inc(&conf->pending_full_writes);
        } else {
-               BUG_ON(level == 6);
                BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
                        test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
+               BUG_ON(level == 6 &&
+                       (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
+                          test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
 
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
-                       if (i == pd_idx)
+                       if (i == pd_idx || i == qd_idx)
                                continue;
 
                        if (dev->towrite &&
@@ -3434,8 +3493,10 @@ unhash:
                                      struct stripe_head, batch_list);
                list_del_init(&sh->batch_list);
 
-               sh->state = head_sh->state & (~((1 << STRIPE_ACTIVE) |
-                       (1 << STRIPE_PREREAD_ACTIVE)));
+               set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+                             head_sh->state & ~((1 << STRIPE_ACTIVE) |
+                                                (1 << STRIPE_PREREAD_ACTIVE) |
+                                                STRIPE_EXPAND_SYNC_FLAG));
                sh->check_state = head_sh->check_state;
                sh->reconstruct_state = head_sh->reconstruct_state;
                for (i = 0; i < sh->disks; i++) {
@@ -3447,6 +3508,8 @@ unhash:
                spin_lock_irq(&sh->stripe_lock);
                sh->batch_head = NULL;
                spin_unlock_irq(&sh->stripe_lock);
+               if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
+                       set_bit(STRIPE_HANDLE, &sh->state);
                release_stripe(sh);
        }
 
@@ -3454,6 +3517,8 @@ unhash:
        head_sh->batch_head = NULL;
        spin_unlock_irq(&head_sh->stripe_lock);
        wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
+       if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
+               set_bit(STRIPE_HANDLE, &head_sh->state);
 }
 
 static void handle_stripe_dirtying(struct r5conf *conf,
@@ -3464,28 +3529,27 @@ static void handle_stripe_dirtying(struct r5conf *conf,
        int rmw = 0, rcw = 0, i;
        sector_t recovery_cp = conf->mddev->recovery_cp;
 
-       /* RAID6 requires 'rcw' in current implementation.
-        * Otherwise, check whether resync is now happening or should start.
+       /* Check whether resync is now happening or should start.
         * If yes, then the array is dirty (after unclean shutdown or
         * initial creation), so parity in some stripes might be inconsistent.
         * In this case, we need to always do reconstruct-write, to ensure
         * that in case of drive failure or read-error correction, we
         * generate correct data from the parity.
         */
-       if (conf->max_degraded == 2 ||
+       if (conf->rmw_level == PARITY_DISABLE_RMW ||
            (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
             s->failed == 0)) {
                /* Calculate the real rcw later - for now make it
                 * look like rcw is cheaper
                 */
                rcw = 1; rmw = 2;
-               pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
-                        conf->max_degraded, (unsigned long long)recovery_cp,
+               pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
+                        conf->rmw_level, (unsigned long long)recovery_cp,
                         (unsigned long long)sh->sector);
        } else for (i = disks; i--; ) {
                /* would I have to read this buffer for read_modify_write */
                struct r5dev *dev = &sh->dev[i];
-               if ((dev->towrite || i == sh->pd_idx) &&
+               if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
                    !test_bit(R5_LOCKED, &dev->flags) &&
                    !(test_bit(R5_UPTODATE, &dev->flags) ||
                      test_bit(R5_Wantcompute, &dev->flags))) {
@@ -3495,7 +3559,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                                rmw += 2*disks;  /* cannot read it */
                }
                /* Would I have to read this buffer for reconstruct_write */
-               if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+               if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+                   i != sh->pd_idx && i != sh->qd_idx &&
                    !test_bit(R5_LOCKED, &dev->flags) &&
                    !(test_bit(R5_UPTODATE, &dev->flags) ||
                    test_bit(R5_Wantcompute, &dev->flags))) {
@@ -3508,7 +3573,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
        pr_debug("for sector %llu, rmw=%d rcw=%d\n",
                (unsigned long long)sh->sector, rmw, rcw);
        set_bit(STRIPE_HANDLE, &sh->state);
-       if (rmw < rcw && rmw > 0) {
+       if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) {
                /* prefer read-modify-write, but need to get some data */
                if (conf->mddev->queue)
                        blk_add_trace_msg(conf->mddev->queue,
@@ -3516,7 +3581,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                                          (unsigned long long)sh->sector, rmw);
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
-                       if ((dev->towrite || i == sh->pd_idx) &&
+                       if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
                            !test_bit(R5_LOCKED, &dev->flags) &&
                            !(test_bit(R5_UPTODATE, &dev->flags) ||
                            test_bit(R5_Wantcompute, &dev->flags)) &&
@@ -3535,7 +3600,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
                        }
                }
        }
-       if (rcw <= rmw && rcw > 0) {
+       if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) {
                /* want reconstruct write, but need to get some data */
                int qread =0;
                rcw = 0;
@@ -3921,8 +3986,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 
        memset(s, 0, sizeof(*s));
 
-       s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-       s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+       s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
+       s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
        s->failed_num[0] = -1;
        s->failed_num[1] = -1;
 
@@ -4124,6 +4189,48 @@ static int clear_batch_ready(struct stripe_head *sh)
        return 0;
 }
 
+static void check_break_stripe_batch_list(struct stripe_head *sh)
+{
+       struct stripe_head *head_sh, *next;
+       int i;
+
+       if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
+               return;
+
+       head_sh = sh;
+       do {
+               sh = list_first_entry(&sh->batch_list,
+                                     struct stripe_head, batch_list);
+               BUG_ON(sh == head_sh);
+       } while (!test_bit(STRIPE_DEGRADED, &sh->state));
+
+       while (sh != head_sh) {
+               next = list_first_entry(&sh->batch_list,
+                                       struct stripe_head, batch_list);
+               list_del_init(&sh->batch_list);
+
+               set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+                             head_sh->state & ~((1 << STRIPE_ACTIVE) |
+                                                (1 << STRIPE_PREREAD_ACTIVE) |
+                                                (1 << STRIPE_DEGRADED) |
+                                                STRIPE_EXPAND_SYNC_FLAG));
+               sh->check_state = head_sh->check_state;
+               sh->reconstruct_state = head_sh->reconstruct_state;
+               for (i = 0; i < sh->disks; i++)
+                       sh->dev[i].flags = head_sh->dev[i].flags &
+                               (~((1 << R5_WriteError) | (1 << R5_Overlap)));
+
+               spin_lock_irq(&sh->stripe_lock);
+               sh->batch_head = NULL;
+               spin_unlock_irq(&sh->stripe_lock);
+
+               set_bit(STRIPE_HANDLE, &sh->state);
+               release_stripe(sh);
+
+               sh = next;
+       }
+}
+
 static void handle_stripe(struct stripe_head *sh)
 {
        struct stripe_head_state s;
@@ -4146,7 +4253,9 @@ static void handle_stripe(struct stripe_head *sh)
                return;
        }
 
-       if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+       check_break_stripe_batch_list(sh);
+
+       if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
                spin_lock(&sh->stripe_lock);
                /* Cannot process 'sync' concurrently with 'discard' */
                if (!test_bit(STRIPE_DISCARD, &sh->state) &&
@@ -4501,7 +4610,7 @@ static int raid5_congested(struct mddev *mddev, int bits)
         * how busy the stripe_cache is
         */
 
-       if (conf->inactive_blocked)
+       if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
                return 1;
        if (conf->quiesce)
                return 1;
@@ -4523,8 +4632,12 @@ static int raid5_mergeable_bvec(struct mddev *mddev,
        unsigned int chunk_sectors = mddev->chunk_sectors;
        unsigned int bio_sectors = bvm->bi_size >> 9;
 
-       if ((bvm->bi_rw & 1) == WRITE)
-               return biovec->bv_len; /* always allow writes to be mergeable */
+       /*
+        * always allow writes to be mergeable, read as well if array
+        * is degraded as we'll go through stripe cache anyway.
+        */
+       if ((bvm->bi_rw & 1) == WRITE || mddev->degraded)
+               return biovec->bv_len;
 
        if (mddev->new_chunk_sectors < mddev->chunk_sectors)
                chunk_sectors = mddev->new_chunk_sectors;
@@ -5001,7 +5114,12 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 
        md_write_start(mddev, bi);
 
-       if (rw == READ &&
+       /*
+        * If array is degraded, better not do chunk aligned read because
+        * later we might have to read it again in order to reconstruct
+        * data on failed drives.
+        */
+       if (rw == READ && mddev->degraded == 0 &&
             mddev->reshape_position == MaxSector &&
             chunk_aligned_read(mddev,bi))
                return;
@@ -5657,6 +5775,8 @@ static void raid5d(struct md_thread *thread)
                int batch_size, released;
 
                released = release_stripe_list(conf, conf->temp_inactive_list);
+               if (released)
+                       clear_bit(R5_DID_ALLOC, &conf->cache_state);
 
                if (
                    !list_empty(&conf->bitmap_list)) {
@@ -5695,6 +5815,13 @@ static void raid5d(struct md_thread *thread)
        pr_debug("%d stripes handled\n", handled);
 
        spin_unlock_irq(&conf->device_lock);
+       if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
+               grow_one_stripe(conf, __GFP_NOWARN);
+               /* Set flag even if allocation failed.  This helps
+                * slow down allocation requests when mem is short
+                */
+               set_bit(R5_DID_ALLOC, &conf->cache_state);
+       }
 
        async_tx_issue_pending_all();
        blk_finish_plug(&plug);
@@ -5710,7 +5837,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
        spin_lock(&mddev->lock);
        conf = mddev->private;
        if (conf)
-               ret = sprintf(page, "%d\n", conf->max_nr_stripes);
+               ret = sprintf(page, "%d\n", conf->min_nr_stripes);
        spin_unlock(&mddev->lock);
        return ret;
 }
@@ -5720,30 +5847,24 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 {
        struct r5conf *conf = mddev->private;
        int err;
-       int hash;
 
        if (size <= 16 || size > 32768)
                return -EINVAL;
-       hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
-       while (size < conf->max_nr_stripes) {
-               if (drop_one_stripe(conf, hash))
-                       conf->max_nr_stripes--;
-               else
-                       break;
-               hash--;
-               if (hash < 0)
-                       hash = NR_STRIPE_HASH_LOCKS - 1;
-       }
+
+       conf->min_nr_stripes = size;
+       while (size < conf->max_nr_stripes &&
+              drop_one_stripe(conf))
+               ;
+
+
        err = md_allow_write(mddev);
        if (err)
                return err;
-       hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
-       while (size > conf->max_nr_stripes) {
-               if (grow_one_stripe(conf, hash))
-                       conf->max_nr_stripes++;
-               else break;
-               hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
-       }
+
+       while (size > conf->max_nr_stripes)
+               if (!grow_one_stripe(conf, GFP_KERNEL))
+                       break;
+
        return 0;
 }
 EXPORT_SYMBOL(raid5_set_cache_size);
@@ -5777,6 +5898,49 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
                                raid5_show_stripe_cache_size,
                                raid5_store_stripe_cache_size);
 
+static ssize_t
+raid5_show_rmw_level(struct mddev  *mddev, char *page)
+{
+       struct r5conf *conf = mddev->private;
+       if (conf)
+               return sprintf(page, "%d\n", conf->rmw_level);
+       else
+               return 0;
+}
+
+static ssize_t
+raid5_store_rmw_level(struct mddev  *mddev, const char *page, size_t len)
+{
+       struct r5conf *conf = mddev->private;
+       unsigned long new;
+
+       if (!conf)
+               return -ENODEV;
+
+       if (len >= PAGE_SIZE)
+               return -EINVAL;
+
+       if (kstrtoul(page, 10, &new))
+               return -EINVAL;
+
+       if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
+               return -EINVAL;
+
+       if (new != PARITY_DISABLE_RMW &&
+           new != PARITY_ENABLE_RMW &&
+           new != PARITY_PREFER_RMW)
+               return -EINVAL;
+
+       conf->rmw_level = new;
+       return len;
+}
+
+static struct md_sysfs_entry
+raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
+                        raid5_show_rmw_level,
+                        raid5_store_rmw_level);
+
+
 static ssize_t
 raid5_show_preread_threshold(struct mddev *mddev, char *page)
 {
@@ -5808,7 +5972,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
        conf = mddev->private;
        if (!conf)
                err = -ENODEV;
-       else if (new > conf->max_nr_stripes)
+       else if (new > conf->min_nr_stripes)
                err = -EINVAL;
        else
                conf->bypass_threshold = new;
@@ -5963,6 +6127,7 @@ static struct attribute *raid5_attrs[] =  {
        &raid5_preread_bypass_threshold.attr,
        &raid5_group_thread_cnt.attr,
        &raid5_skip_copy.attr,
+       &raid5_rmw_level.attr,
        NULL,
 };
 static struct attribute_group raid5_attrs_group = {
@@ -6088,6 +6253,8 @@ static void raid5_free_percpu(struct r5conf *conf)
 
 static void free_conf(struct r5conf *conf)
 {
+       if (conf->shrinker.seeks)
+               unregister_shrinker(&conf->shrinker);
        free_thread_groups(conf);
        shrink_stripes(conf);
        raid5_free_percpu(conf);
@@ -6155,6 +6322,30 @@ static int raid5_alloc_percpu(struct r5conf *conf)
        return err;
 }
 
+static unsigned long raid5_cache_scan(struct shrinker *shrink,
+                                     struct shrink_control *sc)
+{
+       struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+       int ret = 0;
+       while (ret < sc->nr_to_scan) {
+               if (drop_one_stripe(conf) == 0)
+                       return SHRINK_STOP;
+               ret++;
+       }
+       return ret;
+}
+
+static unsigned long raid5_cache_count(struct shrinker *shrink,
+                                      struct shrink_control *sc)
+{
+       struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+
+       if (conf->max_nr_stripes < conf->min_nr_stripes)
+               /* unlikely, but not impossible */
+               return 0;
+       return conf->max_nr_stripes - conf->min_nr_stripes;
+}
+
 static struct r5conf *setup_conf(struct mddev *mddev)
 {
        struct r5conf *conf;
@@ -6288,10 +6479,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        }
 
        conf->level = mddev->new_level;
-       if (conf->level == 6)
+       if (conf->level == 6) {
                conf->max_degraded = 2;
-       else
+               if (raid6_call.xor_syndrome)
+                       conf->rmw_level = PARITY_ENABLE_RMW;
+               else
+                       conf->rmw_level = PARITY_DISABLE_RMW;
+       } else {
                conf->max_degraded = 1;
+               conf->rmw_level = PARITY_ENABLE_RMW;
+       }
        conf->algorithm = mddev->new_layout;
        conf->reshape_progress = mddev->reshape_position;
        if (conf->reshape_progress != MaxSector) {
@@ -6299,10 +6496,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
                conf->prev_algo = mddev->layout;
        }
 
-       memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
+       conf->min_nr_stripes = NR_STRIPES;
+       memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
                 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
        atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
-       if (grow_stripes(conf, NR_STRIPES)) {
+       if (grow_stripes(conf, conf->min_nr_stripes)) {
                printk(KERN_ERR
                       "md/raid:%s: couldn't allocate %dkB for buffers\n",
                       mdname(mddev), memory);
@@ -6310,6 +6508,17 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        } else
                printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
                       mdname(mddev), memory);
+       /*
+        * Losing a stripe head costs more than the time to refill it,
+        * it reduces the queue depth and so can hurt throughput.
+        * So set it rather large, scaled by number of devices.
+        */
+       conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
+       conf->shrinker.scan_objects = raid5_cache_scan;
+       conf->shrinker.count_objects = raid5_cache_count;
+       conf->shrinker.batch = 128;
+       conf->shrinker.flags = 0;
+       register_shrinker(&conf->shrinker);
 
        sprintf(pers_name, "raid%d", mddev->new_level);
        conf->thread = md_register_thread(raid5d, mddev, pers_name);
@@ -6951,9 +7160,9 @@ static int check_stripe_cache(struct mddev *mddev)
         */
        struct r5conf *conf = mddev->private;
        if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
-           > conf->max_nr_stripes ||
+           > conf->min_nr_stripes ||
            ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
-           > conf->max_nr_stripes) {
+           > conf->min_nr_stripes) {
                printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
                       mdname(mddev),
                       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
This page took 0.054139 seconds and 5 git commands to generate.