md/raid5: be more selective about distributing flags across batch.
[deliverable/linux.git] / drivers / md / raid5.c
index 77dfd720aaa00ebc55d14234cd40b6e9b65bae5f..1141b7f62e6e84b46a4e83358afffe54d89cf712 100644 (file)
@@ -749,6 +749,7 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 static bool stripe_can_batch(struct stripe_head *sh)
 {
        return test_bit(STRIPE_BATCH_READY, &sh->state) &&
+               !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
                is_full_stripe_write(sh);
 }
 
@@ -837,6 +838,15 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
                    < IO_THRESHOLD)
                        md_wakeup_thread(conf->mddev->thread);
 
+       if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
+               int seq = sh->bm_seq;
+               if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
+                   sh->batch_head->bm_seq > seq)
+                       seq = sh->batch_head->bm_seq;
+               set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
+               sh->batch_head->bm_seq = seq;
+       }
+
        atomic_inc(&sh->count);
 unlock_out:
        unlock_two_stripes(head, sh);
@@ -1078,9 +1088,6 @@ again:
                        pr_debug("skip op %ld on disc %d for sector %llu\n",
                                bi->bi_rw, i, (unsigned long long)sh->sector);
                        clear_bit(R5_LOCKED, &sh->dev[i].flags);
-                       if (sh->batch_head)
-                               set_bit(STRIPE_BATCH_ERR,
-                                       &sh->batch_head->state);
                        set_bit(STRIPE_HANDLE, &sh->state);
                }
 
@@ -1825,7 +1832,7 @@ again:
        } else
                init_async_submit(&submit, 0, tx, NULL, NULL,
                                  to_addr_conv(sh, percpu, j));
-       async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
+       tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
        if (!last_stripe) {
                j++;
                sh = list_first_entry(&sh->batch_list, struct stripe_head,
@@ -1971,17 +1978,30 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
        put_cpu();
 }
 
+static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
+{
+       struct stripe_head *sh;
+
+       sh = kmem_cache_zalloc(sc, gfp);
+       if (sh) {
+               spin_lock_init(&sh->stripe_lock);
+               spin_lock_init(&sh->batch_lock);
+               INIT_LIST_HEAD(&sh->batch_list);
+               INIT_LIST_HEAD(&sh->lru);
+               atomic_set(&sh->count, 1);
+       }
+       return sh;
+}
 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
 {
        struct stripe_head *sh;
-       sh = kmem_cache_zalloc(conf->slab_cache, gfp);
+
+       sh = alloc_stripe(conf->slab_cache, gfp);
        if (!sh)
                return 0;
 
        sh->raid_conf = conf;
 
-       spin_lock_init(&sh->stripe_lock);
-
        if (grow_buffers(sh, gfp)) {
                shrink_buffers(sh);
                kmem_cache_free(conf->slab_cache, sh);
@@ -1990,13 +2010,8 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
        sh->hash_lock_index =
                conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
        /* we just created an active stripe so... */
-       atomic_set(&sh->count, 1);
        atomic_inc(&conf->active_stripes);
-       INIT_LIST_HEAD(&sh->lru);
 
-       spin_lock_init(&sh->batch_lock);
-       INIT_LIST_HEAD(&sh->batch_list);
-       sh->batch_head = NULL;
        release_stripe(sh);
        conf->max_nr_stripes++;
        return 1;
@@ -2060,6 +2075,35 @@ static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
        return ret;
 }
 
+static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
+{
+       unsigned long cpu;
+       int err = 0;
+
+       mddev_suspend(conf->mddev);
+       get_online_cpus();
+       for_each_present_cpu(cpu) {
+               struct raid5_percpu *percpu;
+               struct flex_array *scribble;
+
+               percpu = per_cpu_ptr(conf->percpu, cpu);
+               scribble = scribble_alloc(new_disks,
+                                         new_sectors / STRIPE_SECTORS,
+                                         GFP_NOIO);
+
+               if (scribble) {
+                       flex_array_free(percpu->scribble);
+                       percpu->scribble = scribble;
+               } else {
+                       err = -ENOMEM;
+                       break;
+               }
+       }
+       put_online_cpus();
+       mddev_resume(conf->mddev);
+       return err;
+}
+
 static int resize_stripes(struct r5conf *conf, int newsize)
 {
        /* Make all the stripes able to hold 'newsize' devices.
@@ -2088,7 +2132,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        struct stripe_head *osh, *nsh;
        LIST_HEAD(newstripes);
        struct disk_info *ndisks;
-       unsigned long cpu;
        int err;
        struct kmem_cache *sc;
        int i;
@@ -2109,13 +2152,11 @@ static int resize_stripes(struct r5conf *conf, int newsize)
                return -ENOMEM;
 
        for (i = conf->max_nr_stripes; i; i--) {
-               nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
+               nsh = alloc_stripe(sc, GFP_KERNEL);
                if (!nsh)
                        break;
 
                nsh->raid_conf = conf;
-               spin_lock_init(&nsh->stripe_lock);
-
                list_add(&nsh->lru, &newstripes);
        }
        if (i) {
@@ -2142,13 +2183,11 @@ static int resize_stripes(struct r5conf *conf, int newsize)
                                    lock_device_hash_lock(conf, hash));
                osh = get_free_stripe(conf, hash);
                unlock_device_hash_lock(conf, hash);
-               atomic_set(&nsh->count, 1);
+
                for(i=0; i<conf->pool_size; i++) {
                        nsh->dev[i].page = osh->dev[i].page;
                        nsh->dev[i].orig_page = osh->dev[i].page;
                }
-               for( ; i<newsize; i++)
-                       nsh->dev[i].page = NULL;
                nsh->hash_lock_index = hash;
                kmem_cache_free(conf->slab_cache, osh);
                cnt++;
@@ -2174,25 +2213,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        } else
                err = -ENOMEM;
 
-       get_online_cpus();
-       for_each_present_cpu(cpu) {
-               struct raid5_percpu *percpu;
-               struct flex_array *scribble;
-
-               percpu = per_cpu_ptr(conf->percpu, cpu);
-               scribble = scribble_alloc(newsize, conf->chunk_sectors /
-                       STRIPE_SECTORS, GFP_NOIO);
-
-               if (scribble) {
-                       flex_array_free(percpu->scribble);
-                       percpu->scribble = scribble;
-               } else {
-                       err = -ENOMEM;
-                       break;
-               }
-       }
-       put_online_cpus();
-
        /* Step 4, return new stripes to service */
        while(!list_empty(&newstripes)) {
                nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -2212,7 +2232,8 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 
        conf->slab_cache = sc;
        conf->active_name = 1-conf->active_name;
-       conf->pool_size = newsize;
+       if (!err)
+               conf->pool_size = newsize;
        return err;
 }
 
@@ -2434,7 +2455,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
        }
        rdev_dec_pending(rdev, conf->mddev);
 
-       if (sh->batch_head && !uptodate)
+       if (sh->batch_head && !uptodate && !replacement)
                set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
 
        if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
@@ -2976,14 +2997,32 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
        pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
                (unsigned long long)(*bip)->bi_iter.bi_sector,
                (unsigned long long)sh->sector, dd_idx);
-       spin_unlock_irq(&sh->stripe_lock);
 
        if (conf->mddev->bitmap && firstwrite) {
+               /* Cannot hold spinlock over bitmap_startwrite,
+                * but must ensure this isn't added to a batch until
+                * we have added to the bitmap and set bm_seq.
+                * So set STRIPE_BITMAP_PENDING to prevent
+                * batching.
+                * If multiple add_stripe_bio() calls race here they
+                * much all set STRIPE_BITMAP_PENDING.  So only the first one
+                * to complete "bitmap_startwrite" gets to set
+                * STRIPE_BIT_DELAY.  This is important as once a stripe
+                * is added to a batch, STRIPE_BIT_DELAY cannot be changed
+                * any more.
+                */
+               set_bit(STRIPE_BITMAP_PENDING, &sh->state);
+               spin_unlock_irq(&sh->stripe_lock);
                bitmap_startwrite(conf->mddev->bitmap, sh->sector,
                                  STRIPE_SECTORS, 0);
-               sh->bm_seq = conf->seq_flush+1;
-               set_bit(STRIPE_BIT_DELAY, &sh->state);
+               spin_lock_irq(&sh->stripe_lock);
+               clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
+               if (!sh->batch_head) {
+                       sh->bm_seq = conf->seq_flush+1;
+                       set_bit(STRIPE_BIT_DELAY, &sh->state);
+               }
        }
+       spin_unlock_irq(&sh->stripe_lock);
 
        if (stripe_can_batch(sh))
                stripe_add_to_batch_list(conf, sh);
@@ -3278,7 +3317,9 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
                /* reconstruct-write isn't being forced */
                return 0;
        for (i = 0; i < s->failed; i++) {
-               if (!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+               if (s->failed_num[i] != sh->pd_idx &&
+                   s->failed_num[i] != sh->qd_idx &&
+                   !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
                    !test_bit(R5_OVERWRITE, &fdev[i]->flags))
                        return 1;
        }
@@ -3298,6 +3339,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
                 */
                BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
                BUG_ON(test_bit(R5_Wantread, &dev->flags));
+               BUG_ON(sh->batch_head);
                if ((s->uptodate == disks - 1) &&
                    (s->failed && (disk_idx == s->failed_num[0] ||
                                   disk_idx == s->failed_num[1]))) {
@@ -3366,7 +3408,6 @@ static void handle_stripe_fill(struct stripe_head *sh,
 {
        int i;
 
-       BUG_ON(sh->batch_head);
        /* look for blocks to read/compute, skip this if a compute
         * is already in flight, or if the stripe contents are in the
         * midst of changing due to a write
@@ -3493,10 +3534,27 @@ unhash:
                                      struct stripe_head, batch_list);
                list_del_init(&sh->batch_list);
 
-               set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
-                             head_sh->state & ~((1 << STRIPE_ACTIVE) |
-                                                (1 << STRIPE_PREREAD_ACTIVE) |
-                                                STRIPE_EXPAND_SYNC_FLAG));
+               WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
+                                         (1 << STRIPE_SYNCING) |
+                                         (1 << STRIPE_REPLACED) |
+                                         (1 << STRIPE_PREREAD_ACTIVE) |
+                                         (1 << STRIPE_DELAYED) |
+                                         (1 << STRIPE_BIT_DELAY) |
+                                         (1 << STRIPE_FULL_WRITE) |
+                                         (1 << STRIPE_BIOFILL_RUN) |
+                                         (1 << STRIPE_COMPUTE_RUN)  |
+                                         (1 << STRIPE_OPS_REQ_PENDING) |
+                                         (1 << STRIPE_DISCARD) |
+                                         (1 << STRIPE_BATCH_READY) |
+                                         (1 << STRIPE_BATCH_ERR) |
+                                         (1 << STRIPE_BITMAP_PENDING)));
+               WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
+                                             (1 << STRIPE_REPLACED)));
+
+               set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
+                                           (1 << STRIPE_DEGRADED)),
+                             head_sh->state & (1 << STRIPE_INSYNC));
+
                sh->check_state = head_sh->check_state;
                sh->reconstruct_state = head_sh->reconstruct_state;
                for (i = 0; i < sh->disks; i++) {
@@ -3508,7 +3566,7 @@ unhash:
                spin_lock_irq(&sh->stripe_lock);
                sh->batch_head = NULL;
                spin_unlock_irq(&sh->stripe_lock);
-               if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
+               if (sh->state & STRIPE_EXPAND_SYNC_FLAGS)
                        set_bit(STRIPE_HANDLE, &sh->state);
                release_stripe(sh);
        }
@@ -3516,8 +3574,9 @@ unhash:
        spin_lock_irq(&head_sh->stripe_lock);
        head_sh->batch_head = NULL;
        spin_unlock_irq(&head_sh->stripe_lock);
-       wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
-       if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
+       if (wakeup_nr)
+               wake_up(&conf->wait_for_overlap);
+       if (head_sh->state & STRIPE_EXPAND_SYNC_FLAGS)
                set_bit(STRIPE_HANDLE, &head_sh->state);
 }
 
@@ -4159,9 +4218,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 
 static int clear_batch_ready(struct stripe_head *sh)
 {
+       /* Return '1' if this is a member of batch, or
+        * '0' if it is a lone stripe or a head which can now be
+        * handled.
+        */
        struct stripe_head *tmp;
        if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
-               return 0;
+               return (sh->batch_head && sh->batch_head != sh);
        spin_lock(&sh->stripe_lock);
        if (!sh->batch_head) {
                spin_unlock(&sh->stripe_lock);
@@ -4189,46 +4252,65 @@ static int clear_batch_ready(struct stripe_head *sh)
        return 0;
 }
 
-static void check_break_stripe_batch_list(struct stripe_head *sh)
+static void break_stripe_batch_list(struct stripe_head *head_sh,
+                                   unsigned long handle_flags)
 {
-       struct stripe_head *head_sh, *next;
+       struct stripe_head *sh, *next;
        int i;
+       int do_wakeup = 0;
 
-       if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
-               return;
-
-       head_sh = sh;
-       do {
-               sh = list_first_entry(&sh->batch_list,
-                                     struct stripe_head, batch_list);
-               BUG_ON(sh == head_sh);
-       } while (!test_bit(STRIPE_DEGRADED, &sh->state));
+       list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
 
-       while (sh != head_sh) {
-               next = list_first_entry(&sh->batch_list,
-                                       struct stripe_head, batch_list);
                list_del_init(&sh->batch_list);
 
-               set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
-                             head_sh->state & ~((1 << STRIPE_ACTIVE) |
-                                                (1 << STRIPE_PREREAD_ACTIVE) |
-                                                (1 << STRIPE_DEGRADED) |
-                                                STRIPE_EXPAND_SYNC_FLAG));
+               WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
+                                         (1 << STRIPE_SYNCING) |
+                                         (1 << STRIPE_REPLACED) |
+                                         (1 << STRIPE_PREREAD_ACTIVE) |
+                                         (1 << STRIPE_DELAYED) |
+                                         (1 << STRIPE_BIT_DELAY) |
+                                         (1 << STRIPE_FULL_WRITE) |
+                                         (1 << STRIPE_BIOFILL_RUN) |
+                                         (1 << STRIPE_COMPUTE_RUN)  |
+                                         (1 << STRIPE_OPS_REQ_PENDING) |
+                                         (1 << STRIPE_DISCARD) |
+                                         (1 << STRIPE_BATCH_READY) |
+                                         (1 << STRIPE_BATCH_ERR) |
+                                         (1 << STRIPE_BITMAP_PENDING)));
+               WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
+                                             (1 << STRIPE_REPLACED)));
+
+               set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
+                                           (1 << STRIPE_DEGRADED)),
+                             head_sh->state & (1 << STRIPE_INSYNC));
+
                sh->check_state = head_sh->check_state;
                sh->reconstruct_state = head_sh->reconstruct_state;
-               for (i = 0; i < sh->disks; i++)
+               for (i = 0; i < sh->disks; i++) {
+                       if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+                               do_wakeup = 1;
                        sh->dev[i].flags = head_sh->dev[i].flags &
                                (~((1 << R5_WriteError) | (1 << R5_Overlap)));
-
+               }
                spin_lock_irq(&sh->stripe_lock);
                sh->batch_head = NULL;
                spin_unlock_irq(&sh->stripe_lock);
-
-               set_bit(STRIPE_HANDLE, &sh->state);
+               if (handle_flags == 0 ||
+                   sh->state & handle_flags)
+                       set_bit(STRIPE_HANDLE, &sh->state);
                release_stripe(sh);
-
-               sh = next;
        }
+       spin_lock_irq(&head_sh->stripe_lock);
+       head_sh->batch_head = NULL;
+       spin_unlock_irq(&head_sh->stripe_lock);
+       for (i = 0; i < head_sh->disks; i++)
+               if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
+                       do_wakeup = 1;
+       if (head_sh->state & handle_flags)
+               set_bit(STRIPE_HANDLE, &head_sh->state);
+
+       if (do_wakeup)
+               wake_up(&head_sh->raid_conf->wait_for_overlap);
 }
 
 static void handle_stripe(struct stripe_head *sh)
@@ -4253,7 +4335,8 @@ static void handle_stripe(struct stripe_head *sh)
                return;
        }
 
-       check_break_stripe_batch_list(sh);
+       if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
+               break_stripe_batch_list(sh, 0);
 
        if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
                spin_lock(&sh->stripe_lock);
@@ -6221,8 +6304,11 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu
                percpu->spare_page = alloc_page(GFP_KERNEL);
        if (!percpu->scribble)
                percpu->scribble = scribble_alloc(max(conf->raid_disks,
-                       conf->previous_raid_disks), conf->chunk_sectors /
-                       STRIPE_SECTORS, GFP_KERNEL);
+                                                     conf->previous_raid_disks),
+                                                 max(conf->chunk_sectors,
+                                                     conf->prev_chunk_sectors)
+                                                  / STRIPE_SECTORS,
+                                                 GFP_KERNEL);
 
        if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
                free_scratch_buffer(conf, percpu);
@@ -7198,6 +7284,15 @@ static int check_reshape(struct mddev *mddev)
        if (!check_stripe_cache(mddev))
                return -ENOSPC;
 
+       if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
+           mddev->delta_disks > 0)
+               if (resize_chunks(conf,
+                                 conf->previous_raid_disks
+                                 + max(0, mddev->delta_disks),
+                                 max(mddev->new_chunk_sectors,
+                                     mddev->chunk_sectors)
+                           ) < 0)
+                       return -ENOMEM;
        return resize_stripes(conf, (conf->previous_raid_disks
                                     + mddev->delta_disks));
 }
This page took 0.037209 seconds and 5 git commands to generate.