md/raid5: recognise replacements when assembling array.
[deliverable/linux.git] / drivers / md / raid5.c
index 14878a9ae09d3be52ebe91fd21b32c16584c0c95..c80f8c2471ccef081967ca038fda43b7daa6f660 100644 (file)
@@ -503,6 +503,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
        for (i = disks; i--; ) {
                int rw;
+               int replace_only = 0;
                struct bio *bi, *rbi;
                struct md_rdev *rdev, *rrdev = NULL;
                if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
@@ -512,7 +513,11 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                                rw = WRITE;
                } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
                        rw = READ;
-               else
+               else if (test_and_clear_bit(R5_WantReplace,
+                                           &sh->dev[i].flags)) {
+                       rw = WRITE;
+                       replace_only = 1;
+               } else
                        continue;
 
                bi = &sh->dev[i].req;
@@ -527,11 +532,24 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                        bi->bi_end_io = raid5_end_read_request;
 
                rcu_read_lock();
+               rrdev = rcu_dereference(conf->disks[i].replacement);
+               smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
                rdev = rcu_dereference(conf->disks[i].rdev);
-               if (rw & WRITE)
-                       rrdev = rcu_dereference(conf->disks[i].replacement);
-               else if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
-                       rdev = rcu_dereference(conf->disks[i].replacement);
+               if (!rdev) {
+                       rdev = rrdev;
+                       rrdev = NULL;
+               }
+               if (rw & WRITE) {
+                       if (replace_only)
+                               rdev = NULL;
+                       if (rdev == rrdev)
+                               /* We raced and saw duplicates */
+                               rrdev = NULL;
+               } else {
+                       if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
+                               rdev = rrdev;
+                       rrdev = NULL;
+               }
 
                if (rdev && test_bit(Faulty, &rdev->flags))
                        rdev = NULL;
@@ -575,7 +593,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                }
 
                if (rdev) {
-                       if (s->syncing || s->expanding || s->expanded)
+                       if (s->syncing || s->expanding || s->expanded
+                           || s->replacing)
                                md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
                        set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -597,7 +616,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                        generic_make_request(bi);
                }
                if (rrdev) {
-                       if (s->syncing || s->expanding || s->expanded)
+                       if (s->syncing || s->expanding || s->expanded
+                           || s->replacing)
                                md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
 
                        set_bit(STRIPE_IO_STARTED, &sh->state);
@@ -1628,7 +1648,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
        int disks = sh->disks, i;
        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
        char b[BDEVNAME_SIZE];
-       struct md_rdev *rdev;
+       struct md_rdev *rdev = NULL;
 
 
        for (i=0 ; i<disks; i++)
@@ -1643,8 +1663,13 @@ static void raid5_end_read_request(struct bio * bi, int error)
                return;
        }
        if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
+               /* If replacement finished while this request was outstanding,
+                * 'replacement' might be NULL already.
+                * In that case it moved down to 'rdev'.
+                * rdev is not removed until all requests are finished.
+                */
                rdev = conf->disks[i].replacement;
-       else
+       if (!rdev)
                rdev = conf->disks[i].rdev;
 
        if (uptodate) {
@@ -1741,7 +1766,14 @@ static void raid5_end_write_request(struct bio *bi, int error)
                }
                if (bi == &sh->dev[i].rreq) {
                        rdev = conf->disks[i].replacement;
-                       replacement = 1;
+                       if (rdev)
+                               replacement = 1;
+                       else
+                               /* rdev was removed and 'replacement'
+                                * replaced it.  rdev is not removed
+                                * until all requests are finished.
+                                */
+                               rdev = conf->disks[i].rdev;
                        break;
                }
        }
@@ -2440,8 +2472,9 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
        md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
        clear_bit(STRIPE_SYNCING, &sh->state);
        s->syncing = 0;
+       s->replacing = 0;
        /* There is nothing more to do for sync/check/repair.
-        * For recover we need to record a bad block on all
+        * For recover/replace we need to record a bad block on all
         * non-sync devices, or abort the recovery
         */
        if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
@@ -2451,12 +2484,18 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
         */
        for (i = 0; i < conf->raid_disks; i++) {
                struct md_rdev *rdev = conf->disks[i].rdev;
-               if (!rdev
-                   || test_bit(Faulty, &rdev->flags)
-                   || test_bit(In_sync, &rdev->flags))
-                       continue;
-               if (!rdev_set_badblocks(rdev, sh->sector,
-                                       STRIPE_SECTORS, 0))
+               if (rdev
+                   && !test_bit(Faulty, &rdev->flags)
+                   && !test_bit(In_sync, &rdev->flags)
+                   && !rdev_set_badblocks(rdev, sh->sector,
+                                          STRIPE_SECTORS, 0))
+                       abort = 1;
+               rdev = conf->disks[i].replacement;
+               if (rdev
+                   && !test_bit(Faulty, &rdev->flags)
+                   && !test_bit(In_sync, &rdev->flags)
+                   && !rdev_set_badblocks(rdev, sh->sector,
+                                          STRIPE_SECTORS, 0))
                        abort = 1;
        }
        if (abort) {
@@ -2465,6 +2504,22 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
        }
 }
 
+static int want_replace(struct stripe_head *sh, int disk_idx)
+{
+       struct md_rdev *rdev;
+       int rv = 0;
+       /* Doing recovery so rcu locking not required */
+       rdev = sh->raid_conf->disks[disk_idx].replacement;
+       if (rdev
+           && !test_bit(Faulty, &rdev->flags)
+           && !test_bit(In_sync, &rdev->flags)
+           && (rdev->recovery_offset <= sh->sector
+               || rdev->mddev->recovery_cp <= sh->sector))
+               rv = 1;
+
+       return rv;
+}
+
 /* fetch_block - checks the given member device to see if its data needs
  * to be read or computed to satisfy a request.
  *
@@ -2484,6 +2539,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
            (dev->toread ||
             (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
             s->syncing || s->expanding ||
+            (s->replacing && want_replace(sh, disk_idx)) ||
             (s->failed >= 1 && fdev[0]->toread) ||
             (s->failed >= 2 && fdev[1]->toread) ||
             (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
@@ -3037,22 +3093,18 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
        }
 }
 
-
 /*
  * handle_stripe - do things to a stripe.
  *
- * We lock the stripe and then examine the state of various bits
- * to see what needs to be done.
+ * We lock the stripe by setting STRIPE_ACTIVE and then examine the
+ * state of various bits to see what needs to be done.
  * Possible results:
- *    return some read request which now have data
- *    return some write requests which are safely on disc
+ *    return some read requests which now have data
+ *    return some write requests which are safely on storage
  *    schedule a read on some buffers
  *    schedule a write of some buffers
  *    return confirmation of parity correctness
  *
- * buffers are taken off read_list or write_list, and bh_cache buffers
- * get BH_Lock set before the stripe lock is released.
- *
  */
 
 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
@@ -3061,10 +3113,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
        int disks = sh->disks;
        struct r5dev *dev;
        int i;
+       int do_recovery = 0;
 
        memset(s, 0, sizeof(*s));
 
-       s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
        s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
        s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
        s->failed_num[0] = -1;
@@ -3082,7 +3134,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                dev = &sh->dev[i];
 
                pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
-                       i, dev->flags, dev->toread, dev->towrite, dev->written);
+                        i, dev->flags,
+                        dev->toread, dev->towrite, dev->written);
                /* maybe we can reply to a read
                 *
                 * new wantfill requests are only permitted while
@@ -3123,6 +3176,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                                 &first_bad, &bad_sectors))
                        set_bit(R5_ReadRepl, &dev->flags);
                else {
+                       if (rdev)
+                               set_bit(R5_NeedReplace, &dev->flags);
                        rdev = rcu_dereference(conf->disks[i].rdev);
                        clear_bit(R5_ReadRepl, &dev->flags);
                }
@@ -3210,9 +3265,25 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                        if (s->failed < 2)
                                s->failed_num[s->failed] = i;
                        s->failed++;
+                       if (rdev && !test_bit(Faulty, &rdev->flags))
+                               do_recovery = 1;
                }
        }
        spin_unlock_irq(&conf->device_lock);
+       if (test_bit(STRIPE_SYNCING, &sh->state)) {
+               /* If there is a failed device being replaced,
+                *     we must be recovering.
+                * else if we are after recovery_cp, we must be syncing
+                * else we can only be replacing
+                * sync and recovery both need to read all devices, and so
+                * use the same flag.
+                */
+               if (do_recovery ||
+                   sh->sector >= conf->mddev->recovery_cp)
+                       s->syncing = 1;
+               else
+                       s->replacing = 1;
+       }
        rcu_read_unlock();
 }
 
@@ -3254,7 +3325,7 @@ static void handle_stripe(struct stripe_head *sh)
 
        if (unlikely(s.blocked_rdev)) {
                if (s.syncing || s.expanding || s.expanded ||
-                   s.to_write || s.written) {
+                   s.replacing || s.to_write || s.written) {
                        set_bit(STRIPE_HANDLE, &sh->state);
                        goto finish;
                }
@@ -3280,7 +3351,7 @@ static void handle_stripe(struct stripe_head *sh)
                sh->reconstruct_state = 0;
                if (s.to_read+s.to_write+s.written)
                        handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
-               if (s.syncing)
+               if (s.syncing + s.replacing)
                        handle_failed_sync(conf, sh, &s);
        }
 
@@ -3311,7 +3382,9 @@ static void handle_stripe(struct stripe_head *sh)
         */
        if (s.to_read || s.non_overwrite
            || (conf->level == 6 && s.to_write && s.failed)
-           || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
+           || (s.syncing && (s.uptodate + s.compute < disks))
+           || s.replacing
+           || s.expanding)
                handle_stripe_fill(sh, &s, disks);
 
        /* Now we check to see if any write operations have recently
@@ -3373,7 +3446,20 @@ static void handle_stripe(struct stripe_head *sh)
                        handle_parity_checks5(conf, sh, &s, disks);
        }
 
-       if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+       if (s.replacing && s.locked == 0
+           && !test_bit(STRIPE_INSYNC, &sh->state)) {
+               /* Write out to replacement devices where possible */
+               for (i = 0; i < conf->raid_disks; i++)
+                       if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
+                           test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
+                               set_bit(R5_WantReplace, &sh->dev[i].flags);
+                               set_bit(R5_LOCKED, &sh->dev[i].flags);
+                               s.locked++;
+                       }
+               set_bit(STRIPE_INSYNC, &sh->state);
+       }
+       if ((s.syncing || s.replacing) && s.locked == 0 &&
+           test_bit(STRIPE_INSYNC, &sh->state)) {
                md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
                clear_bit(STRIPE_SYNCING, &sh->state);
        }
@@ -3473,6 +3559,9 @@ finish:
                        }
                        if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
                                rdev = conf->disks[i].replacement;
+                               if (!rdev)
+                                       /* rdev have been moved down */
+                                       rdev = conf->disks[i].rdev;
                                rdev_clear_badblocks(rdev, sh->sector,
                                                     STRIPE_SECTORS);
                                rdev_dec_pending(rdev, conf->mddev);
@@ -4262,7 +4351,6 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
                return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
        }
 
-
        bitmap_cond_end_sync(mddev->bitmap, sector_nr);
 
        sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
@@ -4759,7 +4847,15 @@ static struct r5conf *setup_conf(struct mddev *mddev)
                        continue;
                disk = conf->disks + raid_disk;
 
-               disk->rdev = rdev;
+               if (test_bit(Replacement, &rdev->flags)) {
+                       if (disk->replacement)
+                               goto abort;
+                       disk->replacement = rdev;
+               } else {
+                       if (disk->rdev)
+                               goto abort;
+                       disk->rdev = rdev;
+               }
 
                if (test_bit(In_sync, &rdev->flags)) {
                        char b[BDEVNAME_SIZE];
@@ -4848,6 +4944,7 @@ static int run(struct mddev *mddev)
        int dirty_parity_disks = 0;
        struct md_rdev *rdev;
        sector_t reshape_offset = 0;
+       int i;
 
        if (mddev->recovery_cp != MaxSector)
                printk(KERN_NOTICE "md/raid:%s: not clean"
@@ -4937,12 +5034,25 @@ static int run(struct mddev *mddev)
        conf->thread = NULL;
        mddev->private = conf;
 
-       /*
-        * 0 for a fully functional array, 1 or 2 for a degraded array.
-        */
-       list_for_each_entry(rdev, &mddev->disks, same_set) {
-               if (rdev->raid_disk < 0)
+       for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
+            i++) {
+               rdev = conf->disks[i].rdev;
+               if (!rdev && conf->disks[i].replacement) {
+                       /* The replacement is all we have yet */
+                       rdev = conf->disks[i].replacement;
+                       conf->disks[i].replacement = NULL;
+                       clear_bit(Replacement, &rdev->flags);
+                       conf->disks[i].rdev = rdev;
+               }
+               if (!rdev)
                        continue;
+               if (conf->disks[i].replacement &&
+                   conf->reshape_progress != MaxSector) {
+                       /* replacements and reshape simply do not mix. */
+                       printk(KERN_ERR "md: cannot handle concurrent "
+                              "replacement and reshape.\n");
+                       goto abort;
+               }
                if (test_bit(In_sync, &rdev->flags)) {
                        working_disks++;
                        continue;
@@ -4976,6 +5086,9 @@ static int run(struct mddev *mddev)
                dirty_parity_disks++;
        }
 
+       /*
+        * 0 for a fully functional array, 1 or 2 for a degraded array.
+        */
        mddev->degraded = calc_degraded(conf);
 
        if (has_failed(conf)) {
@@ -5139,7 +5252,25 @@ static int raid5_spare_active(struct mddev *mddev)
 
        for (i = 0; i < conf->raid_disks; i++) {
                tmp = conf->disks + i;
-               if (tmp->rdev
+               if (tmp->replacement
+                   && tmp->replacement->recovery_offset == MaxSector
+                   && !test_bit(Faulty, &tmp->replacement->flags)
+                   && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
+                       /* Replacement has just become active. */
+                       if (!tmp->rdev
+                           || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
+                               count++;
+                       if (tmp->rdev) {
+                               /* Replaced device not technically faulty,
+                                * but we need to be sure it gets removed
+                                * and never re-added.
+                                */
+                               set_bit(Faulty, &tmp->rdev->flags);
+                               sysfs_notify_dirent_safe(
+                                       tmp->rdev->sysfs_state);
+                       }
+                       sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
+               } else if (tmp->rdev
                    && tmp->rdev->recovery_offset == MaxSector
                    && !test_bit(Faulty, &tmp->rdev->flags)
                    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
@@ -5185,6 +5316,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
        if (!test_bit(Faulty, &rdev->flags) &&
            mddev->recovery_disabled != conf->recovery_disabled &&
            !has_failed(conf) &&
+           (!p->replacement || p->replacement == rdev) &&
            number < conf->raid_disks) {
                err = -EBUSY;
                goto abort;
@@ -5195,7 +5327,20 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
                /* lost the race, try later */
                err = -EBUSY;
                *rdevp = rdev;
-       }
+       } else if (p->replacement) {
+               /* We must have just cleared 'rdev' */
+               p->rdev = p->replacement;
+               clear_bit(Replacement, &p->replacement->flags);
+               smp_mb(); /* Make sure other CPUs may see both as identical
+                          * but will never see neither - if they are careful
+                          */
+               p->replacement = NULL;
+               clear_bit(WantReplacement, &rdev->flags);
+       } else
+               /* We might have just removed the Replacement as faulty-
+                * clear the bit just in case
+                */
+               clear_bit(WantReplacement, &rdev->flags);
 abort:
 
        print_raid5_conf(conf);
This page took 0.031513 seconds and 5 git commands to generate.