Btrfs: log changed inodes based on the extent map tree
[deliverable/linux.git] / fs / btrfs / volumes.c
index 5777e6a9aab17f3c0e9b2d5c620ed4817dd3dac6..d79b5b620e9407fb8b0b53390b34b11e4657d205 100644 (file)
@@ -36,6 +36,7 @@
 #include "check-integrity.h"
 #include "rcu-string.h"
 #include "math.h"
+#include "dev-replace.h"
 
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
@@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
        kfree(fs_devices);
 }
 
+static void btrfs_kobject_uevent(struct block_device *bdev,
+                                enum kobject_action action)
+{
+       int ret;
+
+       ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
+       if (ret)
+               pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
+                       action,
+                       kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
+                       &disk_to_dev(bdev->bd_disk)->kobj);
+}
+
 void btrfs_cleanup_fs_uuids(void)
 {
        struct btrfs_fs_devices *fs_devices;
@@ -505,7 +519,8 @@ error:
        return ERR_PTR(-ENOMEM);
 }
 
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+                              struct btrfs_fs_devices *fs_devices, int step)
 {
        struct btrfs_device *device, *next;
 
@@ -528,6 +543,21 @@ again:
                        continue;
                }
 
+               if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
+                       /*
+                        * In the first step, keep the device which has
+                        * the correct fsid and the devid that is used
+                        * for the dev_replace procedure.
+                        * In the second step, the dev_replace state is
+                        * read from the device tree and it is known
+                        * whether the procedure is really active or
+                        * not, which means whether this device is
+                        * used or whether it should be removed.
+                        */
+                       if (step == 0 || device->is_tgtdev_for_dev_replace) {
+                               continue;
+                       }
+               }
                if (device->bdev) {
                        blkdev_put(device->bdev, device->mode);
                        device->bdev = NULL;
@@ -536,7 +566,8 @@ again:
                if (device->writeable) {
                        list_del_init(&device->dev_alloc_list);
                        device->writeable = 0;
-                       fs_devices->rw_devices--;
+                       if (!device->is_tgtdev_for_dev_replace)
+                               fs_devices->rw_devices--;
                }
                list_del_init(&device->dev_list);
                fs_devices->num_devices--;
@@ -594,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                if (device->bdev)
                        fs_devices->open_devices--;
 
-               if (device->writeable) {
+               if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                        list_del_init(&device->dev_alloc_list);
                        fs_devices->rw_devices--;
                }
@@ -718,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                        fs_devices->rotating = 1;
 
                fs_devices->open_devices++;
-               if (device->writeable) {
+               if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                        fs_devices->rw_devices++;
                        list_add(&device->dev_alloc_list,
                                 &fs_devices->alloc_list);
@@ -1350,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                root->fs_info->avail_system_alloc_bits |
                root->fs_info->avail_metadata_alloc_bits;
 
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
-           root->fs_info->fs_devices->num_devices <= 4) {
+       num_devices = root->fs_info->fs_devices->num_devices;
+       btrfs_dev_replace_lock(&root->fs_info->dev_replace);
+       if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
+               WARN_ON(num_devices < 1);
+               num_devices--;
+       }
+       btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
+
+       if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
                printk(KERN_ERR "btrfs: unable to go below four devices "
                       "on raid10\n");
                ret = -EINVAL;
                goto out;
        }
 
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
-           root->fs_info->fs_devices->num_devices <= 2) {
+       if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
                printk(KERN_ERR "btrfs: unable to go below two "
                       "devices on raid1\n");
                ret = -EINVAL;
@@ -1518,9 +1555,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
        ret = 0;
 
+       /* Notify udev that device has changed */
+       btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
 error_brelse:
        brelse(bh);
-error_close:
        if (bdev)
                blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
@@ -2935,6 +2974,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        u64 allowed;
        int mixed = 0;
        int ret;
+       u64 num_devices;
 
        if (btrfs_fs_closing(fs_info) ||
            atomic_read(&fs_info->balance_pause_req) ||
@@ -2963,10 +3003,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                }
        }
 
+       num_devices = fs_info->fs_devices->num_devices;
+       btrfs_dev_replace_lock(&fs_info->dev_replace);
+       if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+               BUG_ON(num_devices < 1);
+               num_devices--;
+       }
+       btrfs_dev_replace_unlock(&fs_info->dev_replace);
        allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-       if (fs_info->fs_devices->num_devices == 1)
+       if (num_devices == 1)
                allowed |= BTRFS_BLOCK_GROUP_DUP;
-       else if (fs_info->fs_devices->num_devices < 4)
+       else if (num_devices < 4)
                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
        else
                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
@@ -3591,6 +3638,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                devices_info[ndevs].total_avail = total_avail;
                devices_info[ndevs].dev = device;
                ++ndevs;
+               WARN_ON(ndevs > fs_devices->rw_devices);
        }
 
        /*
@@ -3971,19 +4019,46 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
        else
                ret = 1;
        free_extent_map(em);
+
+       btrfs_dev_replace_lock(&fs_info->dev_replace);
+       if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+               ret++;
+       btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
        return ret;
 }
 
-static int find_live_mirror(struct map_lookup *map, int first, int num,
-                           int optimal)
+static int find_live_mirror(struct btrfs_fs_info *fs_info,
+                           struct map_lookup *map, int first, int num,
+                           int optimal, int dev_replace_is_ongoing)
 {
        int i;
-       if (map->stripes[optimal].dev->bdev)
-               return optimal;
-       for (i = first; i < first + num; i++) {
-               if (map->stripes[i].dev->bdev)
-                       return i;
+       int tolerance;
+       struct btrfs_device *srcdev;
+
+       if (dev_replace_is_ongoing &&
+           fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+            BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
+               srcdev = fs_info->dev_replace.srcdev;
+       else
+               srcdev = NULL;
+
+       /*
+        * try to avoid the drive that is the source drive for a
+        * dev-replace procedure, only choose it if no other non-missing
+        * mirror is available
+        */
+       for (tolerance = 0; tolerance < 2; tolerance++) {
+               if (map->stripes[optimal].dev->bdev &&
+                   (tolerance || map->stripes[optimal].dev != srcdev))
+                       return optimal;
+               for (i = first; i < first + num; i++) {
+                       if (map->stripes[i].dev->bdev &&
+                           (tolerance || map->stripes[i].dev != srcdev))
+                               return i;
+               }
        }
+
        /* we couldn't find one that doesn't fail.  Just return something
         * and the io error handling code will clean up eventually
         */
@@ -4011,6 +4086,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        int num_stripes;
        int max_errors = 0;
        struct btrfs_bio *bbio = NULL;
+       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+       int dev_replace_is_ongoing = 0;
+       int num_alloc_stripes;
+       int patch_the_first_stripe_for_dev_replace = 0;
+       u64 physical_to_patch_in_first_stripe = 0;
 
        read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, logical, *length);
@@ -4027,9 +4107,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        map = (struct map_lookup *)em->bdev;
        offset = logical - em->start;
 
-       if (mirror_num > map->num_stripes)
-               mirror_num = 0;
-
        stripe_nr = offset;
        /*
         * stripe_nr counts the total number of stripes we have to stride
@@ -4056,6 +4133,93 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        if (!bbio_ret)
                goto out;
 
+       btrfs_dev_replace_lock(dev_replace);
+       dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+       if (!dev_replace_is_ongoing)
+               btrfs_dev_replace_unlock(dev_replace);
+
+       if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
+           !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
+           dev_replace->tgtdev != NULL) {
+               /*
+                * in dev-replace case, for repair case (that's the only
+                * case where the mirror is selected explicitly when
+                * calling btrfs_map_block), blocks left of the left cursor
+                * can also be read from the target drive.
+                * For REQ_GET_READ_MIRRORS, the target drive is added as
+                * the last one to the array of stripes. For READ, it also
+                * needs to be supported using the same mirror number.
+                * If the requested block is not left of the left cursor,
+                * EIO is returned. This can happen because btrfs_num_copies()
+                * returns one more in the dev-replace case.
+                */
+               u64 tmp_length = *length;
+               struct btrfs_bio *tmp_bbio = NULL;
+               int tmp_num_stripes;
+               u64 srcdev_devid = dev_replace->srcdev->devid;
+               int index_srcdev = 0;
+               int found = 0;
+               u64 physical_of_found = 0;
+
+               ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
+                            logical, &tmp_length, &tmp_bbio, 0);
+               if (ret) {
+                       WARN_ON(tmp_bbio != NULL);
+                       goto out;
+               }
+
+               tmp_num_stripes = tmp_bbio->num_stripes;
+               if (mirror_num > tmp_num_stripes) {
+                       /*
+                        * REQ_GET_READ_MIRRORS does not contain this
+                        * mirror, that means that the requested area
+                        * is not left of the left cursor
+                        */
+                       ret = -EIO;
+                       kfree(tmp_bbio);
+                       goto out;
+               }
+
+               /*
+                * process the rest of the function using the mirror_num
+                * of the source drive. Therefore look it up first.
+                * At the end, patch the device pointer to the one of the
+                * target drive.
+                */
+               for (i = 0; i < tmp_num_stripes; i++) {
+                       if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
+                               /*
+                                * In case of DUP, in order to keep it
+                                * simple, only add the mirror with the
+                                * lowest physical address
+                                */
+                               if (found &&
+                                   physical_of_found <=
+                                    tmp_bbio->stripes[i].physical)
+                                       continue;
+                               index_srcdev = i;
+                               found = 1;
+                               physical_of_found =
+                                       tmp_bbio->stripes[i].physical;
+                       }
+               }
+
+               if (found) {
+                       mirror_num = index_srcdev + 1;
+                       patch_the_first_stripe_for_dev_replace = 1;
+                       physical_to_patch_in_first_stripe = physical_of_found;
+               } else {
+                       WARN_ON(1);
+                       ret = -EIO;
+                       kfree(tmp_bbio);
+                       goto out;
+               }
+
+               kfree(tmp_bbio);
+       } else if (mirror_num > map->num_stripes) {
+               mirror_num = 0;
+       }
+
        num_stripes = 1;
        stripe_index = 0;
        stripe_nr_orig = stripe_nr;
@@ -4070,19 +4234,20 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                                            stripe_nr_end - stripe_nr_orig);
                stripe_index = do_div(stripe_nr, map->num_stripes);
        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-               if (rw & (REQ_WRITE | REQ_DISCARD))
+               if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
                        num_stripes = map->num_stripes;
                else if (mirror_num)
                        stripe_index = mirror_num - 1;
                else {
-                       stripe_index = find_live_mirror(map, 0,
+                       stripe_index = find_live_mirror(fs_info, map, 0,
                                            map->num_stripes,
-                                           current->pid % map->num_stripes);
+                                           current->pid % map->num_stripes,
+                                           dev_replace_is_ongoing);
                        mirror_num = stripe_index + 1;
                }
 
        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-               if (rw & (REQ_WRITE | REQ_DISCARD)) {
+               if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
                        num_stripes = map->num_stripes;
                } else if (mirror_num) {
                        stripe_index = mirror_num - 1;
@@ -4096,7 +4261,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                stripe_index = do_div(stripe_nr, factor);
                stripe_index *= map->sub_stripes;
 
-               if (rw & REQ_WRITE)
+               if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
                        num_stripes = map->sub_stripes;
                else if (rw & REQ_DISCARD)
                        num_stripes = min_t(u64, map->sub_stripes *
@@ -4106,9 +4271,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                        stripe_index += mirror_num - 1;
                else {
                        int old_stripe_index = stripe_index;
-                       stripe_index = find_live_mirror(map, stripe_index,
+                       stripe_index = find_live_mirror(fs_info, map,
+                                             stripe_index,
                                              map->sub_stripes, stripe_index +
-                                             current->pid % map->sub_stripes);
+                                             current->pid % map->sub_stripes,
+                                             dev_replace_is_ongoing);
                        mirror_num = stripe_index - old_stripe_index + 1;
                }
        } else {
@@ -4122,7 +4289,14 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        }
        BUG_ON(stripe_index >= map->num_stripes);
 
-       bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+       num_alloc_stripes = num_stripes;
+       if (dev_replace_is_ongoing) {
+               if (rw & (REQ_WRITE | REQ_DISCARD))
+                       num_alloc_stripes <<= 1;
+               if (rw & REQ_GET_READ_MIRRORS)
+                       num_alloc_stripes++;
+       }
+       bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
        if (!bbio) {
                ret = -ENOMEM;
                goto out;
@@ -4209,7 +4383,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                }
        }
 
-       if (rw & REQ_WRITE) {
+       if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
                if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
                                 BTRFS_BLOCK_GROUP_RAID10 |
                                 BTRFS_BLOCK_GROUP_DUP)) {
@@ -4217,11 +4391,106 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                }
        }
 
+       if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+           dev_replace->tgtdev != NULL) {
+               int index_where_to_add;
+               u64 srcdev_devid = dev_replace->srcdev->devid;
+
+               /*
+                * duplicate the write operations while the dev replace
+                * procedure is running. Since the copying of the old disk
+                * to the new disk takes place at run time while the
+                * filesystem is mounted writable, the regular write
+                * operations to the old disk have to be duplicated to go
+                * to the new disk as well.
+                * Note that device->missing is handled by the caller, and
+                * that the write to the old disk is already set up in the
+                * stripes array.
+                */
+               index_where_to_add = num_stripes;
+               for (i = 0; i < num_stripes; i++) {
+                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                               /* write to new disk, too */
+                               struct btrfs_bio_stripe *new =
+                                       bbio->stripes + index_where_to_add;
+                               struct btrfs_bio_stripe *old =
+                                       bbio->stripes + i;
+
+                               new->physical = old->physical;
+                               new->length = old->length;
+                               new->dev = dev_replace->tgtdev;
+                               index_where_to_add++;
+                               max_errors++;
+                       }
+               }
+               num_stripes = index_where_to_add;
+       } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
+                  dev_replace->tgtdev != NULL) {
+               u64 srcdev_devid = dev_replace->srcdev->devid;
+               int index_srcdev = 0;
+               int found = 0;
+               u64 physical_of_found = 0;
+
+               /*
+                * During the dev-replace procedure, the target drive can
+                * also be used to read data in case it is needed to repair
+                * a corrupt block elsewhere. This is possible if the
+                * requested area is left of the left cursor. In this area,
+                * the target drive is a full copy of the source drive.
+                */
+               for (i = 0; i < num_stripes; i++) {
+                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                               /*
+                                * In case of DUP, in order to keep it
+                                * simple, only add the mirror with the
+                                * lowest physical address
+                                */
+                               if (found &&
+                                   physical_of_found <=
+                                    bbio->stripes[i].physical)
+                                       continue;
+                               index_srcdev = i;
+                               found = 1;
+                               physical_of_found = bbio->stripes[i].physical;
+                       }
+               }
+               if (found) {
+                       u64 length = map->stripe_len;
+
+                       if (physical_of_found + length <=
+                           dev_replace->cursor_left) {
+                               struct btrfs_bio_stripe *tgtdev_stripe =
+                                       bbio->stripes + num_stripes;
+
+                               tgtdev_stripe->physical = physical_of_found;
+                               tgtdev_stripe->length =
+                                       bbio->stripes[index_srcdev].length;
+                               tgtdev_stripe->dev = dev_replace->tgtdev;
+
+                               num_stripes++;
+                       }
+               }
+       }
+
        *bbio_ret = bbio;
        bbio->num_stripes = num_stripes;
        bbio->max_errors = max_errors;
        bbio->mirror_num = mirror_num;
+
+       /*
+        * this is the case that REQ_READ && dev_replace_is_ongoing &&
+        * mirror_num == num_stripes + 1 && dev_replace target drive is
+        * available as a mirror
+        */
+       if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
+               WARN_ON(num_stripes > 1);
+               bbio->stripes[0].dev = dev_replace->tgtdev;
+               bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
+               bbio->mirror_num = map->num_stripes + 1;
+       }
 out:
+       if (dev_replace_is_ongoing)
+               btrfs_dev_replace_unlock(dev_replace);
        free_extent_map(em);
        return ret;
 }
@@ -4714,6 +4983,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
        em->bdev = (struct block_device *)map;
        em->start = logical;
        em->len = length;
+       em->orig_start = 0;
        em->block_start = 0;
        em->block_len = em->len;
 
@@ -4773,6 +5043,7 @@ static void fill_device_from_item(struct extent_buffer *leaf,
        device->io_align = btrfs_device_io_align(leaf, dev_item);
        device->io_width = btrfs_device_io_width(leaf, dev_item);
        device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+       WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
        device->is_tgtdev_for_dev_replace = 0;
 
        ptr = (unsigned long)btrfs_device_uuid(dev_item);
This page took 0.030917 seconds and 5 git commands to generate.