Btrfs: Add locking around volume management (device add/remove/balance)

[deliverable/linux.git] / fs / btrfs / volumes.c
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index 501d23d3ebfd2f0c0a23ad4672af2d445a9391a7..5e6ee7a6f73831d574ade1c09353b3a1b82441e3 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
  #include "transaction.h"
  #include "print-tree.h"
  #include "volumes.h"
+#include "async-thread.h"
  
  struct map_lookup {
         u64 type;
@@ -55,6 +56,18 @@ void btrfs_unlock_volumes(void)
         mutex_unlock(&uuid_mutex);
  }
  
+static void lock_chunks(struct btrfs_root *root)
+{
+       mutex_lock(&root->fs_info->alloc_mutex);
+       mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static void unlock_chunks(struct btrfs_root *root)
+{
+       mutex_unlock(&root->fs_info->alloc_mutex);
+       mutex_unlock(&root->fs_info->chunk_mutex);
+}
+
  int btrfs_cleanup_fs_uuids(void)
  {
         struct btrfs_fs_devices *fs_devices;
@@ -110,6 +123,101 @@ static struct btrfs_fs_devices *find_fsid(u8 *fsid)
         return NULL;
  }
  
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device.  This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block.  The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested.  This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+int run_scheduled_bios(struct btrfs_device *device)
+{
+       struct bio *pending;
+       struct backing_dev_info *bdi;
+       struct bio *tail;
+       struct bio *cur;
+       int again = 0;
+       unsigned long num_run = 0;
+
+       bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+loop:
+       spin_lock(&device->io_lock);
+
+       /* take all the bios off the list at once and process them
+        * later on (without the lock held).  But, remember the
+        * tail and other pointers so the bios can be properly reinserted
+        * into the list if we hit congestion
+        */
+       pending = device->pending_bios;
+       tail = device->pending_bio_tail;
+       WARN_ON(pending && !tail);
+       device->pending_bios = NULL;
+       device->pending_bio_tail = NULL;
+
+       /*
+        * if pending was null this time around, no bios need processing
+        * at all and we can stop.  Otherwise it'll loop back up again
+        * and do an additional check so no bios are missed.
+        *
+        * device->running_pending is used to synchronize with the
+        * schedule_bio code.
+        */
+       if (pending) {
+               again = 1;
+               device->running_pending = 1;
+       } else {
+               again = 0;
+               device->running_pending = 0;
+       }
+       spin_unlock(&device->io_lock);
+
+       while(pending) {
+               cur = pending;
+               pending = pending->bi_next;
+               cur->bi_next = NULL;
+               atomic_dec(&device->dev_root->fs_info->nr_async_submits);
+               submit_bio(cur->bi_rw, cur);
+               num_run++;
+
+               /*
+                * we made progress, there is more work to do and the bdi
+                * is now congested.  Back off and let other work structs
+                * run instead
+                */
+               if (pending && num_run && bdi_write_congested(bdi)) {
+                       struct bio *old_head;
+
+                       spin_lock(&device->io_lock);
+                       old_head = device->pending_bios;
+                       device->pending_bios = pending;
+                       if (device->pending_bio_tail)
+                               tail->bi_next = old_head;
+                       else
+                               device->pending_bio_tail = tail;
+
+                       spin_unlock(&device->io_lock);
+                       btrfs_requeue_work(&device->work);
+                       goto done;
+               }
+       }
+       if (again)
+               goto loop;
+done:
+       return 0;
+}
+
+void pending_bios_fn(struct btrfs_work *work)
+{
+       struct btrfs_device *device;
+
+       device = container_of(work, struct btrfs_device, work);
+       run_scheduled_bios(device);
+}
+
  static int device_list_add(const char *path,
                            struct btrfs_super_block *disk_super,
                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -120,7 +228,7 @@ static int device_list_add(const char *path,
  
         fs_devices = find_fsid(disk_super->fsid);
         if (!fs_devices) {
-               fs_devices = kmalloc(sizeof(*fs_devices), GFP_NOFS);
+               fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
                 if (!fs_devices)
                         return -ENOMEM;
                 INIT_LIST_HEAD(&fs_devices->devices);
@@ -129,7 +237,6 @@ static int device_list_add(const char *path,
                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
                 fs_devices->latest_devid = devid;
                 fs_devices->latest_trans = found_transid;
-               fs_devices->num_devices = 0;
                 device = NULL;
         } else {
                 device = __find_device(&fs_devices->devices, devid,
@@ -142,6 +249,7 @@ static int device_list_add(const char *path,
                         return -ENOMEM;
                 }
                 device->devid = devid;
+               device->work.func = pending_bios_fn;
                 memcpy(device->uuid, disk_super->dev_item.uuid,
                        BTRFS_UUID_SIZE);
                 device->barriers = 1;
@@ -175,13 +283,17 @@ again:
         list_for_each(cur, head) {
                 device = list_entry(cur, struct btrfs_device, dev_list);
                 if (!device->in_fs_metadata) {
-                       if (device->bdev) {
-                               close_bdev_excl(device->bdev);
-                               fs_devices->open_devices--;
-                       }
+                       struct block_device *bdev;
                         list_del(&device->dev_list);
                         list_del(&device->dev_alloc_list);
                         fs_devices->num_devices--;
+                       if (device->bdev) {
+                               bdev = device->bdev;
+                               fs_devices->open_devices--;
+                               mutex_unlock(&uuid_mutex);
+                               close_bdev_excl(bdev);
+                               mutex_lock(&uuid_mutex);
+                       }
                         kfree(device->name);
                         kfree(device);
                         goto again;
@@ -262,7 +374,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                         goto error_brelse;
  
                 transid = btrfs_super_generation(disk_super);
-               if (transid > latest_transid) {
+               if (!latest_transid || transid > latest_transid) {
                         latest_devid = devid;
                         latest_transid = transid;
                         latest_bdev = bdev;
@@ -722,6 +834,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
         key.type = BTRFS_DEV_ITEM_KEY;
         key.offset = device->devid;
+       lock_chunks(root);
  
         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
         if (ret < 0)
@@ -751,15 +864,12 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
         if (bdev == fs_devices->latest_bdev)
                 fs_devices->latest_bdev = next_dev->bdev;
  
-       total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
-       btrfs_set_super_total_bytes(&root->fs_info->super_copy,
-                                   total_bytes - device->total_bytes);
-
         total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
         btrfs_set_super_num_devices(&root->fs_info->super_copy,
                                     total_bytes - 1);
  out:
         btrfs_free_path(path);
+       unlock_chunks(root);
         btrfs_commit_transaction(trans, root);
         return ret;
  }
@@ -774,8 +884,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
         u64 devid;
         int ret = 0;
  
-       mutex_lock(&root->fs_info->fs_mutex);
         mutex_lock(&uuid_mutex);
+       mutex_lock(&root->fs_info->volume_mutex);
  
         all_avail = root->fs_info->avail_data_alloc_bits |
                 root->fs_info->avail_system_alloc_bits |
@@ -850,6 +960,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
  
         }
         root->fs_info->fs_devices->num_devices--;
+       root->fs_info->fs_devices->open_devices--;
  
         ret = btrfs_shrink_device(device, 0);
         if (ret)
@@ -874,7 +985,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
         if (device->bdev) {
                 /* one close for the device struct or super_block */
                 close_bdev_excl(device->bdev);
-               root->fs_info->fs_devices->open_devices--;
         }
         if (bdev) {
                 /* one close for us */
@@ -891,8 +1001,8 @@ error_close:
         if (bdev)
                 close_bdev_excl(bdev);
  out:
+       mutex_unlock(&root->fs_info->volume_mutex);
         mutex_unlock(&uuid_mutex);
-       mutex_unlock(&root->fs_info->fs_mutex);
         return ret;
  }
  
@@ -911,8 +1021,11 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         if (!bdev) {
                 return -EIO;
         }
-       mutex_lock(&root->fs_info->fs_mutex);
+
+       mutex_lock(&root->fs_info->volume_mutex);
+
         trans = btrfs_start_transaction(root, 1);
+       lock_chunks(root);
         devices = &root->fs_info->fs_devices->devices;
         list_for_each(cur, devices) {
                 device = list_entry(cur, struct btrfs_device, dev_list);
@@ -930,6 +1043,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         }
  
         device->barriers = 1;
+       device->work.func = pending_bios_fn;
         generate_random_uuid(device->uuid);
         spin_lock_init(&device->io_lock);
         device->name = kstrdup(device_path, GFP_NOFS);
@@ -963,8 +1077,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         root->fs_info->fs_devices->num_devices++;
         root->fs_info->fs_devices->open_devices++;
  out:
+       unlock_chunks(root);
         btrfs_end_transaction(trans, root);
-       mutex_unlock(&root->fs_info->fs_mutex);
+       mutex_unlock(&root->fs_info->volume_mutex);
+
         return ret;
  
  out_close_bdev:
@@ -1018,7 +1134,7 @@ out:
         return ret;
  }
  
-int btrfs_grow_device(struct btrfs_trans_handle *trans,
+static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
                       struct btrfs_device *device, u64 new_size)
  {
         struct btrfs_super_block *super_copy =
@@ -1030,6 +1146,16 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
         return btrfs_update_device(trans, device);
  }
  
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+                     struct btrfs_device *device, u64 new_size)
+{
+       int ret;
+       lock_chunks(device->dev_root);
+       ret = __btrfs_grow_device(trans, device, new_size);
+       unlock_chunks(device->dev_root);
+       return ret;
+}
+
  static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             u64 chunk_tree, u64 chunk_objectid,
@@ -1130,6 +1256,8 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
         trans = btrfs_start_transaction(root, 1);
         BUG_ON(!trans);
  
+       lock_chunks(root);
+
         /*
          * step two, delete the device extents and the
          * chunk tree entries
@@ -1174,6 +1302,7 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
         /* once for us */
         free_extent_map(em);
  
+       unlock_chunks(root);
         btrfs_end_transaction(trans, root);
         return 0;
  }
@@ -1204,9 +1333,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
         struct btrfs_key found_key;
  
  
+       mutex_lock(&dev_root->fs_info->volume_mutex);
         dev_root = dev_root->fs_info->dev_root;
  
-       mutex_lock(&dev_root->fs_info->fs_mutex);
         /* step one make some room on all the devices */
         list_for_each(cur, devices) {
                 device = list_entry(cur, struct btrfs_device, dev_list);
@@ -1250,13 +1379,14 @@ int btrfs_balance(struct btrfs_root *dev_root)
  
                 ret = btrfs_previous_item(chunk_root, path, 0,
                                           BTRFS_CHUNK_ITEM_KEY);
-               if (ret) {
+               if (ret)
                         break;
-               }
+
                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
                                       path->slots[0]);
                 if (found_key.objectid != key.objectid)
                         break;
+
                 chunk = btrfs_item_ptr(path->nodes[0],
                                        path->slots[0],
                                        struct btrfs_chunk);
@@ -1265,17 +1395,17 @@ int btrfs_balance(struct btrfs_root *dev_root)
                 if (key.offset == 0)
                         break;
  
+               btrfs_release_path(chunk_root, path);
                 ret = btrfs_relocate_chunk(chunk_root,
                                            chunk_root->root_key.objectid,
                                            found_key.objectid,
                                            found_key.offset);
                 BUG_ON(ret);
-               btrfs_release_path(chunk_root, path);
         }
         ret = 0;
  error:
         btrfs_free_path(path);
-       mutex_unlock(&dev_root->fs_info->fs_mutex);
+       mutex_unlock(&dev_root->fs_info->volume_mutex);
         return ret;
  }
  
@@ -1315,14 +1445,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
  
         path->reada = 2;
  
+       lock_chunks(root);
+
         device->total_bytes = new_size;
         ret = btrfs_update_device(trans, device);
         if (ret) {
+               unlock_chunks(root);
                 btrfs_end_transaction(trans, root);
                 goto done;
         }
         WARN_ON(diff > old_total);
         btrfs_set_super_total_bytes(super_copy, old_total - diff);
+       unlock_chunks(root);
         btrfs_end_transaction(trans, root);
  
         key.objectid = device->devid;
@@ -1451,7 +1585,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                 return -ENOSPC;
  
         if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-               num_stripes = btrfs_super_num_devices(&info->super_copy);
+               num_stripes = extent_root->fs_info->fs_devices->open_devices;
                 min_stripes = 2;
         }
         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
@@ -1460,13 +1594,13 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
         }
         if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
                 num_stripes = min_t(u64, 2,
-                                 btrfs_super_num_devices(&info->super_copy));
+                           extent_root->fs_info->fs_devices->open_devices);
                 if (num_stripes < 2)
                         return -ENOSPC;
                 min_stripes = 2;
         }
         if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-               num_stripes = btrfs_super_num_devices(&info->super_copy);
+               num_stripes = extent_root->fs_info->fs_devices->open_devices;
                 if (num_stripes < 4)
                         return -ENOSPC;
                 num_stripes &= ~(u32)1;
@@ -1970,8 +2104,62 @@ static int end_bio_multi_stripe(struct bio *bio,
  #endif
  }
  
+struct async_sched {
+       struct bio *bio;
+       int rw;
+       struct btrfs_fs_info *info;
+       struct btrfs_work work;
+};
+
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
+                int rw, struct bio *bio)
+{
+       int should_queue = 1;
+
+       /* don't bother with additional async steps for reads, right now */
+       if (!(rw & (1 << BIO_RW))) {
+               submit_bio(rw, bio);
+               return 0;
+       }
+
+       /*
+        * nr_async_sumbits allows us to reliably return congestion to the
+        * higher layers.  Otherwise, the async bio makes it appear we have
+        * made progress against dirty pages when we've really just put it
+        * on a queue for later
+        */
+       atomic_inc(&root->fs_info->nr_async_submits);
+       bio->bi_next = NULL;
+       bio->bi_rw |= rw;
+
+       spin_lock(&device->io_lock);
+
+       if (device->pending_bio_tail)
+               device->pending_bio_tail->bi_next = bio;
+
+       device->pending_bio_tail = bio;
+       if (!device->pending_bios)
+               device->pending_bios = bio;
+       if (device->running_pending)
+               should_queue = 0;
+
+       spin_unlock(&device->io_lock);
+
+       if (should_queue)
+               btrfs_queue_worker(&root->fs_info->submit_workers,
+                                  &device->work);
+       return 0;
+}
+
  int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
-                 int mirror_num)
+                 int mirror_num, int async_submit)
  {
         struct btrfs_mapping_tree *map_tree;
         struct btrfs_device *dev;
@@ -2017,10 +2205,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                 dev = multi->stripes[dev_nr].dev;
                 if (dev && dev->bdev) {
                         bio->bi_bdev = dev->bdev;
-                       spin_lock(&dev->io_lock);
-                       dev->total_ios++;
-                       spin_unlock(&dev->io_lock);
-                       submit_bio(rw, bio);
+                       if (async_submit)
+                               schedule_bio(root, dev, rw, bio);
+                       else
+                               submit_bio(rw, bio);
                 } else {
                         bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
                         bio->bi_sector = logical >> 9;
@@ -2059,6 +2247,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
         device->barriers = 1;
         device->dev_root = root->fs_info->dev_root;
         device->devid = devid;
+       device->work.func = pending_bios_fn;
         fs_devices->num_devices++;
         spin_lock_init(&device->io_lock);
         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);