X-Git-Url: http://drtracing.org/?a=blobdiff_plain;f=fs%2Fbtrfs%2Fvolumes.c;h=4e7cee27aab565cef76d1e4041e643382d2438b1;hb=a74a4b97b61beede185b4b3ad359d7d378b0d312;hp=501d23d3ebfd2f0c0a23ad4672af2d445a9391a7;hpb=a0af469b58944f6e8c5c8ecbebb42997baf0cb9e;p=deliverable%2Flinux.git

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 501d23d3ebfd..4e7cee27aab5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
 #include "transaction.h"
 #include "print-tree.h"
 #include "volumes.h"
+#include "async-thread.h"
 
 struct map_lookup {
 	u64 type;
@@ -110,6 +111,101 @@ static struct btrfs_fs_devices *find_fsid(u8 *fsid)
 	return NULL;
 }
 
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device.  This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block.  The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested.  This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+int run_scheduled_bios(struct btrfs_device *device)
+{
+	struct bio *pending;
+	struct backing_dev_info *bdi;
+	struct bio *tail;
+	struct bio *cur;
+	int again = 0;
+	unsigned long num_run = 0;
+
+	bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+loop:
+	spin_lock(&device->io_lock);
+
+	/* take all the bios off the list at once and process them
+	 * later on (without the lock held).  But, remember the
+	 * tail and other pointers so the bios can be properly reinserted
+	 * into the list if we hit congestion
+	 */
+	pending = device->pending_bios;
+	tail = device->pending_bio_tail;
+	WARN_ON(pending && !tail);
+	device->pending_bios = NULL;
+	device->pending_bio_tail = NULL;
+
+	/*
+	 * if pending was null this time around, no bios need processing
+	 * at all and we can stop.  Otherwise it'll loop back up again
+	 * and do an additional check so no bios are missed.
+	 *
+	 * device->running_pending is used to synchronize with the
+	 * schedule_bio code.
+	 */
+	if (pending) {
+		again = 1;
+		device->running_pending = 1;
+	} else {
+		again = 0;
+		device->running_pending = 0;
+	}
+	spin_unlock(&device->io_lock);
+
+	while(pending) {
+		cur = pending;
+		pending = pending->bi_next;
+		cur->bi_next = NULL;
+		atomic_dec(&device->dev_root->fs_info->nr_async_submits);
+		submit_bio(cur->bi_rw, cur);
+		num_run++;
+
+		/*
+		 * we made progress, there is more work to do and the bdi
+		 * is now congested.  Back off and let other work structs
+		 * run instead
+		 */
+		if (pending && num_run && bdi_write_congested(bdi)) {
+			struct bio *old_head;
+
+			spin_lock(&device->io_lock);
+			old_head = device->pending_bios;
+			device->pending_bios = pending;
+			if (device->pending_bio_tail)
+				tail->bi_next = old_head;
+			else
+				device->pending_bio_tail = tail;
+
+			spin_unlock(&device->io_lock);
+			btrfs_requeue_work(&device->work);
+			goto done;
+		}
+	}
+	if (again)
+		goto loop;
+done:
+	return 0;
+}
+
+void pending_bios_fn(struct btrfs_work *work)
+{
+	struct btrfs_device *device;
+
+	device = container_of(work, struct btrfs_device, work);
+	run_scheduled_bios(device);
+}
+
 static int device_list_add(const char *path,
 			   struct btrfs_super_block *disk_super,
 			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -120,7 +216,7 @@ static int device_list_add(const char *path,
 
 	fs_devices = find_fsid(disk_super->fsid);
 	if (!fs_devices) {
-		fs_devices = kmalloc(sizeof(*fs_devices), GFP_NOFS);
+		fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
 		if (!fs_devices)
 			return -ENOMEM;
 		INIT_LIST_HEAD(&fs_devices->devices);
@@ -129,7 +225,6 @@ static int device_list_add(const char *path,
 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
-		fs_devices->num_devices = 0;
 		device = NULL;
 	} else {
 		device = __find_device(&fs_devices->devices, devid,
@@ -142,6 +237,7 @@ static int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		device->devid = devid;
+		device->work.func = pending_bios_fn;
 		memcpy(device->uuid, disk_super->dev_item.uuid,
 		       BTRFS_UUID_SIZE);
 		device->barriers = 1;
@@ -175,13 +271,17 @@ again:
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (!device->in_fs_metadata) {
-			if (device->bdev) {
-				close_bdev_excl(device->bdev);
-				fs_devices->open_devices--;
-			}
+			struct block_device *bdev;
 			list_del(&device->dev_list);
 			list_del(&device->dev_alloc_list);
 			fs_devices->num_devices--;
+			if (device->bdev) {
+				bdev = device->bdev;
+				fs_devices->open_devices--;
+				mutex_unlock(&uuid_mutex);
+				close_bdev_excl(bdev);
+				mutex_lock(&uuid_mutex);
+			}
 			kfree(device->name);
 			kfree(device);
 			goto again;
@@ -262,7 +362,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			goto error_brelse;
 
 		transid = btrfs_super_generation(disk_super);
-		if (transid > latest_transid) {
+		if (!latest_transid || transid > latest_transid) {
 			latest_devid = devid;
 			latest_transid = transid;
 			latest_bdev = bdev;
@@ -751,10 +851,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 	if (bdev == fs_devices->latest_bdev)
 		fs_devices->latest_bdev = next_dev->bdev;
 
-	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
-				    total_bytes - device->total_bytes);
-
 	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
 	btrfs_set_super_num_devices(&root->fs_info->super_copy,
 				    total_bytes - 1);
@@ -774,7 +870,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	u64 devid;
 	int ret = 0;
 
-	mutex_lock(&root->fs_info->fs_mutex);
+	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
 	mutex_lock(&uuid_mutex);
 
 	all_avail = root->fs_info->avail_data_alloc_bits |
@@ -850,6 +947,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	}
 	root->fs_info->fs_devices->num_devices--;
+	root->fs_info->fs_devices->open_devices--;
 
 	ret = btrfs_shrink_device(device, 0);
 	if (ret)
@@ -874,7 +972,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (device->bdev) {
 		/* one close for the device struct or super_block */
 		close_bdev_excl(device->bdev);
-		root->fs_info->fs_devices->open_devices--;
 	}
 	if (bdev) {
 		/* one close for us */
@@ -892,7 +989,8 @@ error_close:
 		close_bdev_excl(bdev);
 out:
 	mutex_unlock(&uuid_mutex);
-	mutex_unlock(&root->fs_info->fs_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -911,7 +1009,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if (!bdev) {
 		return -EIO;
 	}
-	mutex_lock(&root->fs_info->fs_mutex);
+
+	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
+
 	trans = btrfs_start_transaction(root, 1);
 	devices = &root->fs_info->fs_devices->devices;
 	list_for_each(cur, devices) {
@@ -930,6 +1031,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	}
 
 	device->barriers = 1;
+	device->work.func = pending_bios_fn;
 	generate_random_uuid(device->uuid);
 	spin_lock_init(&device->io_lock);
 	device->name = kstrdup(device_path, GFP_NOFS);
@@ -964,7 +1066,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	root->fs_info->fs_devices->open_devices++;
 out:
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&root->fs_info->alloc_mutex);
+
 	return ret;
 
 out_close_bdev:
@@ -1204,9 +1308,10 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	struct btrfs_key found_key;
 
 
+	BUG(); /* FIXME, needs locking */
+
 	dev_root = dev_root->fs_info->dev_root;
 
-	mutex_lock(&dev_root->fs_info->fs_mutex);
 	/* step one make some room on all the devices */
 	list_for_each(cur, devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
@@ -1275,7 +1380,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	ret = 0;
 error:
 	btrfs_free_path(path);
-	mutex_unlock(&dev_root->fs_info->fs_mutex);
 	return ret;
 }
 
@@ -1451,7 +1555,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		return -ENOSPC;
 
 	if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-		num_stripes = btrfs_super_num_devices(&info->super_copy);
+		num_stripes = extent_root->fs_info->fs_devices->open_devices;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
@@ -1460,13 +1564,13 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		num_stripes = min_t(u64, 2,
-				  btrfs_super_num_devices(&info->super_copy));
+			    extent_root->fs_info->fs_devices->open_devices);
 		if (num_stripes < 2)
 			return -ENOSPC;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-		num_stripes = btrfs_super_num_devices(&info->super_copy);
+		num_stripes = extent_root->fs_info->fs_devices->open_devices;
 		if (num_stripes < 4)
 			return -ENOSPC;
 		num_stripes &= ~(u32)1;
@@ -1970,8 +2074,62 @@ static int end_bio_multi_stripe(struct bio *bio,
 #endif
 }
 
+struct async_sched {
+	struct bio *bio;
+	int rw;
+	struct btrfs_fs_info *info;
+	struct btrfs_work work;
+};
+
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
+		 int rw, struct bio *bio)
+{
+	int should_queue = 1;
+
+	/* don't bother with additional async steps for reads, right now */
+	if (!(rw & (1 << BIO_RW))) {
+		submit_bio(rw, bio);
+		return 0;
+	}
+
+	/*
+	 * nr_async_sumbits allows us to reliably return congestion to the
+	 * higher layers.  Otherwise, the async bio makes it appear we have
+	 * made progress against dirty pages when we've really just put it
+	 * on a queue for later
+	 */
+	atomic_inc(&root->fs_info->nr_async_submits);
+	bio->bi_next = NULL;
+	bio->bi_rw |= rw;
+
+	spin_lock(&device->io_lock);
+
+	if (device->pending_bio_tail)
+		device->pending_bio_tail->bi_next = bio;
+
+	device->pending_bio_tail = bio;
+	if (!device->pending_bios)
+		device->pending_bios = bio;
+	if (device->running_pending)
+		should_queue = 0;
+
+	spin_unlock(&device->io_lock);
+
+	if (should_queue)
+		btrfs_queue_worker(&root->fs_info->submit_workers,
+				   &device->work);
+	return 0;
+}
+
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
-		  int mirror_num)
+		  int mirror_num, int async_submit)
 {
 	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
@@ -2017,10 +2175,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		dev = multi->stripes[dev_nr].dev;
 		if (dev && dev->bdev) {
 			bio->bi_bdev = dev->bdev;
-			spin_lock(&dev->io_lock);
-			dev->total_ios++;
-			spin_unlock(&dev->io_lock);
-			submit_bio(rw, bio);
+			if (async_submit)
+				schedule_bio(root, dev, rw, bio);
+			else
+				submit_bio(rw, bio);
 		} else {
 			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
 			bio->bi_sector = logical >> 9;
@@ -2059,6 +2217,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device->barriers = 1;
 	device->dev_root = root->fs_info->dev_root;
 	device->devid = devid;
+	device->work.func = pending_bios_fn;
 	fs_devices->num_devices++;
 	spin_lock_init(&device->io_lock);
 	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);