#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
+#include "async-thread.h"
struct map_lookup {
u64 type;
mutex_unlock(&uuid_mutex);
}
+static void lock_chunks(struct btrfs_root *root)
+{
+ mutex_lock(&root->fs_info->alloc_mutex);
+ mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static void unlock_chunks(struct btrfs_root *root)
+{
+ mutex_unlock(&root->fs_info->alloc_mutex);
+ mutex_unlock(&root->fs_info->chunk_mutex);
+}
+
int btrfs_cleanup_fs_uuids(void)
{
struct btrfs_fs_devices *fs_devices;
return NULL;
}
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device. This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block. The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested. This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+int run_scheduled_bios(struct btrfs_device *device)
+{
+ struct bio *pending;
+ struct backing_dev_info *bdi;
+ struct bio *tail;
+ struct bio *cur;
+ int again = 0;
+ unsigned long num_run = 0;
+
+ bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+loop:
+ spin_lock(&device->io_lock);
+
+ /* take all the bios off the list at once and process them
+ * later on (without the lock held). But, remember the
+ * tail and other pointers so the bios can be properly reinserted
+ * into the list if we hit congestion
+ */
+ pending = device->pending_bios;
+ tail = device->pending_bio_tail;
+ WARN_ON(pending && !tail);
+ device->pending_bios = NULL;
+ device->pending_bio_tail = NULL;
+
+ /*
+ * if pending was null this time around, no bios need processing
+ * at all and we can stop. Otherwise it'll loop back up again
+ * and do an additional check so no bios are missed.
+ *
+ * device->running_pending is used to synchronize with the
+ * schedule_bio code.
+ */
+ if (pending) {
+ again = 1;
+ device->running_pending = 1;
+ } else {
+ again = 0;
+ device->running_pending = 0;
+ }
+ spin_unlock(&device->io_lock);
+
+ while(pending) {
+ cur = pending;
+ pending = pending->bi_next;
+ cur->bi_next = NULL;
+ atomic_dec(&device->dev_root->fs_info->nr_async_submits);
+ submit_bio(cur->bi_rw, cur);
+ num_run++;
+
+ /*
+ * we made progress, there is more work to do and the bdi
+ * is now congested. Back off and let other work structs
+ * run instead
+ */
+ if (pending && num_run && bdi_write_congested(bdi)) {
+ struct bio *old_head;
+
+ spin_lock(&device->io_lock);
+ old_head = device->pending_bios;
+ device->pending_bios = pending;
+ if (device->pending_bio_tail)
+ tail->bi_next = old_head;
+ else
+ device->pending_bio_tail = tail;
+
+ spin_unlock(&device->io_lock);
+ btrfs_requeue_work(&device->work);
+ goto done;
+ }
+ }
+ if (again)
+ goto loop;
+done:
+ return 0;
+}
+
+void pending_bios_fn(struct btrfs_work *work)
+{
+ struct btrfs_device *device;
+
+ device = container_of(work, struct btrfs_device, work);
+ run_scheduled_bios(device);
+}
+
static int device_list_add(const char *path,
struct btrfs_super_block *disk_super,
u64 devid, struct btrfs_fs_devices **fs_devices_ret)
fs_devices = find_fsid(disk_super->fsid);
if (!fs_devices) {
- fs_devices = kmalloc(sizeof(*fs_devices), GFP_NOFS);
+ fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
if (!fs_devices)
return -ENOMEM;
INIT_LIST_HEAD(&fs_devices->devices);
memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
fs_devices->latest_devid = devid;
fs_devices->latest_trans = found_transid;
- fs_devices->num_devices = 0;
device = NULL;
} else {
device = __find_device(&fs_devices->devices, devid,
return -ENOMEM;
}
device->devid = devid;
+ device->work.func = pending_bios_fn;
memcpy(device->uuid, disk_super->dev_item.uuid,
BTRFS_UUID_SIZE);
device->barriers = 1;
list_for_each(cur, head) {
device = list_entry(cur, struct btrfs_device, dev_list);
if (!device->in_fs_metadata) {
- if (device->bdev) {
- close_bdev_excl(device->bdev);
- fs_devices->open_devices--;
- }
+ struct block_device *bdev;
list_del(&device->dev_list);
list_del(&device->dev_alloc_list);
fs_devices->num_devices--;
+ if (device->bdev) {
+ bdev = device->bdev;
+ fs_devices->open_devices--;
+ mutex_unlock(&uuid_mutex);
+ close_bdev_excl(bdev);
+ mutex_lock(&uuid_mutex);
+ }
kfree(device->name);
kfree(device);
goto again;
goto error_brelse;
transid = btrfs_super_generation(disk_super);
- if (transid > latest_transid) {
+ if (!latest_transid || transid > latest_transid) {
latest_devid = devid;
latest_transid = transid;
latest_bdev = bdev;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
+ lock_chunks(root);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
if (bdev == fs_devices->latest_bdev)
fs_devices->latest_bdev = next_dev->bdev;
- total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
- btrfs_set_super_total_bytes(&root->fs_info->super_copy,
- total_bytes - device->total_bytes);
-
total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
btrfs_set_super_num_devices(&root->fs_info->super_copy,
total_bytes - 1);
out:
btrfs_free_path(path);
+ unlock_chunks(root);
btrfs_commit_transaction(trans, root);
return ret;
}
u64 devid;
int ret = 0;
- mutex_lock(&root->fs_info->fs_mutex);
mutex_lock(&uuid_mutex);
+ mutex_lock(&root->fs_info->volume_mutex);
all_avail = root->fs_info->avail_data_alloc_bits |
root->fs_info->avail_system_alloc_bits |
}
root->fs_info->fs_devices->num_devices--;
+ root->fs_info->fs_devices->open_devices--;
ret = btrfs_shrink_device(device, 0);
if (ret)
if (device->bdev) {
/* one close for the device struct or super_block */
close_bdev_excl(device->bdev);
- root->fs_info->fs_devices->open_devices--;
}
if (bdev) {
/* one close for us */
if (bdev)
close_bdev_excl(bdev);
out:
+ mutex_unlock(&root->fs_info->volume_mutex);
mutex_unlock(&uuid_mutex);
- mutex_unlock(&root->fs_info->fs_mutex);
return ret;
}
if (!bdev) {
return -EIO;
}
- mutex_lock(&root->fs_info->fs_mutex);
+
+ mutex_lock(&root->fs_info->volume_mutex);
+
trans = btrfs_start_transaction(root, 1);
+ lock_chunks(root);
devices = &root->fs_info->fs_devices->devices;
list_for_each(cur, devices) {
device = list_entry(cur, struct btrfs_device, dev_list);
}
device->barriers = 1;
+ device->work.func = pending_bios_fn;
generate_random_uuid(device->uuid);
spin_lock_init(&device->io_lock);
device->name = kstrdup(device_path, GFP_NOFS);
root->fs_info->fs_devices->num_devices++;
root->fs_info->fs_devices->open_devices++;
out:
+ unlock_chunks(root);
btrfs_end_transaction(trans, root);
- mutex_unlock(&root->fs_info->fs_mutex);
+ mutex_unlock(&root->fs_info->volume_mutex);
+
return ret;
out_close_bdev:
return ret;
}
-int btrfs_grow_device(struct btrfs_trans_handle *trans,
+static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size)
{
struct btrfs_super_block *super_copy =
return btrfs_update_device(trans, device);
}
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 new_size)
+{
+ int ret;
+ lock_chunks(device->dev_root);
+ ret = __btrfs_grow_device(trans, device, new_size);
+ unlock_chunks(device->dev_root);
+ return ret;
+}
+
static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 chunk_tree, u64 chunk_objectid,
trans = btrfs_start_transaction(root, 1);
BUG_ON(!trans);
+ lock_chunks(root);
+
/*
* step two, delete the device extents and the
* chunk tree entries
/* once for us */
free_extent_map(em);
+ unlock_chunks(root);
btrfs_end_transaction(trans, root);
return 0;
}
struct btrfs_key found_key;
+ mutex_lock(&dev_root->fs_info->volume_mutex);
dev_root = dev_root->fs_info->dev_root;
- mutex_lock(&dev_root->fs_info->fs_mutex);
/* step one make some room on all the devices */
list_for_each(cur, devices) {
device = list_entry(cur, struct btrfs_device, dev_list);
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
- if (ret) {
+ if (ret)
break;
- }
+
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
if (found_key.objectid != key.objectid)
break;
+
chunk = btrfs_item_ptr(path->nodes[0],
path->slots[0],
struct btrfs_chunk);
if (key.offset == 0)
break;
+ btrfs_release_path(chunk_root, path);
ret = btrfs_relocate_chunk(chunk_root,
chunk_root->root_key.objectid,
found_key.objectid,
found_key.offset);
BUG_ON(ret);
- btrfs_release_path(chunk_root, path);
}
ret = 0;
error:
btrfs_free_path(path);
- mutex_unlock(&dev_root->fs_info->fs_mutex);
+ mutex_unlock(&dev_root->fs_info->volume_mutex);
return ret;
}
path->reada = 2;
+ lock_chunks(root);
+
device->total_bytes = new_size;
ret = btrfs_update_device(trans, device);
if (ret) {
+ unlock_chunks(root);
btrfs_end_transaction(trans, root);
goto done;
}
WARN_ON(diff > old_total);
btrfs_set_super_total_bytes(super_copy, old_total - diff);
+ unlock_chunks(root);
btrfs_end_transaction(trans, root);
key.objectid = device->devid;
return -ENOSPC;
if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
- num_stripes = btrfs_super_num_devices(&info->super_copy);
+ num_stripes = extent_root->fs_info->fs_devices->open_devices;
min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_DUP)) {
}
if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
num_stripes = min_t(u64, 2,
- btrfs_super_num_devices(&info->super_copy));
+ extent_root->fs_info->fs_devices->open_devices);
if (num_stripes < 2)
return -ENOSPC;
min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
- num_stripes = btrfs_super_num_devices(&info->super_copy);
+ num_stripes = extent_root->fs_info->fs_devices->open_devices;
if (num_stripes < 4)
return -ENOSPC;
num_stripes &= ~(u32)1;
#endif
}
+struct async_sched {
+ struct bio *bio;
+ int rw;
+ struct btrfs_fs_info *info;
+ struct btrfs_work work;
+};
+
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
+ int rw, struct bio *bio)
+{
+ int should_queue = 1;
+
+ /* don't bother with additional async steps for reads, right now */
+ if (!(rw & (1 << BIO_RW))) {
+ submit_bio(rw, bio);
+ return 0;
+ }
+
+ /*
+ * nr_async_sumbits allows us to reliably return congestion to the
+ * higher layers. Otherwise, the async bio makes it appear we have
+ * made progress against dirty pages when we've really just put it
+ * on a queue for later
+ */
+ atomic_inc(&root->fs_info->nr_async_submits);
+ bio->bi_next = NULL;
+ bio->bi_rw |= rw;
+
+ spin_lock(&device->io_lock);
+
+ if (device->pending_bio_tail)
+ device->pending_bio_tail->bi_next = bio;
+
+ device->pending_bio_tail = bio;
+ if (!device->pending_bios)
+ device->pending_bios = bio;
+ if (device->running_pending)
+ should_queue = 0;
+
+ spin_unlock(&device->io_lock);
+
+ if (should_queue)
+ btrfs_queue_worker(&root->fs_info->submit_workers,
+ &device->work);
+ return 0;
+}
+
int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
- int mirror_num)
+ int mirror_num, int async_submit)
{
struct btrfs_mapping_tree *map_tree;
struct btrfs_device *dev;
dev = multi->stripes[dev_nr].dev;
if (dev && dev->bdev) {
bio->bi_bdev = dev->bdev;
- spin_lock(&dev->io_lock);
- dev->total_ios++;
- spin_unlock(&dev->io_lock);
- submit_bio(rw, bio);
+ if (async_submit)
+ schedule_bio(root, dev, rw, bio);
+ else
+ submit_bio(rw, bio);
} else {
bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
bio->bi_sector = logical >> 9;
device->barriers = 1;
device->dev_root = root->fs_info->dev_root;
device->devid = devid;
+ device->work.func = pending_bios_fn;
fs_devices->num_devices++;
spin_lock_init(&device->io_lock);
memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);