X-Git-Url: http://drtracing.org/?a=blobdiff_plain;f=fs%2Fbtrfs%2Fdisk-io.c;h=31ca9f89388dceac0dc8619b9845abb065f7d03c;hb=a74a4b97b61beede185b4b3ad359d7d378b0d312;hp=e444b99e02da989ea5d03777f02cc2248dd114f7;hpb=728131d8e40c2a47c59ca91a806299c4708029f9;p=deliverable%2Flinux.git diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e444b99e02da..31ca9f89388d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -16,20 +16,30 @@ * Boston, MA 021110-1307, USA. */ +#include #include #include -#include #include #include #include #include #include // for block_sync_page +#include +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) +# include +#else +# include +#endif +#include "crc32c.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" #include "volumes.h" #include "print-tree.h" +#include "async-thread.h" +#include "locking.h" #if 0 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) @@ -45,6 +55,28 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) #endif static struct extent_io_ops btree_extent_io_ops; +static void end_workqueue_fn(struct btrfs_work *work); + +struct end_io_wq { + struct bio *bio; + bio_end_io_t *end_io; + void *private; + struct btrfs_fs_info *info; + int error; + int metadata; + struct list_head list; + struct btrfs_work work; +}; + +struct async_submit_bio { + struct inode *inode; + struct bio *bio; + struct list_head list; + extent_submit_bio_hook_t *submit_bio_hook; + int rw; + int mirror_num; + struct btrfs_work work; +}; struct extent_map *btree_get_extent(struct inode *inode, struct page *page, size_t page_offset, u64 start, u64 len, @@ -54,41 +86,66 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page, struct extent_map *em; int ret; -again: spin_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); - spin_unlock(&em_tree->lock); if (em) { + em->bdev = + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + spin_unlock(&em_tree->lock); goto out; } + spin_unlock(&em_tree->lock); + em = alloc_extent_map(GFP_NOFS); if (!em) { em = ERR_PTR(-ENOMEM); goto out; } em->start = 0; - em->len = i_size_read(inode); + em->len = (u64)-1; em->block_start = 0; - em->bdev = inode->i_sb->s_bdev; + em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; spin_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); - if (ret == -EEXIST) { + u64 failed_start = em->start; + u64 failed_len = em->len; + + printk("failed to insert %Lu %Lu -> %Lu into tree\n", + em->start, em->len, em->block_start); free_extent_map(em); - em = NULL; - goto again; + em = lookup_extent_mapping(em_tree, start, len); + if (em) { + printk("after failing, found %Lu %Lu %Lu\n", + em->start, em->len, em->block_start); + ret = 0; + } else { + em = lookup_extent_mapping(em_tree, failed_start, + failed_len); + if (em) { + printk("double failure lookup gives us " + "%Lu %Lu -> %Lu\n", em->start, + em->len, em->block_start); + free_extent_map(em); + } + ret = -EIO; + } } else if (ret) { - em = ERR_PTR(ret); + free_extent_map(em); + em = NULL; } + spin_unlock(&em_tree->lock); + + if (ret) + em = ERR_PTR(ret); out: return em; } u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) { - return crc32c(seed, data, len); + return btrfs_crc32c(seed, data, len); } void btrfs_csum_final(u32 crc, char *result) @@ -144,7 +201,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, BTRFS_CRC32_SIZE); read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE); - WARN_ON(1); printk("btrfs: %s checksum verify failed on %llu " "wanted %X found %X from_this_trans %d " "level %d\n", @@ -159,6 +215,61 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, return 0; } +static int verify_parent_transid(struct extent_io_tree *io_tree, + struct extent_buffer *eb, u64 parent_transid) +{ + int ret; + + if (!parent_transid || btrfs_header_generation(eb) == parent_transid) + return 0; + + lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); + if (extent_buffer_uptodate(io_tree, eb) && + btrfs_header_generation(eb) == parent_transid) { + ret = 0; + goto out; + } + printk("parent transid verify failed on %llu wanted %llu found %llu\n", + (unsigned long long)eb->start, + (unsigned long long)parent_transid, + (unsigned long long)btrfs_header_generation(eb)); + ret = 1; +out: + clear_extent_buffer_uptodate(io_tree, eb); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + return ret; + +} + +static int btree_read_extent_buffer_pages(struct btrfs_root *root, + struct extent_buffer *eb, + u64 start, u64 parent_transid) +{ + struct extent_io_tree *io_tree; + int ret; + int num_copies = 0; + int mirror_num = 0; + + io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; + while (1) { + ret = read_extent_buffer_pages(io_tree, eb, start, 1, + btree_get_extent, mirror_num); + if (!ret && + !verify_parent_transid(io_tree, eb, parent_transid)) + return ret; + + num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, + eb->start, eb->len); + if (num_copies == 1) + return ret; + + mirror_num++; + if (mirror_num > num_copies) + return ret; + } + return -EIO; +} int csum_dirty_buffer(struct btrfs_root *root, struct page *page) { @@ -168,6 +279,8 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page) int found_level; unsigned long len; struct extent_buffer *eb; + int ret; + tree = &BTRFS_I(page->mapping->host)->io_tree; if (page->private == EXTENT_PAGE_PRIVATE) @@ -179,8 +292,9 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page) WARN_ON(1); } eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); - read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1, - btree_get_extent); + ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, + btrfs_header_generation(eb)); + BUG_ON(ret); btrfs_clear_buffer_defrag(eb); found_start = btrfs_header_bytenr(eb); if (found_start != start) { @@ -219,17 +333,185 @@ static int btree_writepage_io_hook(struct page *page, u64 start, u64 end) return 0; } -static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio) +int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + struct extent_io_tree *tree; + u64 found_start; + int found_level; + unsigned long len; + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + int ret = 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + len = page->private >> 2; + if (len == 0) { + WARN_ON(1); + } + eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + + btrfs_clear_buffer_defrag(eb); + found_start = btrfs_header_bytenr(eb); + if (found_start != start) { + ret = -EIO; + goto err; + } + if (eb->first_page != page) { + printk("bad first page %lu %lu\n", eb->first_page->index, + page->index); + WARN_ON(1); + ret = -EIO; + goto err; + } + if (memcmp_extent_buffer(eb, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(eb), + BTRFS_FSID_SIZE)) { + printk("bad fsid on block %Lu\n", eb->start); + ret = -EIO; + goto err; + } + found_level = btrfs_header_level(eb); + + ret = csum_tree_block(root, eb, 1); + if (ret) + ret = -EIO; + + end = min_t(u64, eb->len, PAGE_CACHE_SIZE); + end = eb->start + end - 1; + release_extent_buffer_tail_pages(eb); +err: + free_extent_buffer(eb); +out: + return ret; +} + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) +static void end_workqueue_bio(struct bio *bio, int err) +#else +static int end_workqueue_bio(struct bio *bio, + unsigned int bytes_done, int err) +#endif +{ + struct end_io_wq *end_io_wq = bio->bi_private; + struct btrfs_fs_info *fs_info; + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + if (bio->bi_size) + return 1; +#endif + + fs_info = end_io_wq->info; + end_io_wq->error = err; + end_io_wq->work.func = end_workqueue_fn; + end_io_wq->work.flags = 0; + btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work); + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + return 0; +#endif +} + +int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, + int metadata) +{ + struct end_io_wq *end_io_wq; + end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); + if (!end_io_wq) + return -ENOMEM; + + end_io_wq->private = bio->bi_private; + end_io_wq->end_io = bio->bi_end_io; + end_io_wq->info = info; + end_io_wq->error = 0; + end_io_wq->bio = bio; + end_io_wq->metadata = metadata; + + bio->bi_private = end_io_wq; + bio->bi_end_io = end_workqueue_bio; + return 0; +} + +static void run_one_async_submit(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info; + struct async_submit_bio *async; + + async = container_of(work, struct async_submit_bio, work); + fs_info = BTRFS_I(async->inode)->root->fs_info; + atomic_dec(&fs_info->nr_async_submits); + async->submit_bio_hook(async->inode, async->rw, async->bio, + async->mirror_num); + kfree(async); +} + +int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, + int rw, struct bio *bio, int mirror_num, + extent_submit_bio_hook_t *submit_bio_hook) +{ + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->inode = inode; + async->rw = rw; + async->bio = bio; + async->mirror_num = mirror_num; + async->submit_bio_hook = submit_bio_hook; + async->work.func = run_one_async_submit; + async->work.flags = 0; + atomic_inc(&fs_info->nr_async_submits); + btrfs_queue_worker(&fs_info->workers, &async->work); + return 0; +} + +static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 offset; + int ret; + offset = bio->bi_sector << 9; - if (offset == BTRFS_SUPER_INFO_OFFSET) { - bio->bi_bdev = root->fs_info->sb->s_bdev; - submit_bio(rw, bio); - return 0; + + /* + * when we're called for a write, we're already in the async + * submission context. Just jump ingo btrfs_map_bio + */ + if (rw & (1 << BIO_RW)) { + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); } - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio); + + /* + * called for a read, do the setup so that checksum validation + * can happen in the async kernel threads + */ + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1); + BUG_ON(ret); + + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); +} + +static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num) +{ + /* + * kthread helpers are used to submit writes so that checksumming + * can happen in parallel across all CPUs + */ + if (!(rw & (1 << BIO_RW))) { + return __btree_submit_bio_hook(inode, rw, bio, mirror_num); + } + return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, + __btree_submit_bio_hook); } static int btree_writepage(struct page *page, struct writeback_control *wbc) @@ -279,9 +561,15 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) struct extent_map_tree *map; int ret; + if (page_count(page) > 3) { + /* once for page->private, once for the caller, once + * once for the page cache + */ + return 0; + } tree = &BTRFS_I(page->mapping->host)->io_tree; map = &BTRFS_I(page->mapping->host)->extent_tree; - ret = try_release_extent_mapping(map, tree, page, gfp_flags); + ret = try_release_extent_state(map, tree, page, gfp_flags); if (ret == 1) { invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE); ClearPagePrivate(page); @@ -297,6 +585,12 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) tree = &BTRFS_I(page->mapping->host)->io_tree; extent_invalidatepage(tree, page, offset); btree_releasepage(page, GFP_NOFS); + if (PagePrivate(page)) { + invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE); + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } } #if 0 @@ -329,7 +623,8 @@ static struct address_space_operations btree_aops = { .sync_page = block_sync_page, }; -int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) +int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, + u64 parent_transid) { struct extent_buffer *buf = NULL; struct inode *btree_inode = root->fs_info->btree_inode; @@ -339,62 +634,11 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) if (!buf) return 0; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, 0, 0, btree_get_extent); + buf, 0, 0, btree_get_extent, 0); free_extent_buffer(buf); return ret; } -static int close_all_devices(struct btrfs_fs_info *fs_info) -{ - struct list_head *list; - struct list_head *next; - struct btrfs_device *device; - - list = &fs_info->fs_devices->devices; - list_for_each(next, list) { - device = list_entry(next, struct btrfs_device, dev_list); - if (device->bdev && device->bdev != fs_info->sb->s_bdev) - close_bdev_excl(device->bdev); - device->bdev = NULL; - } - return 0; -} - -int btrfs_verify_block_csum(struct btrfs_root *root, - struct extent_buffer *buf) -{ - struct extent_io_tree *io_tree; - u64 end; - int ret; - - io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; - if (buf->flags & EXTENT_CSUM) - return 0; - - end = min_t(u64, buf->len, PAGE_CACHE_SIZE); - end = buf->start + end - 1; - if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) { - buf->flags |= EXTENT_CSUM; - return 0; - } - lock_extent(io_tree, buf->start, end, GFP_NOFS); - - if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) { - buf->flags |= EXTENT_CSUM; - ret = 0; - goto out_unlock; - } -WARN_ON(buf->flags & EXTENT_CSUM); - - ret = csum_tree_block(root, buf, 1); - set_extent_bits(io_tree, buf->start, end, EXTENT_CSUM, GFP_NOFS); - buf->flags |= EXTENT_CSUM; - -out_unlock: - unlock_extent(io_tree, buf->start, end, GFP_NOFS); - return ret; -} - struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { @@ -418,7 +662,7 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, - u32 blocksize) + u32 blocksize, u64 parent_transid) { struct extent_buffer *buf = NULL; struct inode *btree_inode = root->fs_info->btree_inode; @@ -430,11 +674,14 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, buf = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!buf) return NULL; - read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, 1, - btree_get_extent); - ret = btrfs_verify_block_csum(root, buf); + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + + if (ret == 0) { + buf->flags |= EXTENT_UPTODATE; + } return buf; + } int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -442,9 +689,11 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct inode *btree_inode = root->fs_info->btree_inode; if (btrfs_header_generation(buf) == - root->fs_info->running_transaction->transid) + root->fs_info->running_transaction->transid) { + WARN_ON(!btrfs_tree_locked(buf)); clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); + } return 0; } @@ -481,6 +730,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->in_sysfs = 0; INIT_LIST_HEAD(&root->dirty_list); + spin_lock_init(&root->node_lock); + mutex_init(&root->objectid_mutex); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); @@ -509,7 +760,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root, blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), - blocksize); + blocksize, 0); BUG_ON(!root->node); return 0; } @@ -565,7 +816,7 @@ out: } blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), - blocksize); + blocksize, 0); BUG_ON(!root->node); insert: root->ref_cows = 1; @@ -602,6 +853,10 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, return fs_info->tree_root; if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) return fs_info->extent_root; + if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) + return fs_info->chunk_root; + if (location->objectid == BTRFS_DEV_TREE_OBJECTID) + return fs_info->dev_root; root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)location->objectid); @@ -680,12 +935,20 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) { struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; int ret = 0; + int limit = 256 * info->fs_devices->open_devices; struct list_head *cur; struct btrfs_device *device; struct backing_dev_info *bdi; + if ((bdi_bits & (1 << BDI_write_congested)) && + atomic_read(&info->nr_async_submits) > limit) { + return 1; + } + list_for_each(cur, &info->fs_devices->devices) { device = list_entry(cur, struct btrfs_device, dev_list); + if (!device->bdev) + continue; bdi = blk_get_backing_dev_info(device->bdev); if (bdi && bdi_congested(bdi, bdi_bits)) { ret = 1; @@ -695,7 +958,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) return ret; } -void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +/* + * this unplugs every device on the box, and it is only used when page + * is null + */ +static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { struct list_head *cur; struct btrfs_device *device; @@ -711,10 +978,51 @@ void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) } } +void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ + struct inode *inode; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct address_space *mapping; + u64 offset; + + /* the generic O_DIRECT read code does this */ + if (!page) { + __unplug_io_fn(bdi, page); + return; + } + + /* + * page->mapping may change at any time. Get a consistent copy + * and use that for everything below + */ + smp_mb(); + mapping = page->mapping; + if (!mapping) + return; + + inode = mapping->host; + offset = page_offset(page); + + em_tree = &BTRFS_I(inode)->extent_tree; + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + if (!em) + return; + + offset = offset - em->start; + btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree, + em->block_start + offset, page); + free_extent_map(em); +} + static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) bdi_init(bdi); - bdi->ra_pages = default_backing_dev_info.ra_pages * 4; +#endif + bdi->ra_pages = default_backing_dev_info.ra_pages; bdi->state = 0; bdi->capabilities = default_backing_dev_info.capabilities; bdi->unplug_io_fn = btrfs_unplug_io_fn; @@ -724,14 +1032,172 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) return 0; } +static int bio_ready_for_csum(struct bio *bio) +{ + u64 length = 0; + u64 buf_len = 0; + u64 start = 0; + struct page *page; + struct extent_io_tree *io_tree = NULL; + struct btrfs_fs_info *info = NULL; + struct bio_vec *bvec; + int i; + int ret; + + bio_for_each_segment(bvec, bio, i) { + page = bvec->bv_page; + if (page->private == EXTENT_PAGE_PRIVATE) { + length += bvec->bv_len; + continue; + } + if (!page->private) { + length += bvec->bv_len; + continue; + } + length = bvec->bv_len; + buf_len = page->private >> 2; + start = page_offset(page) + bvec->bv_offset; + io_tree = &BTRFS_I(page->mapping->host)->io_tree; + info = BTRFS_I(page->mapping->host)->root->fs_info; + } + /* are we fully contained in this bio? */ + if (buf_len <= length) + return 1; + + ret = extent_range_uptodate(io_tree, start + length, + start + buf_len - 1); + if (ret == 1) + return ret; + return ret; +} + +/* + * called by the kthread helper functions to finally call the bio end_io + * functions. This is where read checksum verification actually happens + */ +static void end_workqueue_fn(struct btrfs_work *work) +{ + struct bio *bio; + struct end_io_wq *end_io_wq; + struct btrfs_fs_info *fs_info; + int error; + + end_io_wq = container_of(work, struct end_io_wq, work); + bio = end_io_wq->bio; + fs_info = end_io_wq->info; + + /* metadata bios are special because the whole tree block must + * be checksummed at once. This makes sure the entire block is in + * ram and up to date before trying to verify things. For + * blocksize <= pagesize, it is basically a noop + */ + if (end_io_wq->metadata && !bio_ready_for_csum(bio)) { + btrfs_queue_worker(&fs_info->endio_workers, + &end_io_wq->work); + return; + } + error = end_io_wq->error; + bio->bi_private = end_io_wq->private; + bio->bi_end_io = end_io_wq->end_io; + kfree(end_io_wq); +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) + bio_endio(bio, bio->bi_size, error); +#else + bio_endio(bio, error); +#endif +} + +static int cleaner_kthread(void *arg) +{ + struct btrfs_root *root = arg; + + do { + smp_mb(); + if (root->fs_info->closing) + break; + + vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); + mutex_lock(&root->fs_info->cleaner_mutex); +printk("cleaner awake\n"); + btrfs_clean_old_snapshots(root); +printk("cleaner done\n"); + mutex_unlock(&root->fs_info->cleaner_mutex); + + if (freezing(current)) { + refrigerator(); + } else { + smp_mb(); + if (root->fs_info->closing) + break; + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +static int transaction_kthread(void *arg) +{ + struct btrfs_root *root = arg; + struct btrfs_trans_handle *trans; + struct btrfs_transaction *cur; + unsigned long now; + unsigned long delay; + int ret; + + do { + smp_mb(); + if (root->fs_info->closing) + break; + + delay = HZ * 30; + vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + + mutex_lock(&root->fs_info->trans_mutex); + cur = root->fs_info->running_transaction; + if (!cur) { + mutex_unlock(&root->fs_info->trans_mutex); + goto sleep; + } + now = get_seconds(); + if (now < cur->start_time || now - cur->start_time < 30) { + mutex_unlock(&root->fs_info->trans_mutex); + delay = HZ * 5; + goto sleep; + } + mutex_unlock(&root->fs_info->trans_mutex); + btrfs_defrag_dirty_roots(root->fs_info); + trans = btrfs_start_transaction(root, 1); + ret = btrfs_commit_transaction(trans, root); +sleep: + wake_up_process(root->fs_info->cleaner_kthread); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + + if (freezing(current)) { + refrigerator(); + } else { + if (root->fs_info->closing) + break; + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(delay); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + struct btrfs_root *open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices) + struct btrfs_fs_devices *fs_devices, + char *options) { u32 sectorsize; u32 nodesize; u32 leafsize; u32 blocksize; u32 stripesize; + struct buffer_head *bh; struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root), GFP_NOFS); struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root), @@ -744,6 +1210,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, GFP_NOFS); int ret; int err = -EINVAL; + struct btrfs_super_block *disk_super; if (!extent_root || !tree_root || !fs_info) { @@ -759,7 +1226,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->new_trans_lock); init_completion(&fs_info->kobj_unregister); - sb_set_blocksize(sb, 4096); fs_info->tree_root = tree_root; fs_info->extent_root = extent_root; fs_info->chunk_root = chunk_root; @@ -768,6 +1234,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); + atomic_set(&fs_info->nr_async_submits, 0); + atomic_set(&fs_info->throttles, 0); fs_info->sb = sb; fs_info->max_extent = (u64)-1; fs_info->max_inline = 8192 * 1024; @@ -775,7 +1243,17 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode = new_inode(sb); fs_info->btree_inode->i_ino = 1; fs_info->btree_inode->i_nlink = 1; - fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size; + fs_info->thread_pool_size = min(num_online_cpus() + 2, 8); + + sb->s_blocksize = 4096; + sb->s_blocksize_bits = blksize_bits(4096); + + /* + * we set the i_size on the btree inode to the max possible int. + * the real end of the address space is determined by all of + * the devices in the system + */ + fs_info->btree_inode->i_size = OFFSET_MAX; fs_info->btree_inode->i_mapping->a_ops = &btree_aops; fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; @@ -799,11 +1277,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode->i_mapping, GFP_NOFS); fs_info->do_barriers = 1; -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) - INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info); -#else - INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner); -#endif BTRFS_I(fs_info->btree_inode)->root = tree_root; memset(&BTRFS_I(fs_info->btree_inode)->location, 0, sizeof(struct btrfs_key)); @@ -811,7 +1284,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); mutex_init(&fs_info->trans_mutex); - mutex_init(&fs_info->fs_mutex); + mutex_init(&fs_info->drop_mutex); + mutex_init(&fs_info->alloc_mutex); + mutex_init(&fs_info->chunk_mutex); + mutex_init(&fs_info->transaction_kthread_mutex); + mutex_init(&fs_info->cleaner_mutex); #if 0 ret = add_hasher(fs_info, "crc32c"); @@ -824,30 +1301,51 @@ struct btrfs_root *open_ctree(struct super_block *sb, __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); - fs_info->sb_buffer = read_tree_block(tree_root, - BTRFS_SUPER_INFO_OFFSET, - 4096); - if (!fs_info->sb_buffer) + bh = __bread(fs_devices->latest_bdev, + BTRFS_SUPER_INFO_OFFSET / 4096, 4096); + if (!bh) goto fail_iput; - read_extent_buffer(fs_info->sb_buffer, &fs_info->super_copy, 0, - sizeof(fs_info->super_copy)); + memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); + brelse(bh); - read_extent_buffer(fs_info->sb_buffer, fs_info->fsid, - (unsigned long)btrfs_super_fsid(fs_info->sb_buffer), - BTRFS_FSID_SIZE); + memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); disk_super = &fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_sb_buffer; - if (btrfs_super_num_devices(disk_super) != fs_devices->num_devices) { + err = btrfs_parse_options(tree_root, options); + if (err) + goto fail_sb_buffer; + + /* + * we need to start all the end_io workers up front because the + * queue work function gets called at interrupt time, and so it + * cannot dynamically grow. + */ + btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size); + btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size); + btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size); + btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->submit_workers, 1); + btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); + + err = -EINVAL; + if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) { printk("Btrfs: wanted %llu devices, but found %llu\n", (unsigned long long)btrfs_super_num_devices(disk_super), - (unsigned long long)fs_devices->num_devices); - goto fail_sb_buffer; + (unsigned long long)fs_devices->open_devices); + if (btrfs_test_opt(tree_root, DEGRADED)) + printk("continuing in degraded mode\n"); + else { + goto fail_sb_buffer; + } } + + fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); + nodesize = btrfs_super_nodesize(disk_super); leafsize = btrfs_super_leafsize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); @@ -856,10 +1354,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, tree_root->leafsize = leafsize; tree_root->sectorsize = sectorsize; tree_root->stripesize = stripesize; - sb_set_blocksize(sb, sectorsize); - i_size_write(fs_info->btree_inode, - btrfs_super_total_bytes(disk_super)); + sb->s_blocksize = sectorsize; + sb->s_blocksize_bits = blksize_bits(sectorsize); if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, sizeof(disk_super->magic))) { @@ -867,10 +1364,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_sb_buffer; } - mutex_lock(&fs_info->fs_mutex); - + mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(tree_root); - BUG_ON(ret); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + printk("btrfs: failed to read the system array on %s\n", + sb->s_id); + goto fail_sys_array; + } blocksize = btrfs_level_size(tree_root, btrfs_super_chunk_root_level(disk_super)); @@ -880,19 +1381,27 @@ struct btrfs_root *open_ctree(struct super_block *sb, chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), - blocksize); + blocksize, 0); BUG_ON(!chunk_root->node); + read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), + BTRFS_UUID_SIZE); + + mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_chunk_tree(chunk_root); + mutex_unlock(&fs_info->chunk_mutex); BUG_ON(ret); + btrfs_close_extra_devices(fs_devices); + blocksize = btrfs_level_size(tree_root, btrfs_super_root_level(disk_super)); tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), - blocksize); + blocksize, 0); if (!tree_root->node) goto fail_sb_buffer; @@ -916,44 +1425,192 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->data_alloc_profile = (u64)-1; fs_info->metadata_alloc_profile = (u64)-1; fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, + "btrfs-cleaner"); + if (!fs_info->cleaner_kthread) + goto fail_extent_root; + + fs_info->transaction_kthread = kthread_run(transaction_kthread, + tree_root, + "btrfs-transaction"); + if (!fs_info->transaction_kthread) + goto fail_trans_kthread; + - mutex_unlock(&fs_info->fs_mutex); return tree_root; +fail_trans_kthread: + kthread_stop(fs_info->cleaner_kthread); fail_extent_root: free_extent_buffer(extent_root->node); fail_tree_root: - mutex_unlock(&fs_info->fs_mutex); free_extent_buffer(tree_root->node); +fail_sys_array: fail_sb_buffer: - free_extent_buffer(fs_info->sb_buffer); extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->submit_workers); fail_iput: iput(fs_info->btree_inode); fail: - close_all_devices(fs_info); + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + kfree(extent_root); kfree(tree_root); +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) bdi_destroy(&fs_info->bdi); +#endif kfree(fs_info); return ERR_PTR(err); } +static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (uptodate) { + set_buffer_uptodate(bh); + } else { + if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { + printk(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + bdevname(bh->b_bdev, b)); + } + /* note, we dont' set_buffer_write_io_error because we have + * our own ways of dealing with the IO errors + */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +int write_all_supers(struct btrfs_root *root) +{ + struct list_head *cur; + struct list_head *head = &root->fs_info->fs_devices->devices; + struct btrfs_device *dev; + struct btrfs_super_block *sb; + struct btrfs_dev_item *dev_item; + struct buffer_head *bh; + int ret; + int do_barriers; + int max_errors; + int total_errors = 0; + u32 crc; + u64 flags; + + max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + do_barriers = !btrfs_test_opt(root, NOBARRIER); + + sb = &root->fs_info->super_for_commit; + dev_item = &sb->dev_item; + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (!dev->bdev) { + total_errors++; + continue; + } + if (!dev->in_fs_metadata) + continue; + + btrfs_set_stack_device_type(dev_item, dev->type); + btrfs_set_stack_device_id(dev_item, dev->devid); + btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); + btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); + btrfs_set_stack_device_io_align(dev_item, dev->io_align); + btrfs_set_stack_device_io_width(dev_item, dev->io_width); + btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); + memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); + flags = btrfs_super_flags(sb); + btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); + + + crc = ~(u32)0; + crc = btrfs_csum_data(root, (char *)sb + BTRFS_CSUM_SIZE, crc, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, sb->csum); + + bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET / 4096, + BTRFS_SUPER_INFO_SIZE); + + memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); + dev->pending_io = bh; + + get_bh(bh); + set_buffer_uptodate(bh); + lock_buffer(bh); + bh->b_end_io = btrfs_end_buffer_write_sync; + + if (do_barriers && dev->barriers) { + ret = submit_bh(WRITE_BARRIER, bh); + if (ret == -EOPNOTSUPP) { + printk("btrfs: disabling barriers on dev %s\n", + dev->name); + set_buffer_uptodate(bh); + dev->barriers = 0; + get_bh(bh); + lock_buffer(bh); + ret = submit_bh(WRITE, bh); + } + } else { + ret = submit_bh(WRITE, bh); + } + if (ret) + total_errors++; + } + if (total_errors > max_errors) { + printk("btrfs: %d errors while writing supers\n", total_errors); + BUG(); + } + total_errors = 0; + + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (!dev->bdev) + continue; + if (!dev->in_fs_metadata) + continue; + + BUG_ON(!dev->pending_io); + bh = dev->pending_io; + wait_on_buffer(bh); + if (!buffer_uptodate(dev->pending_io)) { + if (do_barriers && dev->barriers) { + printk("btrfs: disabling barriers on dev %s\n", + dev->name); + set_buffer_uptodate(bh); + get_bh(bh); + lock_buffer(bh); + dev->barriers = 0; + ret = submit_bh(WRITE, bh); + BUG_ON(ret); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + total_errors++; + } else { + total_errors++; + } + + } + dev->pending_io = NULL; + brelse(bh); + } + if (total_errors > max_errors) { + printk("btrfs: %d errors while writing supers\n", total_errors); + BUG(); + } + return 0; +} + int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; - struct extent_buffer *super = root->fs_info->sb_buffer; - struct inode *btree_inode = root->fs_info->btree_inode; - struct super_block *sb = root->fs_info->sb; - - if (!btrfs_test_opt(root, NOBARRIER)) - blkdev_issue_flush(sb->s_bdev, NULL); - set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, super); - ret = sync_page_range_nolock(btree_inode, btree_inode->i_mapping, - super->start, super->len); - if (!btrfs_test_opt(root, NOBARRIER)) - blkdev_issue_flush(sb->s_bdev, NULL); + + ret = write_all_supers(root); return ret; } @@ -1000,9 +1657,13 @@ int close_ctree(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; fs_info->closing = 1; - btrfs_transaction_flush_work(root); - mutex_lock(&fs_info->fs_mutex); + smp_mb(); + + kthread_stop(root->fs_info->transaction_kthread); + kthread_stop(root->fs_info->cleaner_kthread); + btrfs_defrag_dirty_roots(root->fs_info); + btrfs_clean_old_snapshots(root); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); /* run commit again to drop the original snapshot */ @@ -1010,8 +1671,8 @@ int close_ctree(struct btrfs_root *root) btrfs_commit_transaction(trans, root); ret = btrfs_write_and_wait_transaction(NULL, root); BUG_ON(ret); + write_ctree_super(NULL, root); - mutex_unlock(&fs_info->fs_mutex); if (fs_info->delalloc_bytes) { printk("btrfs: at unmount delalloc count %Lu\n", @@ -1029,8 +1690,6 @@ int close_ctree(struct btrfs_root *root) if (root->fs_info->dev_root->node); free_extent_buffer(root->fs_info->dev_root->node); - free_extent_buffer(fs_info->sb_buffer); - btrfs_free_block_groups(root->fs_info); del_fs_roots(fs_info); @@ -1045,6 +1704,10 @@ int close_ctree(struct btrfs_root *root) truncate_inode_pages(fs_info->btree_inode->i_mapping, 0); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->submit_workers); + iput(fs_info->btree_inode); #if 0 while(!list_empty(&fs_info->hashers)) { @@ -1056,9 +1719,12 @@ int close_ctree(struct btrfs_root *root) kfree(hasher); } #endif - close_all_devices(fs_info); + btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) bdi_destroy(&fs_info->bdi); +#endif kfree(fs_info->extent_root); kfree(fs_info->tree_root); @@ -1067,10 +1733,18 @@ int close_ctree(struct btrfs_root *root) return 0; } -int btrfs_buffer_uptodate(struct extent_buffer *buf) +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) { + int ret; struct inode *btree_inode = buf->first_page->mapping->host; - return extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); + + ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); + if (!ret) + return ret; + + ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, + parent_transid); + return !ret; } int btrfs_set_buffer_uptodate(struct extent_buffer *buf) @@ -1086,6 +1760,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) u64 transid = btrfs_header_generation(buf); struct inode *btree_inode = root->fs_info->btree_inode; + WARN_ON(!btrfs_tree_locked(buf)); if (transid != root->fs_info->generation) { printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n", (unsigned long long)buf->start, @@ -1095,24 +1770,28 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); } -void btrfs_throttle(struct btrfs_root *root) -{ - struct backing_dev_info *bdi; - - bdi = root->fs_info->sb->s_bdev->bd_inode->i_mapping->backing_dev_info; - if (root->fs_info->throttles && bdi_write_congested(bdi)) { -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18) - congestion_wait(WRITE, HZ/20); -#else - blk_congestion_wait(WRITE, HZ/20); -#endif - } -} - void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) { - balance_dirty_pages_ratelimited_nr( + /* + * looks as though older kernels can get into trouble with + * this code, they end up stuck in balance_dirty_pages forever + */ + struct extent_io_tree *tree; + u64 num_dirty; + u64 start = 0; + unsigned long thresh = 16 * 1024 * 1024; + tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; + + if (current_is_pdflush()) + return; + + num_dirty = count_range_bits(tree, &start, (u64)-1, + thresh, EXTENT_DIRTY); + if (num_dirty > thresh) { + balance_dirty_pages_ratelimited_nr( root->fs_info->btree_inode->i_mapping, 1); + } + return; } void btrfs_set_buffer_defrag(struct extent_buffer *buf) @@ -1167,16 +1846,20 @@ int btrfs_clear_buffer_defrag(struct extent_buffer *buf) EXTENT_DEFRAG, GFP_NOFS); } -int btrfs_read_buffer(struct extent_buffer *buf) +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; - struct inode *btree_inode = root->fs_info->btree_inode; - return read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, 0, 1, btree_get_extent); + int ret; + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + if (ret == 0) { + buf->flags |= EXTENT_UPTODATE; + } + return ret; } static struct extent_io_ops btree_extent_io_ops = { .writepage_io_hook = btree_writepage_io_hook, + .readpage_end_io_hook = btree_readpage_end_io_hook, .submit_bio_hook = btree_submit_bio_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook,