* Boston, MA 021110-1307, USA.
*/
+#include <linux/version.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/scatterlist.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h> // for block_sync_page
#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+# include <linux/freezer.h>
+#else
+# include <linux/sched.h>
+#endif
#include "crc32c.h"
#include "ctree.h"
#include "disk-io.h"
#include "print-tree.h"
#include "async-thread.h"
#include "locking.h"
+#include "ref-cache.h"
#if 0
static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
btrfs_csum_final(crc, result);
if (verify) {
- int from_this_trans = 0;
-
- if (root->fs_info->running_transaction &&
- btrfs_header_generation(buf) ==
- root->fs_info->running_transaction->transid)
- from_this_trans = 1;
-
/* FIXME, this is not good */
if (memcmp_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE)) {
u32 val;
read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE);
printk("btrfs: %s checksum verify failed on %llu "
- "wanted %X found %X from_this_trans %d "
- "level %d\n",
+ "wanted %X found %X level %d\n",
root->fs_info->sb->s_id,
- buf->start, val, found, from_this_trans,
- btrfs_header_level(buf));
+ buf->start, val, found, btrfs_header_level(buf));
return 1;
}
} else {
(unsigned long long)parent_transid,
(unsigned long long)btrfs_header_generation(eb));
ret = 1;
-out:
clear_extent_buffer_uptodate(io_tree, eb);
+out:
unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
GFP_NOFS);
return ret;
ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
btrfs_header_generation(eb));
BUG_ON(ret);
- btrfs_clear_buffer_defrag(eb);
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
}
eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
- btrfs_clear_buffer_defrag(eb);
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
ret = -EIO;
end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
end = eb->start + end - 1;
- release_extent_buffer_tail_pages(eb);
err:
free_extent_buffer(eb);
out:
end_io_wq->error = err;
end_io_wq->work.func = end_workqueue_fn;
end_io_wq->work.flags = 0;
- btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
+ if (bio->bi_rw & (1 << BIO_RW))
+ btrfs_queue_worker(&fs_info->endio_write_workers,
+ &end_io_wq->work);
+ else
+ btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
return 0;
return 0;
}
+int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
+{
+ int limit = 256 * info->fs_devices->open_devices;
+
+ if (iodone)
+ limit = (limit * 3) / 2;
+ if (atomic_read(&info->nr_async_submits) > limit)
+ return 1;
+
+ return atomic_read(&info->nr_async_bios) > limit;
+}
+
static void run_one_async_submit(struct btrfs_work *work)
{
struct btrfs_fs_info *fs_info;
async = container_of(work, struct async_submit_bio, work);
fs_info = BTRFS_I(async->inode)->root->fs_info;
atomic_dec(&fs_info->nr_async_submits);
+
+ if ((async->bio->bi_rw & (1 << BIO_RW)) &&
+ !btrfs_congested_async(fs_info, 1)) {
+ clear_bdi_congested(&fs_info->bdi, WRITE);
+ }
async->submit_bio_hook(async->inode, async->rw, async->bio,
async->mirror_num);
kfree(async);
struct extent_map_tree *map;
int ret;
- if (page_count(page) > 3) {
- /* once for page->private, once for the caller, once
- * once for the page cache
- */
- return 0;
- }
tree = &BTRFS_I(page->mapping->host)->io_tree;
map = &BTRFS_I(page->mapping->host)->extent_tree;
+
ret = try_release_extent_state(map, tree, page, gfp_flags);
+ if (!ret) {
+ return 0;
+ }
+
+ ret = try_release_extent_buffer(tree, page);
if (ret == 1) {
- invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE);
ClearPagePrivate(page);
set_page_private(page, 0);
page_cache_release(page);
}
+
return ret;
}
extent_invalidatepage(tree, page, offset);
btree_releasepage(page, GFP_NOFS);
if (PagePrivate(page)) {
- invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE);
+ printk("warning page private not zero on page %Lu\n",
+ page_offset(page));
ClearPagePrivate(page);
set_page_private(page, 0);
page_cache_release(page);
root->node = NULL;
root->inode = NULL;
root->commit_root = NULL;
+ root->ref_tree = NULL;
root->sectorsize = sectorsize;
root->nodesize = nodesize;
root->leafsize = leafsize;
root->in_sysfs = 0;
INIT_LIST_HEAD(&root->dirty_list);
+ INIT_LIST_HEAD(&root->orphan_list);
+ INIT_LIST_HEAD(&root->dead_list);
spin_lock_init(&root->node_lock);
+ spin_lock_init(&root->list_lock);
mutex_init(&root->objectid_mutex);
+
+ btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
+ root->ref_tree = &root->ref_tree_struct;
+
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
memset(&root->root_kobj, 0, sizeof(root->root_kobj));
+ root->defrag_trans_start = fs_info->generation;
init_completion(&root->kobj_unregister);
root->defrag_running = 0;
root->defrag_level = 0;
{
struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
int ret = 0;
- int limit = 256 * info->fs_devices->open_devices;
struct list_head *cur;
struct btrfs_device *device;
struct backing_dev_info *bdi;
if ((bdi_bits & (1 << BDI_write_congested)) &&
- atomic_read(&info->nr_async_submits) > limit) {
+ btrfs_congested_async(info, 0))
return 1;
- }
list_for_each(cur, &info->fs_devices->devices) {
device = list_entry(cur, struct btrfs_device, dev_list);
spin_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
spin_unlock(&em_tree->lock);
- if (!em)
+ if (!em) {
+ __unplug_io_fn(bdi, page);
return;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ free_extent_map(em);
+ __unplug_io_fn(bdi, page);
+ return;
+ }
offset = offset - em->start;
btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
em->block_start + offset, page);
#endif
}
+static int cleaner_kthread(void *arg)
+{
+ struct btrfs_root *root = arg;
+
+ do {
+ smp_mb();
+ if (root->fs_info->closing)
+ break;
+
+ vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+ mutex_lock(&root->fs_info->cleaner_mutex);
+ btrfs_clean_old_snapshots(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ if (freezing(current)) {
+ refrigerator();
+ } else {
+ smp_mb();
+ if (root->fs_info->closing)
+ break;
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ }
+ } while (!kthread_should_stop());
+ return 0;
+}
+
+static int transaction_kthread(void *arg)
+{
+ struct btrfs_root *root = arg;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_transaction *cur;
+ unsigned long now;
+ unsigned long delay;
+ int ret;
+
+ do {
+ smp_mb();
+ if (root->fs_info->closing)
+ break;
+
+ delay = HZ * 30;
+ vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+ mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+ if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
+ printk("btrfs: total reference cache size %Lu\n",
+ root->fs_info->total_ref_cache_size);
+ }
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ cur = root->fs_info->running_transaction;
+ if (!cur) {
+ mutex_unlock(&root->fs_info->trans_mutex);
+ goto sleep;
+ }
+
+ now = get_seconds();
+ if (now < cur->start_time || now - cur->start_time < 30) {
+ mutex_unlock(&root->fs_info->trans_mutex);
+ delay = HZ * 5;
+ goto sleep;
+ }
+ mutex_unlock(&root->fs_info->trans_mutex);
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_commit_transaction(trans, root);
+sleep:
+ wake_up_process(root->fs_info->cleaner_kthread);
+ mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+
+ if (freezing(current)) {
+ refrigerator();
+ } else {
+ if (root->fs_info->closing)
+ break;
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(delay);
+ __set_current_state(TASK_RUNNING);
+ }
+ } while (!kthread_should_stop());
+ return 0;
+}
+
struct btrfs_root *open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options)
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->hashers);
+ INIT_LIST_HEAD(&fs_info->delalloc_inodes);
spin_lock_init(&fs_info->hash_lock);
spin_lock_init(&fs_info->delalloc_lock);
spin_lock_init(&fs_info->new_trans_lock);
+ spin_lock_init(&fs_info->ref_cache_lock);
init_completion(&fs_info->kobj_unregister);
fs_info->tree_root = tree_root;
INIT_LIST_HEAD(&fs_info->space_info);
btrfs_mapping_init(&fs_info->mapping_tree);
atomic_set(&fs_info->nr_async_submits, 0);
+ atomic_set(&fs_info->nr_async_bios, 0);
atomic_set(&fs_info->throttles, 0);
+ atomic_set(&fs_info->throttle_gen, 0);
fs_info->sb = sb;
fs_info->max_extent = (u64)-1;
fs_info->max_inline = 8192 * 1024;
fs_info->btree_inode->i_nlink = 1;
fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
+ INIT_LIST_HEAD(&fs_info->ordered_extents);
+ spin_lock_init(&fs_info->ordered_extent_lock);
+
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
fs_info->btree_inode->i_mapping, GFP_NOFS);
fs_info->do_barriers = 1;
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
- INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
-#else
- INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
-#endif
BTRFS_I(fs_info->btree_inode)->root = tree_root;
memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
sizeof(struct btrfs_key));
mutex_init(&fs_info->drop_mutex);
mutex_init(&fs_info->alloc_mutex);
mutex_init(&fs_info->chunk_mutex);
+ mutex_init(&fs_info->transaction_kthread_mutex);
+ mutex_init(&fs_info->cleaner_mutex);
+ mutex_init(&fs_info->volume_mutex);
+ init_waitqueue_head(&fs_info->transaction_throttle);
+ init_waitqueue_head(&fs_info->transaction_wait);
#if 0
ret = add_hasher(fs_info, "crc32c");
* cannot dynamically grow.
*/
btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
- btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
+ btrfs_init_workers(&fs_info->submit_workers,
+ min_t(u64, fs_devices->num_devices,
+ fs_info->thread_pool_size));
+
+ /* a higher idle thresh on the submit workers makes it much more
+ * likely that bios will be send down in a sane order to the
+ * devices
+ */
+ fs_info->submit_workers.idle_thresh = 64;
+
+ btrfs_init_workers(&fs_info->fixup_workers, 1);
btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+ btrfs_init_workers(&fs_info->endio_write_workers,
+ fs_info->thread_pool_size);
+
+ /*
+ * endios are largely parallel and should have a very
+ * low idle thresh
+ */
+ fs_info->endio_workers.idle_thresh = 4;
+ fs_info->endio_write_workers.idle_thresh = 4;
+
btrfs_start_workers(&fs_info->workers, 1);
btrfs_start_workers(&fs_info->submit_workers, 1);
+ btrfs_start_workers(&fs_info->fixup_workers, 1);
btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
-
+ btrfs_start_workers(&fs_info->endio_write_workers,
+ fs_info->thread_pool_size);
err = -EINVAL;
if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
fs_info->data_alloc_profile = (u64)-1;
fs_info->metadata_alloc_profile = (u64)-1;
fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+ fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
+ "btrfs-cleaner");
+ if (!fs_info->cleaner_kthread)
+ goto fail_extent_root;
+
+ fs_info->transaction_kthread = kthread_run(transaction_kthread,
+ tree_root,
+ "btrfs-transaction");
+ if (!fs_info->transaction_kthread)
+ goto fail_cleaner;
+
return tree_root;
+fail_cleaner:
+ kthread_stop(fs_info->cleaner_kthread);
fail_extent_root:
free_extent_buffer(extent_root->node);
fail_tree_root:
free_extent_buffer(tree_root->node);
fail_sys_array:
fail_sb_buffer:
- extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
+ btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
+ btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
fail_iput:
iput(fs_info->btree_inode);
fs_info->closing = 1;
smp_mb();
- btrfs_transaction_flush_work(root);
- btrfs_defrag_dirty_roots(root->fs_info);
+ kthread_stop(root->fs_info->transaction_kthread);
+ kthread_stop(root->fs_info->cleaner_kthread);
+
+ btrfs_clean_old_snapshots(root);
trans = btrfs_start_transaction(root, 1);
ret = btrfs_commit_transaction(trans, root);
/* run commit again to drop the original snapshot */
write_ctree_super(NULL, root);
- btrfs_transaction_flush_work(root);
-
if (fs_info->delalloc_bytes) {
printk("btrfs: at unmount delalloc count %Lu\n",
fs_info->delalloc_bytes);
}
+ if (fs_info->total_ref_cache_size) {
+ printk("btrfs: at umount reference cache size %Lu\n",
+ fs_info->total_ref_cache_size);
+ }
+
if (fs_info->extent_root->node)
free_extent_buffer(fs_info->extent_root->node);
free_extent_buffer(root->fs_info->dev_root->node);
btrfs_free_block_groups(root->fs_info);
+ fs_info->closing = 2;
del_fs_roots(fs_info);
filemap_write_and_wait(fs_info->btree_inode->i_mapping);
- extent_io_tree_empty_lru(&fs_info->free_space_cache);
- extent_io_tree_empty_lru(&fs_info->block_group_cache);
- extent_io_tree_empty_lru(&fs_info->pinned_extents);
- extent_io_tree_empty_lru(&fs_info->pending_del);
- extent_io_tree_empty_lru(&fs_info->extent_ins);
- extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
-
truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
+ btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
+ btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
iput(fs_info->btree_inode);
struct extent_io_tree *tree;
u64 num_dirty;
u64 start = 0;
- unsigned long thresh = 16 * 1024 * 1024;
+ unsigned long thresh = 2 * 1024 * 1024;
tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
if (current_is_pdflush())
return;
}
-void btrfs_set_buffer_defrag(struct extent_buffer *buf)
-{
- struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
- struct inode *btree_inode = root->fs_info->btree_inode;
- set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start,
- buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS);
-}
-
-void btrfs_set_buffer_defrag_done(struct extent_buffer *buf)
-{
- struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
- struct inode *btree_inode = root->fs_info->btree_inode;
- set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start,
- buf->start + buf->len - 1, EXTENT_DEFRAG_DONE,
- GFP_NOFS);
-}
-
-int btrfs_buffer_defrag(struct extent_buffer *buf)
-{
- struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
- struct inode *btree_inode = root->fs_info->btree_inode;
- return test_range_bit(&BTRFS_I(btree_inode)->io_tree,
- buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, 0);
-}
-
-int btrfs_buffer_defrag_done(struct extent_buffer *buf)
-{
- struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
- struct inode *btree_inode = root->fs_info->btree_inode;
- return test_range_bit(&BTRFS_I(btree_inode)->io_tree,
- buf->start, buf->start + buf->len - 1,
- EXTENT_DEFRAG_DONE, 0);
-}
-
-int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf)
-{
- struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
- struct inode *btree_inode = root->fs_info->btree_inode;
- return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree,
- buf->start, buf->start + buf->len - 1,
- EXTENT_DEFRAG_DONE, GFP_NOFS);
-}
-
-int btrfs_clear_buffer_defrag(struct extent_buffer *buf)
-{
- struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
- struct inode *btree_inode = root->fs_info->btree_inode;
- return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree,
- buf->start, buf->start + buf->len - 1,
- EXTENT_DEFRAG, GFP_NOFS);
-}
-
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
{
struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;