Merge branch 'freespace-4.5' into for-linus-4.5
[deliverable/linux.git] / fs / btrfs / extent-tree.c
index acf3ed11cfb60e95b685aeb009e4d72fbdba3c3b..0617cb73669dda8206ebc070f2cfb7d8a49ee6b6 100644 (file)
@@ -33,6 +33,7 @@
 #include "raid56.h"
 #include "locking.h"
 #include "free-space-cache.h"
+#include "free-space-tree.h"
 #include "math.h"
 #include "sysfs.h"
 #include "qgroup.h"
@@ -124,7 +125,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
        return (cache->flags & bits) == bits;
 }
 
-static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 {
        atomic_inc(&cache->count);
 }
@@ -357,8 +358,8 @@ static void fragment_free_space(struct btrfs_root *root,
  * we need to check the pinned_extents for any extents that can't be used yet
  * since their free space will be released as soon as the transaction commits.
  */
-static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
-                             struct btrfs_fs_info *info, u64 start, u64 end)
+u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+                      struct btrfs_fs_info *info, u64 start, u64 end)
 {
        u64 extent_start, extent_end, size, total_added = 0;
        int ret;
@@ -395,11 +396,10 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
        return total_added;
 }
 
-static noinline void caching_thread(struct btrfs_work *work)
+static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 {
        struct btrfs_block_group_cache *block_group;
        struct btrfs_fs_info *fs_info;
-       struct btrfs_caching_control *caching_ctl;
        struct btrfs_root *extent_root;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
@@ -407,17 +407,16 @@ static noinline void caching_thread(struct btrfs_work *work)
        u64 total_found = 0;
        u64 last = 0;
        u32 nritems;
-       int ret = -ENOMEM;
+       int ret;
        bool wakeup = true;
 
-       caching_ctl = container_of(work, struct btrfs_caching_control, work);
        block_group = caching_ctl->block_group;
        fs_info = block_group->fs_info;
        extent_root = fs_info->extent_root;
 
        path = btrfs_alloc_path();
        if (!path)
-               goto out;
+               return -ENOMEM;
 
        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 
@@ -443,15 +442,11 @@ static noinline void caching_thread(struct btrfs_work *work)
        key.objectid = last;
        key.offset = 0;
        key.type = BTRFS_EXTENT_ITEM_KEY;
-again:
-       mutex_lock(&caching_ctl->mutex);
-       /* need to make sure the commit_root doesn't disappear */
-       down_read(&fs_info->commit_root_sem);
 
 next:
        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
        if (ret < 0)
-               goto err;
+               goto out;
 
        leaf = path->nodes[0];
        nritems = btrfs_header_nritems(leaf);
@@ -477,12 +472,14 @@ next:
                                up_read(&fs_info->commit_root_sem);
                                mutex_unlock(&caching_ctl->mutex);
                                cond_resched();
-                               goto again;
+                               mutex_lock(&caching_ctl->mutex);
+                               down_read(&fs_info->commit_root_sem);
+                               goto next;
                        }
 
                        ret = btrfs_next_leaf(extent_root, path);
                        if (ret < 0)
-                               goto err;
+                               goto out;
                        if (ret)
                                break;
                        leaf = path->nodes[0];
@@ -521,7 +518,7 @@ next:
                        else
                                last = key.objectid + key.offset;
 
-                       if (total_found > (1024 * 1024 * 2)) {
+                       if (total_found > CACHING_CTL_WAKE_UP) {
                                total_found = 0;
                                if (wakeup)
                                        wake_up(&caching_ctl->wait);
@@ -534,9 +531,35 @@ next:
        total_found += add_new_free_space(block_group, fs_info, last,
                                          block_group->key.objectid +
                                          block_group->key.offset);
+       caching_ctl->progress = (u64)-1;
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static noinline void caching_thread(struct btrfs_work *work)
+{
+       struct btrfs_block_group_cache *block_group;
+       struct btrfs_fs_info *fs_info;
+       struct btrfs_caching_control *caching_ctl;
+       int ret;
+
+       caching_ctl = container_of(work, struct btrfs_caching_control, work);
+       block_group = caching_ctl->block_group;
+       fs_info = block_group->fs_info;
+
+       mutex_lock(&caching_ctl->mutex);
+       down_read(&fs_info->commit_root_sem);
+
+       if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+               ret = load_free_space_tree(caching_ctl);
+       else
+               ret = load_extent_tree_free(caching_ctl);
+
        spin_lock(&block_group->lock);
        block_group->caching_ctl = NULL;
-       block_group->cached = BTRFS_CACHE_FINISHED;
+       block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
        spin_unlock(&block_group->lock);
 
 #ifdef CONFIG_BTRFS_DEBUG
@@ -555,20 +578,11 @@ next:
 #endif
 
        caching_ctl->progress = (u64)-1;
-err:
-       btrfs_free_path(path);
-       up_read(&fs_info->commit_root_sem);
-
-       free_excluded_extents(extent_root, block_group);
 
+       up_read(&fs_info->commit_root_sem);
+       free_excluded_extents(fs_info->extent_root, block_group);
        mutex_unlock(&caching_ctl->mutex);
-out:
-       if (ret) {
-               spin_lock(&block_group->lock);
-               block_group->caching_ctl = NULL;
-               block_group->cached = BTRFS_CACHE_ERROR;
-               spin_unlock(&block_group->lock);
-       }
+
        wake_up(&caching_ctl->wait);
 
        put_caching_control(caching_ctl);
@@ -680,8 +694,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                }
        } else {
                /*
-                * We are not going to do the fast caching, set cached to the
-                * appropriate value and wakeup any waiters.
+                * We're either using the free space tree or no caching at all.
+                * Set cached to the appropriate value and wakeup any waiters.
                 */
                spin_lock(&cache->lock);
                if (load_cache_only) {
@@ -3684,11 +3698,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                return -ENOMEM;
 
        /*
-        * We don't need the lock here since we are protected by the transaction
-        * commit.  We want to do the cache_save_setup first and then run the
+        * Even though we are in the critical section of the transaction commit,
+        * we can still have concurrent tasks adding elements to this
+        * transaction's list of dirty block groups. These tasks correspond to
+        * endio free space workers started when writeback finishes for a
+        * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
+        * allocate new block groups as a result of COWing nodes of the root
+        * tree when updating the free space inode. The writeback for the space
+        * caches is triggered by an earlier call to
+        * btrfs_start_dirty_block_groups() and iterations of the following
+        * loop.
+        * Also we want to do the cache_save_setup first and then run the
         * delayed refs to make sure we have the best chance at doing this all
         * in one shot.
         */
+       spin_lock(&cur_trans->dirty_bgs_lock);
        while (!list_empty(&cur_trans->dirty_bgs)) {
                cache = list_first_entry(&cur_trans->dirty_bgs,
                                         struct btrfs_block_group_cache,
@@ -3700,11 +3724,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                 * finish and then do it all again
                 */
                if (!list_empty(&cache->io_list)) {
+                       spin_unlock(&cur_trans->dirty_bgs_lock);
                        list_del_init(&cache->io_list);
                        btrfs_wait_cache_io(root, trans, cache,
                                            &cache->io_ctl, path,
                                            cache->key.objectid);
                        btrfs_put_block_group(cache);
+                       spin_lock(&cur_trans->dirty_bgs_lock);
                }
 
                /*
@@ -3712,6 +3738,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                 * on any pending IO
                 */
                list_del_init(&cache->dirty_list);
+               spin_unlock(&cur_trans->dirty_bgs_lock);
                should_put = 1;
 
                cache_save_setup(cache, trans, path);
@@ -3743,7 +3770,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                /* if its not on the io list, we need to put the block group */
                if (should_put)
                        btrfs_put_block_group(cache);
+               spin_lock(&cur_trans->dirty_bgs_lock);
        }
+       spin_unlock(&cur_trans->dirty_bgs_lock);
 
        while (!list_empty(io)) {
                cache = list_first_entry(io, struct btrfs_block_group_cache,
@@ -5915,19 +5944,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        set_extent_dirty(info->pinned_extents,
                                         bytenr, bytenr + num_bytes - 1,
                                         GFP_NOFS | __GFP_NOFAIL);
-                       /*
-                        * No longer have used bytes in this block group, queue
-                        * it for deletion.
-                        */
-                       if (old_val == 0) {
-                               spin_lock(&info->unused_bgs_lock);
-                               if (list_empty(&cache->bg_list)) {
-                                       btrfs_get_block_group(cache);
-                                       list_add_tail(&cache->bg_list,
-                                                     &info->unused_bgs);
-                               }
-                               spin_unlock(&info->unused_bgs_lock);
-                       }
                }
 
                spin_lock(&trans->transaction->dirty_bgs_lock);
@@ -5939,6 +5955,22 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                }
                spin_unlock(&trans->transaction->dirty_bgs_lock);
 
+               /*
+                * No longer have used bytes in this block group, queue it for
+                * deletion. We do this after adding the block group to the
+                * dirty list to avoid races between cleaner kthread and space
+                * cache writeout.
+                */
+               if (!alloc && old_val == 0) {
+                       spin_lock(&info->unused_bgs_lock);
+                       if (list_empty(&cache->bg_list)) {
+                               btrfs_get_block_group(cache);
+                               list_add_tail(&cache->bg_list,
+                                             &info->unused_bgs);
+                       }
+                       spin_unlock(&info->unused_bgs_lock);
+               }
+
                btrfs_put_block_group(cache);
                total -= num_bytes;
                bytenr += num_bytes;
@@ -6658,6 +6690,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        }
                }
 
+               ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
+                                            num_bytes);
+               if (ret) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
+
                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
                if (ret) {
                        btrfs_abort_transaction(trans, extent_root, ret);
@@ -7669,6 +7708,11 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
 
+       ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+                                         ins->offset);
+       if (ret)
+               return ret;
+
        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7749,6 +7793,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_free_path(path);
 
+       ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+                                         num_bytes);
+       if (ret)
+               return ret;
+
        ret = update_block_group(trans, root, ins->objectid, root->nodesize,
                                 1);
        if (ret) { /* -ENOENT, logic error */
@@ -7831,7 +7880,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 
        btrfs_set_lock_blocking(buf);
-       btrfs_set_buffer_uptodate(buf);
+       set_extent_buffer_uptodate(buf);
 
        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
                buf->log_index = root->log_transid % 2;
@@ -8105,21 +8154,47 @@ reada:
 }
 
 /*
- * TODO: Modify related function to add related node/leaf to dirty_extent_root,
- * for later qgroup accounting.
- *
- * Current, this function does nothing.
+ * These may not be seen by the usual inc/dec ref code so we have to
+ * add them here.
  */
+static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root, u64 bytenr,
+                                    u64 num_bytes)
+{
+       struct btrfs_qgroup_extent_record *qrecord;
+       struct btrfs_delayed_ref_root *delayed_refs;
+
+       qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
+       if (!qrecord)
+               return -ENOMEM;
+
+       qrecord->bytenr = bytenr;
+       qrecord->num_bytes = num_bytes;
+       qrecord->old_roots = NULL;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+       if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+               kfree(qrecord);
+       spin_unlock(&delayed_refs->lock);
+
+       return 0;
+}
+
 static int account_leaf_items(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct extent_buffer *eb)
 {
        int nr = btrfs_header_nritems(eb);
-       int i, extent_type;
+       int i, extent_type, ret;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
        u64 bytenr, num_bytes;
 
+       /* We can be called directly from walk_up_proc() */
+       if (!root->fs_info->quota_enabled)
+               return 0;
+
        for (i = 0; i < nr; i++) {
                btrfs_item_key_to_cpu(eb, &key, i);
 
@@ -8138,6 +8213,10 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
                        continue;
 
                num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+               ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
+               if (ret)
+                       return ret;
        }
        return 0;
 }
@@ -8206,8 +8285,6 @@ static int adjust_slots_upwards(struct btrfs_root *root,
 
 /*
  * root_eb is the subtree root and is locked before this function is called.
- * TODO: Modify this function to mark all (including complete shared node)
- * to dirty_extent_root to allow it get accounted in qgroup.
  */
 static int account_shared_subtree(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
@@ -8285,6 +8362,11 @@ walk_down:
                        btrfs_tree_read_lock(eb);
                        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
                        path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+                       ret = record_one_subtree_extent(trans, root, child_bytenr,
+                                                       root->nodesize);
+                       if (ret)
+                               goto out;
                }
 
                if (level == 0) {
@@ -9620,6 +9702,8 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
        cache->full_stripe_len = btrfs_full_stripe_len(root,
                                               &root->fs_info->mapping_tree,
                                               start);
+       set_free_space_tree_thresholds(cache);
+
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
        init_rwsem(&cache->data_rwsem);
@@ -9631,6 +9715,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
        INIT_LIST_HEAD(&cache->io_list);
        btrfs_init_free_space_ctl(cache);
        atomic_set(&cache->trimming, 0);
+       mutex_init(&cache->free_space_lock);
 
        return cache;
 }
@@ -9841,6 +9926,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
                                               key.objectid, key.offset);
                if (ret)
                        btrfs_abort_transaction(trans, extent_root, ret);
+               add_block_group_free_space(trans, root->fs_info, block_group);
+               /* already aborted the transaction if it failed. */
 next:
                list_del_init(&block_group->bg_list);
        }
@@ -9871,6 +9958,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->flags = type;
        cache->last_byte_to_unpin = (u64)-1;
        cache->cached = BTRFS_CACHE_FINISHED;
+       cache->needs_free_space = 1;
        ret = exclude_super_stripes(root, cache);
        if (ret) {
                /*
@@ -10241,6 +10329,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
        unlock_chunks(root);
 
+       ret = remove_block_group_free_space(trans, root->fs_info, block_group);
+       if (ret)
+               goto out;
+
        btrfs_put_block_group(block_group);
        btrfs_put_block_group(block_group);
 
@@ -10256,6 +10348,47 @@ out:
        return ret;
 }
 
+struct btrfs_trans_handle *
+btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
+                                    const u64 chunk_offset)
+{
+       struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+       struct extent_map *em;
+       struct map_lookup *map;
+       unsigned int num_items;
+
+       read_lock(&em_tree->lock);
+       em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+       read_unlock(&em_tree->lock);
+       ASSERT(em && em->start == chunk_offset);
+
+       /*
+        * We need to reserve 3 + N units from the metadata space info in order
+        * to remove a block group (done at btrfs_remove_chunk() and at
+        * btrfs_remove_block_group()), which are used for:
+        *
+        * 1 unit for adding the free space inode's orphan (located in the tree
+        * of tree roots).
+        * 1 unit for deleting the block group item (located in the extent
+        * tree).
+        * 1 unit for deleting the free space item (located in tree of tree
+        * roots).
+        * N units for deleting N device extent items corresponding to each
+        * stripe (located in the device tree).
+        *
+        * In order to remove a block group we also need to reserve units in the
+        * system space info in order to update the chunk tree (update one or
+        * more device items and remove one chunk item), but this is done at
+        * btrfs_remove_chunk() through a call to check_system_chunk().
+        */
+       map = (struct map_lookup *)em->bdev;
+       num_items = 3 + map->num_stripes;
+       free_extent_map(em);
+
+       return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
+                                                          num_items, 1);
+}
+
 /*
  * Process the unused_bgs list and remove any that don't have any allocated
  * space inside of them.
@@ -10322,8 +10455,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 * Want to do this before we do anything else so we can recover
                 * properly if we fail to join the transaction.
                 */
-               /* 1 for btrfs_orphan_reserve_metadata() */
-               trans = btrfs_start_transaction(root, 1);
+               trans = btrfs_start_trans_remove_block_group(fs_info,
+                                                    block_group->key.objectid);
                if (IS_ERR(trans)) {
                        btrfs_dec_block_group_ro(root, block_group);
                        ret = PTR_ERR(trans);
@@ -10403,11 +10536,15 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 * until transaction commit to do the actual discard.
                 */
                if (trimming) {
-                       WARN_ON(!list_empty(&block_group->bg_list));
-                       spin_lock(&trans->transaction->deleted_bgs_lock);
+                       spin_lock(&fs_info->unused_bgs_lock);
+                       /*
+                        * A concurrent scrub might have added us to the list
+                        * fs_info->unused_bgs, so use a list_move operation
+                        * to add the block group to the deleted_bgs list.
+                        */
                        list_move(&block_group->bg_list,
                                  &trans->transaction->deleted_bgs);
-                       spin_unlock(&trans->transaction->deleted_bgs_lock);
+                       spin_unlock(&fs_info->unused_bgs_lock);
                        btrfs_get_block_group(block_group);
                }
 end_trans:
This page took 0.06248 seconds and 5 git commands to generate.