Merge branch 'freespace-4.5' into for-linus-4.5

[deliverable/linux.git] / fs / btrfs / extent-tree.c
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index acf3ed11cfb60e95b685aeb009e4d72fbdba3c3b..0617cb73669dda8206ebc070f2cfb7d8a49ee6b6 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
  #include "raid56.h"
  #include "locking.h"
  #include "free-space-cache.h"
+#include "free-space-tree.h"
  #include "math.h"
  #include "sysfs.h"
  #include "qgroup.h"
@@ -124,7 +125,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
         return (cache->flags & bits) == bits;
  }
  
-static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
  {
         atomic_inc(&cache->count);
  }
@@ -357,8 +358,8 @@ static void fragment_free_space(struct btrfs_root *root,
   * we need to check the pinned_extents for any extents that can't be used yet
   * since their free space will be released as soon as the transaction commits.
   */
-static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
-                             struct btrfs_fs_info *info, u64 start, u64 end)
+u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+                      struct btrfs_fs_info *info, u64 start, u64 end)
  {
         u64 extent_start, extent_end, size, total_added = 0;
         int ret;
@@ -395,11 +396,10 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
         return total_added;
  }
  
-static noinline void caching_thread(struct btrfs_work *work)
+static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
  {
         struct btrfs_block_group_cache *block_group;
         struct btrfs_fs_info *fs_info;
-       struct btrfs_caching_control *caching_ctl;
         struct btrfs_root *extent_root;
         struct btrfs_path *path;
         struct extent_buffer *leaf;
@@ -407,17 +407,16 @@ static noinline void caching_thread(struct btrfs_work *work)
         u64 total_found = 0;
         u64 last = 0;
         u32 nritems;
-       int ret = -ENOMEM;
+       int ret;
         bool wakeup = true;
  
-       caching_ctl = container_of(work, struct btrfs_caching_control, work);
         block_group = caching_ctl->block_group;
         fs_info = block_group->fs_info;
         extent_root = fs_info->extent_root;
  
         path = btrfs_alloc_path();
         if (!path)
-               goto out;
+               return -ENOMEM;
  
         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
  
@@ -443,15 +442,11 @@ static noinline void caching_thread(struct btrfs_work *work)
         key.objectid = last;
         key.offset = 0;
         key.type = BTRFS_EXTENT_ITEM_KEY;
-again:
-       mutex_lock(&caching_ctl->mutex);
-       /* need to make sure the commit_root doesn't disappear */
-       down_read(&fs_info->commit_root_sem);
  
  next:
         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
         if (ret < 0)
-               goto err;
+               goto out;
  
         leaf = path->nodes[0];
         nritems = btrfs_header_nritems(leaf);
@@ -477,12 +472,14 @@ next:
                                 up_read(&fs_info->commit_root_sem);
                                 mutex_unlock(&caching_ctl->mutex);
                                 cond_resched();
-                               goto again;
+                               mutex_lock(&caching_ctl->mutex);
+                               down_read(&fs_info->commit_root_sem);
+                               goto next;
                         }
  
                         ret = btrfs_next_leaf(extent_root, path);
                         if (ret < 0)
-                               goto err;
+                               goto out;
                         if (ret)
                                 break;
                         leaf = path->nodes[0];
@@ -521,7 +518,7 @@ next:
                         else
                                 last = key.objectid + key.offset;
  
-                       if (total_found > (1024 * 1024 * 2)) {
+                       if (total_found > CACHING_CTL_WAKE_UP) {
                                 total_found = 0;
                                 if (wakeup)
                                         wake_up(&caching_ctl->wait);
@@ -534,9 +531,35 @@ next:
         total_found += add_new_free_space(block_group, fs_info, last,
                                           block_group->key.objectid +
                                           block_group->key.offset);
+       caching_ctl->progress = (u64)-1;
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static noinline void caching_thread(struct btrfs_work *work)
+{
+       struct btrfs_block_group_cache *block_group;
+       struct btrfs_fs_info *fs_info;
+       struct btrfs_caching_control *caching_ctl;
+       int ret;
+
+       caching_ctl = container_of(work, struct btrfs_caching_control, work);
+       block_group = caching_ctl->block_group;
+       fs_info = block_group->fs_info;
+
+       mutex_lock(&caching_ctl->mutex);
+       down_read(&fs_info->commit_root_sem);
+
+       if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+               ret = load_free_space_tree(caching_ctl);
+       else
+               ret = load_extent_tree_free(caching_ctl);
+
         spin_lock(&block_group->lock);
         block_group->caching_ctl = NULL;
-       block_group->cached = BTRFS_CACHE_FINISHED;
+       block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
         spin_unlock(&block_group->lock);
  
  #ifdef CONFIG_BTRFS_DEBUG
@@ -555,20 +578,11 @@ next:
  #endif
  
         caching_ctl->progress = (u64)-1;
-err:
-       btrfs_free_path(path);
-       up_read(&fs_info->commit_root_sem);
-
-       free_excluded_extents(extent_root, block_group);
  
+       up_read(&fs_info->commit_root_sem);
+       free_excluded_extents(fs_info->extent_root, block_group);
         mutex_unlock(&caching_ctl->mutex);
-out:
-       if (ret) {
-               spin_lock(&block_group->lock);
-               block_group->caching_ctl = NULL;
-               block_group->cached = BTRFS_CACHE_ERROR;
-               spin_unlock(&block_group->lock);
-       }
+
         wake_up(&caching_ctl->wait);
  
         put_caching_control(caching_ctl);
@@ -680,8 +694,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                 }
         } else {
                 /*
-                * We are not going to do the fast caching, set cached to the
-                * appropriate value and wakeup any waiters.
+                * We're either using the free space tree or no caching at all.
+                * Set cached to the appropriate value and wakeup any waiters.
                  */
                 spin_lock(&cache->lock);
                 if (load_cache_only) {
@@ -3684,11 +3698,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                 return -ENOMEM;
  
         /*
-        * We don't need the lock here since we are protected by the transaction
-        * commit.  We want to do the cache_save_setup first and then run the
+        * Even though we are in the critical section of the transaction commit,
+        * we can still have concurrent tasks adding elements to this
+        * transaction's list of dirty block groups. These tasks correspond to
+        * endio free space workers started when writeback finishes for a
+        * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
+        * allocate new block groups as a result of COWing nodes of the root
+        * tree when updating the free space inode. The writeback for the space
+        * caches is triggered by an earlier call to
+        * btrfs_start_dirty_block_groups() and iterations of the following
+        * loop.
+        * Also we want to do the cache_save_setup first and then run the
          * delayed refs to make sure we have the best chance at doing this all
          * in one shot.
          */
+       spin_lock(&cur_trans->dirty_bgs_lock);
         while (!list_empty(&cur_trans->dirty_bgs)) {
                 cache = list_first_entry(&cur_trans->dirty_bgs,
                                          struct btrfs_block_group_cache,
@@ -3700,11 +3724,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                  * finish and then do it all again
                  */
                 if (!list_empty(&cache->io_list)) {
+                       spin_unlock(&cur_trans->dirty_bgs_lock);
                         list_del_init(&cache->io_list);
                         btrfs_wait_cache_io(root, trans, cache,
                                             &cache->io_ctl, path,
                                             cache->key.objectid);
                         btrfs_put_block_group(cache);
+                       spin_lock(&cur_trans->dirty_bgs_lock);
                 }
  
                 /*
@@ -3712,6 +3738,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                  * on any pending IO
                  */
                 list_del_init(&cache->dirty_list);
+               spin_unlock(&cur_trans->dirty_bgs_lock);
                 should_put = 1;
  
                 cache_save_setup(cache, trans, path);
@@ -3743,7 +3770,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                 /* if its not on the io list, we need to put the block group */
                 if (should_put)
                         btrfs_put_block_group(cache);
+               spin_lock(&cur_trans->dirty_bgs_lock);
         }
+       spin_unlock(&cur_trans->dirty_bgs_lock);
  
         while (!list_empty(io)) {
                 cache = list_first_entry(io, struct btrfs_block_group_cache,
@@ -5915,19 +5944,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                         set_extent_dirty(info->pinned_extents,
                                          bytenr, bytenr + num_bytes - 1,
                                          GFP_NOFS | __GFP_NOFAIL);
-                       /*
-                        * No longer have used bytes in this block group, queue
-                        * it for deletion.
-                        */
-                       if (old_val == 0) {
-                               spin_lock(&info->unused_bgs_lock);
-                               if (list_empty(&cache->bg_list)) {
-                                       btrfs_get_block_group(cache);
-                                       list_add_tail(&cache->bg_list,
-                                                     &info->unused_bgs);
-                               }
-                               spin_unlock(&info->unused_bgs_lock);
-                       }
                 }
  
                 spin_lock(&trans->transaction->dirty_bgs_lock);
@@ -5939,6 +5955,22 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                 }
                 spin_unlock(&trans->transaction->dirty_bgs_lock);
  
+               /*
+                * No longer have used bytes in this block group, queue it for
+                * deletion. We do this after adding the block group to the
+                * dirty list to avoid races between cleaner kthread and space
+                * cache writeout.
+                */
+               if (!alloc && old_val == 0) {
+                       spin_lock(&info->unused_bgs_lock);
+                       if (list_empty(&cache->bg_list)) {
+                               btrfs_get_block_group(cache);
+                               list_add_tail(&cache->bg_list,
+                                             &info->unused_bgs);
+                       }
+                       spin_unlock(&info->unused_bgs_lock);
+               }
+
                 btrfs_put_block_group(cache);
                 total -= num_bytes;
                 bytenr += num_bytes;
@@ -6658,6 +6690,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                         }
                 }
  
+               ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
+                                            num_bytes);
+               if (ret) {
+                       btrfs_abort_transaction(trans, extent_root, ret);
+                       goto out;
+               }
+
                 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
                 if (ret) {
                         btrfs_abort_transaction(trans, extent_root, ret);
@@ -7669,6 +7708,11 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
         btrfs_mark_buffer_dirty(path->nodes[0]);
         btrfs_free_path(path);
  
+       ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+                                         ins->offset);
+       if (ret)
+               return ret;
+
         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
         if (ret) { /* -ENOENT, logic error */
                 btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7749,6 +7793,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
         btrfs_mark_buffer_dirty(leaf);
         btrfs_free_path(path);
  
+       ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+                                         num_bytes);
+       if (ret)
+               return ret;
+
         ret = update_block_group(trans, root, ins->objectid, root->nodesize,
                                  1);
         if (ret) { /* -ENOENT, logic error */
@@ -7831,7 +7880,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
  
         btrfs_set_lock_blocking(buf);
-       btrfs_set_buffer_uptodate(buf);
+       set_extent_buffer_uptodate(buf);
  
         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
                 buf->log_index = root->log_transid % 2;
@@ -8105,21 +8154,47 @@ reada:
  }
  
  /*
- * TODO: Modify related function to add related node/leaf to dirty_extent_root,
- * for later qgroup accounting.
- *
- * Current, this function does nothing.
+ * These may not be seen by the usual inc/dec ref code so we have to
+ * add them here.
   */
+static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root, u64 bytenr,
+                                    u64 num_bytes)
+{
+       struct btrfs_qgroup_extent_record *qrecord;
+       struct btrfs_delayed_ref_root *delayed_refs;
+
+       qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
+       if (!qrecord)
+               return -ENOMEM;
+
+       qrecord->bytenr = bytenr;
+       qrecord->num_bytes = num_bytes;
+       qrecord->old_roots = NULL;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+       if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+               kfree(qrecord);
+       spin_unlock(&delayed_refs->lock);
+
+       return 0;
+}
+
  static int account_leaf_items(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               struct extent_buffer *eb)
  {
         int nr = btrfs_header_nritems(eb);
-       int i, extent_type;
+       int i, extent_type, ret;
         struct btrfs_key key;
         struct btrfs_file_extent_item *fi;
         u64 bytenr, num_bytes;
  
+       /* We can be called directly from walk_up_proc() */
+       if (!root->fs_info->quota_enabled)
+               return 0;
+
         for (i = 0; i < nr; i++) {
                 btrfs_item_key_to_cpu(eb, &key, i);
  
@@ -8138,6 +8213,10 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
                         continue;
  
                 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+               ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
+               if (ret)
+                       return ret;
         }
         return 0;
  }
@@ -8206,8 +8285,6 @@ static int adjust_slots_upwards(struct btrfs_root *root,
  
  /*
   * root_eb is the subtree root and is locked before this function is called.
- * TODO: Modify this function to mark all (including complete shared node)
- * to dirty_extent_root to allow it get accounted in qgroup.
   */
  static int account_shared_subtree(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
@@ -8285,6 +8362,11 @@ walk_down:
                         btrfs_tree_read_lock(eb);
                         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
                         path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+                       ret = record_one_subtree_extent(trans, root, child_bytenr,
+                                                       root->nodesize);
+                       if (ret)
+                               goto out;
                 }
  
                 if (level == 0) {
@@ -9620,6 +9702,8 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
         cache->full_stripe_len = btrfs_full_stripe_len(root,
                                                &root->fs_info->mapping_tree,
                                                start);
+       set_free_space_tree_thresholds(cache);
+
         atomic_set(&cache->count, 1);
         spin_lock_init(&cache->lock);
         init_rwsem(&cache->data_rwsem);
@@ -9631,6 +9715,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
         INIT_LIST_HEAD(&cache->io_list);
         btrfs_init_free_space_ctl(cache);
         atomic_set(&cache->trimming, 0);
+       mutex_init(&cache->free_space_lock);
  
         return cache;
  }
@@ -9841,6 +9926,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
                                                key.objectid, key.offset);
                 if (ret)
                         btrfs_abort_transaction(trans, extent_root, ret);
+               add_block_group_free_space(trans, root->fs_info, block_group);
+               /* already aborted the transaction if it failed. */
  next:
                 list_del_init(&block_group->bg_list);
         }
@@ -9871,6 +9958,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
         cache->flags = type;
         cache->last_byte_to_unpin = (u64)-1;
         cache->cached = BTRFS_CACHE_FINISHED;
+       cache->needs_free_space = 1;
         ret = exclude_super_stripes(root, cache);
         if (ret) {
                 /*
@@ -10241,6 +10329,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
  
         unlock_chunks(root);
  
+       ret = remove_block_group_free_space(trans, root->fs_info, block_group);
+       if (ret)
+               goto out;
+
         btrfs_put_block_group(block_group);
         btrfs_put_block_group(block_group);
  
@@ -10256,6 +10348,47 @@ out:
         return ret;
  }
  
+struct btrfs_trans_handle *
+btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
+                                    const u64 chunk_offset)
+{
+       struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+       struct extent_map *em;
+       struct map_lookup *map;
+       unsigned int num_items;
+
+       read_lock(&em_tree->lock);
+       em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+       read_unlock(&em_tree->lock);
+       ASSERT(em && em->start == chunk_offset);
+
+       /*
+        * We need to reserve 3 + N units from the metadata space info in order
+        * to remove a block group (done at btrfs_remove_chunk() and at
+        * btrfs_remove_block_group()), which are used for:
+        *
+        * 1 unit for adding the free space inode's orphan (located in the tree
+        * of tree roots).
+        * 1 unit for deleting the block group item (located in the extent
+        * tree).
+        * 1 unit for deleting the free space item (located in tree of tree
+        * roots).
+        * N units for deleting N device extent items corresponding to each
+        * stripe (located in the device tree).
+        *
+        * In order to remove a block group we also need to reserve units in the
+        * system space info in order to update the chunk tree (update one or
+        * more device items and remove one chunk item), but this is done at
+        * btrfs_remove_chunk() through a call to check_system_chunk().
+        */
+       map = (struct map_lookup *)em->bdev;
+       num_items = 3 + map->num_stripes;
+       free_extent_map(em);
+
+       return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
+                                                          num_items, 1);
+}
+
  /*
   * Process the unused_bgs list and remove any that don't have any allocated
   * space inside of them.
@@ -10322,8 +10455,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                  * Want to do this before we do anything else so we can recover
                  * properly if we fail to join the transaction.
                  */
-               /* 1 for btrfs_orphan_reserve_metadata() */
-               trans = btrfs_start_transaction(root, 1);
+               trans = btrfs_start_trans_remove_block_group(fs_info,
+                                                    block_group->key.objectid);
                 if (IS_ERR(trans)) {
                         btrfs_dec_block_group_ro(root, block_group);
                         ret = PTR_ERR(trans);
@@ -10403,11 +10536,15 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                  * until transaction commit to do the actual discard.
                  */
                 if (trimming) {
-                       WARN_ON(!list_empty(&block_group->bg_list));
-                       spin_lock(&trans->transaction->deleted_bgs_lock);
+                       spin_lock(&fs_info->unused_bgs_lock);
+                       /*
+                        * A concurrent scrub might have added us to the list
+                        * fs_info->unused_bgs, so use a list_move operation
+                        * to add the block group to the deleted_bgs list.
+                        */
                         list_move(&block_group->bg_list,
                                   &trans->transaction->deleted_bgs);
-                       spin_unlock(&trans->transaction->deleted_bgs_lock);
+                       spin_unlock(&fs_info->unused_bgs_lock);
                         btrfs_get_block_group(block_group);
                 }
  end_trans: