Btrfs: qgroup: free reserved in exceeding quota.

[deliverable/linux.git] / fs / btrfs / extent-tree.c
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 571f402d3fc46e5f0205451e85a7b78f3cc16b50..695d5110e02057abf9f8b3f4a0f1cbf878a6441d 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2538,6 +2538,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                  * list before we release it.
                  */
                 if (btrfs_delayed_ref_is_head(ref)) {
+                       if (locked_ref->is_data &&
+                           locked_ref->total_ref_mod < 0) {
+                               spin_lock(&delayed_refs->lock);
+                               delayed_refs->pending_csums -= ref->num_bytes;
+                               spin_unlock(&delayed_refs->lock);
+                       }
                         btrfs_delayed_ref_unlock(locked_ref);
                         locked_ref = NULL;
                 }
@@ -2561,8 +2567,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                  */
                 spin_lock(&delayed_refs->lock);
                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
-               avg = div64_u64(avg, 4);
-               fs_info->avg_delayed_ref_runtime = avg;
+               fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
                 spin_unlock(&delayed_refs->lock);
         }
         return 0;
@@ -2624,7 +2629,26 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
          * We don't ever fill up leaves all the way so multiply by 2 just to be
          * closer to what we're really going to want to ouse.
          */
-       return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+       return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+
+/*
+ * Takes the number of bytes to be csumm'ed and figures out how many leaves it
+ * would require to store the csums for that many bytes.
+ */
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
+{
+       u64 csum_size;
+       u64 num_csums_per_leaf;
+       u64 num_csums;
+
+       csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+       num_csums_per_leaf = div64_u64(csum_size,
+                       (u64)btrfs_super_csum_size(root->fs_info->super_copy));
+       num_csums = div64_u64(csum_bytes, root->sectorsize);
+       num_csums += num_csums_per_leaf - 1;
+       num_csums = div64_u64(num_csums, num_csums_per_leaf);
+       return num_csums;
  }
  
  int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
@@ -2632,7 +2656,9 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
  {
         struct btrfs_block_rsv *global_rsv;
         u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
-       u64 num_bytes;
+       u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
+       u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
+       u64 num_bytes, num_dirty_bgs_bytes;
         int ret = 0;
  
         num_bytes = btrfs_calc_trans_metadata_size(root, 1);
@@ -2640,17 +2666,22 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
         if (num_heads > 1)
                 num_bytes += (num_heads - 1) * root->nodesize;
         num_bytes <<= 1;
+       num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
+       num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
+                                                            num_dirty_bgs);
         global_rsv = &root->fs_info->global_block_rsv;
  
         /*
          * If we can't allocate any more chunks lets make sure we have _lots_ of
          * wiggle room since running delayed refs can create more delayed refs.
          */
-       if (global_rsv->space_info->full)
+       if (global_rsv->space_info->full) {
+               num_dirty_bgs_bytes <<= 1;
                 num_bytes <<= 1;
+       }
  
         spin_lock(&global_rsv->lock);
-       if (global_rsv->reserved <= num_bytes)
+       if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
                 ret = 1;
         spin_unlock(&global_rsv->lock);
         return ret;
@@ -3193,7 +3224,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
         struct inode *inode = NULL;
         u64 alloc_hint = 0;
         int dcs = BTRFS_DC_ERROR;
-       int num_pages = 0;
+       u64 num_pages = 0;
         int retries = 0;
         int ret = 0;
  
@@ -3208,6 +3239,8 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
                 return 0;
         }
  
+       if (trans->aborted)
+               return 0;
  again:
         inode = lookup_free_space_inode(root, block_group, path);
         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -3243,6 +3276,20 @@ again:
          */
         BTRFS_I(inode)->generation = 0;
         ret = btrfs_update_inode(trans, root, inode);
+       if (ret) {
+               /*
+                * So theoretically we could recover from this, simply set the
+                * super cache generation to 0 so we know to invalidate the
+                * cache, but then we'd have to keep track of the block groups
+                * that fail this way so we know we _have_ to reset this cache
+                * before the next commit or risk reading stale cache.  So to
+                * limit our exposure to horrible edge cases lets just abort the
+                * transaction, this only happens in really bad situations
+                * anyway.
+                */
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_put;
+       }
         WARN_ON(ret);
  
         if (i_size_read(inode) > 0) {
@@ -3251,7 +3298,7 @@ again:
                 if (ret)
                         goto out_put;
  
-               ret = btrfs_truncate_free_space_cache(root, trans, inode);
+               ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
                 if (ret)
                         goto out_put;
         }
@@ -3277,7 +3324,7 @@ again:
          * taking up quite a bit since it's not folded into the other space
          * cache.
          */
-       num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
+       num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
         if (!num_pages)
                 num_pages = 1;
  
@@ -3309,16 +3356,182 @@ out:
         return ret;
  }
  
-int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root)
+{
+       struct btrfs_block_group_cache *cache, *tmp;
+       struct btrfs_transaction *cur_trans = trans->transaction;
+       struct btrfs_path *path;
+
+       if (list_empty(&cur_trans->dirty_bgs) ||
+           !btrfs_test_opt(root, SPACE_CACHE))
+               return 0;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       /* Could add new block groups, use _safe just in case */
+       list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
+                                dirty_list) {
+               if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+                       cache_save_setup(cache, trans, path);
+       }
+
+       btrfs_free_path(path);
+       return 0;
+}
+
+/*
+ * transaction commit does final block group cache writeback during a
+ * critical section where nothing is allowed to change the FS.  This is
+ * required in order for the cache to actually match the block group,
+ * but can introduce a lot of latency into the commit.
+ *
+ * So, btrfs_start_dirty_block_groups is here to kick off block group
+ * cache IO.  There's a chance we'll have to redo some of it if the
+ * block group changes again during the commit, but it greatly reduces
+ * the commit latency by getting rid of the easy block groups while
+ * we're still allowing others to join the commit.
+ */
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root)
  {
         struct btrfs_block_group_cache *cache;
         struct btrfs_transaction *cur_trans = trans->transaction;
         int ret = 0;
-       struct btrfs_path *path;
+       int should_put;
+       struct btrfs_path *path = NULL;
+       LIST_HEAD(dirty);
+       struct list_head *io = &cur_trans->io_bgs;
+       int num_started = 0;
+       int loops = 0;
  
-       if (list_empty(&cur_trans->dirty_bgs))
+       spin_lock(&cur_trans->dirty_bgs_lock);
+       if (!list_empty(&cur_trans->dirty_bgs)) {
+               list_splice_init(&cur_trans->dirty_bgs, &dirty);
+       }
+       spin_unlock(&cur_trans->dirty_bgs_lock);
+
+again:
+       if (list_empty(&dirty)) {
+               btrfs_free_path(path);
                 return 0;
+       }
+
+       /*
+        * make sure all the block groups on our dirty list actually
+        * exist
+        */
+       btrfs_create_pending_block_groups(trans, root);
+
+       if (!path) {
+               path = btrfs_alloc_path();
+               if (!path)
+                       return -ENOMEM;
+       }
+
+       while (!list_empty(&dirty)) {
+               cache = list_first_entry(&dirty,
+                                        struct btrfs_block_group_cache,
+                                        dirty_list);
+
+               /*
+                * cache_write_mutex is here only to save us from balance
+                * deleting this block group while we are writing out the
+                * cache
+                */
+               mutex_lock(&trans->transaction->cache_write_mutex);
+
+               /*
+                * this can happen if something re-dirties a block
+                * group that is already under IO.  Just wait for it to
+                * finish and then do it all again
+                */
+               if (!list_empty(&cache->io_list)) {
+                       list_del_init(&cache->io_list);
+                       btrfs_wait_cache_io(root, trans, cache,
+                                           &cache->io_ctl, path,
+                                           cache->key.objectid);
+                       btrfs_put_block_group(cache);
+               }
+
+
+               /*
+                * btrfs_wait_cache_io uses the cache->dirty_list to decide
+                * if it should update the cache_state.  Don't delete
+                * until after we wait.
+                *
+                * Since we're not running in the commit critical section
+                * we need the dirty_bgs_lock to protect from update_block_group
+                */
+               spin_lock(&cur_trans->dirty_bgs_lock);
+               list_del_init(&cache->dirty_list);
+               spin_unlock(&cur_trans->dirty_bgs_lock);
+
+               should_put = 1;
+
+               cache_save_setup(cache, trans, path);
+
+               if (cache->disk_cache_state == BTRFS_DC_SETUP) {
+                       cache->io_ctl.inode = NULL;
+                       ret = btrfs_write_out_cache(root, trans, cache, path);
+                       if (ret == 0 && cache->io_ctl.inode) {
+                               num_started++;
+                               should_put = 0;
+
+                               /*
+                                * the cache_write_mutex is protecting
+                                * the io_list
+                                */
+                               list_add_tail(&cache->io_list, io);
+                       } else {
+                               /*
+                                * if we failed to write the cache, the
+                                * generation will be bad and life goes on
+                                */
+                               ret = 0;
+                       }
+               }
+               if (!ret)
+                       ret = write_one_cache_group(trans, root, path, cache);
+               mutex_unlock(&trans->transaction->cache_write_mutex);
+
+               /* if its not on the io list, we need to put the block group */
+               if (should_put)
+                       btrfs_put_block_group(cache);
+
+               if (ret)
+                       break;
+       }
+
+       /*
+        * go through delayed refs for all the stuff we've just kicked off
+        * and then loop back (just once)
+        */
+       ret = btrfs_run_delayed_refs(trans, root, 0);
+       if (!ret && loops == 0) {
+               loops++;
+               spin_lock(&cur_trans->dirty_bgs_lock);
+               list_splice_init(&cur_trans->dirty_bgs, &dirty);
+               spin_unlock(&cur_trans->dirty_bgs_lock);
+               goto again;
+       }
+
+       btrfs_free_path(path);
+       return ret;
+}
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root)
+{
+       struct btrfs_block_group_cache *cache;
+       struct btrfs_transaction *cur_trans = trans->transaction;
+       int ret = 0;
+       int should_put;
+       struct btrfs_path *path;
+       struct list_head *io = &cur_trans->io_bgs;
+       int num_started = 0;
  
         path = btrfs_alloc_path();
         if (!path)
@@ -3334,16 +3547,61 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                 cache = list_first_entry(&cur_trans->dirty_bgs,
                                          struct btrfs_block_group_cache,
                                          dirty_list);
+
+               /*
+                * this can happen if cache_save_setup re-dirties a block
+                * group that is already under IO.  Just wait for it to
+                * finish and then do it all again
+                */
+               if (!list_empty(&cache->io_list)) {
+                       list_del_init(&cache->io_list);
+                       btrfs_wait_cache_io(root, trans, cache,
+                                           &cache->io_ctl, path,
+                                           cache->key.objectid);
+                       btrfs_put_block_group(cache);
+               }
+
+               /*
+                * don't remove from the dirty list until after we've waited
+                * on any pending IO
+                */
                 list_del_init(&cache->dirty_list);
-               if (cache->disk_cache_state == BTRFS_DC_CLEAR)
-                       cache_save_setup(cache, trans, path);
+               should_put = 1;
+
+               cache_save_setup(cache, trans, path);
+
                 if (!ret)
-                       ret = btrfs_run_delayed_refs(trans, root,
-                                                    (unsigned long) -1);
-               if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
-                       btrfs_write_out_cache(root, trans, cache, path);
+                       ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
+
+               if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
+                       cache->io_ctl.inode = NULL;
+                       ret = btrfs_write_out_cache(root, trans, cache, path);
+                       if (ret == 0 && cache->io_ctl.inode) {
+                               num_started++;
+                               should_put = 0;
+                               list_add_tail(&cache->io_list, io);
+                       } else {
+                               /*
+                                * if we failed to write the cache, the
+                                * generation will be bad and life goes on
+                                */
+                               ret = 0;
+                       }
+               }
                 if (!ret)
                         ret = write_one_cache_group(trans, root, path, cache);
+
+               /* if its not on the io list, we need to put the block group */
+               if (should_put)
+                       btrfs_put_block_group(cache);
+       }
+
+       while (!list_empty(io)) {
+               cache = list_first_entry(io, struct btrfs_block_group_cache,
+                                        io_list);
+               list_del_init(&cache->io_list);
+               btrfs_wait_cache_io(root, trans, cache,
+                                   &cache->io_ctl, path, cache->key.objectid);
                 btrfs_put_block_group(cache);
         }
  
@@ -3599,13 +3857,15 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_fs_info *fs_info = root->fs_info;
         u64 used;
-       int ret = 0, committed = 0, alloc_chunk = 1;
+       int ret = 0;
+       int need_commit = 2;
+       int have_pinned_space;
  
         /* make sure bytes are sectorsize aligned */
         bytes = ALIGN(bytes, root->sectorsize);
  
         if (btrfs_is_free_space_inode(inode)) {
-               committed = 1;
+               need_commit = 0;
                 ASSERT(current->journal_info);
         }
  
@@ -3627,7 +3887,7 @@ again:
                  * if we don't have enough free bytes in this space then we need
                  * to alloc a new chunk.
                  */
-               if (!data_sinfo->full && alloc_chunk) {
+               if (!data_sinfo->full) {
                         u64 alloc_target;
  
                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
@@ -3655,8 +3915,10 @@ alloc:
                         if (ret < 0) {
                                 if (ret != -ENOSPC)
                                         return ret;
-                               else
+                               else {
+                                       have_pinned_space = 1;
                                         goto commit_trans;
+                               }
                         }
  
                         if (!data_sinfo)
@@ -3667,26 +3929,39 @@ alloc:
  
                 /*
                  * If we don't have enough pinned space to deal with this
-                * allocation don't bother committing the transaction.
+                * allocation, and no removed chunk in current transaction,
+                * don't bother committing the transaction.
                  */
-               if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
-                                          bytes) < 0)
-                       committed = 1;
+               have_pinned_space = percpu_counter_compare(
+                       &data_sinfo->total_bytes_pinned,
+                       used + bytes - data_sinfo->total_bytes);
                 spin_unlock(&data_sinfo->lock);
  
                 /* commit the current transaction and try again */
  commit_trans:
-               if (!committed &&
+               if (need_commit &&
                     !atomic_read(&root->fs_info->open_ioctl_trans)) {
-                       committed = 1;
+                       need_commit--;
  
                         trans = btrfs_join_transaction(root);
                         if (IS_ERR(trans))
                                 return PTR_ERR(trans);
-                       ret = btrfs_commit_transaction(trans, root);
-                       if (ret)
-                               return ret;
-                       goto again;
+                       if (have_pinned_space >= 0 ||
+                           trans->transaction->have_free_bgs ||
+                           need_commit > 0) {
+                               ret = btrfs_commit_transaction(trans, root);
+                               if (ret)
+                                       return ret;
+                               /*
+                                * make sure that all running delayed iput are
+                                * done
+                                */
+                               down_write(&root->fs_info->delayed_iput_sem);
+                               up_write(&root->fs_info->delayed_iput_sem);
+                               goto again;
+                       } else {
+                               btrfs_end_transaction(trans, root);
+                       }
                 }
  
                 trace_btrfs_space_reservation(root->fs_info,
@@ -4256,8 +4531,13 @@ out:
  static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
                                         struct btrfs_fs_info *fs_info, u64 used)
  {
-       return (used >= div_factor_fine(space_info->total_bytes, 98) &&
-               !btrfs_fs_closing(fs_info) &&
+       u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+
+       /* If we're just plain full then async reclaim just slows us down. */
+       if (space_info->bytes_used >= thresh)
+               return 0;
+
+       return (used >= thresh && !btrfs_fs_closing(fs_info) &&
                 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
  }
  
@@ -4312,10 +4592,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
                 if (!btrfs_need_do_async_reclaim(space_info, fs_info,
                                                  flush_state))
                         return;
-       } while (flush_state <= COMMIT_TRANS);
-
-       if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
-               queue_work(system_unbound_wq, work);
+       } while (flush_state < COMMIT_TRANS);
  }
  
  void btrfs_init_async_reclaim_work(struct work_struct *work)
@@ -4658,6 +4935,11 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
         kfree(rsv);
  }
  
+void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
+{
+       kfree(rsv);
+}
+
  int btrfs_block_rsv_add(struct btrfs_root *root,
                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
                         enum btrfs_reserve_flush_enum flush)
@@ -4770,10 +5052,10 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
  
         num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
                     csum_size * 2;
-       num_bytes += div64_u64(data_used + meta_used, 50);
+       num_bytes += div_u64(data_used + meta_used, 50);
  
         if (num_bytes * 3 > meta_used)
-               num_bytes = div64_u64(meta_used, 3);
+               num_bytes = div_u64(meta_used, 3);
  
         return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
  }
@@ -5024,30 +5306,18 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
                                    int reserve)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
-       u64 csum_size;
-       int num_csums_per_leaf;
-       int num_csums;
-       int old_csums;
+       u64 old_csums, num_csums;
  
         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
             BTRFS_I(inode)->csum_bytes == 0)
                 return 0;
  
-       old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+       old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
         if (reserve)
                 BTRFS_I(inode)->csum_bytes += num_bytes;
         else
                 BTRFS_I(inode)->csum_bytes -= num_bytes;
-       csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
-       num_csums_per_leaf = (int)div64_u64(csum_size,
-                                           sizeof(struct btrfs_csum_item) +
-                                           sizeof(struct btrfs_disk_key));
-       num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
-       num_csums = num_csums + num_csums_per_leaf - 1;
-       num_csums = num_csums / num_csums_per_leaf;
-
-       old_csums = old_csums + num_csums_per_leaf - 1;
-       old_csums = old_csums / num_csums_per_leaf;
+       num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
  
         /* No change, no need to reserve more */
         if (old_csums == num_csums)
@@ -5094,7 +5364,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
         num_bytes = ALIGN(num_bytes, root->sectorsize);
  
         spin_lock(&BTRFS_I(inode)->lock);
-       BTRFS_I(inode)->outstanding_extents++;
+       nr_extents = (unsigned)div64_u64(num_bytes +
+                                        BTRFS_MAX_EXTENT_SIZE - 1,
+                                        BTRFS_MAX_EXTENT_SIZE);
+       BTRFS_I(inode)->outstanding_extents += nr_extents;
+       nr_extents = 0;
  
         if (BTRFS_I(inode)->outstanding_extents >
             BTRFS_I(inode)->reserved_extents)
@@ -5201,8 +5475,11 @@ out_fail:
                         to_free = 0;
         }
         spin_unlock(&BTRFS_I(inode)->lock);
-       if (dropped)
+       if (dropped) {
+               if (root->fs_info->quota_enabled)
+                       btrfs_qgroup_free(root, dropped * root->nodesize);
                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
+       }
  
         if (to_free) {
                 btrfs_block_rsv_release(root, block_rsv, to_free);
@@ -5239,6 +5516,9 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
         if (dropped > 0)
                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
  
+       if (btrfs_test_is_dummy_root(root))
+               return;
+
         trace_btrfs_space_reservation(root->fs_info, "delalloc",
                                       btrfs_ino(inode), to_free, 0);
         if (root->fs_info->quota_enabled) {
@@ -5341,14 +5621,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
                         cache_block_group(cache, 1);
  
-               spin_lock(&trans->transaction->dirty_bgs_lock);
-               if (list_empty(&cache->dirty_list)) {
-                       list_add_tail(&cache->dirty_list,
-                                     &trans->transaction->dirty_bgs);
-                       btrfs_get_block_group(cache);
-               }
-               spin_unlock(&trans->transaction->dirty_bgs_lock);
-
                 byte_in_group = bytenr - cache->key.objectid;
                 WARN_ON(byte_in_group > cache->key.offset);
  
@@ -5397,6 +5669,16 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                                 spin_unlock(&info->unused_bgs_lock);
                         }
                 }
+
+               spin_lock(&trans->transaction->dirty_bgs_lock);
+               if (list_empty(&cache->dirty_list)) {
+                       list_add_tail(&cache->dirty_list,
+                                     &trans->transaction->dirty_bgs);
+                               trans->transaction->num_dirty_bgs++;
+                       btrfs_get_block_group(cache);
+               }
+               spin_unlock(&trans->transaction->dirty_bgs_lock);
+
                 btrfs_put_block_group(cache);
                 total -= num_bytes;
                 bytenr += num_bytes;
@@ -6907,12 +7189,11 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
                 return -ENOSPC;
         }
  
-       if (btrfs_test_opt(root, DISCARD))
-               ret = btrfs_discard_extent(root, start, len, NULL);
-
         if (pin)
                 pin_down_extent(root, cache, start, len, 1);
         else {
+               if (btrfs_test_opt(root, DISCARD))
+                       ret = btrfs_discard_extent(root, start, len, NULL);
                 btrfs_add_free_space(cache, start, len);
                 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
         }
@@ -7046,9 +7327,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
                                       ins, size);
         if (ret) {
+               btrfs_free_path(path);
                 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
                                                    root->nodesize);
-               btrfs_free_path(path);
                 return ret;
         }
  
@@ -7168,7 +7449,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
         btrfs_set_header_generation(buf, trans->transid);
         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
         btrfs_tree_lock(buf);
-       clean_tree_block(trans, root, buf);
+       clean_tree_block(trans, root->fs_info, buf);
         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
  
         btrfs_set_lock_blocking(buf);
@@ -7766,7 +8047,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
         blocksize = root->nodesize;
  
-       next = btrfs_find_tree_block(root, bytenr);
+       next = btrfs_find_tree_block(root->fs_info, bytenr);
         if (!next) {
                 next = btrfs_find_create_tree_block(root, bytenr);
                 if (!next)
@@ -7967,7 +8248,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                         btrfs_set_lock_blocking(eb);
                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                 }
-               clean_tree_block(trans, root, eb);
+               clean_tree_block(trans, root->fs_info, eb);
         }
  
         if (eb == root->node) {
@@ -8484,10 +8765,30 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
  
         BUG_ON(cache->ro);
  
+again:
         trans = btrfs_join_transaction(root);
         if (IS_ERR(trans))
                 return PTR_ERR(trans);
  
+       /*
+        * we're not allowed to set block groups readonly after the dirty
+        * block groups cache has started writing.  If it already started,
+        * back off and let this transaction commit
+        */
+       mutex_lock(&root->fs_info->ro_block_group_mutex);
+       if (trans->transaction->dirty_bg_run) {
+               u64 transid = trans->transid;
+
+               mutex_unlock(&root->fs_info->ro_block_group_mutex);
+               btrfs_end_transaction(trans, root);
+
+               ret = btrfs_wait_for_commit(root, transid);
+               if (ret)
+                       return ret;
+               goto again;
+       }
+
+
         ret = set_block_group_ro(cache, 0);
         if (!ret)
                 goto out;
@@ -8502,6 +8803,7 @@ out:
                 alloc_flags = update_block_group_flags(root, cache->flags);
                 check_system_chunk(trans, root, alloc_flags);
         }
+       mutex_unlock(&root->fs_info->ro_block_group_mutex);
  
         btrfs_end_transaction(trans, root);
         return ret;
@@ -8671,7 +8973,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                 min_free <<= 1;
         } else if (index == BTRFS_RAID_RAID0) {
                 dev_min = fs_devices->rw_devices;
-               do_div(min_free, dev_min);
+               min_free = div64_u64(min_free, dev_min);
         }
  
         /* We need to do this so that we can look at pending chunks */
@@ -8943,6 +9245,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
         INIT_LIST_HEAD(&cache->bg_list);
         INIT_LIST_HEAD(&cache->ro_list);
         INIT_LIST_HEAD(&cache->dirty_list);
+       INIT_LIST_HEAD(&cache->io_list);
         btrfs_init_free_space_ctl(cache);
         atomic_set(&cache->trimming, 0);
  
@@ -9306,7 +9609,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                 goto out;
         }
  
+       /*
+        * get the inode first so any iput calls done for the io_list
+        * aren't the final iput (no unlinks allowed now)
+        */
         inode = lookup_free_space_inode(tree_root, block_group, path);
+
+       mutex_lock(&trans->transaction->cache_write_mutex);
+       /*
+        * make sure our free spache cache IO is done before remove the
+        * free space inode
+        */
+       spin_lock(&trans->transaction->dirty_bgs_lock);
+       if (!list_empty(&block_group->io_list)) {
+               list_del_init(&block_group->io_list);
+
+               WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
+
+               spin_unlock(&trans->transaction->dirty_bgs_lock);
+               btrfs_wait_cache_io(root, trans, block_group,
+                                   &block_group->io_ctl, path,
+                                   block_group->key.objectid);
+               btrfs_put_block_group(block_group);
+               spin_lock(&trans->transaction->dirty_bgs_lock);
+       }
+
+       if (!list_empty(&block_group->dirty_list)) {
+               list_del_init(&block_group->dirty_list);
+               btrfs_put_block_group(block_group);
+       }
+       spin_unlock(&trans->transaction->dirty_bgs_lock);
+       mutex_unlock(&trans->transaction->cache_write_mutex);
+
         if (!IS_ERR(inode)) {
                 ret = btrfs_orphan_add(trans, inode);
                 if (ret) {
@@ -9399,18 +9733,29 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
  
         spin_lock(&trans->transaction->dirty_bgs_lock);
         if (!list_empty(&block_group->dirty_list)) {
-               list_del_init(&block_group->dirty_list);
-               btrfs_put_block_group(block_group);
+               WARN_ON(1);
+       }
+       if (!list_empty(&block_group->io_list)) {
+               WARN_ON(1);
         }
         spin_unlock(&trans->transaction->dirty_bgs_lock);
-
         btrfs_remove_free_space_cache(block_group);
  
         spin_lock(&block_group->space_info->lock);
         list_del_init(&block_group->ro_list);
+
+       if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+               WARN_ON(block_group->space_info->total_bytes
+                       < block_group->key.offset);
+               WARN_ON(block_group->space_info->bytes_readonly
+                       < block_group->key.offset);
+               WARN_ON(block_group->space_info->disk_total
+                       < block_group->key.offset * factor);
+       }
         block_group->space_info->total_bytes -= block_group->key.offset;
         block_group->space_info->bytes_readonly -= block_group->key.offset;
         block_group->space_info->disk_total -= block_group->key.offset * factor;
+
         spin_unlock(&block_group->space_info->lock);
  
         memcpy(&key, &block_group->key, sizeof(key));
@@ -9598,8 +9943,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
  
                 /* Reset pinned so btrfs_put_block_group doesn't complain */
+               spin_lock(&space_info->lock);
+               spin_lock(&block_group->lock);
+
+               space_info->bytes_pinned -= block_group->pinned;
+               space_info->bytes_readonly += block_group->pinned;
+               percpu_counter_add(&space_info->total_bytes_pinned,
+                                  -block_group->pinned);
                 block_group->pinned = 0;
  
+               spin_unlock(&block_group->lock);
+               spin_unlock(&space_info->lock);
+
                 /*
                  * Btrfs_remove_chunk will abort the transaction if things go
                  * horribly wrong.