Btrfs: qgroup: free reserved in exceeding quota.
[deliverable/linux.git] / fs / btrfs / extent-tree.c
index 571f402d3fc46e5f0205451e85a7b78f3cc16b50..695d5110e02057abf9f8b3f4a0f1cbf878a6441d 100644 (file)
@@ -2538,6 +2538,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                 * list before we release it.
                 */
                if (btrfs_delayed_ref_is_head(ref)) {
+                       if (locked_ref->is_data &&
+                           locked_ref->total_ref_mod < 0) {
+                               spin_lock(&delayed_refs->lock);
+                               delayed_refs->pending_csums -= ref->num_bytes;
+                               spin_unlock(&delayed_refs->lock);
+                       }
                        btrfs_delayed_ref_unlock(locked_ref);
                        locked_ref = NULL;
                }
@@ -2561,8 +2567,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                 */
                spin_lock(&delayed_refs->lock);
                avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
-               avg = div64_u64(avg, 4);
-               fs_info->avg_delayed_ref_runtime = avg;
+               fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
                spin_unlock(&delayed_refs->lock);
        }
        return 0;
@@ -2624,7 +2629,26 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
         * We don't ever fill up leaves all the way so multiply by 2 just to be
         * closer to what we're really going to want to ouse.
         */
-       return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+       return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+
+/*
+ * Takes the number of bytes to be csumm'ed and figures out how many leaves it
+ * would require to store the csums for that many bytes.
+ */
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
+{
+       u64 csum_size;
+       u64 num_csums_per_leaf;
+       u64 num_csums;
+
+       csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+       num_csums_per_leaf = div64_u64(csum_size,
+                       (u64)btrfs_super_csum_size(root->fs_info->super_copy));
+       num_csums = div64_u64(csum_bytes, root->sectorsize);
+       num_csums += num_csums_per_leaf - 1;
+       num_csums = div64_u64(num_csums, num_csums_per_leaf);
+       return num_csums;
 }
 
 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
@@ -2632,7 +2656,9 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
 {
        struct btrfs_block_rsv *global_rsv;
        u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
-       u64 num_bytes;
+       u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
+       u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
+       u64 num_bytes, num_dirty_bgs_bytes;
        int ret = 0;
 
        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
@@ -2640,17 +2666,22 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
        if (num_heads > 1)
                num_bytes += (num_heads - 1) * root->nodesize;
        num_bytes <<= 1;
+       num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
+       num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
+                                                            num_dirty_bgs);
        global_rsv = &root->fs_info->global_block_rsv;
 
        /*
         * If we can't allocate any more chunks lets make sure we have _lots_ of
         * wiggle room since running delayed refs can create more delayed refs.
         */
-       if (global_rsv->space_info->full)
+       if (global_rsv->space_info->full) {
+               num_dirty_bgs_bytes <<= 1;
                num_bytes <<= 1;
+       }
 
        spin_lock(&global_rsv->lock);
-       if (global_rsv->reserved <= num_bytes)
+       if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
                ret = 1;
        spin_unlock(&global_rsv->lock);
        return ret;
@@ -3193,7 +3224,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
        struct inode *inode = NULL;
        u64 alloc_hint = 0;
        int dcs = BTRFS_DC_ERROR;
-       int num_pages = 0;
+       u64 num_pages = 0;
        int retries = 0;
        int ret = 0;
 
@@ -3208,6 +3239,8 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
                return 0;
        }
 
+       if (trans->aborted)
+               return 0;
 again:
        inode = lookup_free_space_inode(root, block_group, path);
        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -3243,6 +3276,20 @@ again:
         */
        BTRFS_I(inode)->generation = 0;
        ret = btrfs_update_inode(trans, root, inode);
+       if (ret) {
+               /*
+                * So theoretically we could recover from this, simply set the
+                * super cache generation to 0 so we know to invalidate the
+                * cache, but then we'd have to keep track of the block groups
+                * that fail this way so we know we _have_ to reset this cache
+                * before the next commit or risk reading stale cache.  So to
+                * limit our exposure to horrible edge cases lets just abort the
+                * transaction, this only happens in really bad situations
+                * anyway.
+                */
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_put;
+       }
        WARN_ON(ret);
 
        if (i_size_read(inode) > 0) {
@@ -3251,7 +3298,7 @@ again:
                if (ret)
                        goto out_put;
 
-               ret = btrfs_truncate_free_space_cache(root, trans, inode);
+               ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
                if (ret)
                        goto out_put;
        }
@@ -3277,7 +3324,7 @@ again:
         * taking up quite a bit since it's not folded into the other space
         * cache.
         */
-       num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
+       num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
        if (!num_pages)
                num_pages = 1;
 
@@ -3309,16 +3356,182 @@ out:
        return ret;
 }
 
-int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root)
+{
+       struct btrfs_block_group_cache *cache, *tmp;
+       struct btrfs_transaction *cur_trans = trans->transaction;
+       struct btrfs_path *path;
+
+       if (list_empty(&cur_trans->dirty_bgs) ||
+           !btrfs_test_opt(root, SPACE_CACHE))
+               return 0;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       /* Could add new block groups, use _safe just in case */
+       list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
+                                dirty_list) {
+               if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+                       cache_save_setup(cache, trans, path);
+       }
+
+       btrfs_free_path(path);
+       return 0;
+}
+
+/*
+ * transaction commit does final block group cache writeback during a
+ * critical section where nothing is allowed to change the FS.  This is
+ * required in order for the cache to actually match the block group,
+ * but can introduce a lot of latency into the commit.
+ *
+ * So, btrfs_start_dirty_block_groups is here to kick off block group
+ * cache IO.  There's a chance we'll have to redo some of it if the
+ * block group changes again during the commit, but it greatly reduces
+ * the commit latency by getting rid of the easy block groups while
+ * we're still allowing others to join the commit.
+ */
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
        struct btrfs_block_group_cache *cache;
        struct btrfs_transaction *cur_trans = trans->transaction;
        int ret = 0;
-       struct btrfs_path *path;
+       int should_put;
+       struct btrfs_path *path = NULL;
+       LIST_HEAD(dirty);
+       struct list_head *io = &cur_trans->io_bgs;
+       int num_started = 0;
+       int loops = 0;
 
-       if (list_empty(&cur_trans->dirty_bgs))
+       spin_lock(&cur_trans->dirty_bgs_lock);
+       if (!list_empty(&cur_trans->dirty_bgs)) {
+               list_splice_init(&cur_trans->dirty_bgs, &dirty);
+       }
+       spin_unlock(&cur_trans->dirty_bgs_lock);
+
+again:
+       if (list_empty(&dirty)) {
+               btrfs_free_path(path);
                return 0;
+       }
+
+       /*
+        * make sure all the block groups on our dirty list actually
+        * exist
+        */
+       btrfs_create_pending_block_groups(trans, root);
+
+       if (!path) {
+               path = btrfs_alloc_path();
+               if (!path)
+                       return -ENOMEM;
+       }
+
+       while (!list_empty(&dirty)) {
+               cache = list_first_entry(&dirty,
+                                        struct btrfs_block_group_cache,
+                                        dirty_list);
+
+               /*
+                * cache_write_mutex is here only to save us from balance
+                * deleting this block group while we are writing out the
+                * cache
+                */
+               mutex_lock(&trans->transaction->cache_write_mutex);
+
+               /*
+                * this can happen if something re-dirties a block
+                * group that is already under IO.  Just wait for it to
+                * finish and then do it all again
+                */
+               if (!list_empty(&cache->io_list)) {
+                       list_del_init(&cache->io_list);
+                       btrfs_wait_cache_io(root, trans, cache,
+                                           &cache->io_ctl, path,
+                                           cache->key.objectid);
+                       btrfs_put_block_group(cache);
+               }
+
+
+               /*
+                * btrfs_wait_cache_io uses the cache->dirty_list to decide
+                * if it should update the cache_state.  Don't delete
+                * until after we wait.
+                *
+                * Since we're not running in the commit critical section
+                * we need the dirty_bgs_lock to protect from update_block_group
+                */
+               spin_lock(&cur_trans->dirty_bgs_lock);
+               list_del_init(&cache->dirty_list);
+               spin_unlock(&cur_trans->dirty_bgs_lock);
+
+               should_put = 1;
+
+               cache_save_setup(cache, trans, path);
+
+               if (cache->disk_cache_state == BTRFS_DC_SETUP) {
+                       cache->io_ctl.inode = NULL;
+                       ret = btrfs_write_out_cache(root, trans, cache, path);
+                       if (ret == 0 && cache->io_ctl.inode) {
+                               num_started++;
+                               should_put = 0;
+
+                               /*
+                                * the cache_write_mutex is protecting
+                                * the io_list
+                                */
+                               list_add_tail(&cache->io_list, io);
+                       } else {
+                               /*
+                                * if we failed to write the cache, the
+                                * generation will be bad and life goes on
+                                */
+                               ret = 0;
+                       }
+               }
+               if (!ret)
+                       ret = write_one_cache_group(trans, root, path, cache);
+               mutex_unlock(&trans->transaction->cache_write_mutex);
+
+               /* if its not on the io list, we need to put the block group */
+               if (should_put)
+                       btrfs_put_block_group(cache);
+
+               if (ret)
+                       break;
+       }
+
+       /*
+        * go through delayed refs for all the stuff we've just kicked off
+        * and then loop back (just once)
+        */
+       ret = btrfs_run_delayed_refs(trans, root, 0);
+       if (!ret && loops == 0) {
+               loops++;
+               spin_lock(&cur_trans->dirty_bgs_lock);
+               list_splice_init(&cur_trans->dirty_bgs, &dirty);
+               spin_unlock(&cur_trans->dirty_bgs_lock);
+               goto again;
+       }
+
+       btrfs_free_path(path);
+       return ret;
+}
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root)
+{
+       struct btrfs_block_group_cache *cache;
+       struct btrfs_transaction *cur_trans = trans->transaction;
+       int ret = 0;
+       int should_put;
+       struct btrfs_path *path;
+       struct list_head *io = &cur_trans->io_bgs;
+       int num_started = 0;
 
        path = btrfs_alloc_path();
        if (!path)
@@ -3334,16 +3547,61 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                cache = list_first_entry(&cur_trans->dirty_bgs,
                                         struct btrfs_block_group_cache,
                                         dirty_list);
+
+               /*
+                * this can happen if cache_save_setup re-dirties a block
+                * group that is already under IO.  Just wait for it to
+                * finish and then do it all again
+                */
+               if (!list_empty(&cache->io_list)) {
+                       list_del_init(&cache->io_list);
+                       btrfs_wait_cache_io(root, trans, cache,
+                                           &cache->io_ctl, path,
+                                           cache->key.objectid);
+                       btrfs_put_block_group(cache);
+               }
+
+               /*
+                * don't remove from the dirty list until after we've waited
+                * on any pending IO
+                */
                list_del_init(&cache->dirty_list);
-               if (cache->disk_cache_state == BTRFS_DC_CLEAR)
-                       cache_save_setup(cache, trans, path);
+               should_put = 1;
+
+               cache_save_setup(cache, trans, path);
+
                if (!ret)
-                       ret = btrfs_run_delayed_refs(trans, root,
-                                                    (unsigned long) -1);
-               if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
-                       btrfs_write_out_cache(root, trans, cache, path);
+                       ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
+
+               if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
+                       cache->io_ctl.inode = NULL;
+                       ret = btrfs_write_out_cache(root, trans, cache, path);
+                       if (ret == 0 && cache->io_ctl.inode) {
+                               num_started++;
+                               should_put = 0;
+                               list_add_tail(&cache->io_list, io);
+                       } else {
+                               /*
+                                * if we failed to write the cache, the
+                                * generation will be bad and life goes on
+                                */
+                               ret = 0;
+                       }
+               }
                if (!ret)
                        ret = write_one_cache_group(trans, root, path, cache);
+
+               /* if its not on the io list, we need to put the block group */
+               if (should_put)
+                       btrfs_put_block_group(cache);
+       }
+
+       while (!list_empty(io)) {
+               cache = list_first_entry(io, struct btrfs_block_group_cache,
+                                        io_list);
+               list_del_init(&cache->io_list);
+               btrfs_wait_cache_io(root, trans, cache,
+                                   &cache->io_ctl, path, cache->key.objectid);
                btrfs_put_block_group(cache);
        }
 
@@ -3599,13 +3857,15 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 used;
-       int ret = 0, committed = 0, alloc_chunk = 1;
+       int ret = 0;
+       int need_commit = 2;
+       int have_pinned_space;
 
        /* make sure bytes are sectorsize aligned */
        bytes = ALIGN(bytes, root->sectorsize);
 
        if (btrfs_is_free_space_inode(inode)) {
-               committed = 1;
+               need_commit = 0;
                ASSERT(current->journal_info);
        }
 
@@ -3627,7 +3887,7 @@ again:
                 * if we don't have enough free bytes in this space then we need
                 * to alloc a new chunk.
                 */
-               if (!data_sinfo->full && alloc_chunk) {
+               if (!data_sinfo->full) {
                        u64 alloc_target;
 
                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
@@ -3655,8 +3915,10 @@ alloc:
                        if (ret < 0) {
                                if (ret != -ENOSPC)
                                        return ret;
-                               else
+                               else {
+                                       have_pinned_space = 1;
                                        goto commit_trans;
+                               }
                        }
 
                        if (!data_sinfo)
@@ -3667,26 +3929,39 @@ alloc:
 
                /*
                 * If we don't have enough pinned space to deal with this
-                * allocation don't bother committing the transaction.
+                * allocation, and no removed chunk in current transaction,
+                * don't bother committing the transaction.
                 */
-               if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
-                                          bytes) < 0)
-                       committed = 1;
+               have_pinned_space = percpu_counter_compare(
+                       &data_sinfo->total_bytes_pinned,
+                       used + bytes - data_sinfo->total_bytes);
                spin_unlock(&data_sinfo->lock);
 
                /* commit the current transaction and try again */
 commit_trans:
-               if (!committed &&
+               if (need_commit &&
                    !atomic_read(&root->fs_info->open_ioctl_trans)) {
-                       committed = 1;
+                       need_commit--;
 
                        trans = btrfs_join_transaction(root);
                        if (IS_ERR(trans))
                                return PTR_ERR(trans);
-                       ret = btrfs_commit_transaction(trans, root);
-                       if (ret)
-                               return ret;
-                       goto again;
+                       if (have_pinned_space >= 0 ||
+                           trans->transaction->have_free_bgs ||
+                           need_commit > 0) {
+                               ret = btrfs_commit_transaction(trans, root);
+                               if (ret)
+                                       return ret;
+                               /*
+                                * make sure that all running delayed iput are
+                                * done
+                                */
+                               down_write(&root->fs_info->delayed_iput_sem);
+                               up_write(&root->fs_info->delayed_iput_sem);
+                               goto again;
+                       } else {
+                               btrfs_end_transaction(trans, root);
+                       }
                }
 
                trace_btrfs_space_reservation(root->fs_info,
@@ -4256,8 +4531,13 @@ out:
 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
                                        struct btrfs_fs_info *fs_info, u64 used)
 {
-       return (used >= div_factor_fine(space_info->total_bytes, 98) &&
-               !btrfs_fs_closing(fs_info) &&
+       u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+
+       /* If we're just plain full then async reclaim just slows us down. */
+       if (space_info->bytes_used >= thresh)
+               return 0;
+
+       return (used >= thresh && !btrfs_fs_closing(fs_info) &&
                !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
 }
 
@@ -4312,10 +4592,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
                if (!btrfs_need_do_async_reclaim(space_info, fs_info,
                                                 flush_state))
                        return;
-       } while (flush_state <= COMMIT_TRANS);
-
-       if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
-               queue_work(system_unbound_wq, work);
+       } while (flush_state < COMMIT_TRANS);
 }
 
 void btrfs_init_async_reclaim_work(struct work_struct *work)
@@ -4658,6 +4935,11 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
        kfree(rsv);
 }
 
+void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
+{
+       kfree(rsv);
+}
+
 int btrfs_block_rsv_add(struct btrfs_root *root,
                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
                        enum btrfs_reserve_flush_enum flush)
@@ -4770,10 +5052,10 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
 
        num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
                    csum_size * 2;
-       num_bytes += div64_u64(data_used + meta_used, 50);
+       num_bytes += div_u64(data_used + meta_used, 50);
 
        if (num_bytes * 3 > meta_used)
-               num_bytes = div64_u64(meta_used, 3);
+               num_bytes = div_u64(meta_used, 3);
 
        return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
 }
@@ -5024,30 +5306,18 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
                                   int reserve)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       u64 csum_size;
-       int num_csums_per_leaf;
-       int num_csums;
-       int old_csums;
+       u64 old_csums, num_csums;
 
        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
            BTRFS_I(inode)->csum_bytes == 0)
                return 0;
 
-       old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+       old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
        if (reserve)
                BTRFS_I(inode)->csum_bytes += num_bytes;
        else
                BTRFS_I(inode)->csum_bytes -= num_bytes;
-       csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
-       num_csums_per_leaf = (int)div64_u64(csum_size,
-                                           sizeof(struct btrfs_csum_item) +
-                                           sizeof(struct btrfs_disk_key));
-       num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
-       num_csums = num_csums + num_csums_per_leaf - 1;
-       num_csums = num_csums / num_csums_per_leaf;
-
-       old_csums = old_csums + num_csums_per_leaf - 1;
-       old_csums = old_csums / num_csums_per_leaf;
+       num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
 
        /* No change, no need to reserve more */
        if (old_csums == num_csums)
@@ -5094,7 +5364,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        num_bytes = ALIGN(num_bytes, root->sectorsize);
 
        spin_lock(&BTRFS_I(inode)->lock);
-       BTRFS_I(inode)->outstanding_extents++;
+       nr_extents = (unsigned)div64_u64(num_bytes +
+                                        BTRFS_MAX_EXTENT_SIZE - 1,
+                                        BTRFS_MAX_EXTENT_SIZE);
+       BTRFS_I(inode)->outstanding_extents += nr_extents;
+       nr_extents = 0;
 
        if (BTRFS_I(inode)->outstanding_extents >
            BTRFS_I(inode)->reserved_extents)
@@ -5201,8 +5475,11 @@ out_fail:
                        to_free = 0;
        }
        spin_unlock(&BTRFS_I(inode)->lock);
-       if (dropped)
+       if (dropped) {
+               if (root->fs_info->quota_enabled)
+                       btrfs_qgroup_free(root, dropped * root->nodesize);
                to_free += btrfs_calc_trans_metadata_size(root, dropped);
+       }
 
        if (to_free) {
                btrfs_block_rsv_release(root, block_rsv, to_free);
@@ -5239,6 +5516,9 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        if (dropped > 0)
                to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
+       if (btrfs_test_is_dummy_root(root))
+               return;
+
        trace_btrfs_space_reservation(root->fs_info, "delalloc",
                                      btrfs_ino(inode), to_free, 0);
        if (root->fs_info->quota_enabled) {
@@ -5341,14 +5621,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                if (!alloc && cache->cached == BTRFS_CACHE_NO)
                        cache_block_group(cache, 1);
 
-               spin_lock(&trans->transaction->dirty_bgs_lock);
-               if (list_empty(&cache->dirty_list)) {
-                       list_add_tail(&cache->dirty_list,
-                                     &trans->transaction->dirty_bgs);
-                       btrfs_get_block_group(cache);
-               }
-               spin_unlock(&trans->transaction->dirty_bgs_lock);
-
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
 
@@ -5397,6 +5669,16 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                                spin_unlock(&info->unused_bgs_lock);
                        }
                }
+
+               spin_lock(&trans->transaction->dirty_bgs_lock);
+               if (list_empty(&cache->dirty_list)) {
+                       list_add_tail(&cache->dirty_list,
+                                     &trans->transaction->dirty_bgs);
+                               trans->transaction->num_dirty_bgs++;
+                       btrfs_get_block_group(cache);
+               }
+               spin_unlock(&trans->transaction->dirty_bgs_lock);
+
                btrfs_put_block_group(cache);
                total -= num_bytes;
                bytenr += num_bytes;
@@ -6907,12 +7189,11 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
                return -ENOSPC;
        }
 
-       if (btrfs_test_opt(root, DISCARD))
-               ret = btrfs_discard_extent(root, start, len, NULL);
-
        if (pin)
                pin_down_extent(root, cache, start, len, 1);
        else {
+               if (btrfs_test_opt(root, DISCARD))
+                       ret = btrfs_discard_extent(root, start, len, NULL);
                btrfs_add_free_space(cache, start, len);
                btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
        }
@@ -7046,9 +7327,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
                                      ins, size);
        if (ret) {
+               btrfs_free_path(path);
                btrfs_free_and_pin_reserved_extent(root, ins->objectid,
                                                   root->nodesize);
-               btrfs_free_path(path);
                return ret;
        }
 
@@ -7168,7 +7449,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        btrfs_set_header_generation(buf, trans->transid);
        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
        btrfs_tree_lock(buf);
-       clean_tree_block(trans, root, buf);
+       clean_tree_block(trans, root->fs_info, buf);
        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 
        btrfs_set_lock_blocking(buf);
@@ -7766,7 +8047,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
        blocksize = root->nodesize;
 
-       next = btrfs_find_tree_block(root, bytenr);
+       next = btrfs_find_tree_block(root->fs_info, bytenr);
        if (!next) {
                next = btrfs_find_create_tree_block(root, bytenr);
                if (!next)
@@ -7967,7 +8248,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                        btrfs_set_lock_blocking(eb);
                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                }
-               clean_tree_block(trans, root, eb);
+               clean_tree_block(trans, root->fs_info, eb);
        }
 
        if (eb == root->node) {
@@ -8484,10 +8765,30 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 
        BUG_ON(cache->ro);
 
+again:
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
+       /*
+        * we're not allowed to set block groups readonly after the dirty
+        * block groups cache has started writing.  If it already started,
+        * back off and let this transaction commit
+        */
+       mutex_lock(&root->fs_info->ro_block_group_mutex);
+       if (trans->transaction->dirty_bg_run) {
+               u64 transid = trans->transid;
+
+               mutex_unlock(&root->fs_info->ro_block_group_mutex);
+               btrfs_end_transaction(trans, root);
+
+               ret = btrfs_wait_for_commit(root, transid);
+               if (ret)
+                       return ret;
+               goto again;
+       }
+
+
        ret = set_block_group_ro(cache, 0);
        if (!ret)
                goto out;
@@ -8502,6 +8803,7 @@ out:
                alloc_flags = update_block_group_flags(root, cache->flags);
                check_system_chunk(trans, root, alloc_flags);
        }
+       mutex_unlock(&root->fs_info->ro_block_group_mutex);
 
        btrfs_end_transaction(trans, root);
        return ret;
@@ -8671,7 +8973,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                min_free <<= 1;
        } else if (index == BTRFS_RAID_RAID0) {
                dev_min = fs_devices->rw_devices;
-               do_div(min_free, dev_min);
+               min_free = div64_u64(min_free, dev_min);
        }
 
        /* We need to do this so that we can look at pending chunks */
@@ -8943,6 +9245,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
        INIT_LIST_HEAD(&cache->bg_list);
        INIT_LIST_HEAD(&cache->ro_list);
        INIT_LIST_HEAD(&cache->dirty_list);
+       INIT_LIST_HEAD(&cache->io_list);
        btrfs_init_free_space_ctl(cache);
        atomic_set(&cache->trimming, 0);
 
@@ -9306,7 +9609,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                goto out;
        }
 
+       /*
+        * get the inode first so any iput calls done for the io_list
+        * aren't the final iput (no unlinks allowed now)
+        */
        inode = lookup_free_space_inode(tree_root, block_group, path);
+
+       mutex_lock(&trans->transaction->cache_write_mutex);
+       /*
+        * make sure our free spache cache IO is done before remove the
+        * free space inode
+        */
+       spin_lock(&trans->transaction->dirty_bgs_lock);
+       if (!list_empty(&block_group->io_list)) {
+               list_del_init(&block_group->io_list);
+
+               WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
+
+               spin_unlock(&trans->transaction->dirty_bgs_lock);
+               btrfs_wait_cache_io(root, trans, block_group,
+                                   &block_group->io_ctl, path,
+                                   block_group->key.objectid);
+               btrfs_put_block_group(block_group);
+               spin_lock(&trans->transaction->dirty_bgs_lock);
+       }
+
+       if (!list_empty(&block_group->dirty_list)) {
+               list_del_init(&block_group->dirty_list);
+               btrfs_put_block_group(block_group);
+       }
+       spin_unlock(&trans->transaction->dirty_bgs_lock);
+       mutex_unlock(&trans->transaction->cache_write_mutex);
+
        if (!IS_ERR(inode)) {
                ret = btrfs_orphan_add(trans, inode);
                if (ret) {
@@ -9399,18 +9733,29 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
        spin_lock(&trans->transaction->dirty_bgs_lock);
        if (!list_empty(&block_group->dirty_list)) {
-               list_del_init(&block_group->dirty_list);
-               btrfs_put_block_group(block_group);
+               WARN_ON(1);
+       }
+       if (!list_empty(&block_group->io_list)) {
+               WARN_ON(1);
        }
        spin_unlock(&trans->transaction->dirty_bgs_lock);
-
        btrfs_remove_free_space_cache(block_group);
 
        spin_lock(&block_group->space_info->lock);
        list_del_init(&block_group->ro_list);
+
+       if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+               WARN_ON(block_group->space_info->total_bytes
+                       < block_group->key.offset);
+               WARN_ON(block_group->space_info->bytes_readonly
+                       < block_group->key.offset);
+               WARN_ON(block_group->space_info->disk_total
+                       < block_group->key.offset * factor);
+       }
        block_group->space_info->total_bytes -= block_group->key.offset;
        block_group->space_info->bytes_readonly -= block_group->key.offset;
        block_group->space_info->disk_total -= block_group->key.offset * factor;
+
        spin_unlock(&block_group->space_info->lock);
 
        memcpy(&key, &block_group->key, sizeof(key));
@@ -9598,8 +9943,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 
                /* Reset pinned so btrfs_put_block_group doesn't complain */
+               spin_lock(&space_info->lock);
+               spin_lock(&block_group->lock);
+
+               space_info->bytes_pinned -= block_group->pinned;
+               space_info->bytes_readonly += block_group->pinned;
+               percpu_counter_add(&space_info->total_bytes_pinned,
+                                  -block_group->pinned);
                block_group->pinned = 0;
 
+               spin_unlock(&block_group->lock);
+               spin_unlock(&space_info->lock);
+
                /*
                 * Btrfs_remove_chunk will abort the transaction if things go
                 * horribly wrong.
This page took 0.056264 seconds and 5 git commands to generate.