fs/btrfs/extent-tree.c

   1 /*
   2  * Copyright (C) 2007 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18 #include <linux/sched.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/writeback.h>
  21 #include <linux/blkdev.h>
  22 #include <linux/sort.h>
  23 #include <linux/rcupdate.h>
  24 #include <linux/kthread.h>
  25 #include <linux/slab.h>
  26 #include <linux/ratelimit.h>
  27 #include <linux/percpu_counter.h>
  28 #include "hash.h"
  29 #include "tree-log.h"
  30 #include "disk-io.h"
  31 #include "print-tree.h"
  32 #include "volumes.h"
  33 #include "raid56.h"
  34 #include "locking.h"
  35 #include "free-space-cache.h"
  36 #include "free-space-tree.h"
  37 #include "math.h"
  38 #include "sysfs.h"
  39 #include "qgroup.h"
  40
  41 #undef SCRAMBLE_DELAYED_REFS
  42
  43 /*
  44  * control flags for do_chunk_alloc's force field
  45  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  46  * if we really need one.
  47  *
  48  * CHUNK_ALLOC_LIMITED means to only try and allocate one
  49  * if we have very few chunks already allocated.  This is
  50  * used as part of the clustering code to help make sure
  51  * we have a good pool of storage to cluster in, without
  52  * filling the FS with empty chunks
  53  *
  54  * CHUNK_ALLOC_FORCE means it must try to allocate one
  55  *
  56  */
  57 enum {
  58         CHUNK_ALLOC_NO_FORCE = 0,
  59         CHUNK_ALLOC_LIMITED = 1,
  60         CHUNK_ALLOC_FORCE = 2,
  61 };
  62
  63 /*
  64  * Control how reservations are dealt with.
  65  *
  66  * RESERVE_FREE - freeing a reservation.
  67  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
  68  *   ENOSPC accounting
  69  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
  70  *   bytes_may_use as the ENOSPC accounting is done elsewhere
  71  */
  72 enum {
  73         RESERVE_FREE = 0,
  74         RESERVE_ALLOC = 1,
  75         RESERVE_ALLOC_NO_ACCOUNT = 2,
  76 };
  77
  78 static int update_block_group(struct btrfs_trans_handle *trans,
  79                               struct btrfs_root *root, u64 bytenr,
  80                               u64 num_bytes, int alloc);
  81 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  82                                 struct btrfs_root *root,
  83                                 struct btrfs_delayed_ref_node *node, u64 parent,
  84                                 u64 root_objectid, u64 owner_objectid,
  85                                 u64 owner_offset, int refs_to_drop,
  86                                 struct btrfs_delayed_extent_op *extra_op);
  87 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  88                                     struct extent_buffer *leaf,
  89                                     struct btrfs_extent_item *ei);
  90 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  91                                       struct btrfs_root *root,
  92                                       u64 parent, u64 root_objectid,
  93                                       u64 flags, u64 owner, u64 offset,
  94                                       struct btrfs_key *ins, int ref_mod);
  95 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  96                                      struct btrfs_root *root,
  97                                      u64 parent, u64 root_objectid,
  98                                      u64 flags, struct btrfs_disk_key *key,
  99                                      int level, struct btrfs_key *ins);
 100 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 101                           struct btrfs_root *extent_root, u64 flags,
 102                           int force);
 103 static int find_next_key(struct btrfs_path *path, int level,
 104                          struct btrfs_key *key);
 105 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 106                             int dump_block_groups);
 107 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 108                                        u64 num_bytes, int reserve,
 109                                        int delalloc);
 110 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
 111                                u64 num_bytes);
 112 int btrfs_pin_extent(struct btrfs_root *root,
 113                      u64 bytenr, u64 num_bytes, int reserved);
 114
 115 static noinline int
 116 block_group_cache_done(struct btrfs_block_group_cache *cache)
 117 {
 118         smp_mb();
 119         return cache->cached == BTRFS_CACHE_FINISHED ||
 120                 cache->cached == BTRFS_CACHE_ERROR;
 121 }
 122
 123 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 124 {
 125         return (cache->flags & bits) == bits;
 126 }
 127
 128 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 129 {
 130         atomic_inc(&cache->count);
 131 }
 132
 133 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 134 {
 135         if (atomic_dec_and_test(&cache->count)) {
 136                 WARN_ON(cache->pinned > 0);
 137                 WARN_ON(cache->reserved > 0);
 138                 kfree(cache->free_space_ctl);
 139                 kfree(cache);
 140         }
 141 }
 142
 143 /*
 144  * this adds the block group to the fs_info rb tree for the block group
 145  * cache
 146  */
 147 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 148                                 struct btrfs_block_group_cache *block_group)
 149 {
 150         struct rb_node **p;
 151         struct rb_node *parent = NULL;
 152         struct btrfs_block_group_cache *cache;
 153
 154         spin_lock(&info->block_group_cache_lock);
 155         p = &info->block_group_cache_tree.rb_node;
 156
 157         while (*p) {
 158                 parent = *p;
 159                 cache = rb_entry(parent, struct btrfs_block_group_cache,
 160                                  cache_node);
 161                 if (block_group->key.objectid < cache->key.objectid) {
 162                         p = &(*p)->rb_left;
 163                 } else if (block_group->key.objectid > cache->key.objectid) {
 164                         p = &(*p)->rb_right;
 165                 } else {
 166                         spin_unlock(&info->block_group_cache_lock);
 167                         return -EEXIST;
 168                 }
 169         }
 170
 171         rb_link_node(&block_group->cache_node, parent, p);
 172         rb_insert_color(&block_group->cache_node,
 173                         &info->block_group_cache_tree);
 174
 175         if (info->first_logical_byte > block_group->key.objectid)
 176                 info->first_logical_byte = block_group->key.objectid;
 177
 178         spin_unlock(&info->block_group_cache_lock);
 179
 180         return 0;
 181 }
 182
 183 /*
 184  * This will return the block group at or after bytenr if contains is 0, else
 185  * it will return the block group that contains the bytenr
 186  */
 187 static struct btrfs_block_group_cache *
 188 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 189                               int contains)
 190 {
 191         struct btrfs_block_group_cache *cache, *ret = NULL;
 192         struct rb_node *n;
 193         u64 end, start;
 194
 195         spin_lock(&info->block_group_cache_lock);
 196         n = info->block_group_cache_tree.rb_node;
 197
 198         while (n) {
 199                 cache = rb_entry(n, struct btrfs_block_group_cache,
 200                                  cache_node);
 201                 end = cache->key.objectid + cache->key.offset - 1;
 202                 start = cache->key.objectid;
 203
 204                 if (bytenr < start) {
 205                         if (!contains && (!ret || start < ret->key.objectid))
 206                                 ret = cache;
 207                         n = n->rb_left;
 208                 } else if (bytenr > start) {
 209                         if (contains && bytenr <= end) {
 210                                 ret = cache;
 211                                 break;
 212                         }
 213                         n = n->rb_right;
 214                 } else {
 215                         ret = cache;
 216                         break;
 217                 }
 218         }
 219         if (ret) {
 220                 btrfs_get_block_group(ret);
 221                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
 222                         info->first_logical_byte = ret->key.objectid;
 223         }
 224         spin_unlock(&info->block_group_cache_lock);
 225
 226         return ret;
 227 }
 228
 229 static int add_excluded_extent(struct btrfs_root *root,
 230                                u64 start, u64 num_bytes)
 231 {
 232         u64 end = start + num_bytes - 1;
 233         set_extent_bits(&root->fs_info->freed_extents[0],
 234                         start, end, EXTENT_UPTODATE, GFP_NOFS);
 235         set_extent_bits(&root->fs_info->freed_extents[1],
 236                         start, end, EXTENT_UPTODATE, GFP_NOFS);
 237         return 0;
 238 }
 239
 240 static void free_excluded_extents(struct btrfs_root *root,
 241                                   struct btrfs_block_group_cache *cache)
 242 {
 243         u64 start, end;
 244
 245         start = cache->key.objectid;
 246         end = start + cache->key.offset - 1;
 247
 248         clear_extent_bits(&root->fs_info->freed_extents[0],
 249                           start, end, EXTENT_UPTODATE, GFP_NOFS);
 250         clear_extent_bits(&root->fs_info->freed_extents[1],
 251                           start, end, EXTENT_UPTODATE, GFP_NOFS);
 252 }
 253
 254 static int exclude_super_stripes(struct btrfs_root *root,
 255                                  struct btrfs_block_group_cache *cache)
 256 {
 257         u64 bytenr;
 258         u64 *logical;
 259         int stripe_len;
 260         int i, nr, ret;
 261
 262         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 263                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 264                 cache->bytes_super += stripe_len;
 265                 ret = add_excluded_extent(root, cache->key.objectid,
 266                                           stripe_len);
 267                 if (ret)
 268                         return ret;
 269         }
 270
 271         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 272                 bytenr = btrfs_sb_offset(i);
 273                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 274                                        cache->key.objectid, bytenr,
 275                                        0, &logical, &nr, &stripe_len);
 276                 if (ret)
 277                         return ret;
 278
 279                 while (nr--) {
 280                         u64 start, len;
 281
 282                         if (logical[nr] > cache->key.objectid +
 283                             cache->key.offset)
 284                                 continue;
 285
 286                         if (logical[nr] + stripe_len <= cache->key.objectid)
 287                                 continue;
 288
 289                         start = logical[nr];
 290                         if (start < cache->key.objectid) {
 291                                 start = cache->key.objectid;
 292                                 len = (logical[nr] + stripe_len) - start;
 293                         } else {
 294                                 len = min_t(u64, stripe_len,
 295                                             cache->key.objectid +
 296                                             cache->key.offset - start);
 297                         }
 298
 299                         cache->bytes_super += len;
 300                         ret = add_excluded_extent(root, start, len);
 301                         if (ret) {
 302                                 kfree(logical);
 303                                 return ret;
 304                         }
 305                 }
 306
 307                 kfree(logical);
 308         }
 309         return 0;
 310 }
 311
 312 static struct btrfs_caching_control *
 313 get_caching_control(struct btrfs_block_group_cache *cache)
 314 {
 315         struct btrfs_caching_control *ctl;
 316
 317         spin_lock(&cache->lock);
 318         if (!cache->caching_ctl) {
 319                 spin_unlock(&cache->lock);
 320                 return NULL;
 321         }
 322
 323         ctl = cache->caching_ctl;
 324         atomic_inc(&ctl->count);
 325         spin_unlock(&cache->lock);
 326         return ctl;
 327 }
 328
 329 static void put_caching_control(struct btrfs_caching_control *ctl)
 330 {
 331         if (atomic_dec_and_test(&ctl->count))
 332                 kfree(ctl);
 333 }
 334
 335 #ifdef CONFIG_BTRFS_DEBUG
 336 static void fragment_free_space(struct btrfs_root *root,
 337                                 struct btrfs_block_group_cache *block_group)
 338 {
 339         u64 start = block_group->key.objectid;
 340         u64 len = block_group->key.offset;
 341         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 342                 root->nodesize : root->sectorsize;
 343         u64 step = chunk << 1;
 344
 345         while (len > chunk) {
 346                 btrfs_remove_free_space(block_group, start, chunk);
 347                 start += step;
 348                 if (len < step)
 349                         len = 0;
 350                 else
 351                         len -= step;
 352         }
 353 }
 354 #endif
 355
 356 /*
 357  * this is only called by cache_block_group, since we could have freed extents
 358  * we need to check the pinned_extents for any extents that can't be used yet
 359  * since their free space will be released as soon as the transaction commits.
 360  */
 361 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 362                        struct btrfs_fs_info *info, u64 start, u64 end)
 363 {
 364         u64 extent_start, extent_end, size, total_added = 0;
 365         int ret;
 366
 367         while (start < end) {
 368                 ret = find_first_extent_bit(info->pinned_extents, start,
 369                                             &extent_start, &extent_end,
 370                                             EXTENT_DIRTY | EXTENT_UPTODATE,
 371                                             NULL);
 372                 if (ret)
 373                         break;
 374
 375                 if (extent_start <= start) {
 376                         start = extent_end + 1;
 377                 } else if (extent_start > start && extent_start < end) {
 378                         size = extent_start - start;
 379                         total_added += size;
 380                         ret = btrfs_add_free_space(block_group, start,
 381                                                    size);
 382                         BUG_ON(ret); /* -ENOMEM or logic error */
 383                         start = extent_end + 1;
 384                 } else {
 385                         break;
 386                 }
 387         }
 388
 389         if (start < end) {
 390                 size = end - start;
 391                 total_added += size;
 392                 ret = btrfs_add_free_space(block_group, start, size);
 393                 BUG_ON(ret); /* -ENOMEM or logic error */
 394         }
 395
 396         return total_added;
 397 }
 398
 399 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 400 {
 401         struct btrfs_block_group_cache *block_group;
 402         struct btrfs_fs_info *fs_info;
 403         struct btrfs_root *extent_root;
 404         struct btrfs_path *path;
 405         struct extent_buffer *leaf;
 406         struct btrfs_key key;
 407         u64 total_found = 0;
 408         u64 last = 0;
 409         u32 nritems;
 410         int ret;
 411         bool wakeup = true;
 412
 413         block_group = caching_ctl->block_group;
 414         fs_info = block_group->fs_info;
 415         extent_root = fs_info->extent_root;
 416
 417         path = btrfs_alloc_path();
 418         if (!path)
 419                 return -ENOMEM;
 420
 421         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 422
 423 #ifdef CONFIG_BTRFS_DEBUG
 424         /*
 425          * If we're fragmenting we don't want to make anybody think we can
 426          * allocate from this block group until we've had a chance to fragment
 427          * the free space.
 428          */
 429         if (btrfs_should_fragment_free_space(extent_root, block_group))
 430                 wakeup = false;
 431 #endif
 432         /*
 433          * We don't want to deadlock with somebody trying to allocate a new
 434          * extent for the extent root while also trying to search the extent
 435          * root to add free space.  So we skip locking and search the commit
 436          * root, since its read-only
 437          */
 438         path->skip_locking = 1;
 439         path->search_commit_root = 1;
 440         path->reada = 1;
 441
 442         key.objectid = last;
 443         key.offset = 0;
 444         key.type = BTRFS_EXTENT_ITEM_KEY;
 445
 446 next:
 447         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 448         if (ret < 0)
 449                 goto out;
 450
 451         leaf = path->nodes[0];
 452         nritems = btrfs_header_nritems(leaf);
 453
 454         while (1) {
 455                 if (btrfs_fs_closing(fs_info) > 1) {
 456                         last = (u64)-1;
 457                         break;
 458                 }
 459
 460                 if (path->slots[0] < nritems) {
 461                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 462                 } else {
 463                         ret = find_next_key(path, 0, &key);
 464                         if (ret)
 465                                 break;
 466
 467                         if (need_resched() ||
 468                             rwsem_is_contended(&fs_info->commit_root_sem)) {
 469                                 if (wakeup)
 470                                         caching_ctl->progress = last;
 471                                 btrfs_release_path(path);
 472                                 up_read(&fs_info->commit_root_sem);
 473                                 mutex_unlock(&caching_ctl->mutex);
 474                                 cond_resched();
 475                                 mutex_lock(&caching_ctl->mutex);
 476                                 down_read(&fs_info->commit_root_sem);
 477                                 goto next;
 478                         }
 479
 480                         ret = btrfs_next_leaf(extent_root, path);
 481                         if (ret < 0)
 482                                 goto out;
 483                         if (ret)
 484                                 break;
 485                         leaf = path->nodes[0];
 486                         nritems = btrfs_header_nritems(leaf);
 487                         continue;
 488                 }
 489
 490                 if (key.objectid < last) {
 491                         key.objectid = last;
 492                         key.offset = 0;
 493                         key.type = BTRFS_EXTENT_ITEM_KEY;
 494
 495                         if (wakeup)
 496                                 caching_ctl->progress = last;
 497                         btrfs_release_path(path);
 498                         goto next;
 499                 }
 500
 501                 if (key.objectid < block_group->key.objectid) {
 502                         path->slots[0]++;
 503                         continue;
 504                 }
 505
 506                 if (key.objectid >= block_group->key.objectid +
 507                     block_group->key.offset)
 508                         break;
 509
 510                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 511                     key.type == BTRFS_METADATA_ITEM_KEY) {
 512                         total_found += add_new_free_space(block_group,
 513                                                           fs_info, last,
 514                                                           key.objectid);
 515                         if (key.type == BTRFS_METADATA_ITEM_KEY)
 516                                 last = key.objectid +
 517                                         fs_info->tree_root->nodesize;
 518                         else
 519                                 last = key.objectid + key.offset;
 520
 521                         if (total_found > CACHING_CTL_WAKE_UP) {
 522                                 total_found = 0;
 523                                 if (wakeup)
 524                                         wake_up(&caching_ctl->wait);
 525                         }
 526                 }
 527                 path->slots[0]++;
 528         }
 529         ret = 0;
 530
 531         total_found += add_new_free_space(block_group, fs_info, last,
 532                                           block_group->key.objectid +
 533                                           block_group->key.offset);
 534         caching_ctl->progress = (u64)-1;
 535
 536 out:
 537         btrfs_free_path(path);
 538         return ret;
 539 }
 540
 541 static noinline void caching_thread(struct btrfs_work *work)
 542 {
 543         struct btrfs_block_group_cache *block_group;
 544         struct btrfs_fs_info *fs_info;
 545         struct btrfs_caching_control *caching_ctl;
 546         int ret;
 547
 548         caching_ctl = container_of(work, struct btrfs_caching_control, work);
 549         block_group = caching_ctl->block_group;
 550         fs_info = block_group->fs_info;
 551
 552         mutex_lock(&caching_ctl->mutex);
 553         down_read(&fs_info->commit_root_sem);
 554
 555         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
 556                 ret = load_free_space_tree(caching_ctl);
 557         else
 558                 ret = load_extent_tree_free(caching_ctl);
 559
 560         spin_lock(&block_group->lock);
 561         block_group->caching_ctl = NULL;
 562         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 563         spin_unlock(&block_group->lock);
 564
 565 #ifdef CONFIG_BTRFS_DEBUG
 566         if (btrfs_should_fragment_free_space(extent_root, block_group)) {
 567                 u64 bytes_used;
 568
 569                 spin_lock(&block_group->space_info->lock);
 570                 spin_lock(&block_group->lock);
 571                 bytes_used = block_group->key.offset -
 572                         btrfs_block_group_used(&block_group->item);
 573                 block_group->space_info->bytes_used += bytes_used >> 1;
 574                 spin_unlock(&block_group->lock);
 575                 spin_unlock(&block_group->space_info->lock);
 576                 fragment_free_space(extent_root, block_group);
 577         }
 578 #endif
 579
 580         caching_ctl->progress = (u64)-1;
 581
 582         up_read(&fs_info->commit_root_sem);
 583         free_excluded_extents(fs_info->extent_root, block_group);
 584         mutex_unlock(&caching_ctl->mutex);
 585
 586         wake_up(&caching_ctl->wait);
 587
 588         put_caching_control(caching_ctl);
 589         btrfs_put_block_group(block_group);
 590 }
 591
 592 static int cache_block_group(struct btrfs_block_group_cache *cache,
 593                              int load_cache_only)
 594 {
 595         DEFINE_WAIT(wait);
 596         struct btrfs_fs_info *fs_info = cache->fs_info;
 597         struct btrfs_caching_control *caching_ctl;
 598         int ret = 0;
 599
 600         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 601         if (!caching_ctl)
 602                 return -ENOMEM;
 603
 604         INIT_LIST_HEAD(&caching_ctl->list);
 605         mutex_init(&caching_ctl->mutex);
 606         init_waitqueue_head(&caching_ctl->wait);
 607         caching_ctl->block_group = cache;
 608         caching_ctl->progress = cache->key.objectid;
 609         atomic_set(&caching_ctl->count, 1);
 610         btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
 611                         caching_thread, NULL, NULL);
 612
 613         spin_lock(&cache->lock);
 614         /*
 615          * This should be a rare occasion, but this could happen I think in the
 616          * case where one thread starts to load the space cache info, and then
 617          * some other thread starts a transaction commit which tries to do an
 618          * allocation while the other thread is still loading the space cache
 619          * info.  The previous loop should have kept us from choosing this block
 620          * group, but if we've moved to the state where we will wait on caching
 621          * block groups we need to first check if we're doing a fast load here,
 622          * so we can wait for it to finish, otherwise we could end up allocating
 623          * from a block group who's cache gets evicted for one reason or
 624          * another.
 625          */
 626         while (cache->cached == BTRFS_CACHE_FAST) {
 627                 struct btrfs_caching_control *ctl;
 628
 629                 ctl = cache->caching_ctl;
 630                 atomic_inc(&ctl->count);
 631                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 632                 spin_unlock(&cache->lock);
 633
 634                 schedule();
 635
 636                 finish_wait(&ctl->wait, &wait);
 637                 put_caching_control(ctl);
 638                 spin_lock(&cache->lock);
 639         }
 640
 641         if (cache->cached != BTRFS_CACHE_NO) {
 642                 spin_unlock(&cache->lock);
 643                 kfree(caching_ctl);
 644                 return 0;
 645         }
 646         WARN_ON(cache->caching_ctl);
 647         cache->caching_ctl = caching_ctl;
 648         cache->cached = BTRFS_CACHE_FAST;
 649         spin_unlock(&cache->lock);
 650
 651         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
 652                 mutex_lock(&caching_ctl->mutex);
 653                 ret = load_free_space_cache(fs_info, cache);
 654
 655                 spin_lock(&cache->lock);
 656                 if (ret == 1) {
 657                         cache->caching_ctl = NULL;
 658                         cache->cached = BTRFS_CACHE_FINISHED;
 659                         cache->last_byte_to_unpin = (u64)-1;
 660                         caching_ctl->progress = (u64)-1;
 661                 } else {
 662                         if (load_cache_only) {
 663                                 cache->caching_ctl = NULL;
 664                                 cache->cached = BTRFS_CACHE_NO;
 665                         } else {
 666                                 cache->cached = BTRFS_CACHE_STARTED;
 667                                 cache->has_caching_ctl = 1;
 668                         }
 669                 }
 670                 spin_unlock(&cache->lock);
 671 #ifdef CONFIG_BTRFS_DEBUG
 672                 if (ret == 1 &&
 673                     btrfs_should_fragment_free_space(fs_info->extent_root,
 674                                                      cache)) {
 675                         u64 bytes_used;
 676
 677                         spin_lock(&cache->space_info->lock);
 678                         spin_lock(&cache->lock);
 679                         bytes_used = cache->key.offset -
 680                                 btrfs_block_group_used(&cache->item);
 681                         cache->space_info->bytes_used += bytes_used >> 1;
 682                         spin_unlock(&cache->lock);
 683                         spin_unlock(&cache->space_info->lock);
 684                         fragment_free_space(fs_info->extent_root, cache);
 685                 }
 686 #endif
 687                 mutex_unlock(&caching_ctl->mutex);
 688
 689                 wake_up(&caching_ctl->wait);
 690                 if (ret == 1) {
 691                         put_caching_control(caching_ctl);
 692                         free_excluded_extents(fs_info->extent_root, cache);
 693                         return 0;
 694                 }
 695         } else {
 696                 /*
 697                  * We're either using the free space tree or no caching at all.
 698                  * Set cached to the appropriate value and wakeup any waiters.
 699                  */
 700                 spin_lock(&cache->lock);
 701                 if (load_cache_only) {
 702                         cache->caching_ctl = NULL;
 703                         cache->cached = BTRFS_CACHE_NO;
 704                 } else {
 705                         cache->cached = BTRFS_CACHE_STARTED;
 706                         cache->has_caching_ctl = 1;
 707                 }
 708                 spin_unlock(&cache->lock);
 709                 wake_up(&caching_ctl->wait);
 710         }
 711
 712         if (load_cache_only) {
 713                 put_caching_control(caching_ctl);
 714                 return 0;
 715         }
 716
 717         down_write(&fs_info->commit_root_sem);
 718         atomic_inc(&caching_ctl->count);
 719         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 720         up_write(&fs_info->commit_root_sem);
 721
 722         btrfs_get_block_group(cache);
 723
 724         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 725
 726         return ret;
 727 }
 728
 729 /*
 730  * return the block group that starts at or after bytenr
 731  */
 732 static struct btrfs_block_group_cache *
 733 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 734 {
 735         struct btrfs_block_group_cache *cache;
 736
 737         cache = block_group_cache_tree_search(info, bytenr, 0);
 738
 739         return cache;
 740 }
 741
 742 /*
 743  * return the block group that contains the given bytenr
 744  */
 745 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 746                                                  struct btrfs_fs_info *info,
 747                                                  u64 bytenr)
 748 {
 749         struct btrfs_block_group_cache *cache;
 750
 751         cache = block_group_cache_tree_search(info, bytenr, 1);
 752
 753         return cache;
 754 }
 755
 756 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 757                                                   u64 flags)
 758 {
 759         struct list_head *head = &info->space_info;
 760         struct btrfs_space_info *found;
 761
 762         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 763
 764         rcu_read_lock();
 765         list_for_each_entry_rcu(found, head, list) {
 766                 if (found->flags & flags) {
 767                         rcu_read_unlock();
 768                         return found;
 769                 }
 770         }
 771         rcu_read_unlock();
 772         return NULL;
 773 }
 774
 775 /*
 776  * after adding space to the filesystem, we need to clear the full flags
 777  * on all the space infos.
 778  */
 779 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 780 {
 781         struct list_head *head = &info->space_info;
 782         struct btrfs_space_info *found;
 783
 784         rcu_read_lock();
 785         list_for_each_entry_rcu(found, head, list)
 786                 found->full = 0;
 787         rcu_read_unlock();
 788 }
 789
 790 /* simple helper to search for an existing data extent at a given offset */
 791 int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
 792 {
 793         int ret;
 794         struct btrfs_key key;
 795         struct btrfs_path *path;
 796
 797         path = btrfs_alloc_path();
 798         if (!path)
 799                 return -ENOMEM;
 800
 801         key.objectid = start;
 802         key.offset = len;
 803         key.type = BTRFS_EXTENT_ITEM_KEY;
 804         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 805                                 0, 0);
 806         btrfs_free_path(path);
 807         return ret;
 808 }
 809
 810 /*
 811  * helper function to lookup reference count and flags of a tree block.
 812  *
 813  * the head node for delayed ref is used to store the sum of all the
 814  * reference count modifications queued up in the rbtree. the head
 815  * node may also store the extent flags to set. This way you can check
 816  * to see what the reference count and extent flags would be if all of
 817  * the delayed refs are not processed.
 818  */
 819 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 820                              struct btrfs_root *root, u64 bytenr,
 821                              u64 offset, int metadata, u64 *refs, u64 *flags)
 822 {
 823         struct btrfs_delayed_ref_head *head;
 824         struct btrfs_delayed_ref_root *delayed_refs;
 825         struct btrfs_path *path;
 826         struct btrfs_extent_item *ei;
 827         struct extent_buffer *leaf;
 828         struct btrfs_key key;
 829         u32 item_size;
 830         u64 num_refs;
 831         u64 extent_flags;
 832         int ret;
 833
 834         /*
 835          * If we don't have skinny metadata, don't bother doing anything
 836          * different
 837          */
 838         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
 839                 offset = root->nodesize;
 840                 metadata = 0;
 841         }
 842
 843         path = btrfs_alloc_path();
 844         if (!path)
 845                 return -ENOMEM;
 846
 847         if (!trans) {
 848                 path->skip_locking = 1;
 849                 path->search_commit_root = 1;
 850         }
 851
 852 search_again:
 853         key.objectid = bytenr;
 854         key.offset = offset;
 855         if (metadata)
 856                 key.type = BTRFS_METADATA_ITEM_KEY;
 857         else
 858                 key.type = BTRFS_EXTENT_ITEM_KEY;
 859
 860         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 861                                 &key, path, 0, 0);
 862         if (ret < 0)
 863                 goto out_free;
 864
 865         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
 866                 if (path->slots[0]) {
 867                         path->slots[0]--;
 868                         btrfs_item_key_to_cpu(path->nodes[0], &key,
 869                                               path->slots[0]);
 870                         if (key.objectid == bytenr &&
 871                             key.type == BTRFS_EXTENT_ITEM_KEY &&
 872                             key.offset == root->nodesize)
 873                                 ret = 0;
 874                 }
 875         }
 876
 877         if (ret == 0) {
 878                 leaf = path->nodes[0];
 879                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 880                 if (item_size >= sizeof(*ei)) {
 881                         ei = btrfs_item_ptr(leaf, path->slots[0],
 882                                             struct btrfs_extent_item);
 883                         num_refs = btrfs_extent_refs(leaf, ei);
 884                         extent_flags = btrfs_extent_flags(leaf, ei);
 885                 } else {
 886 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 887                         struct btrfs_extent_item_v0 *ei0;
 888                         BUG_ON(item_size != sizeof(*ei0));
 889                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
 890                                              struct btrfs_extent_item_v0);
 891                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
 892                         /* FIXME: this isn't correct for data */
 893                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 894 #else
 895                         BUG();
 896 #endif
 897                 }
 898                 BUG_ON(num_refs == 0);
 899         } else {
 900                 num_refs = 0;
 901                 extent_flags = 0;
 902                 ret = 0;
 903         }
 904
 905         if (!trans)
 906                 goto out;
 907
 908         delayed_refs = &trans->transaction->delayed_refs;
 909         spin_lock(&delayed_refs->lock);
 910         head = btrfs_find_delayed_ref_head(trans, bytenr);
 911         if (head) {
 912                 if (!mutex_trylock(&head->mutex)) {
 913                         atomic_inc(&head->node.refs);
 914                         spin_unlock(&delayed_refs->lock);
 915
 916                         btrfs_release_path(path);
 917
 918                         /*
 919                          * Mutex was contended, block until it's released and try
 920                          * again
 921                          */
 922                         mutex_lock(&head->mutex);
 923                         mutex_unlock(&head->mutex);
 924                         btrfs_put_delayed_ref(&head->node);
 925                         goto search_again;
 926                 }
 927                 spin_lock(&head->lock);
 928                 if (head->extent_op && head->extent_op->update_flags)
 929                         extent_flags |= head->extent_op->flags_to_set;
 930                 else
 931                         BUG_ON(num_refs == 0);
 932
 933                 num_refs += head->node.ref_mod;
 934                 spin_unlock(&head->lock);
 935                 mutex_unlock(&head->mutex);
 936         }
 937         spin_unlock(&delayed_refs->lock);
 938 out:
 939         WARN_ON(num_refs == 0);
 940         if (refs)
 941                 *refs = num_refs;
 942         if (flags)
 943                 *flags = extent_flags;
 944 out_free:
 945         btrfs_free_path(path);
 946         return ret;
 947 }
 948
 949 /*
 950  * Back reference rules.  Back refs have three main goals:
 951  *
 952  * 1) differentiate between all holders of references to an extent so that
 953  *    when a reference is dropped we can make sure it was a valid reference
 954  *    before freeing the extent.
 955  *
 956  * 2) Provide enough information to quickly find the holders of an extent
 957  *    if we notice a given block is corrupted or bad.
 958  *
 959  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 960  *    maintenance.  This is actually the same as #2, but with a slightly
 961  *    different use case.
 962  *
 963  * There are two kinds of back refs. The implicit back refs is optimized
 964  * for pointers in non-shared tree blocks. For a given pointer in a block,
 965  * back refs of this kind provide information about the block's owner tree
 966  * and the pointer's key. These information allow us to find the block by
 967  * b-tree searching. The full back refs is for pointers in tree blocks not
 968  * referenced by their owner trees. The location of tree block is recorded
 969  * in the back refs. Actually the full back refs is generic, and can be
 970  * used in all cases the implicit back refs is used. The major shortcoming
 971  * of the full back refs is its overhead. Every time a tree block gets
 972  * COWed, we have to update back refs entry for all pointers in it.
 973  *
 974  * For a newly allocated tree block, we use implicit back refs for
 975  * pointers in it. This means most tree related operations only involve
 976  * implicit back refs. For a tree block created in old transaction, the
 977  * only way to drop a reference to it is COW it. So we can detect the
 978  * event that tree block loses its owner tree's reference and do the
 979  * back refs conversion.
 980  *
 981  * When a tree block is COW'd through a tree, there are four cases:
 982  *
 983  * The reference count of the block is one and the tree is the block's
 984  * owner tree. Nothing to do in this case.
 985  *
 986  * The reference count of the block is one and the tree is not the
 987  * block's owner tree. In this case, full back refs is used for pointers
 988  * in the block. Remove these full back refs, add implicit back refs for
 989  * every pointers in the new block.
 990  *
 991  * The reference count of the block is greater than one and the tree is
 992  * the block's owner tree. In this case, implicit back refs is used for
 993  * pointers in the block. Add full back refs for every pointers in the
 994  * block, increase lower level extents' reference counts. The original
 995  * implicit back refs are entailed to the new block.
 996  *
 997  * The reference count of the block is greater than one and the tree is
 998  * not the block's owner tree. Add implicit back refs for every pointer in
 999  * the new block, increase lower level extents' reference count.
1000  *
1001  * Back Reference Key composing:
1002  *
1003  * The key objectid corresponds to the first byte in the extent,
1004  * The key type is used to differentiate between types of back refs.
1005  * There are different meanings of the key offset for different types
1006  * of back refs.
1007  *
1008  * File extents can be referenced by:
1009  *
1010  * - multiple snapshots, subvolumes, or different generations in one subvol
1011  * - different files inside a single subvolume
1012  * - different offsets inside a file (bookend extents in file.c)
1013  *
1014  * The extent ref structure for the implicit back refs has fields for:
1015  *
1016  * - Objectid of the subvolume root
1017  * - objectid of the file holding the reference
1018  * - original offset in the file
1019  * - how many bookend extents
1020  *
1021  * The key offset for the implicit back refs is hash of the first
1022  * three fields.
1023  *
1024  * The extent ref structure for the full back refs has field for:
1025  *
1026  * - number of pointers in the tree leaf
1027  *
1028  * The key offset for the implicit back refs is the first byte of
1029  * the tree leaf
1030  *
1031  * When a file extent is allocated, The implicit back refs is used.
1032  * the fields are filled in:
1033  *
1034  *     (root_key.objectid, inode objectid, offset in file, 1)
1035  *
1036  * When a file extent is removed file truncation, we find the
1037  * corresponding implicit back refs and check the following fields:
1038  *
1039  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1040  *
1041  * Btree extents can be referenced by:
1042  *
1043  * - Different subvolumes
1044  *
1045  * Both the implicit back refs and the full back refs for tree blocks
1046  * only consist of key. The key offset for the implicit back refs is
1047  * objectid of block's owner tree. The key offset for the full back refs
1048  * is the first byte of parent block.
1049  *
1050  * When implicit back refs is used, information about the lowest key and
1051  * level of the tree block are required. These information are stored in
1052  * tree block info structure.
1053  */
1054
1055 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1056 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
1057                                   struct btrfs_root *root,
1058                                   struct btrfs_path *path,
1059                                   u64 owner, u32 extra_size)
1060 {
1061         struct btrfs_extent_item *item;
1062         struct btrfs_extent_item_v0 *ei0;
1063         struct btrfs_extent_ref_v0 *ref0;
1064         struct btrfs_tree_block_info *bi;
1065         struct extent_buffer *leaf;
1066         struct btrfs_key key;
1067         struct btrfs_key found_key;
1068         u32 new_size = sizeof(*item);
1069         u64 refs;
1070         int ret;
1071
1072         leaf = path->nodes[0];
1073         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1074
1075         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1076         ei0 = btrfs_item_ptr(leaf, path->slots[0],
1077                              struct btrfs_extent_item_v0);
1078         refs = btrfs_extent_refs_v0(leaf, ei0);
1079
1080         if (owner == (u64)-1) {
1081                 while (1) {
1082                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1083                                 ret = btrfs_next_leaf(root, path);
1084                                 if (ret < 0)
1085                                         return ret;
1086                                 BUG_ON(ret > 0); /* Corruption */
1087                                 leaf = path->nodes[0];
1088                         }
1089                         btrfs_item_key_to_cpu(leaf, &found_key,
1090                                               path->slots[0]);
1091                         BUG_ON(key.objectid != found_key.objectid);
1092                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1093                                 path->slots[0]++;
1094                                 continue;
1095                         }
1096                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1097                                               struct btrfs_extent_ref_v0);
1098                         owner = btrfs_ref_objectid_v0(leaf, ref0);
1099                         break;
1100                 }
1101         }
1102         btrfs_release_path(path);
1103
1104         if (owner < BTRFS_FIRST_FREE_OBJECTID)
1105                 new_size += sizeof(*bi);
1106
1107         new_size -= sizeof(*ei0);
1108         ret = btrfs_search_slot(trans, root, &key, path,
1109                                 new_size + extra_size, 1);
1110         if (ret < 0)
1111                 return ret;
1112         BUG_ON(ret); /* Corruption */
1113
1114         btrfs_extend_item(root, path, new_size);
1115
1116         leaf = path->nodes[0];
1117         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1118         btrfs_set_extent_refs(leaf, item, refs);
1119         /* FIXME: get real generation */
1120         btrfs_set_extent_generation(leaf, item, 0);
1121         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1122                 btrfs_set_extent_flags(leaf, item,
1123                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
1124                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
1125                 bi = (struct btrfs_tree_block_info *)(item + 1);
1126                 /* FIXME: get first key of the block */
1127                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1128                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1129         } else {
1130                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1131         }
1132         btrfs_mark_buffer_dirty(leaf);
1133         return 0;
1134 }
1135 #endif
1136
1137 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1138 {
1139         u32 high_crc = ~(u32)0;
1140         u32 low_crc = ~(u32)0;
1141         __le64 lenum;
1142
1143         lenum = cpu_to_le64(root_objectid);
1144         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1145         lenum = cpu_to_le64(owner);
1146         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1147         lenum = cpu_to_le64(offset);
1148         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1149
1150         return ((u64)high_crc << 31) ^ (u64)low_crc;
1151 }
1152
1153 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1154                                      struct btrfs_extent_data_ref *ref)
1155 {
1156         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1157                                     btrfs_extent_data_ref_objectid(leaf, ref),
1158                                     btrfs_extent_data_ref_offset(leaf, ref));
1159 }
1160
1161 static int match_extent_data_ref(struct extent_buffer *leaf,
1162                                  struct btrfs_extent_data_ref *ref,
1163                                  u64 root_objectid, u64 owner, u64 offset)
1164 {
1165         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1166             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1167             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1168                 return 0;
1169         return 1;
1170 }
1171
1172 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1173                                            struct btrfs_root *root,
1174                                            struct btrfs_path *path,
1175                                            u64 bytenr, u64 parent,
1176                                            u64 root_objectid,
1177                                            u64 owner, u64 offset)
1178 {
1179         struct btrfs_key key;
1180         struct btrfs_extent_data_ref *ref;
1181         struct extent_buffer *leaf;
1182         u32 nritems;
1183         int ret;
1184         int recow;
1185         int err = -ENOENT;
1186
1187         key.objectid = bytenr;
1188         if (parent) {
1189                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1190                 key.offset = parent;
1191         } else {
1192                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1193                 key.offset = hash_extent_data_ref(root_objectid,
1194                                                   owner, offset);
1195         }
1196 again:
1197         recow = 0;
1198         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1199         if (ret < 0) {
1200                 err = ret;
1201                 goto fail;
1202         }
1203
1204         if (parent) {
1205                 if (!ret)
1206                         return 0;
1207 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1208                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1209                 btrfs_release_path(path);
1210                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1211                 if (ret < 0) {
1212                         err = ret;
1213                         goto fail;
1214                 }
1215                 if (!ret)
1216                         return 0;
1217 #endif
1218                 goto fail;
1219         }
1220
1221         leaf = path->nodes[0];
1222         nritems = btrfs_header_nritems(leaf);
1223         while (1) {
1224                 if (path->slots[0] >= nritems) {
1225                         ret = btrfs_next_leaf(root, path);
1226                         if (ret < 0)
1227                                 err = ret;
1228                         if (ret)
1229                                 goto fail;
1230
1231                         leaf = path->nodes[0];
1232                         nritems = btrfs_header_nritems(leaf);
1233                         recow = 1;
1234                 }
1235
1236                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1237                 if (key.objectid != bytenr ||
1238                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1239                         goto fail;
1240
1241                 ref = btrfs_item_ptr(leaf, path->slots[0],
1242                                      struct btrfs_extent_data_ref);
1243
1244                 if (match_extent_data_ref(leaf, ref, root_objectid,
1245                                           owner, offset)) {
1246                         if (recow) {
1247                                 btrfs_release_path(path);
1248                                 goto again;
1249                         }
1250                         err = 0;
1251                         break;
1252                 }
1253                 path->slots[0]++;
1254         }
1255 fail:
1256         return err;
1257 }
1258
1259 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1260                                            struct btrfs_root *root,
1261                                            struct btrfs_path *path,
1262                                            u64 bytenr, u64 parent,
1263                                            u64 root_objectid, u64 owner,
1264                                            u64 offset, int refs_to_add)
1265 {
1266         struct btrfs_key key;
1267         struct extent_buffer *leaf;
1268         u32 size;
1269         u32 num_refs;
1270         int ret;
1271
1272         key.objectid = bytenr;
1273         if (parent) {
1274                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1275                 key.offset = parent;
1276                 size = sizeof(struct btrfs_shared_data_ref);
1277         } else {
1278                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1279                 key.offset = hash_extent_data_ref(root_objectid,
1280                                                   owner, offset);
1281                 size = sizeof(struct btrfs_extent_data_ref);
1282         }
1283
1284         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1285         if (ret && ret != -EEXIST)
1286                 goto fail;
1287
1288         leaf = path->nodes[0];
1289         if (parent) {
1290                 struct btrfs_shared_data_ref *ref;
1291                 ref = btrfs_item_ptr(leaf, path->slots[0],
1292                                      struct btrfs_shared_data_ref);
1293                 if (ret == 0) {
1294                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1295                 } else {
1296                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1297                         num_refs += refs_to_add;
1298                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1299                 }
1300         } else {
1301                 struct btrfs_extent_data_ref *ref;
1302                 while (ret == -EEXIST) {
1303                         ref = btrfs_item_ptr(leaf, path->slots[0],
1304                                              struct btrfs_extent_data_ref);
1305                         if (match_extent_data_ref(leaf, ref, root_objectid,
1306                                                   owner, offset))
1307                                 break;
1308                         btrfs_release_path(path);
1309                         key.offset++;
1310                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1311                                                       size);
1312                         if (ret && ret != -EEXIST)
1313                                 goto fail;
1314
1315                         leaf = path->nodes[0];
1316                 }
1317                 ref = btrfs_item_ptr(leaf, path->slots[0],
1318                                      struct btrfs_extent_data_ref);
1319                 if (ret == 0) {
1320                         btrfs_set_extent_data_ref_root(leaf, ref,
1321                                                        root_objectid);
1322                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1323                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1324                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1325                 } else {
1326                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1327                         num_refs += refs_to_add;
1328                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1329                 }
1330         }
1331         btrfs_mark_buffer_dirty(leaf);
1332         ret = 0;
1333 fail:
1334         btrfs_release_path(path);
1335         return ret;
1336 }
1337
1338 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1339                                            struct btrfs_root *root,
1340                                            struct btrfs_path *path,
1341                                            int refs_to_drop, int *last_ref)
1342 {
1343         struct btrfs_key key;
1344         struct btrfs_extent_data_ref *ref1 = NULL;
1345         struct btrfs_shared_data_ref *ref2 = NULL;
1346         struct extent_buffer *leaf;
1347         u32 num_refs = 0;
1348         int ret = 0;
1349
1350         leaf = path->nodes[0];
1351         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1352
1353         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1354                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1355                                       struct btrfs_extent_data_ref);
1356                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1357         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1358                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1359                                       struct btrfs_shared_data_ref);
1360                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1361 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1362         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1363                 struct btrfs_extent_ref_v0 *ref0;
1364                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1365                                       struct btrfs_extent_ref_v0);
1366                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1367 #endif
1368         } else {
1369                 BUG();
1370         }
1371
1372         BUG_ON(num_refs < refs_to_drop);
1373         num_refs -= refs_to_drop;
1374
1375         if (num_refs == 0) {
1376                 ret = btrfs_del_item(trans, root, path);
1377                 *last_ref = 1;
1378         } else {
1379                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1380                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1381                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1382                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1383 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1384                 else {
1385                         struct btrfs_extent_ref_v0 *ref0;
1386                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1387                                         struct btrfs_extent_ref_v0);
1388                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1389                 }
1390 #endif
1391                 btrfs_mark_buffer_dirty(leaf);
1392         }
1393         return ret;
1394 }
1395
1396 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1397                                           struct btrfs_extent_inline_ref *iref)
1398 {
1399         struct btrfs_key key;
1400         struct extent_buffer *leaf;
1401         struct btrfs_extent_data_ref *ref1;
1402         struct btrfs_shared_data_ref *ref2;
1403         u32 num_refs = 0;
1404
1405         leaf = path->nodes[0];
1406         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1407         if (iref) {
1408                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1409                     BTRFS_EXTENT_DATA_REF_KEY) {
1410                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1411                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1412                 } else {
1413                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1414                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1415                 }
1416         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1417                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1418                                       struct btrfs_extent_data_ref);
1419                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1420         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1421                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1422                                       struct btrfs_shared_data_ref);
1423                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1424 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1425         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1426                 struct btrfs_extent_ref_v0 *ref0;
1427                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1428                                       struct btrfs_extent_ref_v0);
1429                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1430 #endif
1431         } else {
1432                 WARN_ON(1);
1433         }
1434         return num_refs;
1435 }
1436
1437 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1438                                           struct btrfs_root *root,
1439                                           struct btrfs_path *path,
1440                                           u64 bytenr, u64 parent,
1441                                           u64 root_objectid)
1442 {
1443         struct btrfs_key key;
1444         int ret;
1445
1446         key.objectid = bytenr;
1447         if (parent) {
1448                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1449                 key.offset = parent;
1450         } else {
1451                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1452                 key.offset = root_objectid;
1453         }
1454
1455         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1456         if (ret > 0)
1457                 ret = -ENOENT;
1458 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1459         if (ret == -ENOENT && parent) {
1460                 btrfs_release_path(path);
1461                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1462                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1463                 if (ret > 0)
1464                         ret = -ENOENT;
1465         }
1466 #endif
1467         return ret;
1468 }
1469
1470 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1471                                           struct btrfs_root *root,
1472                                           struct btrfs_path *path,
1473                                           u64 bytenr, u64 parent,
1474                                           u64 root_objectid)
1475 {
1476         struct btrfs_key key;
1477         int ret;
1478
1479         key.objectid = bytenr;
1480         if (parent) {
1481                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1482                 key.offset = parent;
1483         } else {
1484                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1485                 key.offset = root_objectid;
1486         }
1487
1488         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1489         btrfs_release_path(path);
1490         return ret;
1491 }
1492
1493 static inline int extent_ref_type(u64 parent, u64 owner)
1494 {
1495         int type;
1496         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1497                 if (parent > 0)
1498                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1499                 else
1500                         type = BTRFS_TREE_BLOCK_REF_KEY;
1501         } else {
1502                 if (parent > 0)
1503                         type = BTRFS_SHARED_DATA_REF_KEY;
1504                 else
1505                         type = BTRFS_EXTENT_DATA_REF_KEY;
1506         }
1507         return type;
1508 }
1509
1510 static int find_next_key(struct btrfs_path *path, int level,
1511                          struct btrfs_key *key)
1512
1513 {
1514         for (; level < BTRFS_MAX_LEVEL; level++) {
1515                 if (!path->nodes[level])
1516                         break;
1517                 if (path->slots[level] + 1 >=
1518                     btrfs_header_nritems(path->nodes[level]))
1519                         continue;
1520                 if (level == 0)
1521                         btrfs_item_key_to_cpu(path->nodes[level], key,
1522                                               path->slots[level] + 1);
1523                 else
1524                         btrfs_node_key_to_cpu(path->nodes[level], key,
1525                                               path->slots[level] + 1);
1526                 return 0;
1527         }
1528         return 1;
1529 }
1530
1531 /*
1532  * look for inline back ref. if back ref is found, *ref_ret is set
1533  * to the address of inline back ref, and 0 is returned.
1534  *
1535  * if back ref isn't found, *ref_ret is set to the address where it
1536  * should be inserted, and -ENOENT is returned.
1537  *
1538  * if insert is true and there are too many inline back refs, the path
1539  * points to the extent item, and -EAGAIN is returned.
1540  *
1541  * NOTE: inline back refs are ordered in the same way that back ref
1542  *       items in the tree are ordered.
1543  */
1544 static noinline_for_stack
1545 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1546                                  struct btrfs_root *root,
1547                                  struct btrfs_path *path,
1548                                  struct btrfs_extent_inline_ref **ref_ret,
1549                                  u64 bytenr, u64 num_bytes,
1550                                  u64 parent, u64 root_objectid,
1551                                  u64 owner, u64 offset, int insert)
1552 {
1553         struct btrfs_key key;
1554         struct extent_buffer *leaf;
1555         struct btrfs_extent_item *ei;
1556         struct btrfs_extent_inline_ref *iref;
1557         u64 flags;
1558         u64 item_size;
1559         unsigned long ptr;
1560         unsigned long end;
1561         int extra_size;
1562         int type;
1563         int want;
1564         int ret;
1565         int err = 0;
1566         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
1567                                                  SKINNY_METADATA);
1568
1569         key.objectid = bytenr;
1570         key.type = BTRFS_EXTENT_ITEM_KEY;
1571         key.offset = num_bytes;
1572
1573         want = extent_ref_type(parent, owner);
1574         if (insert) {
1575                 extra_size = btrfs_extent_inline_ref_size(want);
1576                 path->keep_locks = 1;
1577         } else
1578                 extra_size = -1;
1579
1580         /*
1581          * Owner is our parent level, so we can just add one to get the level
1582          * for the block we are interested in.
1583          */
1584         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1585                 key.type = BTRFS_METADATA_ITEM_KEY;
1586                 key.offset = owner;
1587         }
1588
1589 again:
1590         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1591         if (ret < 0) {
1592                 err = ret;
1593                 goto out;
1594         }
1595
1596         /*
1597          * We may be a newly converted file system which still has the old fat
1598          * extent entries for metadata, so try and see if we have one of those.
1599          */
1600         if (ret > 0 && skinny_metadata) {
1601                 skinny_metadata = false;
1602                 if (path->slots[0]) {
1603                         path->slots[0]--;
1604                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1605                                               path->slots[0]);
1606                         if (key.objectid == bytenr &&
1607                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1608                             key.offset == num_bytes)
1609                                 ret = 0;
1610                 }
1611                 if (ret) {
1612                         key.objectid = bytenr;
1613                         key.type = BTRFS_EXTENT_ITEM_KEY;
1614                         key.offset = num_bytes;
1615                         btrfs_release_path(path);
1616                         goto again;
1617                 }
1618         }
1619
1620         if (ret && !insert) {
1621                 err = -ENOENT;
1622                 goto out;
1623         } else if (WARN_ON(ret)) {
1624                 err = -EIO;
1625                 goto out;
1626         }
1627
1628         leaf = path->nodes[0];
1629         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1630 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1631         if (item_size < sizeof(*ei)) {
1632                 if (!insert) {
1633                         err = -ENOENT;
1634                         goto out;
1635                 }
1636                 ret = convert_extent_item_v0(trans, root, path, owner,
1637                                              extra_size);
1638                 if (ret < 0) {
1639                         err = ret;
1640                         goto out;
1641                 }
1642                 leaf = path->nodes[0];
1643                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1644         }
1645 #endif
1646         BUG_ON(item_size < sizeof(*ei));
1647
1648         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1649         flags = btrfs_extent_flags(leaf, ei);
1650
1651         ptr = (unsigned long)(ei + 1);
1652         end = (unsigned long)ei + item_size;
1653
1654         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1655                 ptr += sizeof(struct btrfs_tree_block_info);
1656                 BUG_ON(ptr > end);
1657         }
1658
1659         err = -ENOENT;
1660         while (1) {
1661                 if (ptr >= end) {
1662                         WARN_ON(ptr > end);
1663                         break;
1664                 }
1665                 iref = (struct btrfs_extent_inline_ref *)ptr;
1666                 type = btrfs_extent_inline_ref_type(leaf, iref);
1667                 if (want < type)
1668                         break;
1669                 if (want > type) {
1670                         ptr += btrfs_extent_inline_ref_size(type);
1671                         continue;
1672                 }
1673
1674                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1675                         struct btrfs_extent_data_ref *dref;
1676                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1677                         if (match_extent_data_ref(leaf, dref, root_objectid,
1678                                                   owner, offset)) {
1679                                 err = 0;
1680                                 break;
1681                         }
1682                         if (hash_extent_data_ref_item(leaf, dref) <
1683                             hash_extent_data_ref(root_objectid, owner, offset))
1684                                 break;
1685                 } else {
1686                         u64 ref_offset;
1687                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1688                         if (parent > 0) {
1689                                 if (parent == ref_offset) {
1690                                         err = 0;
1691                                         break;
1692                                 }
1693                                 if (ref_offset < parent)
1694                                         break;
1695                         } else {
1696                                 if (root_objectid == ref_offset) {
1697                                         err = 0;
1698                                         break;
1699                                 }
1700                                 if (ref_offset < root_objectid)
1701                                         break;
1702                         }
1703                 }
1704                 ptr += btrfs_extent_inline_ref_size(type);
1705         }
1706         if (err == -ENOENT && insert) {
1707                 if (item_size + extra_size >=
1708                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1709                         err = -EAGAIN;
1710                         goto out;
1711                 }
1712                 /*
1713                  * To add new inline back ref, we have to make sure
1714                  * there is no corresponding back ref item.
1715                  * For simplicity, we just do not add new inline back
1716                  * ref if there is any kind of item for this block
1717                  */
1718                 if (find_next_key(path, 0, &key) == 0 &&
1719                     key.objectid == bytenr &&
1720                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1721                         err = -EAGAIN;
1722                         goto out;
1723                 }
1724         }
1725         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1726 out:
1727         if (insert) {
1728                 path->keep_locks = 0;
1729                 btrfs_unlock_up_safe(path, 1);
1730         }
1731         return err;
1732 }
1733
1734 /*
1735  * helper to add new inline back ref
1736  */
1737 static noinline_for_stack
1738 void setup_inline_extent_backref(struct btrfs_root *root,
1739                                  struct btrfs_path *path,
1740                                  struct btrfs_extent_inline_ref *iref,
1741                                  u64 parent, u64 root_objectid,
1742                                  u64 owner, u64 offset, int refs_to_add,
1743                                  struct btrfs_delayed_extent_op *extent_op)
1744 {
1745         struct extent_buffer *leaf;
1746         struct btrfs_extent_item *ei;
1747         unsigned long ptr;
1748         unsigned long end;
1749         unsigned long item_offset;
1750         u64 refs;
1751         int size;
1752         int type;
1753
1754         leaf = path->nodes[0];
1755         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1756         item_offset = (unsigned long)iref - (unsigned long)ei;
1757
1758         type = extent_ref_type(parent, owner);
1759         size = btrfs_extent_inline_ref_size(type);
1760
1761         btrfs_extend_item(root, path, size);
1762
1763         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1764         refs = btrfs_extent_refs(leaf, ei);
1765         refs += refs_to_add;
1766         btrfs_set_extent_refs(leaf, ei, refs);
1767         if (extent_op)
1768                 __run_delayed_extent_op(extent_op, leaf, ei);
1769
1770         ptr = (unsigned long)ei + item_offset;
1771         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1772         if (ptr < end - size)
1773                 memmove_extent_buffer(leaf, ptr + size, ptr,
1774                                       end - size - ptr);
1775
1776         iref = (struct btrfs_extent_inline_ref *)ptr;
1777         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1778         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1779                 struct btrfs_extent_data_ref *dref;
1780                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1781                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1782                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1783                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1784                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1785         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1786                 struct btrfs_shared_data_ref *sref;
1787                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1788                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1789                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1790         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1791                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1792         } else {
1793                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1794         }
1795         btrfs_mark_buffer_dirty(leaf);
1796 }
1797
1798 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1799                                  struct btrfs_root *root,
1800                                  struct btrfs_path *path,
1801                                  struct btrfs_extent_inline_ref **ref_ret,
1802                                  u64 bytenr, u64 num_bytes, u64 parent,
1803                                  u64 root_objectid, u64 owner, u64 offset)
1804 {
1805         int ret;
1806
1807         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1808                                            bytenr, num_bytes, parent,
1809                                            root_objectid, owner, offset, 0);
1810         if (ret != -ENOENT)
1811                 return ret;
1812
1813         btrfs_release_path(path);
1814         *ref_ret = NULL;
1815
1816         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1817                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1818                                             root_objectid);
1819         } else {
1820                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1821                                              root_objectid, owner, offset);
1822         }
1823         return ret;
1824 }
1825
1826 /*
1827  * helper to update/remove inline back ref
1828  */
1829 static noinline_for_stack
1830 void update_inline_extent_backref(struct btrfs_root *root,
1831                                   struct btrfs_path *path,
1832                                   struct btrfs_extent_inline_ref *iref,
1833                                   int refs_to_mod,
1834                                   struct btrfs_delayed_extent_op *extent_op,
1835                                   int *last_ref)
1836 {
1837         struct extent_buffer *leaf;
1838         struct btrfs_extent_item *ei;
1839         struct btrfs_extent_data_ref *dref = NULL;
1840         struct btrfs_shared_data_ref *sref = NULL;
1841         unsigned long ptr;
1842         unsigned long end;
1843         u32 item_size;
1844         int size;
1845         int type;
1846         u64 refs;
1847
1848         leaf = path->nodes[0];
1849         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1850         refs = btrfs_extent_refs(leaf, ei);
1851         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1852         refs += refs_to_mod;
1853         btrfs_set_extent_refs(leaf, ei, refs);
1854         if (extent_op)
1855                 __run_delayed_extent_op(extent_op, leaf, ei);
1856
1857         type = btrfs_extent_inline_ref_type(leaf, iref);
1858
1859         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1860                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1861                 refs = btrfs_extent_data_ref_count(leaf, dref);
1862         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1863                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1864                 refs = btrfs_shared_data_ref_count(leaf, sref);
1865         } else {
1866                 refs = 1;
1867                 BUG_ON(refs_to_mod != -1);
1868         }
1869
1870         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1871         refs += refs_to_mod;
1872
1873         if (refs > 0) {
1874                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1875                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1876                 else
1877                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1878         } else {
1879                 *last_ref = 1;
1880                 size =  btrfs_extent_inline_ref_size(type);
1881                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1882                 ptr = (unsigned long)iref;
1883                 end = (unsigned long)ei + item_size;
1884                 if (ptr + size < end)
1885                         memmove_extent_buffer(leaf, ptr, ptr + size,
1886                                               end - ptr - size);
1887                 item_size -= size;
1888                 btrfs_truncate_item(root, path, item_size, 1);
1889         }
1890         btrfs_mark_buffer_dirty(leaf);
1891 }
1892
1893 static noinline_for_stack
1894 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1895                                  struct btrfs_root *root,
1896                                  struct btrfs_path *path,
1897                                  u64 bytenr, u64 num_bytes, u64 parent,
1898                                  u64 root_objectid, u64 owner,
1899                                  u64 offset, int refs_to_add,
1900                                  struct btrfs_delayed_extent_op *extent_op)
1901 {
1902         struct btrfs_extent_inline_ref *iref;
1903         int ret;
1904
1905         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1906                                            bytenr, num_bytes, parent,
1907                                            root_objectid, owner, offset, 1);
1908         if (ret == 0) {
1909                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1910                 update_inline_extent_backref(root, path, iref,
1911                                              refs_to_add, extent_op, NULL);
1912         } else if (ret == -ENOENT) {
1913                 setup_inline_extent_backref(root, path, iref, parent,
1914                                             root_objectid, owner, offset,
1915                                             refs_to_add, extent_op);
1916                 ret = 0;
1917         }
1918         return ret;
1919 }
1920
1921 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1922                                  struct btrfs_root *root,
1923                                  struct btrfs_path *path,
1924                                  u64 bytenr, u64 parent, u64 root_objectid,
1925                                  u64 owner, u64 offset, int refs_to_add)
1926 {
1927         int ret;
1928         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1929                 BUG_ON(refs_to_add != 1);
1930                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1931                                             parent, root_objectid);
1932         } else {
1933                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1934                                              parent, root_objectid,
1935                                              owner, offset, refs_to_add);
1936         }
1937         return ret;
1938 }
1939
1940 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1941                                  struct btrfs_root *root,
1942                                  struct btrfs_path *path,
1943                                  struct btrfs_extent_inline_ref *iref,
1944                                  int refs_to_drop, int is_data, int *last_ref)
1945 {
1946         int ret = 0;
1947
1948         BUG_ON(!is_data && refs_to_drop != 1);
1949         if (iref) {
1950                 update_inline_extent_backref(root, path, iref,
1951                                              -refs_to_drop, NULL, last_ref);
1952         } else if (is_data) {
1953                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
1954                                              last_ref);
1955         } else {
1956                 *last_ref = 1;
1957                 ret = btrfs_del_item(trans, root, path);
1958         }
1959         return ret;
1960 }
1961
1962 #define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
1963 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1964                                u64 *discarded_bytes)
1965 {
1966         int j, ret = 0;
1967         u64 bytes_left, end;
1968         u64 aligned_start = ALIGN(start, 1 << 9);
1969
1970         if (WARN_ON(start != aligned_start)) {
1971                 len -= aligned_start - start;
1972                 len = round_down(len, 1 << 9);
1973                 start = aligned_start;
1974         }
1975
1976         *discarded_bytes = 0;
1977
1978         if (!len)
1979                 return 0;
1980
1981         end = start + len;
1982         bytes_left = len;
1983
1984         /* Skip any superblocks on this device. */
1985         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1986                 u64 sb_start = btrfs_sb_offset(j);
1987                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1988                 u64 size = sb_start - start;
1989
1990                 if (!in_range(sb_start, start, bytes_left) &&
1991                     !in_range(sb_end, start, bytes_left) &&
1992                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1993                         continue;
1994
1995                 /*
1996                  * Superblock spans beginning of range.  Adjust start and
1997                  * try again.
1998                  */
1999                 if (sb_start <= start) {
2000                         start += sb_end - start;
2001                         if (start > end) {
2002                                 bytes_left = 0;
2003                                 break;
2004                         }
2005                         bytes_left = end - start;
2006                         continue;
2007                 }
2008
2009                 if (size) {
2010                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
2011                                                    GFP_NOFS, 0);
2012                         if (!ret)
2013                                 *discarded_bytes += size;
2014                         else if (ret != -EOPNOTSUPP)
2015                                 return ret;
2016                 }
2017
2018                 start = sb_end;
2019                 if (start > end) {
2020                         bytes_left = 0;
2021                         break;
2022                 }
2023                 bytes_left = end - start;
2024         }
2025
2026         if (bytes_left) {
2027                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
2028                                            GFP_NOFS, 0);
2029                 if (!ret)
2030                         *discarded_bytes += bytes_left;
2031         }
2032         return ret;
2033 }
2034
2035 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
2036                          u64 num_bytes, u64 *actual_bytes)
2037 {
2038         int ret;
2039         u64 discarded_bytes = 0;
2040         struct btrfs_bio *bbio = NULL;
2041
2042
2043         /* Tell the block device(s) that the sectors can be discarded */
2044         ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
2045                               bytenr, &num_bytes, &bbio, 0);
2046         /* Error condition is -ENOMEM */
2047         if (!ret) {
2048                 struct btrfs_bio_stripe *stripe = bbio->stripes;
2049                 int i;
2050
2051
2052                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2053                         u64 bytes;
2054                         if (!stripe->dev->can_discard)
2055                                 continue;
2056
2057                         ret = btrfs_issue_discard(stripe->dev->bdev,
2058                                                   stripe->physical,
2059                                                   stripe->length,
2060                                                   &bytes);
2061                         if (!ret)
2062                                 discarded_bytes += bytes;
2063                         else if (ret != -EOPNOTSUPP)
2064                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2065
2066                         /*
2067                          * Just in case we get back EOPNOTSUPP for some reason,
2068                          * just ignore the return value so we don't screw up
2069                          * people calling discard_extent.
2070                          */
2071                         ret = 0;
2072                 }
2073                 btrfs_put_bbio(bbio);
2074         }
2075
2076         if (actual_bytes)
2077                 *actual_bytes = discarded_bytes;
2078
2079
2080         if (ret == -EOPNOTSUPP)
2081                 ret = 0;
2082         return ret;
2083 }
2084
2085 /* Can return -ENOMEM */
2086 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2087                          struct btrfs_root *root,
2088                          u64 bytenr, u64 num_bytes, u64 parent,
2089                          u64 root_objectid, u64 owner, u64 offset)
2090 {
2091         int ret;
2092         struct btrfs_fs_info *fs_info = root->fs_info;
2093
2094         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2095                root_objectid == BTRFS_TREE_LOG_OBJECTID);
2096
2097         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2098                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2099                                         num_bytes,
2100                                         parent, root_objectid, (int)owner,
2101                                         BTRFS_ADD_DELAYED_REF, NULL);
2102         } else {
2103                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2104                                         num_bytes, parent, root_objectid,
2105                                         owner, offset, 0,
2106                                         BTRFS_ADD_DELAYED_REF, NULL);
2107         }
2108         return ret;
2109 }
2110
2111 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2112                                   struct btrfs_root *root,
2113                                   struct btrfs_delayed_ref_node *node,
2114                                   u64 parent, u64 root_objectid,
2115                                   u64 owner, u64 offset, int refs_to_add,
2116                                   struct btrfs_delayed_extent_op *extent_op)
2117 {
2118         struct btrfs_fs_info *fs_info = root->fs_info;
2119         struct btrfs_path *path;
2120         struct extent_buffer *leaf;
2121         struct btrfs_extent_item *item;
2122         struct btrfs_key key;
2123         u64 bytenr = node->bytenr;
2124         u64 num_bytes = node->num_bytes;
2125         u64 refs;
2126         int ret;
2127
2128         path = btrfs_alloc_path();
2129         if (!path)
2130                 return -ENOMEM;
2131
2132         path->reada = 1;
2133         path->leave_spinning = 1;
2134         /* this will setup the path even if it fails to insert the back ref */
2135         ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
2136                                            bytenr, num_bytes, parent,
2137                                            root_objectid, owner, offset,
2138                                            refs_to_add, extent_op);
2139         if ((ret < 0 && ret != -EAGAIN) || !ret)
2140                 goto out;
2141
2142         /*
2143          * Ok we had -EAGAIN which means we didn't have space to insert and
2144          * inline extent ref, so just update the reference count and add a
2145          * normal backref.
2146          */
2147         leaf = path->nodes[0];
2148         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2149         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2150         refs = btrfs_extent_refs(leaf, item);
2151         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2152         if (extent_op)
2153                 __run_delayed_extent_op(extent_op, leaf, item);
2154
2155         btrfs_mark_buffer_dirty(leaf);
2156         btrfs_release_path(path);
2157
2158         path->reada = 1;
2159         path->leave_spinning = 1;
2160         /* now insert the actual backref */
2161         ret = insert_extent_backref(trans, root->fs_info->extent_root,
2162                                     path, bytenr, parent, root_objectid,
2163                                     owner, offset, refs_to_add);
2164         if (ret)
2165                 btrfs_abort_transaction(trans, root, ret);
2166 out:
2167         btrfs_free_path(path);
2168         return ret;
2169 }
2170
2171 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2172                                 struct btrfs_root *root,
2173                                 struct btrfs_delayed_ref_node *node,
2174                                 struct btrfs_delayed_extent_op *extent_op,
2175                                 int insert_reserved)
2176 {
2177         int ret = 0;
2178         struct btrfs_delayed_data_ref *ref;
2179         struct btrfs_key ins;
2180         u64 parent = 0;
2181         u64 ref_root = 0;
2182         u64 flags = 0;
2183
2184         ins.objectid = node->bytenr;
2185         ins.offset = node->num_bytes;
2186         ins.type = BTRFS_EXTENT_ITEM_KEY;
2187
2188         ref = btrfs_delayed_node_to_data_ref(node);
2189         trace_run_delayed_data_ref(node, ref, node->action);
2190
2191         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2192                 parent = ref->parent;
2193         ref_root = ref->root;
2194
2195         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2196                 if (extent_op)
2197                         flags |= extent_op->flags_to_set;
2198                 ret = alloc_reserved_file_extent(trans, root,
2199                                                  parent, ref_root, flags,
2200                                                  ref->objectid, ref->offset,
2201                                                  &ins, node->ref_mod);
2202         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2203                 ret = __btrfs_inc_extent_ref(trans, root, node, parent,
2204                                              ref_root, ref->objectid,
2205                                              ref->offset, node->ref_mod,
2206                                              extent_op);
2207         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2208                 ret = __btrfs_free_extent(trans, root, node, parent,
2209                                           ref_root, ref->objectid,
2210                                           ref->offset, node->ref_mod,
2211                                           extent_op);
2212         } else {
2213                 BUG();
2214         }
2215         return ret;
2216 }
2217
2218 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2219                                     struct extent_buffer *leaf,
2220                                     struct btrfs_extent_item *ei)
2221 {
2222         u64 flags = btrfs_extent_flags(leaf, ei);
2223         if (extent_op->update_flags) {
2224                 flags |= extent_op->flags_to_set;
2225                 btrfs_set_extent_flags(leaf, ei, flags);
2226         }
2227
2228         if (extent_op->update_key) {
2229                 struct btrfs_tree_block_info *bi;
2230                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2231                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2232                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2233         }
2234 }
2235
2236 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2237                                  struct btrfs_root *root,
2238                                  struct btrfs_delayed_ref_node *node,
2239                                  struct btrfs_delayed_extent_op *extent_op)
2240 {
2241         struct btrfs_key key;
2242         struct btrfs_path *path;
2243         struct btrfs_extent_item *ei;
2244         struct extent_buffer *leaf;
2245         u32 item_size;
2246         int ret;
2247         int err = 0;
2248         int metadata = !extent_op->is_data;
2249
2250         if (trans->aborted)
2251                 return 0;
2252
2253         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2254                 metadata = 0;
2255
2256         path = btrfs_alloc_path();
2257         if (!path)
2258                 return -ENOMEM;
2259
2260         key.objectid = node->bytenr;
2261
2262         if (metadata) {
2263                 key.type = BTRFS_METADATA_ITEM_KEY;
2264                 key.offset = extent_op->level;
2265         } else {
2266                 key.type = BTRFS_EXTENT_ITEM_KEY;
2267                 key.offset = node->num_bytes;
2268         }
2269
2270 again:
2271         path->reada = 1;
2272         path->leave_spinning = 1;
2273         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2274                                 path, 0, 1);
2275         if (ret < 0) {
2276                 err = ret;
2277                 goto out;
2278         }
2279         if (ret > 0) {
2280                 if (metadata) {
2281                         if (path->slots[0] > 0) {
2282                                 path->slots[0]--;
2283                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
2284                                                       path->slots[0]);
2285                                 if (key.objectid == node->bytenr &&
2286                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
2287                                     key.offset == node->num_bytes)
2288                                         ret = 0;
2289                         }
2290                         if (ret > 0) {
2291                                 btrfs_release_path(path);
2292                                 metadata = 0;
2293
2294                                 key.objectid = node->bytenr;
2295                                 key.offset = node->num_bytes;
2296                                 key.type = BTRFS_EXTENT_ITEM_KEY;
2297                                 goto again;
2298                         }
2299                 } else {
2300                         err = -EIO;
2301                         goto out;
2302                 }
2303         }
2304
2305         leaf = path->nodes[0];
2306         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2307 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2308         if (item_size < sizeof(*ei)) {
2309                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2310                                              path, (u64)-1, 0);
2311                 if (ret < 0) {
2312                         err = ret;
2313                         goto out;
2314                 }
2315                 leaf = path->nodes[0];
2316                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2317         }
2318 #endif
2319         BUG_ON(item_size < sizeof(*ei));
2320         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2321         __run_delayed_extent_op(extent_op, leaf, ei);
2322
2323         btrfs_mark_buffer_dirty(leaf);
2324 out:
2325         btrfs_free_path(path);
2326         return err;
2327 }
2328
2329 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2330                                 struct btrfs_root *root,
2331                                 struct btrfs_delayed_ref_node *node,
2332                                 struct btrfs_delayed_extent_op *extent_op,
2333                                 int insert_reserved)
2334 {
2335         int ret = 0;
2336         struct btrfs_delayed_tree_ref *ref;
2337         struct btrfs_key ins;
2338         u64 parent = 0;
2339         u64 ref_root = 0;
2340         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
2341                                                  SKINNY_METADATA);
2342
2343         ref = btrfs_delayed_node_to_tree_ref(node);
2344         trace_run_delayed_tree_ref(node, ref, node->action);
2345
2346         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2347                 parent = ref->parent;
2348         ref_root = ref->root;
2349
2350         ins.objectid = node->bytenr;
2351         if (skinny_metadata) {
2352                 ins.offset = ref->level;
2353                 ins.type = BTRFS_METADATA_ITEM_KEY;
2354         } else {
2355                 ins.offset = node->num_bytes;
2356                 ins.type = BTRFS_EXTENT_ITEM_KEY;
2357         }
2358
2359         BUG_ON(node->ref_mod != 1);
2360         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2361                 BUG_ON(!extent_op || !extent_op->update_flags);
2362                 ret = alloc_reserved_tree_block(trans, root,
2363                                                 parent, ref_root,
2364                                                 extent_op->flags_to_set,
2365                                                 &extent_op->key,
2366                                                 ref->level, &ins);
2367         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2368                 ret = __btrfs_inc_extent_ref(trans, root, node,
2369                                              parent, ref_root,
2370                                              ref->level, 0, 1,
2371                                              extent_op);
2372         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2373                 ret = __btrfs_free_extent(trans, root, node,
2374                                           parent, ref_root,
2375                                           ref->level, 0, 1, extent_op);
2376         } else {
2377                 BUG();
2378         }
2379         return ret;
2380 }
2381
2382 /* helper function to actually process a single delayed ref entry */
2383 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2384                                struct btrfs_root *root,
2385                                struct btrfs_delayed_ref_node *node,
2386                                struct btrfs_delayed_extent_op *extent_op,
2387                                int insert_reserved)
2388 {
2389         int ret = 0;
2390
2391         if (trans->aborted) {
2392                 if (insert_reserved)
2393                         btrfs_pin_extent(root, node->bytenr,
2394                                          node->num_bytes, 1);
2395                 return 0;
2396         }
2397
2398         if (btrfs_delayed_ref_is_head(node)) {
2399                 struct btrfs_delayed_ref_head *head;
2400                 /*
2401                  * we've hit the end of the chain and we were supposed
2402                  * to insert this extent into the tree.  But, it got
2403                  * deleted before we ever needed to insert it, so all
2404                  * we have to do is clean up the accounting
2405                  */
2406                 BUG_ON(extent_op);
2407                 head = btrfs_delayed_node_to_head(node);
2408                 trace_run_delayed_ref_head(node, head, node->action);
2409
2410                 if (insert_reserved) {
2411                         btrfs_pin_extent(root, node->bytenr,
2412                                          node->num_bytes, 1);
2413                         if (head->is_data) {
2414                                 ret = btrfs_del_csums(trans, root,
2415                                                       node->bytenr,
2416                                                       node->num_bytes);
2417                         }
2418                 }
2419
2420                 /* Also free its reserved qgroup space */
2421                 btrfs_qgroup_free_delayed_ref(root->fs_info,
2422                                               head->qgroup_ref_root,
2423                                               head->qgroup_reserved);
2424                 return ret;
2425         }
2426
2427         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2428             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2429                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2430                                            insert_reserved);
2431         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2432                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2433                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2434                                            insert_reserved);
2435         else
2436                 BUG();
2437         return ret;
2438 }
2439
2440 static inline struct btrfs_delayed_ref_node *
2441 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2442 {
2443         struct btrfs_delayed_ref_node *ref;
2444
2445         if (list_empty(&head->ref_list))
2446                 return NULL;
2447
2448         /*
2449          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2450          * This is to prevent a ref count from going down to zero, which deletes
2451          * the extent item from the extent tree, when there still are references
2452          * to add, which would fail because they would not find the extent item.
2453          */
2454         list_for_each_entry(ref, &head->ref_list, list) {
2455                 if (ref->action == BTRFS_ADD_DELAYED_REF)
2456                         return ref;
2457         }
2458
2459         return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
2460                           list);
2461 }
2462
2463 /*
2464  * Returns 0 on success or if called with an already aborted transaction.
2465  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2466  */
2467 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2468                                              struct btrfs_root *root,
2469                                              unsigned long nr)
2470 {
2471         struct btrfs_delayed_ref_root *delayed_refs;
2472         struct btrfs_delayed_ref_node *ref;
2473         struct btrfs_delayed_ref_head *locked_ref = NULL;
2474         struct btrfs_delayed_extent_op *extent_op;
2475         struct btrfs_fs_info *fs_info = root->fs_info;
2476         ktime_t start = ktime_get();
2477         int ret;
2478         unsigned long count = 0;
2479         unsigned long actual_count = 0;
2480         int must_insert_reserved = 0;
2481
2482         delayed_refs = &trans->transaction->delayed_refs;
2483         while (1) {
2484                 if (!locked_ref) {
2485                         if (count >= nr)
2486                                 break;
2487
2488                         spin_lock(&delayed_refs->lock);
2489                         locked_ref = btrfs_select_ref_head(trans);
2490                         if (!locked_ref) {
2491                                 spin_unlock(&delayed_refs->lock);
2492                                 break;
2493                         }
2494
2495                         /* grab the lock that says we are going to process
2496                          * all the refs for this head */
2497                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2498                         spin_unlock(&delayed_refs->lock);
2499                         /*
2500                          * we may have dropped the spin lock to get the head
2501                          * mutex lock, and that might have given someone else
2502                          * time to free the head.  If that's true, it has been
2503                          * removed from our list and we can move on.
2504                          */
2505                         if (ret == -EAGAIN) {
2506                                 locked_ref = NULL;
2507                                 count++;
2508                                 continue;
2509                         }
2510                 }
2511
2512                 /*
2513                  * We need to try and merge add/drops of the same ref since we
2514                  * can run into issues with relocate dropping the implicit ref
2515                  * and then it being added back again before the drop can
2516                  * finish.  If we merged anything we need to re-loop so we can
2517                  * get a good ref.
2518                  * Or we can get node references of the same type that weren't
2519                  * merged when created due to bumps in the tree mod seq, and
2520                  * we need to merge them to prevent adding an inline extent
2521                  * backref before dropping it (triggering a BUG_ON at
2522                  * insert_inline_extent_backref()).
2523                  */
2524                 spin_lock(&locked_ref->lock);
2525                 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2526                                          locked_ref);
2527
2528                 /*
2529                  * locked_ref is the head node, so we have to go one
2530                  * node back for any delayed ref updates
2531                  */
2532                 ref = select_delayed_ref(locked_ref);
2533
2534                 if (ref && ref->seq &&
2535                     btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2536                         spin_unlock(&locked_ref->lock);
2537                         btrfs_delayed_ref_unlock(locked_ref);
2538                         spin_lock(&delayed_refs->lock);
2539                         locked_ref->processing = 0;
2540                         delayed_refs->num_heads_ready++;
2541                         spin_unlock(&delayed_refs->lock);
2542                         locked_ref = NULL;
2543                         cond_resched();
2544                         count++;
2545                         continue;
2546                 }
2547
2548                 /*
2549                  * record the must insert reserved flag before we
2550                  * drop the spin lock.
2551                  */
2552                 must_insert_reserved = locked_ref->must_insert_reserved;
2553                 locked_ref->must_insert_reserved = 0;
2554
2555                 extent_op = locked_ref->extent_op;
2556                 locked_ref->extent_op = NULL;
2557
2558                 if (!ref) {
2559
2560
2561                         /* All delayed refs have been processed, Go ahead
2562                          * and send the head node to run_one_delayed_ref,
2563                          * so that any accounting fixes can happen
2564                          */
2565                         ref = &locked_ref->node;
2566
2567                         if (extent_op && must_insert_reserved) {
2568                                 btrfs_free_delayed_extent_op(extent_op);
2569                                 extent_op = NULL;
2570                         }
2571
2572                         if (extent_op) {
2573                                 spin_unlock(&locked_ref->lock);
2574                                 ret = run_delayed_extent_op(trans, root,
2575                                                             ref, extent_op);
2576                                 btrfs_free_delayed_extent_op(extent_op);
2577
2578                                 if (ret) {
2579                                         /*
2580                                          * Need to reset must_insert_reserved if
2581                                          * there was an error so the abort stuff
2582                                          * can cleanup the reserved space
2583                                          * properly.
2584                                          */
2585                                         if (must_insert_reserved)
2586                                                 locked_ref->must_insert_reserved = 1;
2587                                         locked_ref->processing = 0;
2588                                         btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2589                                         btrfs_delayed_ref_unlock(locked_ref);
2590                                         return ret;
2591                                 }
2592                                 continue;
2593                         }
2594
2595                         /*
2596                          * Need to drop our head ref lock and re-aqcuire the
2597                          * delayed ref lock and then re-check to make sure
2598                          * nobody got added.
2599                          */
2600                         spin_unlock(&locked_ref->lock);
2601                         spin_lock(&delayed_refs->lock);
2602                         spin_lock(&locked_ref->lock);
2603                         if (!list_empty(&locked_ref->ref_list) ||
2604                             locked_ref->extent_op) {
2605                                 spin_unlock(&locked_ref->lock);
2606                                 spin_unlock(&delayed_refs->lock);
2607                                 continue;
2608                         }
2609                         ref->in_tree = 0;
2610                         delayed_refs->num_heads--;
2611                         rb_erase(&locked_ref->href_node,
2612                                  &delayed_refs->href_root);
2613                         spin_unlock(&delayed_refs->lock);
2614                 } else {
2615                         actual_count++;
2616                         ref->in_tree = 0;
2617                         list_del(&ref->list);
2618                 }
2619                 atomic_dec(&delayed_refs->num_entries);
2620
2621                 if (!btrfs_delayed_ref_is_head(ref)) {
2622                         /*
2623                          * when we play the delayed ref, also correct the
2624                          * ref_mod on head
2625                          */
2626                         switch (ref->action) {
2627                         case BTRFS_ADD_DELAYED_REF:
2628                         case BTRFS_ADD_DELAYED_EXTENT:
2629                                 locked_ref->node.ref_mod -= ref->ref_mod;
2630                                 break;
2631                         case BTRFS_DROP_DELAYED_REF:
2632                                 locked_ref->node.ref_mod += ref->ref_mod;
2633                                 break;
2634                         default:
2635                                 WARN_ON(1);
2636                         }
2637                 }
2638                 spin_unlock(&locked_ref->lock);
2639
2640                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2641                                           must_insert_reserved);
2642
2643                 btrfs_free_delayed_extent_op(extent_op);
2644                 if (ret) {
2645                         locked_ref->processing = 0;
2646                         btrfs_delayed_ref_unlock(locked_ref);
2647                         btrfs_put_delayed_ref(ref);
2648                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2649                         return ret;
2650                 }
2651
2652                 /*
2653                  * If this node is a head, that means all the refs in this head
2654                  * have been dealt with, and we will pick the next head to deal
2655                  * with, so we must unlock the head and drop it from the cluster
2656                  * list before we release it.
2657                  */
2658                 if (btrfs_delayed_ref_is_head(ref)) {
2659                         if (locked_ref->is_data &&
2660                             locked_ref->total_ref_mod < 0) {
2661                                 spin_lock(&delayed_refs->lock);
2662                                 delayed_refs->pending_csums -= ref->num_bytes;
2663                                 spin_unlock(&delayed_refs->lock);
2664                         }
2665                         btrfs_delayed_ref_unlock(locked_ref);
2666                         locked_ref = NULL;
2667                 }
2668                 btrfs_put_delayed_ref(ref);
2669                 count++;
2670                 cond_resched();
2671         }
2672
2673         /*
2674          * We don't want to include ref heads since we can have empty ref heads
2675          * and those will drastically skew our runtime down since we just do
2676          * accounting, no actual extent tree updates.
2677          */
2678         if (actual_count > 0) {
2679                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2680                 u64 avg;
2681
2682                 /*
2683                  * We weigh the current average higher than our current runtime
2684                  * to avoid large swings in the average.
2685                  */
2686                 spin_lock(&delayed_refs->lock);
2687                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2688                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2689                 spin_unlock(&delayed_refs->lock);
2690         }
2691         return 0;
2692 }
2693
2694 #ifdef SCRAMBLE_DELAYED_REFS
2695 /*
2696  * Normally delayed refs get processed in ascending bytenr order. This
2697  * correlates in most cases to the order added. To expose dependencies on this
2698  * order, we start to process the tree in the middle instead of the beginning
2699  */
2700 static u64 find_middle(struct rb_root *root)
2701 {
2702         struct rb_node *n = root->rb_node;
2703         struct btrfs_delayed_ref_node *entry;
2704         int alt = 1;
2705         u64 middle;
2706         u64 first = 0, last = 0;
2707
2708         n = rb_first(root);
2709         if (n) {
2710                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2711                 first = entry->bytenr;
2712         }
2713         n = rb_last(root);
2714         if (n) {
2715                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2716                 last = entry->bytenr;
2717         }
2718         n = root->rb_node;
2719
2720         while (n) {
2721                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2722                 WARN_ON(!entry->in_tree);
2723
2724                 middle = entry->bytenr;
2725
2726                 if (alt)
2727                         n = n->rb_left;
2728                 else
2729                         n = n->rb_right;
2730
2731                 alt = 1 - alt;
2732         }
2733         return middle;
2734 }
2735 #endif
2736
2737 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2738 {
2739         u64 num_bytes;
2740
2741         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2742                              sizeof(struct btrfs_extent_inline_ref));
2743         if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2744                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2745
2746         /*
2747          * We don't ever fill up leaves all the way so multiply by 2 just to be
2748          * closer to what we're really going to want to ouse.
2749          */
2750         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2751 }
2752
2753 /*
2754  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2755  * would require to store the csums for that many bytes.
2756  */
2757 u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
2758 {
2759         u64 csum_size;
2760         u64 num_csums_per_leaf;
2761         u64 num_csums;
2762
2763         csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
2764         num_csums_per_leaf = div64_u64(csum_size,
2765                         (u64)btrfs_super_csum_size(root->fs_info->super_copy));
2766         num_csums = div64_u64(csum_bytes, root->sectorsize);
2767         num_csums += num_csums_per_leaf - 1;
2768         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2769         return num_csums;
2770 }
2771
2772 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2773                                        struct btrfs_root *root)
2774 {
2775         struct btrfs_block_rsv *global_rsv;
2776         u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2777         u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2778         u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
2779         u64 num_bytes, num_dirty_bgs_bytes;
2780         int ret = 0;
2781
2782         num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2783         num_heads = heads_to_leaves(root, num_heads);
2784         if (num_heads > 1)
2785                 num_bytes += (num_heads - 1) * root->nodesize;
2786         num_bytes <<= 1;
2787         num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
2788         num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
2789                                                              num_dirty_bgs);
2790         global_rsv = &root->fs_info->global_block_rsv;
2791
2792         /*
2793          * If we can't allocate any more chunks lets make sure we have _lots_ of
2794          * wiggle room since running delayed refs can create more delayed refs.
2795          */
2796         if (global_rsv->space_info->full) {
2797                 num_dirty_bgs_bytes <<= 1;
2798                 num_bytes <<= 1;
2799         }
2800
2801         spin_lock(&global_rsv->lock);
2802         if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2803                 ret = 1;
2804         spin_unlock(&global_rsv->lock);
2805         return ret;
2806 }
2807
2808 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2809                                        struct btrfs_root *root)
2810 {
2811         struct btrfs_fs_info *fs_info = root->fs_info;
2812         u64 num_entries =
2813                 atomic_read(&trans->transaction->delayed_refs.num_entries);
2814         u64 avg_runtime;
2815         u64 val;
2816
2817         smp_mb();
2818         avg_runtime = fs_info->avg_delayed_ref_runtime;
2819         val = num_entries * avg_runtime;
2820         if (num_entries * avg_runtime >= NSEC_PER_SEC)
2821                 return 1;
2822         if (val >= NSEC_PER_SEC / 2)
2823                 return 2;
2824
2825         return btrfs_check_space_for_delayed_refs(trans, root);
2826 }
2827
2828 struct async_delayed_refs {
2829         struct btrfs_root *root;
2830         int count;
2831         int error;
2832         int sync;
2833         struct completion wait;
2834         struct btrfs_work work;
2835 };
2836
2837 static void delayed_ref_async_start(struct btrfs_work *work)
2838 {
2839         struct async_delayed_refs *async;
2840         struct btrfs_trans_handle *trans;
2841         int ret;
2842
2843         async = container_of(work, struct async_delayed_refs, work);
2844
2845         trans = btrfs_join_transaction(async->root);
2846         if (IS_ERR(trans)) {
2847                 async->error = PTR_ERR(trans);
2848                 goto done;
2849         }
2850
2851         /*
2852          * trans->sync means that when we call end_transaciton, we won't
2853          * wait on delayed refs
2854          */
2855         trans->sync = true;
2856         ret = btrfs_run_delayed_refs(trans, async->root, async->count);
2857         if (ret)
2858                 async->error = ret;
2859
2860         ret = btrfs_end_transaction(trans, async->root);
2861         if (ret && !async->error)
2862                 async->error = ret;
2863 done:
2864         if (async->sync)
2865                 complete(&async->wait);
2866         else
2867                 kfree(async);
2868 }
2869
2870 int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2871                                  unsigned long count, int wait)
2872 {
2873         struct async_delayed_refs *async;
2874         int ret;
2875
2876         async = kmalloc(sizeof(*async), GFP_NOFS);
2877         if (!async)
2878                 return -ENOMEM;
2879
2880         async->root = root->fs_info->tree_root;
2881         async->count = count;
2882         async->error = 0;
2883         if (wait)
2884                 async->sync = 1;
2885         else
2886                 async->sync = 0;
2887         init_completion(&async->wait);
2888
2889         btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2890                         delayed_ref_async_start, NULL, NULL);
2891
2892         btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2893
2894         if (wait) {
2895                 wait_for_completion(&async->wait);
2896                 ret = async->error;
2897                 kfree(async);
2898                 return ret;
2899         }
2900         return 0;
2901 }
2902
2903 /*
2904  * this starts processing the delayed reference count updates and
2905  * extent insertions we have queued up so far.  count can be
2906  * 0, which means to process everything in the tree at the start
2907  * of the run (but not newly added entries), or it can be some target
2908  * number you'd like to process.
2909  *
2910  * Returns 0 on success or if called with an aborted transaction
2911  * Returns <0 on error and aborts the transaction
2912  */
2913 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2914                            struct btrfs_root *root, unsigned long count)
2915 {
2916         struct rb_node *node;
2917         struct btrfs_delayed_ref_root *delayed_refs;
2918         struct btrfs_delayed_ref_head *head;
2919         int ret;
2920         int run_all = count == (unsigned long)-1;
2921         bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
2922
2923         /* We'll clean this up in btrfs_cleanup_transaction */
2924         if (trans->aborted)
2925                 return 0;
2926
2927         if (root == root->fs_info->extent_root)
2928                 root = root->fs_info->tree_root;
2929
2930         delayed_refs = &trans->transaction->delayed_refs;
2931         if (count == 0)
2932                 count = atomic_read(&delayed_refs->num_entries) * 2;
2933
2934 again:
2935 #ifdef SCRAMBLE_DELAYED_REFS
2936         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2937 #endif
2938         trans->can_flush_pending_bgs = false;
2939         ret = __btrfs_run_delayed_refs(trans, root, count);
2940         if (ret < 0) {
2941                 btrfs_abort_transaction(trans, root, ret);
2942                 return ret;
2943         }
2944
2945         if (run_all) {
2946                 if (!list_empty(&trans->new_bgs))
2947                         btrfs_create_pending_block_groups(trans, root);
2948
2949                 spin_lock(&delayed_refs->lock);
2950                 node = rb_first(&delayed_refs->href_root);
2951                 if (!node) {
2952                         spin_unlock(&delayed_refs->lock);
2953                         goto out;
2954                 }
2955                 count = (unsigned long)-1;
2956
2957                 while (node) {
2958                         head = rb_entry(node, struct btrfs_delayed_ref_head,
2959                                         href_node);
2960                         if (btrfs_delayed_ref_is_head(&head->node)) {
2961                                 struct btrfs_delayed_ref_node *ref;
2962
2963                                 ref = &head->node;
2964                                 atomic_inc(&ref->refs);
2965
2966                                 spin_unlock(&delayed_refs->lock);
2967                                 /*
2968                                  * Mutex was contended, block until it's
2969                                  * released and try again
2970                                  */
2971                                 mutex_lock(&head->mutex);
2972                                 mutex_unlock(&head->mutex);
2973
2974                                 btrfs_put_delayed_ref(ref);
2975                                 cond_resched();
2976                                 goto again;
2977                         } else {
2978                                 WARN_ON(1);
2979                         }
2980                         node = rb_next(node);
2981                 }
2982                 spin_unlock(&delayed_refs->lock);
2983                 cond_resched();
2984                 goto again;
2985         }
2986 out:
2987         assert_qgroups_uptodate(trans);
2988         trans->can_flush_pending_bgs = can_flush_pending_bgs;
2989         return 0;
2990 }
2991
2992 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2993                                 struct btrfs_root *root,
2994                                 u64 bytenr, u64 num_bytes, u64 flags,
2995                                 int level, int is_data)
2996 {
2997         struct btrfs_delayed_extent_op *extent_op;
2998         int ret;
2999
3000         extent_op = btrfs_alloc_delayed_extent_op();
3001         if (!extent_op)
3002                 return -ENOMEM;
3003
3004         extent_op->flags_to_set = flags;
3005         extent_op->update_flags = 1;
3006         extent_op->update_key = 0;
3007         extent_op->is_data = is_data ? 1 : 0;
3008         extent_op->level = level;
3009
3010         ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
3011                                           num_bytes, extent_op);
3012         if (ret)
3013                 btrfs_free_delayed_extent_op(extent_op);
3014         return ret;
3015 }
3016
3017 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
3018                                       struct btrfs_root *root,
3019                                       struct btrfs_path *path,
3020                                       u64 objectid, u64 offset, u64 bytenr)
3021 {
3022         struct btrfs_delayed_ref_head *head;
3023         struct btrfs_delayed_ref_node *ref;
3024         struct btrfs_delayed_data_ref *data_ref;
3025         struct btrfs_delayed_ref_root *delayed_refs;
3026         int ret = 0;
3027
3028         delayed_refs = &trans->transaction->delayed_refs;
3029         spin_lock(&delayed_refs->lock);
3030         head = btrfs_find_delayed_ref_head(trans, bytenr);
3031         if (!head) {
3032                 spin_unlock(&delayed_refs->lock);
3033                 return 0;
3034         }
3035
3036         if (!mutex_trylock(&head->mutex)) {
3037                 atomic_inc(&head->node.refs);
3038                 spin_unlock(&delayed_refs->lock);
3039
3040                 btrfs_release_path(path);
3041
3042                 /*
3043                  * Mutex was contended, block until it's released and let
3044                  * caller try again
3045                  */
3046                 mutex_lock(&head->mutex);
3047                 mutex_unlock(&head->mutex);
3048                 btrfs_put_delayed_ref(&head->node);
3049                 return -EAGAIN;
3050         }
3051         spin_unlock(&delayed_refs->lock);
3052
3053         spin_lock(&head->lock);
3054         list_for_each_entry(ref, &head->ref_list, list) {
3055                 /* If it's a shared ref we know a cross reference exists */
3056                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3057                         ret = 1;
3058                         break;
3059                 }
3060
3061                 data_ref = btrfs_delayed_node_to_data_ref(ref);
3062
3063                 /*
3064                  * If our ref doesn't match the one we're currently looking at
3065                  * then we have a cross reference.
3066                  */
3067                 if (data_ref->root != root->root_key.objectid ||
3068                     data_ref->objectid != objectid ||
3069                     data_ref->offset != offset) {
3070                         ret = 1;
3071                         break;
3072                 }
3073         }
3074         spin_unlock(&head->lock);
3075         mutex_unlock(&head->mutex);
3076         return ret;
3077 }
3078
3079 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
3080                                         struct btrfs_root *root,
3081                                         struct btrfs_path *path,
3082                                         u64 objectid, u64 offset, u64 bytenr)
3083 {
3084         struct btrfs_root *extent_root = root->fs_info->extent_root;
3085         struct extent_buffer *leaf;
3086         struct btrfs_extent_data_ref *ref;
3087         struct btrfs_extent_inline_ref *iref;
3088         struct btrfs_extent_item *ei;
3089         struct btrfs_key key;
3090         u32 item_size;
3091         int ret;
3092
3093         key.objectid = bytenr;
3094         key.offset = (u64)-1;
3095         key.type = BTRFS_EXTENT_ITEM_KEY;
3096
3097         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3098         if (ret < 0)
3099                 goto out;
3100         BUG_ON(ret == 0); /* Corruption */
3101
3102         ret = -ENOENT;
3103         if (path->slots[0] == 0)
3104                 goto out;
3105
3106         path->slots[0]--;
3107         leaf = path->nodes[0];
3108         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3109
3110         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3111                 goto out;
3112
3113         ret = 1;
3114         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3115 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3116         if (item_size < sizeof(*ei)) {
3117                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3118                 goto out;
3119         }
3120 #endif
3121         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3122
3123         if (item_size != sizeof(*ei) +
3124             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3125                 goto out;
3126
3127         if (btrfs_extent_generation(leaf, ei) <=
3128             btrfs_root_last_snapshot(&root->root_item))
3129                 goto out;
3130
3131         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3132         if (btrfs_extent_inline_ref_type(leaf, iref) !=
3133             BTRFS_EXTENT_DATA_REF_KEY)
3134                 goto out;
3135
3136         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3137         if (btrfs_extent_refs(leaf, ei) !=
3138             btrfs_extent_data_ref_count(leaf, ref) ||
3139             btrfs_extent_data_ref_root(leaf, ref) !=
3140             root->root_key.objectid ||
3141             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3142             btrfs_extent_data_ref_offset(leaf, ref) != offset)
3143                 goto out;
3144
3145         ret = 0;
3146 out:
3147         return ret;
3148 }
3149
3150 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
3151                           struct btrfs_root *root,
3152                           u64 objectid, u64 offset, u64 bytenr)
3153 {
3154         struct btrfs_path *path;
3155         int ret;
3156         int ret2;
3157
3158         path = btrfs_alloc_path();
3159         if (!path)
3160                 return -ENOENT;
3161
3162         do {
3163                 ret = check_committed_ref(trans, root, path, objectid,
3164                                           offset, bytenr);
3165                 if (ret && ret != -ENOENT)
3166                         goto out;
3167
3168                 ret2 = check_delayed_ref(trans, root, path, objectid,
3169                                          offset, bytenr);
3170         } while (ret2 == -EAGAIN);
3171
3172         if (ret2 && ret2 != -ENOENT) {
3173                 ret = ret2;
3174                 goto out;
3175         }
3176
3177         if (ret != -ENOENT || ret2 != -ENOENT)
3178                 ret = 0;
3179 out:
3180         btrfs_free_path(path);
3181         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3182                 WARN_ON(ret > 0);
3183         return ret;
3184 }
3185
3186 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3187                            struct btrfs_root *root,
3188                            struct extent_buffer *buf,
3189                            int full_backref, int inc)
3190 {
3191         u64 bytenr;
3192         u64 num_bytes;
3193         u64 parent;
3194         u64 ref_root;
3195         u32 nritems;
3196         struct btrfs_key key;
3197         struct btrfs_file_extent_item *fi;
3198         int i;
3199         int level;
3200         int ret = 0;
3201         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3202                             u64, u64, u64, u64, u64, u64);
3203
3204
3205         if (btrfs_test_is_dummy_root(root))
3206                 return 0;
3207
3208         ref_root = btrfs_header_owner(buf);
3209         nritems = btrfs_header_nritems(buf);
3210         level = btrfs_header_level(buf);
3211
3212         if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3213                 return 0;
3214
3215         if (inc)
3216                 process_func = btrfs_inc_extent_ref;
3217         else
3218                 process_func = btrfs_free_extent;
3219
3220         if (full_backref)
3221                 parent = buf->start;
3222         else
3223                 parent = 0;
3224
3225         for (i = 0; i < nritems; i++) {
3226                 if (level == 0) {
3227                         btrfs_item_key_to_cpu(buf, &key, i);
3228                         if (key.type != BTRFS_EXTENT_DATA_KEY)
3229                                 continue;
3230                         fi = btrfs_item_ptr(buf, i,
3231                                             struct btrfs_file_extent_item);
3232                         if (btrfs_file_extent_type(buf, fi) ==
3233                             BTRFS_FILE_EXTENT_INLINE)
3234                                 continue;
3235                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3236                         if (bytenr == 0)
3237                                 continue;
3238
3239                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3240                         key.offset -= btrfs_file_extent_offset(buf, fi);
3241                         ret = process_func(trans, root, bytenr, num_bytes,
3242                                            parent, ref_root, key.objectid,
3243                                            key.offset);
3244                         if (ret)
3245                                 goto fail;
3246                 } else {
3247                         bytenr = btrfs_node_blockptr(buf, i);
3248                         num_bytes = root->nodesize;
3249                         ret = process_func(trans, root, bytenr, num_bytes,
3250                                            parent, ref_root, level - 1, 0);
3251                         if (ret)
3252                                 goto fail;
3253                 }
3254         }
3255         return 0;
3256 fail:
3257         return ret;
3258 }
3259
3260 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3261                   struct extent_buffer *buf, int full_backref)
3262 {
3263         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3264 }
3265
3266 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3267                   struct extent_buffer *buf, int full_backref)
3268 {
3269         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3270 }
3271
3272 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3273                                  struct btrfs_root *root,
3274                                  struct btrfs_path *path,
3275                                  struct btrfs_block_group_cache *cache)
3276 {
3277         int ret;
3278         struct btrfs_root *extent_root = root->fs_info->extent_root;
3279         unsigned long bi;
3280         struct extent_buffer *leaf;
3281
3282         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3283         if (ret) {
3284                 if (ret > 0)
3285                         ret = -ENOENT;
3286                 goto fail;
3287         }
3288
3289         leaf = path->nodes[0];
3290         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3291         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3292         btrfs_mark_buffer_dirty(leaf);
3293 fail:
3294         btrfs_release_path(path);
3295         return ret;
3296
3297 }
3298
3299 static struct btrfs_block_group_cache *
3300 next_block_group(struct btrfs_root *root,
3301                  struct btrfs_block_group_cache *cache)
3302 {
3303         struct rb_node *node;
3304
3305         spin_lock(&root->fs_info->block_group_cache_lock);
3306
3307         /* If our block group was removed, we need a full search. */
3308         if (RB_EMPTY_NODE(&cache->cache_node)) {
3309                 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3310
3311                 spin_unlock(&root->fs_info->block_group_cache_lock);
3312                 btrfs_put_block_group(cache);
3313                 cache = btrfs_lookup_first_block_group(root->fs_info,
3314                                                        next_bytenr);
3315                 return cache;
3316         }
3317         node = rb_next(&cache->cache_node);
3318         btrfs_put_block_group(cache);
3319         if (node) {
3320                 cache = rb_entry(node, struct btrfs_block_group_cache,
3321                                  cache_node);
3322                 btrfs_get_block_group(cache);
3323         } else
3324                 cache = NULL;
3325         spin_unlock(&root->fs_info->block_group_cache_lock);
3326         return cache;
3327 }
3328
3329 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3330                             struct btrfs_trans_handle *trans,
3331                             struct btrfs_path *path)
3332 {
3333         struct btrfs_root *root = block_group->fs_info->tree_root;
3334         struct inode *inode = NULL;
3335         u64 alloc_hint = 0;
3336         int dcs = BTRFS_DC_ERROR;
3337         u64 num_pages = 0;
3338         int retries = 0;
3339         int ret = 0;
3340
3341         /*
3342          * If this block group is smaller than 100 megs don't bother caching the
3343          * block group.
3344          */
3345         if (block_group->key.offset < (100 * 1024 * 1024)) {
3346                 spin_lock(&block_group->lock);
3347                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3348                 spin_unlock(&block_group->lock);
3349                 return 0;
3350         }
3351
3352         if (trans->aborted)
3353                 return 0;
3354 again:
3355         inode = lookup_free_space_inode(root, block_group, path);
3356         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3357                 ret = PTR_ERR(inode);
3358                 btrfs_release_path(path);
3359                 goto out;
3360         }
3361
3362         if (IS_ERR(inode)) {
3363                 BUG_ON(retries);
3364                 retries++;
3365
3366                 if (block_group->ro)
3367                         goto out_free;
3368
3369                 ret = create_free_space_inode(root, trans, block_group, path);
3370                 if (ret)
3371                         goto out_free;
3372                 goto again;
3373         }
3374
3375         /* We've already setup this transaction, go ahead and exit */
3376         if (block_group->cache_generation == trans->transid &&
3377             i_size_read(inode)) {
3378                 dcs = BTRFS_DC_SETUP;
3379                 goto out_put;
3380         }
3381
3382         /*
3383          * We want to set the generation to 0, that way if anything goes wrong
3384          * from here on out we know not to trust this cache when we load up next
3385          * time.
3386          */
3387         BTRFS_I(inode)->generation = 0;
3388         ret = btrfs_update_inode(trans, root, inode);
3389         if (ret) {
3390                 /*
3391                  * So theoretically we could recover from this, simply set the
3392                  * super cache generation to 0 so we know to invalidate the
3393                  * cache, but then we'd have to keep track of the block groups
3394                  * that fail this way so we know we _have_ to reset this cache
3395                  * before the next commit or risk reading stale cache.  So to
3396                  * limit our exposure to horrible edge cases lets just abort the
3397                  * transaction, this only happens in really bad situations
3398                  * anyway.
3399                  */
3400                 btrfs_abort_transaction(trans, root, ret);
3401                 goto out_put;
3402         }
3403         WARN_ON(ret);
3404
3405         if (i_size_read(inode) > 0) {
3406                 ret = btrfs_check_trunc_cache_free_space(root,
3407                                         &root->fs_info->global_block_rsv);
3408                 if (ret)
3409                         goto out_put;
3410
3411                 ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
3412                 if (ret)
3413                         goto out_put;
3414         }
3415
3416         spin_lock(&block_group->lock);
3417         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3418             !btrfs_test_opt(root, SPACE_CACHE)) {
3419                 /*
3420                  * don't bother trying to write stuff out _if_
3421                  * a) we're not cached,
3422                  * b) we're with nospace_cache mount option.
3423                  */
3424                 dcs = BTRFS_DC_WRITTEN;
3425                 spin_unlock(&block_group->lock);
3426                 goto out_put;
3427         }
3428         spin_unlock(&block_group->lock);
3429
3430         /*
3431          * We hit an ENOSPC when setting up the cache in this transaction, just
3432          * skip doing the setup, we've already cleared the cache so we're safe.
3433          */
3434         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3435                 ret = -ENOSPC;
3436                 goto out_put;
3437         }
3438
3439         /*
3440          * Try to preallocate enough space based on how big the block group is.
3441          * Keep in mind this has to include any pinned space which could end up
3442          * taking up quite a bit since it's not folded into the other space
3443          * cache.
3444          */
3445         num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
3446         if (!num_pages)
3447                 num_pages = 1;
3448
3449         num_pages *= 16;
3450         num_pages *= PAGE_CACHE_SIZE;
3451
3452         ret = btrfs_check_data_free_space(inode, 0, num_pages);
3453         if (ret)
3454                 goto out_put;
3455
3456         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3457                                               num_pages, num_pages,
3458                                               &alloc_hint);
3459         /*
3460          * Our cache requires contiguous chunks so that we don't modify a bunch
3461          * of metadata or split extents when writing the cache out, which means
3462          * we can enospc if we are heavily fragmented in addition to just normal
3463          * out of space conditions.  So if we hit this just skip setting up any
3464          * other block groups for this transaction, maybe we'll unpin enough
3465          * space the next time around.
3466          */
3467         if (!ret)
3468                 dcs = BTRFS_DC_SETUP;
3469         else if (ret == -ENOSPC)
3470                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3471         btrfs_free_reserved_data_space(inode, 0, num_pages);
3472
3473 out_put:
3474         iput(inode);
3475 out_free:
3476         btrfs_release_path(path);
3477 out:
3478         spin_lock(&block_group->lock);
3479         if (!ret && dcs == BTRFS_DC_SETUP)
3480                 block_group->cache_generation = trans->transid;
3481         block_group->disk_cache_state = dcs;
3482         spin_unlock(&block_group->lock);
3483
3484         return ret;
3485 }
3486
3487 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3488                             struct btrfs_root *root)
3489 {
3490         struct btrfs_block_group_cache *cache, *tmp;
3491         struct btrfs_transaction *cur_trans = trans->transaction;
3492         struct btrfs_path *path;
3493
3494         if (list_empty(&cur_trans->dirty_bgs) ||
3495             !btrfs_test_opt(root, SPACE_CACHE))
3496                 return 0;
3497
3498         path = btrfs_alloc_path();
3499         if (!path)
3500                 return -ENOMEM;
3501
3502         /* Could add new block groups, use _safe just in case */
3503         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3504                                  dirty_list) {
3505                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3506                         cache_save_setup(cache, trans, path);
3507         }
3508
3509         btrfs_free_path(path);
3510         return 0;
3511 }
3512
3513 /*
3514  * transaction commit does final block group cache writeback during a
3515  * critical section where nothing is allowed to change the FS.  This is
3516  * required in order for the cache to actually match the block group,
3517  * but can introduce a lot of latency into the commit.
3518  *
3519  * So, btrfs_start_dirty_block_groups is here to kick off block group
3520  * cache IO.  There's a chance we'll have to redo some of it if the
3521  * block group changes again during the commit, but it greatly reduces
3522  * the commit latency by getting rid of the easy block groups while
3523  * we're still allowing others to join the commit.
3524  */
3525 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3526                                    struct btrfs_root *root)
3527 {
3528         struct btrfs_block_group_cache *cache;
3529         struct btrfs_transaction *cur_trans = trans->transaction;
3530         int ret = 0;
3531         int should_put;
3532         struct btrfs_path *path = NULL;
3533         LIST_HEAD(dirty);
3534         struct list_head *io = &cur_trans->io_bgs;
3535         int num_started = 0;
3536         int loops = 0;
3537
3538         spin_lock(&cur_trans->dirty_bgs_lock);
3539         if (list_empty(&cur_trans->dirty_bgs)) {
3540                 spin_unlock(&cur_trans->dirty_bgs_lock);
3541                 return 0;
3542         }
3543         list_splice_init(&cur_trans->dirty_bgs, &dirty);
3544         spin_unlock(&cur_trans->dirty_bgs_lock);
3545
3546 again:
3547         /*
3548          * make sure all the block groups on our dirty list actually
3549          * exist
3550          */
3551         btrfs_create_pending_block_groups(trans, root);
3552
3553         if (!path) {
3554                 path = btrfs_alloc_path();
3555                 if (!path)
3556                         return -ENOMEM;
3557         }
3558
3559         /*
3560          * cache_write_mutex is here only to save us from balance or automatic
3561          * removal of empty block groups deleting this block group while we are
3562          * writing out the cache
3563          */
3564         mutex_lock(&trans->transaction->cache_write_mutex);
3565         while (!list_empty(&dirty)) {
3566                 cache = list_first_entry(&dirty,
3567                                          struct btrfs_block_group_cache,
3568                                          dirty_list);
3569                 /*
3570                  * this can happen if something re-dirties a block
3571                  * group that is already under IO.  Just wait for it to
3572                  * finish and then do it all again
3573                  */
3574                 if (!list_empty(&cache->io_list)) {
3575                         list_del_init(&cache->io_list);
3576                         btrfs_wait_cache_io(root, trans, cache,
3577                                             &cache->io_ctl, path,
3578                                             cache->key.objectid);
3579                         btrfs_put_block_group(cache);
3580                 }
3581
3582
3583                 /*
3584                  * btrfs_wait_cache_io uses the cache->dirty_list to decide
3585                  * if it should update the cache_state.  Don't delete
3586                  * until after we wait.
3587                  *
3588                  * Since we're not running in the commit critical section
3589                  * we need the dirty_bgs_lock to protect from update_block_group
3590                  */
3591                 spin_lock(&cur_trans->dirty_bgs_lock);
3592                 list_del_init(&cache->dirty_list);
3593                 spin_unlock(&cur_trans->dirty_bgs_lock);
3594
3595                 should_put = 1;
3596
3597                 cache_save_setup(cache, trans, path);
3598
3599                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3600                         cache->io_ctl.inode = NULL;
3601                         ret = btrfs_write_out_cache(root, trans, cache, path);
3602                         if (ret == 0 && cache->io_ctl.inode) {
3603                                 num_started++;
3604                                 should_put = 0;
3605
3606                                 /*
3607                                  * the cache_write_mutex is protecting
3608                                  * the io_list
3609                                  */
3610                                 list_add_tail(&cache->io_list, io);
3611                         } else {
3612                                 /*
3613                                  * if we failed to write the cache, the
3614                                  * generation will be bad and life goes on
3615                                  */
3616                                 ret = 0;
3617                         }
3618                 }
3619                 if (!ret) {
3620                         ret = write_one_cache_group(trans, root, path, cache);
3621                         /*
3622                          * Our block group might still be attached to the list
3623                          * of new block groups in the transaction handle of some
3624                          * other task (struct btrfs_trans_handle->new_bgs). This
3625                          * means its block group item isn't yet in the extent
3626                          * tree. If this happens ignore the error, as we will
3627                          * try again later in the critical section of the
3628                          * transaction commit.
3629                          */
3630                         if (ret == -ENOENT) {
3631                                 ret = 0;
3632                                 spin_lock(&cur_trans->dirty_bgs_lock);
3633                                 if (list_empty(&cache->dirty_list)) {
3634                                         list_add_tail(&cache->dirty_list,
3635                                                       &cur_trans->dirty_bgs);
3636                                         btrfs_get_block_group(cache);
3637                                 }
3638                                 spin_unlock(&cur_trans->dirty_bgs_lock);
3639                         } else if (ret) {
3640                                 btrfs_abort_transaction(trans, root, ret);
3641                         }
3642                 }
3643
3644                 /* if its not on the io list, we need to put the block group */
3645                 if (should_put)
3646                         btrfs_put_block_group(cache);
3647
3648                 if (ret)
3649                         break;
3650
3651                 /*
3652                  * Avoid blocking other tasks for too long. It might even save
3653                  * us from writing caches for block groups that are going to be
3654                  * removed.
3655                  */
3656                 mutex_unlock(&trans->transaction->cache_write_mutex);
3657                 mutex_lock(&trans->transaction->cache_write_mutex);
3658         }
3659         mutex_unlock(&trans->transaction->cache_write_mutex);
3660
3661         /*
3662          * go through delayed refs for all the stuff we've just kicked off
3663          * and then loop back (just once)
3664          */
3665         ret = btrfs_run_delayed_refs(trans, root, 0);
3666         if (!ret && loops == 0) {
3667                 loops++;
3668                 spin_lock(&cur_trans->dirty_bgs_lock);
3669                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3670                 /*
3671                  * dirty_bgs_lock protects us from concurrent block group
3672                  * deletes too (not just cache_write_mutex).
3673                  */
3674                 if (!list_empty(&dirty)) {
3675                         spin_unlock(&cur_trans->dirty_bgs_lock);
3676                         goto again;
3677                 }
3678                 spin_unlock(&cur_trans->dirty_bgs_lock);
3679         }
3680
3681         btrfs_free_path(path);
3682         return ret;
3683 }
3684
3685 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3686                                    struct btrfs_root *root)
3687 {
3688         struct btrfs_block_group_cache *cache;
3689         struct btrfs_transaction *cur_trans = trans->transaction;
3690         int ret = 0;
3691         int should_put;
3692         struct btrfs_path *path;
3693         struct list_head *io = &cur_trans->io_bgs;
3694         int num_started = 0;
3695
3696         path = btrfs_alloc_path();
3697         if (!path)
3698                 return -ENOMEM;
3699
3700         /*
3701          * We don't need the lock here since we are protected by the transaction
3702          * commit.  We want to do the cache_save_setup first and then run the
3703          * delayed refs to make sure we have the best chance at doing this all
3704          * in one shot.
3705          */
3706         while (!list_empty(&cur_trans->dirty_bgs)) {
3707                 cache = list_first_entry(&cur_trans->dirty_bgs,
3708                                          struct btrfs_block_group_cache,
3709                                          dirty_list);
3710
3711                 /*
3712                  * this can happen if cache_save_setup re-dirties a block
3713                  * group that is already under IO.  Just wait for it to
3714                  * finish and then do it all again
3715                  */
3716                 if (!list_empty(&cache->io_list)) {
3717                         list_del_init(&cache->io_list);
3718                         btrfs_wait_cache_io(root, trans, cache,
3719                                             &cache->io_ctl, path,
3720                                             cache->key.objectid);
3721                         btrfs_put_block_group(cache);
3722                 }
3723
3724                 /*
3725                  * don't remove from the dirty list until after we've waited
3726                  * on any pending IO
3727                  */
3728                 list_del_init(&cache->dirty_list);
3729                 should_put = 1;
3730
3731                 cache_save_setup(cache, trans, path);
3732
3733                 if (!ret)
3734                         ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
3735
3736                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3737                         cache->io_ctl.inode = NULL;
3738                         ret = btrfs_write_out_cache(root, trans, cache, path);
3739                         if (ret == 0 && cache->io_ctl.inode) {
3740                                 num_started++;
3741                                 should_put = 0;
3742                                 list_add_tail(&cache->io_list, io);
3743                         } else {
3744                                 /*
3745                                  * if we failed to write the cache, the
3746                                  * generation will be bad and life goes on
3747                                  */
3748                                 ret = 0;
3749                         }
3750                 }
3751                 if (!ret) {
3752                         ret = write_one_cache_group(trans, root, path, cache);
3753                         if (ret)
3754                                 btrfs_abort_transaction(trans, root, ret);
3755                 }
3756
3757                 /* if its not on the io list, we need to put the block group */
3758                 if (should_put)
3759                         btrfs_put_block_group(cache);
3760         }
3761
3762         while (!list_empty(io)) {
3763                 cache = list_first_entry(io, struct btrfs_block_group_cache,
3764                                          io_list);
3765                 list_del_init(&cache->io_list);
3766                 btrfs_wait_cache_io(root, trans, cache,
3767                                     &cache->io_ctl, path, cache->key.objectid);
3768                 btrfs_put_block_group(cache);
3769         }
3770
3771         btrfs_free_path(path);
3772         return ret;
3773 }
3774
3775 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3776 {
3777         struct btrfs_block_group_cache *block_group;
3778         int readonly = 0;
3779
3780         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3781         if (!block_group || block_group->ro)
3782                 readonly = 1;
3783         if (block_group)
3784                 btrfs_put_block_group(block_group);
3785         return readonly;
3786 }
3787
3788 static const char *alloc_name(u64 flags)
3789 {
3790         switch (flags) {
3791         case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3792                 return "mixed";
3793         case BTRFS_BLOCK_GROUP_METADATA:
3794                 return "metadata";
3795         case BTRFS_BLOCK_GROUP_DATA:
3796                 return "data";
3797         case BTRFS_BLOCK_GROUP_SYSTEM:
3798                 return "system";
3799         default:
3800                 WARN_ON(1);
3801                 return "invalid-combination";
3802         };
3803 }
3804
3805 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3806                              u64 total_bytes, u64 bytes_used,
3807                              struct btrfs_space_info **space_info)
3808 {
3809         struct btrfs_space_info *found;
3810         int i;
3811         int factor;
3812         int ret;
3813
3814         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3815                      BTRFS_BLOCK_GROUP_RAID10))
3816                 factor = 2;
3817         else
3818                 factor = 1;
3819
3820         found = __find_space_info(info, flags);
3821         if (found) {
3822                 spin_lock(&found->lock);
3823                 found->total_bytes += total_bytes;
3824                 found->disk_total += total_bytes * factor;
3825                 found->bytes_used += bytes_used;
3826                 found->disk_used += bytes_used * factor;
3827                 if (total_bytes > 0)
3828                         found->full = 0;
3829                 spin_unlock(&found->lock);
3830                 *space_info = found;
3831                 return 0;
3832         }
3833         found = kzalloc(sizeof(*found), GFP_NOFS);
3834         if (!found)
3835                 return -ENOMEM;
3836
3837         ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
3838         if (ret) {
3839                 kfree(found);
3840                 return ret;
3841         }
3842
3843         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3844                 INIT_LIST_HEAD(&found->block_groups[i]);
3845         init_rwsem(&found->groups_sem);
3846         spin_lock_init(&found->lock);
3847         found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3848         found->total_bytes = total_bytes;
3849         found->disk_total = total_bytes * factor;
3850         found->bytes_used = bytes_used;
3851         found->disk_used = bytes_used * factor;
3852         found->bytes_pinned = 0;
3853         found->bytes_reserved = 0;
3854         found->bytes_readonly = 0;
3855         found->bytes_may_use = 0;
3856         found->full = 0;
3857         found->max_extent_size = 0;
3858         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3859         found->chunk_alloc = 0;
3860         found->flush = 0;
3861         init_waitqueue_head(&found->wait);
3862         INIT_LIST_HEAD(&found->ro_bgs);
3863
3864         ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3865                                     info->space_info_kobj, "%s",
3866                                     alloc_name(found->flags));
3867         if (ret) {
3868                 kfree(found);
3869                 return ret;
3870         }
3871
3872         *space_info = found;
3873         list_add_rcu(&found->list, &info->space_info);
3874         if (flags & BTRFS_BLOCK_GROUP_DATA)
3875                 info->data_sinfo = found;
3876
3877         return ret;
3878 }
3879
3880 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3881 {
3882         u64 extra_flags = chunk_to_extended(flags) &
3883                                 BTRFS_EXTENDED_PROFILE_MASK;
3884
3885         write_seqlock(&fs_info->profiles_lock);
3886         if (flags & BTRFS_BLOCK_GROUP_DATA)
3887                 fs_info->avail_data_alloc_bits |= extra_flags;
3888         if (flags & BTRFS_BLOCK_GROUP_METADATA)
3889                 fs_info->avail_metadata_alloc_bits |= extra_flags;
3890         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3891                 fs_info->avail_system_alloc_bits |= extra_flags;
3892         write_sequnlock(&fs_info->profiles_lock);
3893 }
3894
3895 /*
3896  * returns target flags in extended format or 0 if restripe for this
3897  * chunk_type is not in progress
3898  *
3899  * should be called with either volume_mutex or balance_lock held
3900  */
3901 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3902 {
3903         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3904         u64 target = 0;
3905
3906         if (!bctl)
3907                 return 0;
3908
3909         if (flags & BTRFS_BLOCK_GROUP_DATA &&
3910             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3911                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3912         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3913                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3914                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3915         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3916                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3917                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3918         }
3919
3920         return target;
3921 }
3922
3923 /*
3924  * @flags: available profiles in extended format (see ctree.h)
3925  *
3926  * Returns reduced profile in chunk format.  If profile changing is in
3927  * progress (either running or paused) picks the target profile (if it's
3928  * already available), otherwise falls back to plain reducing.
3929  */
3930 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3931 {
3932         u64 num_devices = root->fs_info->fs_devices->rw_devices;
3933         u64 target;
3934         u64 raid_type;
3935         u64 allowed = 0;
3936
3937         /*
3938          * see if restripe for this chunk_type is in progress, if so
3939          * try to reduce to the target profile
3940          */
3941         spin_lock(&root->fs_info->balance_lock);
3942         target = get_restripe_target(root->fs_info, flags);
3943         if (target) {
3944                 /* pick target profile only if it's already available */
3945                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3946                         spin_unlock(&root->fs_info->balance_lock);
3947                         return extended_to_chunk(target);
3948                 }
3949         }
3950         spin_unlock(&root->fs_info->balance_lock);
3951
3952         /* First, mask out the RAID levels which aren't possible */
3953         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3954                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
3955                         allowed |= btrfs_raid_group[raid_type];
3956         }
3957         allowed &= flags;
3958
3959         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
3960                 allowed = BTRFS_BLOCK_GROUP_RAID6;
3961         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
3962                 allowed = BTRFS_BLOCK_GROUP_RAID5;
3963         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
3964                 allowed = BTRFS_BLOCK_GROUP_RAID10;
3965         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
3966                 allowed = BTRFS_BLOCK_GROUP_RAID1;
3967         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
3968                 allowed = BTRFS_BLOCK_GROUP_RAID0;
3969
3970         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
3971
3972         return extended_to_chunk(flags | allowed);
3973 }
3974
3975 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
3976 {
3977         unsigned seq;
3978         u64 flags;
3979
3980         do {
3981                 flags = orig_flags;
3982                 seq = read_seqbegin(&root->fs_info->profiles_lock);
3983
3984                 if (flags & BTRFS_BLOCK_GROUP_DATA)
3985                         flags |= root->fs_info->avail_data_alloc_bits;
3986                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3987                         flags |= root->fs_info->avail_system_alloc_bits;
3988                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3989                         flags |= root->fs_info->avail_metadata_alloc_bits;
3990         } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3991
3992         return btrfs_reduce_alloc_profile(root, flags);
3993 }
3994
3995 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3996 {
3997         u64 flags;
3998         u64 ret;
3999
4000         if (data)
4001                 flags = BTRFS_BLOCK_GROUP_DATA;
4002         else if (root == root->fs_info->chunk_root)
4003                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
4004         else
4005                 flags = BTRFS_BLOCK_GROUP_METADATA;
4006
4007         ret = get_alloc_profile(root, flags);
4008         return ret;
4009 }
4010
4011 int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
4012 {
4013         struct btrfs_space_info *data_sinfo;
4014         struct btrfs_root *root = BTRFS_I(inode)->root;
4015         struct btrfs_fs_info *fs_info = root->fs_info;
4016         u64 used;
4017         int ret = 0;
4018         int need_commit = 2;
4019         int have_pinned_space;
4020
4021         /* make sure bytes are sectorsize aligned */
4022         bytes = ALIGN(bytes, root->sectorsize);
4023
4024         if (btrfs_is_free_space_inode(inode)) {
4025                 need_commit = 0;
4026                 ASSERT(current->journal_info);
4027         }
4028
4029         data_sinfo = fs_info->data_sinfo;
4030         if (!data_sinfo)
4031                 goto alloc;
4032
4033 again:
4034         /* make sure we have enough space to handle the data first */
4035         spin_lock(&data_sinfo->lock);
4036         used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
4037                 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
4038                 data_sinfo->bytes_may_use;
4039
4040         if (used + bytes > data_sinfo->total_bytes) {
4041                 struct btrfs_trans_handle *trans;
4042
4043                 /*
4044                  * if we don't have enough free bytes in this space then we need
4045                  * to alloc a new chunk.
4046                  */
4047                 if (!data_sinfo->full) {
4048                         u64 alloc_target;
4049
4050                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4051                         spin_unlock(&data_sinfo->lock);
4052 alloc:
4053                         alloc_target = btrfs_get_alloc_profile(root, 1);
4054                         /*
4055                          * It is ugly that we don't call nolock join
4056                          * transaction for the free space inode case here.
4057                          * But it is safe because we only do the data space
4058                          * reservation for the free space cache in the
4059                          * transaction context, the common join transaction
4060                          * just increase the counter of the current transaction
4061                          * handler, doesn't try to acquire the trans_lock of
4062                          * the fs.
4063                          */
4064                         trans = btrfs_join_transaction(root);
4065                         if (IS_ERR(trans))
4066                                 return PTR_ERR(trans);
4067
4068                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4069                                              alloc_target,
4070                                              CHUNK_ALLOC_NO_FORCE);
4071                         btrfs_end_transaction(trans, root);
4072                         if (ret < 0) {
4073                                 if (ret != -ENOSPC)
4074                                         return ret;
4075                                 else {
4076                                         have_pinned_space = 1;
4077                                         goto commit_trans;
4078                                 }
4079                         }
4080
4081                         if (!data_sinfo)
4082                                 data_sinfo = fs_info->data_sinfo;
4083
4084                         goto again;
4085                 }
4086
4087                 /*
4088                  * If we don't have enough pinned space to deal with this
4089                  * allocation, and no removed chunk in current transaction,
4090                  * don't bother committing the transaction.
4091                  */
4092                 have_pinned_space = percpu_counter_compare(
4093                         &data_sinfo->total_bytes_pinned,
4094                         used + bytes - data_sinfo->total_bytes);
4095                 spin_unlock(&data_sinfo->lock);
4096
4097                 /* commit the current transaction and try again */
4098 commit_trans:
4099                 if (need_commit &&
4100                     !atomic_read(&root->fs_info->open_ioctl_trans)) {
4101                         need_commit--;
4102
4103                         if (need_commit > 0)
4104                                 btrfs_wait_ordered_roots(fs_info, -1);
4105
4106                         trans = btrfs_join_transaction(root);
4107                         if (IS_ERR(trans))
4108                                 return PTR_ERR(trans);
4109                         if (have_pinned_space >= 0 ||
4110                             test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4111                                      &trans->transaction->flags) ||
4112                             need_commit > 0) {
4113                                 ret = btrfs_commit_transaction(trans, root);
4114                                 if (ret)
4115                                         return ret;
4116                                 /*
4117                                  * make sure that all running delayed iput are
4118                                  * done
4119                                  */
4120                                 down_write(&root->fs_info->delayed_iput_sem);
4121                                 up_write(&root->fs_info->delayed_iput_sem);
4122                                 goto again;
4123                         } else {
4124                                 btrfs_end_transaction(trans, root);
4125                         }
4126                 }
4127
4128                 trace_btrfs_space_reservation(root->fs_info,
4129                                               "space_info:enospc",
4130                                               data_sinfo->flags, bytes, 1);
4131                 return -ENOSPC;
4132         }
4133         data_sinfo->bytes_may_use += bytes;
4134         trace_btrfs_space_reservation(root->fs_info, "space_info",
4135                                       data_sinfo->flags, bytes, 1);
4136         spin_unlock(&data_sinfo->lock);
4137
4138         return ret;
4139 }
4140
4141 /*
4142  * New check_data_free_space() with ability for precious data reservation
4143  * Will replace old btrfs_check_data_free_space(), but for patch split,
4144  * add a new function first and then replace it.
4145  */
4146 int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
4147 {
4148         struct btrfs_root *root = BTRFS_I(inode)->root;
4149         int ret;
4150
4151         /* align the range */
4152         len = round_up(start + len, root->sectorsize) -
4153               round_down(start, root->sectorsize);
4154         start = round_down(start, root->sectorsize);
4155
4156         ret = btrfs_alloc_data_chunk_ondemand(inode, len);
4157         if (ret < 0)
4158                 return ret;
4159
4160         /*
4161          * Use new btrfs_qgroup_reserve_data to reserve precious data space
4162          *
4163          * TODO: Find a good method to avoid reserve data space for NOCOW
4164          * range, but don't impact performance on quota disable case.
4165          */
4166         ret = btrfs_qgroup_reserve_data(inode, start, len);
4167         return ret;
4168 }
4169
4170 /*
4171  * Called if we need to clear a data reservation for this inode
4172  * Normally in a error case.
4173  *
4174  * This one will *NOT* use accurate qgroup reserved space API, just for case
4175  * which we can't sleep and is sure it won't affect qgroup reserved space.
4176  * Like clear_bit_hook().
4177  */
4178 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4179                                             u64 len)
4180 {
4181         struct btrfs_root *root = BTRFS_I(inode)->root;
4182         struct btrfs_space_info *data_sinfo;
4183
4184         /* Make sure the range is aligned to sectorsize */
4185         len = round_up(start + len, root->sectorsize) -
4186               round_down(start, root->sectorsize);
4187         start = round_down(start, root->sectorsize);
4188
4189         data_sinfo = root->fs_info->data_sinfo;
4190         spin_lock(&data_sinfo->lock);
4191         if (WARN_ON(data_sinfo->bytes_may_use < len))
4192                 data_sinfo->bytes_may_use = 0;
4193         else
4194                 data_sinfo->bytes_may_use -= len;
4195         trace_btrfs_space_reservation(root->fs_info, "space_info",
4196                                       data_sinfo->flags, len, 0);
4197         spin_unlock(&data_sinfo->lock);
4198 }
4199
4200 /*
4201  * Called if we need to clear a data reservation for this inode
4202  * Normally in a error case.
4203  *
4204  * This one will handle the per-indoe data rsv map for accurate reserved
4205  * space framework.
4206  */
4207 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
4208 {
4209         btrfs_free_reserved_data_space_noquota(inode, start, len);
4210         btrfs_qgroup_free_data(inode, start, len);
4211 }
4212
4213 static void force_metadata_allocation(struct btrfs_fs_info *info)
4214 {
4215         struct list_head *head = &info->space_info;
4216         struct btrfs_space_info *found;
4217
4218         rcu_read_lock();
4219         list_for_each_entry_rcu(found, head, list) {
4220                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4221                         found->force_alloc = CHUNK_ALLOC_FORCE;
4222         }
4223         rcu_read_unlock();
4224 }
4225
4226 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4227 {
4228         return (global->size << 1);
4229 }
4230
4231 static int should_alloc_chunk(struct btrfs_root *root,
4232                               struct btrfs_space_info *sinfo, int force)
4233 {
4234         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4235         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
4236         u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
4237         u64 thresh;
4238
4239         if (force == CHUNK_ALLOC_FORCE)
4240                 return 1;
4241
4242         /*
4243          * We need to take into account the global rsv because for all intents
4244          * and purposes it's used space.  Don't worry about locking the
4245          * global_rsv, it doesn't change except when the transaction commits.
4246          */
4247         if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4248                 num_allocated += calc_global_rsv_need_space(global_rsv);
4249
4250         /*
4251          * in limited mode, we want to have some free space up to
4252          * about 1% of the FS size.
4253          */
4254         if (force == CHUNK_ALLOC_LIMITED) {
4255                 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
4256                 thresh = max_t(u64, 64 * 1024 * 1024,
4257                                div_factor_fine(thresh, 1));
4258
4259                 if (num_bytes - num_allocated < thresh)
4260                         return 1;
4261         }
4262
4263         if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
4264                 return 0;
4265         return 1;
4266 }
4267
4268 static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
4269 {
4270         u64 num_dev;
4271
4272         if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4273                     BTRFS_BLOCK_GROUP_RAID0 |
4274                     BTRFS_BLOCK_GROUP_RAID5 |
4275                     BTRFS_BLOCK_GROUP_RAID6))
4276                 num_dev = root->fs_info->fs_devices->rw_devices;
4277         else if (type & BTRFS_BLOCK_GROUP_RAID1)
4278                 num_dev = 2;
4279         else
4280                 num_dev = 1;    /* DUP or single */
4281
4282         return num_dev;
4283 }
4284
4285 /*
4286  * If @is_allocation is true, reserve space in the system space info necessary
4287  * for allocating a chunk, otherwise if it's false, reserve space necessary for
4288  * removing a chunk.
4289  */
4290 void check_system_chunk(struct btrfs_trans_handle *trans,
4291                         struct btrfs_root *root,
4292                         u64 type)
4293 {
4294         struct btrfs_space_info *info;
4295         u64 left;
4296         u64 thresh;
4297         int ret = 0;
4298         u64 num_devs;
4299
4300         /*
4301          * Needed because we can end up allocating a system chunk and for an
4302          * atomic and race free space reservation in the chunk block reserve.
4303          */
4304         ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
4305
4306         info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4307         spin_lock(&info->lock);
4308         left = info->total_bytes - info->bytes_used - info->bytes_pinned -
4309                 info->bytes_reserved - info->bytes_readonly -
4310                 info->bytes_may_use;
4311         spin_unlock(&info->lock);
4312
4313         num_devs = get_profile_num_devs(root, type);
4314
4315         /* num_devs device items to update and 1 chunk item to add or remove */
4316         thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
4317                 btrfs_calc_trans_metadata_size(root, 1);
4318
4319         if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
4320                 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
4321                         left, thresh, type);
4322                 dump_space_info(info, 0, 0);
4323         }
4324
4325         if (left < thresh) {
4326                 u64 flags;
4327
4328                 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
4329                 /*
4330                  * Ignore failure to create system chunk. We might end up not
4331                  * needing it, as we might not need to COW all nodes/leafs from
4332                  * the paths we visit in the chunk tree (they were already COWed
4333                  * or created in the current transaction for example).
4334                  */
4335                 ret = btrfs_alloc_chunk(trans, root, flags);
4336         }
4337
4338         if (!ret) {
4339                 ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
4340                                           &root->fs_info->chunk_block_rsv,
4341                                           thresh, BTRFS_RESERVE_NO_FLUSH);
4342                 if (!ret)
4343                         trans->chunk_bytes_reserved += thresh;
4344         }
4345 }
4346
4347 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4348                           struct btrfs_root *extent_root, u64 flags, int force)
4349 {
4350         struct btrfs_space_info *space_info;
4351         struct btrfs_fs_info *fs_info = extent_root->fs_info;
4352         int wait_for_alloc = 0;
4353         int ret = 0;
4354
4355         /* Don't re-enter if we're already allocating a chunk */
4356         if (trans->allocating_chunk)
4357                 return -ENOSPC;
4358
4359         space_info = __find_space_info(extent_root->fs_info, flags);
4360         if (!space_info) {
4361                 ret = update_space_info(extent_root->fs_info, flags,
4362                                         0, 0, &space_info);
4363                 BUG_ON(ret); /* -ENOMEM */
4364         }
4365         BUG_ON(!space_info); /* Logic error */
4366
4367 again:
4368         spin_lock(&space_info->lock);
4369         if (force < space_info->force_alloc)
4370                 force = space_info->force_alloc;
4371         if (space_info->full) {
4372                 if (should_alloc_chunk(extent_root, space_info, force))
4373                         ret = -ENOSPC;
4374                 else
4375                         ret = 0;
4376                 spin_unlock(&space_info->lock);
4377                 return ret;
4378         }
4379
4380         if (!should_alloc_chunk(extent_root, space_info, force)) {
4381                 spin_unlock(&space_info->lock);
4382                 return 0;
4383         } else if (space_info->chunk_alloc) {
4384                 wait_for_alloc = 1;
4385         } else {
4386                 space_info->chunk_alloc = 1;
4387         }
4388
4389         spin_unlock(&space_info->lock);
4390
4391         mutex_lock(&fs_info->chunk_mutex);
4392
4393         /*
4394          * The chunk_mutex is held throughout the entirety of a chunk
4395          * allocation, so once we've acquired the chunk_mutex we know that the
4396          * other guy is done and we need to recheck and see if we should
4397          * allocate.
4398          */
4399         if (wait_for_alloc) {
4400                 mutex_unlock(&fs_info->chunk_mutex);
4401                 wait_for_alloc = 0;
4402                 goto again;
4403         }
4404
4405         trans->allocating_chunk = true;
4406
4407         /*
4408          * If we have mixed data/metadata chunks we want to make sure we keep
4409          * allocating mixed chunks instead of individual chunks.
4410          */
4411         if (btrfs_mixed_space_info(space_info))
4412                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4413
4414         /*
4415          * if we're doing a data chunk, go ahead and make sure that
4416          * we keep a reasonable number of metadata chunks allocated in the
4417          * FS as well.
4418          */
4419         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4420                 fs_info->data_chunk_allocations++;
4421                 if (!(fs_info->data_chunk_allocations %
4422                       fs_info->metadata_ratio))
4423                         force_metadata_allocation(fs_info);
4424         }
4425
4426         /*
4427          * Check if we have enough space in SYSTEM chunk because we may need
4428          * to update devices.
4429          */
4430         check_system_chunk(trans, extent_root, flags);
4431
4432         ret = btrfs_alloc_chunk(trans, extent_root, flags);
4433         trans->allocating_chunk = false;
4434
4435         spin_lock(&space_info->lock);
4436         if (ret < 0 && ret != -ENOSPC)
4437                 goto out;
4438         if (ret)
4439                 space_info->full = 1;
4440         else
4441                 ret = 1;
4442
4443         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4444 out:
4445         space_info->chunk_alloc = 0;
4446         spin_unlock(&space_info->lock);
4447         mutex_unlock(&fs_info->chunk_mutex);
4448         /*
4449          * When we allocate a new chunk we reserve space in the chunk block
4450          * reserve to make sure we can COW nodes/leafs in the chunk tree or
4451          * add new nodes/leafs to it if we end up needing to do it when
4452          * inserting the chunk item and updating device items as part of the
4453          * second phase of chunk allocation, performed by
4454          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4455          * large number of new block groups to create in our transaction
4456          * handle's new_bgs list to avoid exhausting the chunk block reserve
4457          * in extreme cases - like having a single transaction create many new
4458          * block groups when starting to write out the free space caches of all
4459          * the block groups that were made dirty during the lifetime of the
4460          * transaction.
4461          */
4462         if (trans->can_flush_pending_bgs &&
4463             trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
4464                 btrfs_create_pending_block_groups(trans, trans->root);
4465                 btrfs_trans_release_chunk_metadata(trans);
4466         }
4467         return ret;
4468 }
4469
4470 static int can_overcommit(struct btrfs_root *root,
4471                           struct btrfs_space_info *space_info, u64 bytes,
4472                           enum btrfs_reserve_flush_enum flush)
4473 {
4474         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4475         u64 profile = btrfs_get_alloc_profile(root, 0);
4476         u64 space_size;
4477         u64 avail;
4478         u64 used;
4479
4480         used = space_info->bytes_used + space_info->bytes_reserved +
4481                 space_info->bytes_pinned + space_info->bytes_readonly;
4482
4483         /*
4484          * We only want to allow over committing if we have lots of actual space
4485          * free, but if we don't have enough space to handle the global reserve
4486          * space then we could end up having a real enospc problem when trying
4487          * to allocate a chunk or some other such important allocation.
4488          */
4489         spin_lock(&global_rsv->lock);
4490         space_size = calc_global_rsv_need_space(global_rsv);
4491         spin_unlock(&global_rsv->lock);
4492         if (used + space_size >= space_info->total_bytes)
4493                 return 0;
4494
4495         used += space_info->bytes_may_use;
4496
4497         spin_lock(&root->fs_info->free_chunk_lock);
4498         avail = root->fs_info->free_chunk_space;
4499         spin_unlock(&root->fs_info->free_chunk_lock);
4500
4501         /*
4502          * If we have dup, raid1 or raid10 then only half of the free
4503          * space is actually useable.  For raid56, the space info used
4504          * doesn't include the parity drive, so we don't have to
4505          * change the math
4506          */
4507         if (profile & (BTRFS_BLOCK_GROUP_DUP |
4508                        BTRFS_BLOCK_GROUP_RAID1 |
4509                        BTRFS_BLOCK_GROUP_RAID10))
4510                 avail >>= 1;
4511
4512         /*
4513          * If we aren't flushing all things, let us overcommit up to
4514          * 1/2th of the space. If we can flush, don't let us overcommit
4515          * too much, let it overcommit up to 1/8 of the space.
4516          */
4517         if (flush == BTRFS_RESERVE_FLUSH_ALL)
4518                 avail >>= 3;
4519         else
4520                 avail >>= 1;
4521
4522         if (used + bytes < space_info->total_bytes + avail)
4523                 return 1;
4524         return 0;
4525 }
4526
4527 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
4528                                          unsigned long nr_pages, int nr_items)
4529 {
4530         struct super_block *sb = root->fs_info->sb;
4531
4532         if (down_read_trylock(&sb->s_umount)) {
4533                 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4534                 up_read(&sb->s_umount);
4535         } else {
4536                 /*
4537                  * We needn't worry the filesystem going from r/w to r/o though
4538                  * we don't acquire ->s_umount mutex, because the filesystem
4539                  * should guarantee the delalloc inodes list be empty after
4540                  * the filesystem is readonly(all dirty pages are written to
4541                  * the disk).
4542                  */
4543                 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
4544                 if (!current->journal_info)
4545                         btrfs_wait_ordered_roots(root->fs_info, nr_items);
4546         }
4547 }
4548
4549 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
4550 {
4551         u64 bytes;
4552         int nr;
4553
4554         bytes = btrfs_calc_trans_metadata_size(root, 1);
4555         nr = (int)div64_u64(to_reclaim, bytes);
4556         if (!nr)
4557                 nr = 1;
4558         return nr;
4559 }
4560
4561 #define EXTENT_SIZE_PER_ITEM    (256 * 1024)
4562
4563 /*
4564  * shrink metadata reservation for delalloc
4565  */
4566 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4567                             bool wait_ordered)
4568 {
4569         struct btrfs_block_rsv *block_rsv;
4570         struct btrfs_space_info *space_info;
4571         struct btrfs_trans_handle *trans;
4572         u64 delalloc_bytes;
4573         u64 max_reclaim;
4574         long time_left;
4575         unsigned long nr_pages;
4576         int loops;
4577         int items;
4578         enum btrfs_reserve_flush_enum flush;
4579
4580         /* Calc the number of the pages we need flush for space reservation */
4581         items = calc_reclaim_items_nr(root, to_reclaim);
4582         to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4583
4584         trans = (struct btrfs_trans_handle *)current->journal_info;
4585         block_rsv = &root->fs_info->delalloc_block_rsv;
4586         space_info = block_rsv->space_info;
4587
4588         delalloc_bytes = percpu_counter_sum_positive(
4589                                                 &root->fs_info->delalloc_bytes);
4590         if (delalloc_bytes == 0) {
4591                 if (trans)
4592                         return;
4593                 if (wait_ordered)
4594                         btrfs_wait_ordered_roots(root->fs_info, items);
4595                 return;
4596         }
4597
4598         loops = 0;
4599         while (delalloc_bytes && loops < 3) {
4600                 max_reclaim = min(delalloc_bytes, to_reclaim);
4601                 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
4602                 btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4603                 /*
4604                  * We need to wait for the async pages to actually start before
4605                  * we do anything.
4606                  */
4607                 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
4608                 if (!max_reclaim)
4609                         goto skip_async;
4610
4611                 if (max_reclaim <= nr_pages)
4612                         max_reclaim = 0;
4613                 else
4614                         max_reclaim -= nr_pages;
4615
4616                 wait_event(root->fs_info->async_submit_wait,
4617                            atomic_read(&root->fs_info->async_delalloc_pages) <=
4618                            (int)max_reclaim);
4619 skip_async:
4620                 if (!trans)
4621                         flush = BTRFS_RESERVE_FLUSH_ALL;
4622                 else
4623                         flush = BTRFS_RESERVE_NO_FLUSH;
4624                 spin_lock(&space_info->lock);
4625                 if (can_overcommit(root, space_info, orig, flush)) {
4626                         spin_unlock(&space_info->lock);
4627                         break;
4628                 }
4629                 spin_unlock(&space_info->lock);
4630
4631                 loops++;
4632                 if (wait_ordered && !trans) {
4633                         btrfs_wait_ordered_roots(root->fs_info, items);
4634                 } else {
4635                         time_left = schedule_timeout_killable(1);
4636                         if (time_left)
4637                                 break;
4638                 }
4639                 delalloc_bytes = percpu_counter_sum_positive(
4640                                                 &root->fs_info->delalloc_bytes);
4641         }
4642 }
4643
4644 /**
4645  * maybe_commit_transaction - possibly commit the transaction if its ok to
4646  * @root - the root we're allocating for
4647  * @bytes - the number of bytes we want to reserve
4648  * @force - force the commit
4649  *
4650  * This will check to make sure that committing the transaction will actually
4651  * get us somewhere and then commit the transaction if it does.  Otherwise it
4652  * will return -ENOSPC.
4653  */
4654 static int may_commit_transaction(struct btrfs_root *root,
4655                                   struct btrfs_space_info *space_info,
4656                                   u64 bytes, int force)
4657 {
4658         struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
4659         struct btrfs_trans_handle *trans;
4660
4661         trans = (struct btrfs_trans_handle *)current->journal_info;
4662         if (trans)
4663                 return -EAGAIN;
4664
4665         if (force)
4666                 goto commit;
4667
4668         /* See if there is enough pinned space to make this reservation */
4669         if (percpu_counter_compare(&space_info->total_bytes_pinned,
4670                                    bytes) >= 0)
4671                 goto commit;
4672
4673         /*
4674          * See if there is some space in the delayed insertion reservation for
4675          * this reservation.
4676          */
4677         if (space_info != delayed_rsv->space_info)
4678                 return -ENOSPC;
4679
4680         spin_lock(&delayed_rsv->lock);
4681         if (percpu_counter_compare(&space_info->total_bytes_pinned,
4682                                    bytes - delayed_rsv->size) >= 0) {
4683                 spin_unlock(&delayed_rsv->lock);
4684                 return -ENOSPC;
4685         }
4686         spin_unlock(&delayed_rsv->lock);
4687
4688 commit:
4689         trans = btrfs_join_transaction(root);
4690         if (IS_ERR(trans))
4691                 return -ENOSPC;
4692
4693         return btrfs_commit_transaction(trans, root);
4694 }
4695
4696 enum flush_state {
4697         FLUSH_DELAYED_ITEMS_NR  =       1,
4698         FLUSH_DELAYED_ITEMS     =       2,
4699         FLUSH_DELALLOC          =       3,
4700         FLUSH_DELALLOC_WAIT     =       4,
4701         ALLOC_CHUNK             =       5,
4702         COMMIT_TRANS            =       6,
4703 };
4704
4705 static int flush_space(struct btrfs_root *root,
4706                        struct btrfs_space_info *space_info, u64 num_bytes,
4707                        u64 orig_bytes, int state)
4708 {
4709         struct btrfs_trans_handle *trans;
4710         int nr;
4711         int ret = 0;
4712
4713         switch (state) {
4714         case FLUSH_DELAYED_ITEMS_NR:
4715         case FLUSH_DELAYED_ITEMS:
4716                 if (state == FLUSH_DELAYED_ITEMS_NR)
4717                         nr = calc_reclaim_items_nr(root, num_bytes) * 2;
4718                 else
4719                         nr = -1;
4720
4721                 trans = btrfs_join_transaction(root);
4722                 if (IS_ERR(trans)) {
4723                         ret = PTR_ERR(trans);
4724                         break;
4725                 }
4726                 ret = btrfs_run_delayed_items_nr(trans, root, nr);
4727                 btrfs_end_transaction(trans, root);
4728                 break;
4729         case FLUSH_DELALLOC:
4730         case FLUSH_DELALLOC_WAIT:
4731                 shrink_delalloc(root, num_bytes * 2, orig_bytes,
4732                                 state == FLUSH_DELALLOC_WAIT);
4733                 break;
4734         case ALLOC_CHUNK:
4735                 trans = btrfs_join_transaction(root);
4736                 if (IS_ERR(trans)) {
4737                         ret = PTR_ERR(trans);
4738                         break;
4739                 }
4740                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4741                                      btrfs_get_alloc_profile(root, 0),
4742                                      CHUNK_ALLOC_NO_FORCE);
4743                 btrfs_end_transaction(trans, root);
4744                 if (ret == -ENOSPC)
4745                         ret = 0;
4746                 break;
4747         case COMMIT_TRANS:
4748                 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4749                 break;
4750         default:
4751                 ret = -ENOSPC;
4752                 break;
4753         }
4754
4755         return ret;
4756 }
4757
4758 static inline u64
4759 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4760                                  struct btrfs_space_info *space_info)
4761 {
4762         u64 used;
4763         u64 expected;
4764         u64 to_reclaim;
4765
4766         to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
4767                                 16 * 1024 * 1024);
4768         spin_lock(&space_info->lock);
4769         if (can_overcommit(root, space_info, to_reclaim,
4770                            BTRFS_RESERVE_FLUSH_ALL)) {
4771                 to_reclaim = 0;
4772                 goto out;
4773         }
4774
4775         used = space_info->bytes_used + space_info->bytes_reserved +
4776                space_info->bytes_pinned + space_info->bytes_readonly +
4777                space_info->bytes_may_use;
4778         if (can_overcommit(root, space_info, 1024 * 1024,
4779                            BTRFS_RESERVE_FLUSH_ALL))
4780                 expected = div_factor_fine(space_info->total_bytes, 95);
4781         else
4782                 expected = div_factor_fine(space_info->total_bytes, 90);
4783
4784         if (used > expected)
4785                 to_reclaim = used - expected;
4786         else
4787                 to_reclaim = 0;
4788         to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4789                                      space_info->bytes_reserved);
4790 out:
4791         spin_unlock(&space_info->lock);
4792
4793         return to_reclaim;
4794 }
4795
4796 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4797                                         struct btrfs_fs_info *fs_info, u64 used)
4798 {
4799         u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4800
4801         /* If we're just plain full then async reclaim just slows us down. */
4802         if (space_info->bytes_used >= thresh)
4803                 return 0;
4804
4805         return (used >= thresh && !btrfs_fs_closing(fs_info) &&
4806                 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4807 }
4808
4809 static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
4810                                        struct btrfs_fs_info *fs_info,
4811                                        int flush_state)
4812 {
4813         u64 used;
4814
4815         spin_lock(&space_info->lock);
4816         /*
4817          * We run out of space and have not got any free space via flush_space,
4818          * so don't bother doing async reclaim.
4819          */
4820         if (flush_state > COMMIT_TRANS && space_info->full) {
4821                 spin_unlock(&space_info->lock);
4822                 return 0;
4823         }
4824
4825         used = space_info->bytes_used + space_info->bytes_reserved +
4826                space_info->bytes_pinned + space_info->bytes_readonly +
4827                space_info->bytes_may_use;
4828         if (need_do_async_reclaim(space_info, fs_info, used)) {
4829                 spin_unlock(&space_info->lock);
4830                 return 1;
4831         }
4832         spin_unlock(&space_info->lock);
4833
4834         return 0;
4835 }
4836
4837 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4838 {
4839         struct btrfs_fs_info *fs_info;
4840         struct btrfs_space_info *space_info;
4841         u64 to_reclaim;
4842         int flush_state;
4843
4844         fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4845         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4846
4847         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
4848                                                       space_info);
4849         if (!to_reclaim)
4850                 return;
4851
4852         flush_state = FLUSH_DELAYED_ITEMS_NR;
4853         do {
4854                 flush_space(fs_info->fs_root, space_info, to_reclaim,
4855                             to_reclaim, flush_state);
4856                 flush_state++;
4857                 if (!btrfs_need_do_async_reclaim(space_info, fs_info,
4858                                                  flush_state))
4859                         return;
4860         } while (flush_state < COMMIT_TRANS);
4861 }
4862
4863 void btrfs_init_async_reclaim_work(struct work_struct *work)
4864 {
4865         INIT_WORK(work, btrfs_async_reclaim_metadata_space);
4866 }
4867
4868 /**
4869  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4870  * @root - the root we're allocating for
4871  * @block_rsv - the block_rsv we're allocating for
4872  * @orig_bytes - the number of bytes we want
4873  * @flush - whether or not we can flush to make our reservation
4874  *
4875  * This will reserve orgi_bytes number of bytes from the space info associated
4876  * with the block_rsv.  If there is not enough space it will make an attempt to
4877  * flush out space to make room.  It will do this by flushing delalloc if
4878  * possible or committing the transaction.  If flush is 0 then no attempts to
4879  * regain reservations will be made and this will fail if there is not enough
4880  * space already.
4881  */
4882 static int reserve_metadata_bytes(struct btrfs_root *root,
4883                                   struct btrfs_block_rsv *block_rsv,
4884                                   u64 orig_bytes,
4885                                   enum btrfs_reserve_flush_enum flush)
4886 {
4887         struct btrfs_space_info *space_info = block_rsv->space_info;
4888         u64 used;
4889         u64 num_bytes = orig_bytes;
4890         int flush_state = FLUSH_DELAYED_ITEMS_NR;
4891         int ret = 0;
4892         bool flushing = false;
4893
4894 again:
4895         ret = 0;
4896         spin_lock(&space_info->lock);
4897         /*
4898          * We only want to wait if somebody other than us is flushing and we
4899          * are actually allowed to flush all things.
4900          */
4901         while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
4902                space_info->flush) {
4903                 spin_unlock(&space_info->lock);
4904                 /*
4905                  * If we have a trans handle we can't wait because the flusher
4906                  * may have to commit the transaction, which would mean we would
4907                  * deadlock since we are waiting for the flusher to finish, but
4908                  * hold the current transaction open.
4909                  */
4910                 if (current->journal_info)
4911                         return -EAGAIN;
4912                 ret = wait_event_killable(space_info->wait, !space_info->flush);
4913                 /* Must have been killed, return */
4914                 if (ret)
4915                         return -EINTR;
4916
4917                 spin_lock(&space_info->lock);
4918         }
4919
4920         ret = -ENOSPC;
4921         used = space_info->bytes_used + space_info->bytes_reserved +
4922                 space_info->bytes_pinned + space_info->bytes_readonly +
4923                 space_info->bytes_may_use;
4924
4925         /*
4926          * The idea here is that we've not already over-reserved the block group
4927          * then we can go ahead and save our reservation first and then start
4928          * flushing if we need to.  Otherwise if we've already overcommitted
4929          * lets start flushing stuff first and then come back and try to make
4930          * our reservation.
4931          */
4932         if (used <= space_info->total_bytes) {
4933                 if (used + orig_bytes <= space_info->total_bytes) {
4934                         space_info->bytes_may_use += orig_bytes;
4935                         trace_btrfs_space_reservation(root->fs_info,
4936                                 "space_info", space_info->flags, orig_bytes, 1);
4937                         ret = 0;
4938                 } else {
4939                         /*
4940                          * Ok set num_bytes to orig_bytes since we aren't
4941                          * overocmmitted, this way we only try and reclaim what
4942                          * we need.
4943                          */
4944                         num_bytes = orig_bytes;
4945                 }
4946         } else {
4947                 /*
4948                  * Ok we're over committed, set num_bytes to the overcommitted
4949                  * amount plus the amount of bytes that we need for this
4950                  * reservation.
4951                  */
4952                 num_bytes = used - space_info->total_bytes +
4953                         (orig_bytes * 2);
4954         }
4955
4956         if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
4957                 space_info->bytes_may_use += orig_bytes;
4958                 trace_btrfs_space_reservation(root->fs_info, "space_info",
4959                                               space_info->flags, orig_bytes,
4960                                               1);
4961                 ret = 0;
4962         }
4963
4964         /*
4965          * Couldn't make our reservation, save our place so while we're trying
4966          * to reclaim space we can actually use it instead of somebody else
4967          * stealing it from us.
4968          *
4969          * We make the other tasks wait for the flush only when we can flush
4970          * all things.
4971          */
4972         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4973                 flushing = true;
4974                 space_info->flush = 1;
4975         } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
4976                 used += orig_bytes;
4977                 /*
4978                  * We will do the space reservation dance during log replay,
4979                  * which means we won't have fs_info->fs_root set, so don't do
4980                  * the async reclaim as we will panic.
4981                  */
4982                 if (!root->fs_info->log_root_recovering &&
4983                     need_do_async_reclaim(space_info, root->fs_info, used) &&
4984                     !work_busy(&root->fs_info->async_reclaim_work))
4985                         queue_work(system_unbound_wq,
4986                                    &root->fs_info->async_reclaim_work);
4987         }
4988         spin_unlock(&space_info->lock);
4989
4990         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4991                 goto out;
4992
4993         ret = flush_space(root, space_info, num_bytes, orig_bytes,
4994                           flush_state);
4995         flush_state++;
4996
4997         /*
4998          * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4999          * would happen. So skip delalloc flush.
5000          */
5001         if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
5002             (flush_state == FLUSH_DELALLOC ||
5003              flush_state == FLUSH_DELALLOC_WAIT))
5004                 flush_state = ALLOC_CHUNK;
5005
5006         if (!ret)
5007                 goto again;
5008         else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
5009                  flush_state < COMMIT_TRANS)
5010                 goto again;
5011         else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
5012                  flush_state <= COMMIT_TRANS)
5013                 goto again;
5014
5015 out:
5016         if (ret == -ENOSPC &&
5017             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5018                 struct btrfs_block_rsv *global_rsv =
5019                         &root->fs_info->global_block_rsv;
5020
5021                 if (block_rsv != global_rsv &&
5022                     !block_rsv_use_bytes(global_rsv, orig_bytes))
5023                         ret = 0;
5024         }
5025         if (ret == -ENOSPC)
5026                 trace_btrfs_space_reservation(root->fs_info,
5027                                               "space_info:enospc",
5028                                               space_info->flags, orig_bytes, 1);
5029         if (flushing) {
5030                 spin_lock(&space_info->lock);
5031                 space_info->flush = 0;
5032                 wake_up_all(&space_info->wait);
5033                 spin_unlock(&space_info->lock);
5034         }
5035         return ret;
5036 }
5037
5038 static struct btrfs_block_rsv *get_block_rsv(
5039                                         const struct btrfs_trans_handle *trans,
5040                                         const struct btrfs_root *root)
5041 {
5042         struct btrfs_block_rsv *block_rsv = NULL;
5043
5044         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5045             (root == root->fs_info->csum_root && trans->adding_csums) ||
5046              (root == root->fs_info->uuid_root))
5047                 block_rsv = trans->block_rsv;
5048
5049         if (!block_rsv)
5050                 block_rsv = root->block_rsv;
5051
5052         if (!block_rsv)
5053                 block_rsv = &root->fs_info->empty_block_rsv;
5054
5055         return block_rsv;
5056 }
5057
5058 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5059                                u64 num_bytes)
5060 {
5061         int ret = -ENOSPC;
5062         spin_lock(&block_rsv->lock);
5063         if (block_rsv->reserved >= num_bytes) {
5064                 block_rsv->reserved -= num_bytes;
5065                 if (block_rsv->reserved < block_rsv->size)
5066                         block_rsv->full = 0;
5067                 ret = 0;
5068         }
5069         spin_unlock(&block_rsv->lock);
5070         return ret;
5071 }
5072
5073 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5074                                 u64 num_bytes, int update_size)
5075 {
5076         spin_lock(&block_rsv->lock);
5077         block_rsv->reserved += num_bytes;
5078         if (update_size)
5079                 block_rsv->size += num_bytes;
5080         else if (block_rsv->reserved >= block_rsv->size)
5081                 block_rsv->full = 1;
5082         spin_unlock(&block_rsv->lock);
5083 }
5084
5085 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5086                              struct btrfs_block_rsv *dest, u64 num_bytes,
5087                              int min_factor)
5088 {
5089         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5090         u64 min_bytes;
5091
5092         if (global_rsv->space_info != dest->space_info)
5093                 return -ENOSPC;
5094
5095         spin_lock(&global_rsv->lock);
5096         min_bytes = div_factor(global_rsv->size, min_factor);
5097         if (global_rsv->reserved < min_bytes + num_bytes) {
5098                 spin_unlock(&global_rsv->lock);
5099                 return -ENOSPC;
5100         }
5101         global_rsv->reserved -= num_bytes;
5102         if (global_rsv->reserved < global_rsv->size)
5103                 global_rsv->full = 0;
5104         spin_unlock(&global_rsv->lock);
5105
5106         block_rsv_add_bytes(dest, num_bytes, 1);
5107         return 0;
5108 }
5109
5110 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5111                                     struct btrfs_block_rsv *block_rsv,
5112                                     struct btrfs_block_rsv *dest, u64 num_bytes)
5113 {
5114         struct btrfs_space_info *space_info = block_rsv->space_info;
5115
5116         spin_lock(&block_rsv->lock);
5117         if (num_bytes == (u64)-1)
5118                 num_bytes = block_rsv->size;
5119         block_rsv->size -= num_bytes;
5120         if (block_rsv->reserved >= block_rsv->size) {
5121                 num_bytes = block_rsv->reserved - block_rsv->size;
5122                 block_rsv->reserved = block_rsv->size;
5123                 block_rsv->full = 1;
5124         } else {
5125                 num_bytes = 0;
5126         }
5127         spin_unlock(&block_rsv->lock);
5128
5129         if (num_bytes > 0) {
5130                 if (dest) {
5131                         spin_lock(&dest->lock);
5132                         if (!dest->full) {
5133                                 u64 bytes_to_add;
5134
5135                                 bytes_to_add = dest->size - dest->reserved;
5136                                 bytes_to_add = min(num_bytes, bytes_to_add);
5137                                 dest->reserved += bytes_to_add;
5138                                 if (dest->reserved >= dest->size)
5139                                         dest->full = 1;
5140                                 num_bytes -= bytes_to_add;
5141                         }
5142                         spin_unlock(&dest->lock);
5143                 }
5144                 if (num_bytes) {
5145                         spin_lock(&space_info->lock);
5146                         space_info->bytes_may_use -= num_bytes;
5147                         trace_btrfs_space_reservation(fs_info, "space_info",
5148                                         space_info->flags, num_bytes, 0);
5149                         spin_unlock(&space_info->lock);
5150                 }
5151         }
5152 }
5153
5154 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
5155                                    struct btrfs_block_rsv *dst, u64 num_bytes)
5156 {
5157         int ret;
5158
5159         ret = block_rsv_use_bytes(src, num_bytes);
5160         if (ret)
5161                 return ret;
5162
5163         block_rsv_add_bytes(dst, num_bytes, 1);
5164         return 0;
5165 }
5166
5167 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5168 {
5169         memset(rsv, 0, sizeof(*rsv));
5170         spin_lock_init(&rsv->lock);
5171         rsv->type = type;
5172 }
5173
5174 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
5175                                               unsigned short type)
5176 {
5177         struct btrfs_block_rsv *block_rsv;
5178         struct btrfs_fs_info *fs_info = root->fs_info;
5179
5180         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5181         if (!block_rsv)
5182                 return NULL;
5183
5184         btrfs_init_block_rsv(block_rsv, type);
5185         block_rsv->space_info = __find_space_info(fs_info,
5186                                                   BTRFS_BLOCK_GROUP_METADATA);
5187         return block_rsv;
5188 }
5189
5190 void btrfs_free_block_rsv(struct btrfs_root *root,
5191                           struct btrfs_block_rsv *rsv)
5192 {
5193         if (!rsv)
5194                 return;
5195         btrfs_block_rsv_release(root, rsv, (u64)-1);
5196         kfree(rsv);
5197 }
5198
5199 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
5200 {
5201         kfree(rsv);
5202 }
5203
5204 int btrfs_block_rsv_add(struct btrfs_root *root,
5205                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5206                         enum btrfs_reserve_flush_enum flush)
5207 {
5208         int ret;
5209
5210         if (num_bytes == 0)
5211                 return 0;
5212
5213         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5214         if (!ret) {
5215                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
5216                 return 0;
5217         }
5218
5219         return ret;
5220 }
5221
5222 int btrfs_block_rsv_check(struct btrfs_root *root,
5223                           struct btrfs_block_rsv *block_rsv, int min_factor)
5224 {
5225         u64 num_bytes = 0;
5226         int ret = -ENOSPC;
5227
5228         if (!block_rsv)
5229                 return 0;
5230
5231         spin_lock(&block_rsv->lock);
5232         num_bytes = div_factor(block_rsv->size, min_factor);
5233         if (block_rsv->reserved >= num_bytes)
5234                 ret = 0;
5235         spin_unlock(&block_rsv->lock);
5236
5237         return ret;
5238 }
5239
5240 int btrfs_block_rsv_refill(struct btrfs_root *root,
5241                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5242                            enum btrfs_reserve_flush_enum flush)
5243 {
5244         u64 num_bytes = 0;
5245         int ret = -ENOSPC;
5246
5247         if (!block_rsv)
5248                 return 0;
5249
5250         spin_lock(&block_rsv->lock);
5251         num_bytes = min_reserved;
5252         if (block_rsv->reserved >= num_bytes)
5253                 ret = 0;
5254         else
5255                 num_bytes -= block_rsv->reserved;
5256         spin_unlock(&block_rsv->lock);
5257
5258         if (!ret)
5259                 return 0;
5260
5261         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5262         if (!ret) {
5263                 block_rsv_add_bytes(block_rsv, num_bytes, 0);
5264                 return 0;
5265         }
5266
5267         return ret;
5268 }
5269
5270 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
5271                             struct btrfs_block_rsv *dst_rsv,
5272                             u64 num_bytes)
5273 {
5274         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
5275 }
5276
5277 void btrfs_block_rsv_release(struct btrfs_root *root,
5278                              struct btrfs_block_rsv *block_rsv,
5279                              u64 num_bytes)
5280 {
5281         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5282         if (global_rsv == block_rsv ||
5283             block_rsv->space_info != global_rsv->space_info)
5284                 global_rsv = NULL;
5285         block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
5286                                 num_bytes);
5287 }
5288
5289 /*
5290  * helper to calculate size of global block reservation.
5291  * the desired value is sum of space used by extent tree,
5292  * checksum tree and root tree
5293  */
5294 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
5295 {
5296         struct btrfs_space_info *sinfo;
5297         u64 num_bytes;
5298         u64 meta_used;
5299         u64 data_used;
5300         int csum_size = btrfs_super_csum_size(fs_info->super_copy);
5301
5302         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
5303         spin_lock(&sinfo->lock);
5304         data_used = sinfo->bytes_used;
5305         spin_unlock(&sinfo->lock);
5306
5307         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5308         spin_lock(&sinfo->lock);
5309         if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
5310                 data_used = 0;
5311         meta_used = sinfo->bytes_used;
5312         spin_unlock(&sinfo->lock);
5313
5314         num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
5315                     csum_size * 2;
5316         num_bytes += div_u64(data_used + meta_used, 50);
5317
5318         if (num_bytes * 3 > meta_used)
5319                 num_bytes = div_u64(meta_used, 3);
5320
5321         return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
5322 }
5323
5324 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5325 {
5326         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5327         struct btrfs_space_info *sinfo = block_rsv->space_info;
5328         u64 num_bytes;
5329
5330         num_bytes = calc_global_metadata_size(fs_info);
5331
5332         spin_lock(&sinfo->lock);
5333         spin_lock(&block_rsv->lock);
5334
5335         block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
5336
5337         num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
5338                     sinfo->bytes_reserved + sinfo->bytes_readonly +
5339                     sinfo->bytes_may_use;
5340
5341         if (sinfo->total_bytes > num_bytes) {
5342                 num_bytes = sinfo->total_bytes - num_bytes;
5343                 block_rsv->reserved += num_bytes;
5344                 sinfo->bytes_may_use += num_bytes;
5345                 trace_btrfs_space_reservation(fs_info, "space_info",
5346                                       sinfo->flags, num_bytes, 1);
5347         }
5348
5349         if (block_rsv->reserved >= block_rsv->size) {
5350                 num_bytes = block_rsv->reserved - block_rsv->size;
5351                 sinfo->bytes_may_use -= num_bytes;
5352                 trace_btrfs_space_reservation(fs_info, "space_info",
5353                                       sinfo->flags, num_bytes, 0);
5354                 block_rsv->reserved = block_rsv->size;
5355                 block_rsv->full = 1;
5356         }
5357
5358         spin_unlock(&block_rsv->lock);
5359         spin_unlock(&sinfo->lock);
5360 }
5361
5362 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5363 {
5364         struct btrfs_space_info *space_info;
5365
5366         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5367         fs_info->chunk_block_rsv.space_info = space_info;
5368
5369         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5370         fs_info->global_block_rsv.space_info = space_info;
5371         fs_info->delalloc_block_rsv.space_info = space_info;
5372         fs_info->trans_block_rsv.space_info = space_info;
5373         fs_info->empty_block_rsv.space_info = space_info;
5374         fs_info->delayed_block_rsv.space_info = space_info;
5375
5376         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5377         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5378         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5379         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5380         if (fs_info->quota_root)
5381                 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5382         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5383
5384         update_global_block_rsv(fs_info);
5385 }
5386
5387 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5388 {
5389         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5390                                 (u64)-1);
5391         WARN_ON(fs_info->delalloc_block_rsv.size > 0);
5392         WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
5393         WARN_ON(fs_info->trans_block_rsv.size > 0);
5394         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5395         WARN_ON(fs_info->chunk_block_rsv.size > 0);
5396         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5397         WARN_ON(fs_info->delayed_block_rsv.size > 0);
5398         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5399 }
5400
5401 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
5402                                   struct btrfs_root *root)
5403 {
5404         if (!trans->block_rsv)
5405                 return;
5406
5407         if (!trans->bytes_reserved)
5408                 return;
5409
5410         trace_btrfs_space_reservation(root->fs_info, "transaction",
5411                                       trans->transid, trans->bytes_reserved, 0);
5412         btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
5413         trans->bytes_reserved = 0;
5414 }
5415
5416 /*
5417  * To be called after all the new block groups attached to the transaction
5418  * handle have been created (btrfs_create_pending_block_groups()).
5419  */
5420 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5421 {
5422         struct btrfs_fs_info *fs_info = trans->root->fs_info;
5423
5424         if (!trans->chunk_bytes_reserved)
5425                 return;
5426
5427         WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5428
5429         block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5430                                 trans->chunk_bytes_reserved);
5431         trans->chunk_bytes_reserved = 0;
5432 }
5433
5434 /* Can only return 0 or -ENOSPC */
5435 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
5436                                   struct inode *inode)
5437 {
5438         struct btrfs_root *root = BTRFS_I(inode)->root;
5439         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
5440         struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
5441
5442         /*
5443          * We need to hold space in order to delete our orphan item once we've
5444          * added it, so this takes the reservation so we can release it later
5445          * when we are truly done with the orphan item.
5446          */
5447         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5448         trace_btrfs_space_reservation(root->fs_info, "orphan",
5449                                       btrfs_ino(inode), num_bytes, 1);
5450         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
5451 }
5452
5453 void btrfs_orphan_release_metadata(struct inode *inode)
5454 {
5455         struct btrfs_root *root = BTRFS_I(inode)->root;
5456         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5457         trace_btrfs_space_reservation(root->fs_info, "orphan",
5458                                       btrfs_ino(inode), num_bytes, 0);
5459         btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
5460 }
5461
5462 /*
5463  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5464  * root: the root of the parent directory
5465  * rsv: block reservation
5466  * items: the number of items that we need do reservation
5467  * qgroup_reserved: used to return the reserved size in qgroup
5468  *
5469  * This function is used to reserve the space for snapshot/subvolume
5470  * creation and deletion. Those operations are different with the
5471  * common file/directory operations, they change two fs/file trees
5472  * and root tree, the number of items that the qgroup reserves is
5473  * different with the free space reservation. So we can not use
5474  * the space reseravtion mechanism in start_transaction().
5475  */
5476 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5477                                      struct btrfs_block_rsv *rsv,
5478                                      int items,
5479                                      u64 *qgroup_reserved,
5480                                      bool use_global_rsv)
5481 {
5482         u64 num_bytes;
5483         int ret;
5484         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5485
5486         if (root->fs_info->quota_enabled) {
5487                 /* One for parent inode, two for dir entries */
5488                 num_bytes = 3 * root->nodesize;
5489                 ret = btrfs_qgroup_reserve_meta(root, num_bytes);
5490                 if (ret)
5491                         return ret;
5492         } else {
5493                 num_bytes = 0;
5494         }
5495
5496         *qgroup_reserved = num_bytes;
5497
5498         num_bytes = btrfs_calc_trans_metadata_size(root, items);
5499         rsv->space_info = __find_space_info(root->fs_info,
5500                                             BTRFS_BLOCK_GROUP_METADATA);
5501         ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5502                                   BTRFS_RESERVE_FLUSH_ALL);
5503
5504         if (ret == -ENOSPC && use_global_rsv)
5505                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
5506
5507         if (ret && *qgroup_reserved)
5508                 btrfs_qgroup_free_meta(root, *qgroup_reserved);
5509
5510         return ret;
5511 }
5512
5513 void btrfs_subvolume_release_metadata(struct btrfs_root *root,
5514                                       struct btrfs_block_rsv *rsv,
5515                                       u64 qgroup_reserved)
5516 {
5517         btrfs_block_rsv_release(root, rsv, (u64)-1);
5518 }
5519
5520 /**
5521  * drop_outstanding_extent - drop an outstanding extent
5522  * @inode: the inode we're dropping the extent for
5523  * @num_bytes: the number of bytes we're relaseing.
5524  *
5525  * This is called when we are freeing up an outstanding extent, either called
5526  * after an error or after an extent is written.  This will return the number of
5527  * reserved extents that need to be freed.  This must be called with
5528  * BTRFS_I(inode)->lock held.
5529  */
5530 static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
5531 {
5532         unsigned drop_inode_space = 0;
5533         unsigned dropped_extents = 0;
5534         unsigned num_extents = 0;
5535
5536         num_extents = (unsigned)div64_u64(num_bytes +
5537                                           BTRFS_MAX_EXTENT_SIZE - 1,
5538                                           BTRFS_MAX_EXTENT_SIZE);
5539         ASSERT(num_extents);
5540         ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
5541         BTRFS_I(inode)->outstanding_extents -= num_extents;
5542
5543         if (BTRFS_I(inode)->outstanding_extents == 0 &&
5544             test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5545                                &BTRFS_I(inode)->runtime_flags))
5546                 drop_inode_space = 1;
5547
5548         /*
5549          * If we have more or the same amount of outsanding extents than we have
5550          * reserved then we need to leave the reserved extents count alone.
5551          */
5552         if (BTRFS_I(inode)->outstanding_extents >=
5553             BTRFS_I(inode)->reserved_extents)
5554                 return drop_inode_space;
5555
5556         dropped_extents = BTRFS_I(inode)->reserved_extents -
5557                 BTRFS_I(inode)->outstanding_extents;
5558         BTRFS_I(inode)->reserved_extents -= dropped_extents;
5559         return dropped_extents + drop_inode_space;
5560 }
5561
5562 /**
5563  * calc_csum_metadata_size - return the amount of metada space that must be
5564  *      reserved/free'd for the given bytes.
5565  * @inode: the inode we're manipulating
5566  * @num_bytes: the number of bytes in question
5567  * @reserve: 1 if we are reserving space, 0 if we are freeing space
5568  *
5569  * This adjusts the number of csum_bytes in the inode and then returns the
5570  * correct amount of metadata that must either be reserved or freed.  We
5571  * calculate how many checksums we can fit into one leaf and then divide the
5572  * number of bytes that will need to be checksumed by this value to figure out
5573  * how many checksums will be required.  If we are adding bytes then the number
5574  * may go up and we will return the number of additional bytes that must be
5575  * reserved.  If it is going down we will return the number of bytes that must
5576  * be freed.
5577  *
5578  * This must be called with BTRFS_I(inode)->lock held.
5579  */
5580 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
5581                                    int reserve)
5582 {
5583         struct btrfs_root *root = BTRFS_I(inode)->root;
5584         u64 old_csums, num_csums;
5585
5586         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
5587             BTRFS_I(inode)->csum_bytes == 0)
5588                 return 0;
5589
5590         old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5591         if (reserve)
5592                 BTRFS_I(inode)->csum_bytes += num_bytes;
5593         else
5594                 BTRFS_I(inode)->csum_bytes -= num_bytes;
5595         num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5596
5597         /* No change, no need to reserve more */
5598         if (old_csums == num_csums)
5599                 return 0;
5600
5601         if (reserve)
5602                 return btrfs_calc_trans_metadata_size(root,
5603                                                       num_csums - old_csums);
5604
5605         return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
5606 }
5607
5608 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5609 {
5610         struct btrfs_root *root = BTRFS_I(inode)->root;
5611         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
5612         u64 to_reserve = 0;
5613         u64 csum_bytes;
5614         unsigned nr_extents = 0;
5615         int extra_reserve = 0;
5616         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5617         int ret = 0;
5618         bool delalloc_lock = true;
5619         u64 to_free = 0;
5620         unsigned dropped;
5621
5622         /* If we are a free space inode we need to not flush since we will be in
5623          * the middle of a transaction commit.  We also don't need the delalloc
5624          * mutex since we won't race with anybody.  We need this mostly to make
5625          * lockdep shut its filthy mouth.
5626          */
5627         if (btrfs_is_free_space_inode(inode)) {
5628                 flush = BTRFS_RESERVE_NO_FLUSH;
5629                 delalloc_lock = false;
5630         }
5631
5632         if (flush != BTRFS_RESERVE_NO_FLUSH &&
5633             btrfs_transaction_in_commit(root->fs_info))
5634                 schedule_timeout(1);
5635
5636         if (delalloc_lock)
5637                 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
5638
5639         num_bytes = ALIGN(num_bytes, root->sectorsize);
5640
5641         spin_lock(&BTRFS_I(inode)->lock);
5642         nr_extents = (unsigned)div64_u64(num_bytes +
5643                                          BTRFS_MAX_EXTENT_SIZE - 1,
5644                                          BTRFS_MAX_EXTENT_SIZE);
5645         BTRFS_I(inode)->outstanding_extents += nr_extents;
5646         nr_extents = 0;
5647
5648         if (BTRFS_I(inode)->outstanding_extents >
5649             BTRFS_I(inode)->reserved_extents)
5650                 nr_extents = BTRFS_I(inode)->outstanding_extents -
5651                         BTRFS_I(inode)->reserved_extents;
5652
5653         /*
5654          * Add an item to reserve for updating the inode when we complete the
5655          * delalloc io.
5656          */
5657         if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5658                       &BTRFS_I(inode)->runtime_flags)) {
5659                 nr_extents++;
5660                 extra_reserve = 1;
5661         }
5662
5663         to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
5664         to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
5665         csum_bytes = BTRFS_I(inode)->csum_bytes;
5666         spin_unlock(&BTRFS_I(inode)->lock);
5667
5668         if (root->fs_info->quota_enabled) {
5669                 ret = btrfs_qgroup_reserve_meta(root,
5670                                 nr_extents * root->nodesize);
5671                 if (ret)
5672                         goto out_fail;
5673         }
5674
5675         ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
5676         if (unlikely(ret)) {
5677                 btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
5678                 goto out_fail;
5679         }
5680
5681         spin_lock(&BTRFS_I(inode)->lock);
5682         if (extra_reserve) {
5683                 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5684                         &BTRFS_I(inode)->runtime_flags);
5685                 nr_extents--;
5686         }
5687         BTRFS_I(inode)->reserved_extents += nr_extents;
5688         spin_unlock(&BTRFS_I(inode)->lock);
5689
5690         if (delalloc_lock)
5691                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5692
5693         if (to_reserve)
5694                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5695                                               btrfs_ino(inode), to_reserve, 1);
5696         block_rsv_add_bytes(block_rsv, to_reserve, 1);
5697
5698         return 0;
5699
5700 out_fail:
5701         spin_lock(&BTRFS_I(inode)->lock);
5702         dropped = drop_outstanding_extent(inode, num_bytes);
5703         /*
5704          * If the inodes csum_bytes is the same as the original
5705          * csum_bytes then we know we haven't raced with any free()ers
5706          * so we can just reduce our inodes csum bytes and carry on.
5707          */
5708         if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
5709                 calc_csum_metadata_size(inode, num_bytes, 0);
5710         } else {
5711                 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
5712                 u64 bytes;
5713
5714                 /*
5715                  * This is tricky, but first we need to figure out how much we
5716                  * free'd from any free-ers that occured during this
5717                  * reservation, so we reset ->csum_bytes to the csum_bytes
5718                  * before we dropped our lock, and then call the free for the
5719                  * number of bytes that were freed while we were trying our
5720                  * reservation.
5721                  */
5722                 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
5723                 BTRFS_I(inode)->csum_bytes = csum_bytes;
5724                 to_free = calc_csum_metadata_size(inode, bytes, 0);
5725
5726
5727                 /*
5728                  * Now we need to see how much we would have freed had we not
5729                  * been making this reservation and our ->csum_bytes were not
5730                  * artificially inflated.
5731                  */
5732                 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
5733                 bytes = csum_bytes - orig_csum_bytes;
5734                 bytes = calc_csum_metadata_size(inode, bytes, 0);
5735
5736                 /*
5737                  * Now reset ->csum_bytes to what it should be.  If bytes is
5738                  * more than to_free then we would have free'd more space had we
5739                  * not had an artificially high ->csum_bytes, so we need to free
5740                  * the remainder.  If bytes is the same or less then we don't
5741                  * need to do anything, the other free-ers did the correct
5742                  * thing.
5743                  */
5744                 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
5745                 if (bytes > to_free)
5746                         to_free = bytes - to_free;
5747                 else
5748                         to_free = 0;
5749         }
5750         spin_unlock(&BTRFS_I(inode)->lock);
5751         if (dropped)
5752                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
5753
5754         if (to_free) {
5755                 btrfs_block_rsv_release(root, block_rsv, to_free);
5756                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5757                                               btrfs_ino(inode), to_free, 0);
5758         }
5759         if (delalloc_lock)
5760                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5761         return ret;
5762 }
5763
5764 /**
5765  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5766  * @inode: the inode to release the reservation for
5767  * @num_bytes: the number of bytes we're releasing
5768  *
5769  * This will release the metadata reservation for an inode.  This can be called
5770  * once we complete IO for a given set of bytes to release their metadata
5771  * reservations.
5772  */
5773 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
5774 {
5775         struct btrfs_root *root = BTRFS_I(inode)->root;
5776         u64 to_free = 0;
5777         unsigned dropped;
5778
5779         num_bytes = ALIGN(num_bytes, root->sectorsize);
5780         spin_lock(&BTRFS_I(inode)->lock);
5781         dropped = drop_outstanding_extent(inode, num_bytes);
5782
5783         if (num_bytes)
5784                 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
5785         spin_unlock(&BTRFS_I(inode)->lock);
5786         if (dropped > 0)
5787                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
5788
5789         if (btrfs_test_is_dummy_root(root))
5790                 return;
5791
5792         trace_btrfs_space_reservation(root->fs_info, "delalloc",
5793                                       btrfs_ino(inode), to_free, 0);
5794
5795         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
5796                                 to_free);
5797 }
5798
5799 /**
5800  * btrfs_delalloc_reserve_space - reserve data and metadata space for
5801  * delalloc
5802  * @inode: inode we're writing to
5803  * @start: start range we are writing to
5804  * @len: how long the range we are writing to
5805  *
5806  * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
5807  *
5808  * This will do the following things
5809  *
5810  * o reserve space in data space info for num bytes
5811  *   and reserve precious corresponding qgroup space
5812  *   (Done in check_data_free_space)
5813  *
5814  * o reserve space for metadata space, based on the number of outstanding
5815  *   extents and how much csums will be needed
5816  *   also reserve metadata space in a per root over-reserve method.
5817  * o add to the inodes->delalloc_bytes
5818  * o add it to the fs_info's delalloc inodes list.
5819  *   (Above 3 all done in delalloc_reserve_metadata)
5820  *
5821  * Return 0 for success
5822  * Return <0 for error(-ENOSPC or -EQUOT)
5823  */
5824 int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
5825 {
5826         int ret;
5827
5828         ret = btrfs_check_data_free_space(inode, start, len);
5829         if (ret < 0)
5830                 return ret;
5831         ret = btrfs_delalloc_reserve_metadata(inode, len);
5832         if (ret < 0)
5833                 btrfs_free_reserved_data_space(inode, start, len);
5834         return ret;
5835 }
5836
5837 /**
5838  * btrfs_delalloc_release_space - release data and metadata space for delalloc
5839  * @inode: inode we're releasing space for
5840  * @start: start position of the space already reserved
5841  * @len: the len of the space already reserved
5842  *
5843  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
5844  * called in the case that we don't need the metadata AND data reservations
5845  * anymore.  So if there is an error or we insert an inline extent.
5846  *
5847  * This function will release the metadata space that was not used and will
5848  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
5849  * list if there are no delalloc bytes left.
5850  * Also it will handle the qgroup reserved space.
5851  */
5852 void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
5853 {
5854         btrfs_delalloc_release_metadata(inode, len);
5855         btrfs_free_reserved_data_space(inode, start, len);
5856 }
5857
5858 static int update_block_group(struct btrfs_trans_handle *trans,
5859                               struct btrfs_root *root, u64 bytenr,
5860                               u64 num_bytes, int alloc)
5861 {
5862         struct btrfs_block_group_cache *cache = NULL;
5863         struct btrfs_fs_info *info = root->fs_info;
5864         u64 total = num_bytes;
5865         u64 old_val;
5866         u64 byte_in_group;
5867         int factor;
5868
5869         /* block accounting for super block */
5870         spin_lock(&info->delalloc_root_lock);
5871         old_val = btrfs_super_bytes_used(info->super_copy);
5872         if (alloc)
5873                 old_val += num_bytes;
5874         else
5875                 old_val -= num_bytes;
5876         btrfs_set_super_bytes_used(info->super_copy, old_val);
5877         spin_unlock(&info->delalloc_root_lock);
5878
5879         while (total) {
5880                 cache = btrfs_lookup_block_group(info, bytenr);
5881                 if (!cache)
5882                         return -ENOENT;
5883                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
5884                                     BTRFS_BLOCK_GROUP_RAID1 |
5885                                     BTRFS_BLOCK_GROUP_RAID10))
5886                         factor = 2;
5887                 else
5888                         factor = 1;
5889                 /*
5890                  * If this block group has free space cache written out, we
5891                  * need to make sure to load it if we are removing space.  This
5892                  * is because we need the unpinning stage to actually add the
5893                  * space back to the block group, otherwise we will leak space.
5894                  */
5895                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
5896                         cache_block_group(cache, 1);
5897
5898                 byte_in_group = bytenr - cache->key.objectid;
5899                 WARN_ON(byte_in_group > cache->key.offset);
5900
5901                 spin_lock(&cache->space_info->lock);
5902                 spin_lock(&cache->lock);
5903
5904                 if (btrfs_test_opt(root, SPACE_CACHE) &&
5905                     cache->disk_cache_state < BTRFS_DC_CLEAR)
5906                         cache->disk_cache_state = BTRFS_DC_CLEAR;
5907
5908                 old_val = btrfs_block_group_used(&cache->item);
5909                 num_bytes = min(total, cache->key.offset - byte_in_group);
5910                 if (alloc) {
5911                         old_val += num_bytes;
5912                         btrfs_set_block_group_used(&cache->item, old_val);
5913                         cache->reserved -= num_bytes;
5914                         cache->space_info->bytes_reserved -= num_bytes;
5915                         cache->space_info->bytes_used += num_bytes;
5916                         cache->space_info->disk_used += num_bytes * factor;
5917                         spin_unlock(&cache->lock);
5918                         spin_unlock(&cache->space_info->lock);
5919                 } else {
5920                         old_val -= num_bytes;
5921                         btrfs_set_block_group_used(&cache->item, old_val);
5922                         cache->pinned += num_bytes;
5923                         cache->space_info->bytes_pinned += num_bytes;
5924                         cache->space_info->bytes_used -= num_bytes;
5925                         cache->space_info->disk_used -= num_bytes * factor;
5926                         spin_unlock(&cache->lock);
5927                         spin_unlock(&cache->space_info->lock);
5928
5929                         set_extent_dirty(info->pinned_extents,
5930                                          bytenr, bytenr + num_bytes - 1,
5931                                          GFP_NOFS | __GFP_NOFAIL);
5932                 }
5933
5934                 spin_lock(&trans->transaction->dirty_bgs_lock);
5935                 if (list_empty(&cache->dirty_list)) {
5936                         list_add_tail(&cache->dirty_list,
5937                                       &trans->transaction->dirty_bgs);
5938                                 trans->transaction->num_dirty_bgs++;
5939                         btrfs_get_block_group(cache);
5940                 }
5941                 spin_unlock(&trans->transaction->dirty_bgs_lock);
5942
5943                 /*
5944                  * No longer have used bytes in this block group, queue it for
5945                  * deletion. We do this after adding the block group to the
5946                  * dirty list to avoid races between cleaner kthread and space
5947                  * cache writeout.
5948                  */
5949                 if (!alloc && old_val == 0) {
5950                         spin_lock(&info->unused_bgs_lock);
5951                         if (list_empty(&cache->bg_list)) {
5952                                 btrfs_get_block_group(cache);
5953                                 list_add_tail(&cache->bg_list,
5954                                               &info->unused_bgs);
5955                         }
5956                         spin_unlock(&info->unused_bgs_lock);
5957                 }
5958
5959                 btrfs_put_block_group(cache);
5960                 total -= num_bytes;
5961                 bytenr += num_bytes;
5962         }
5963         return 0;
5964 }
5965
5966 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
5967 {
5968         struct btrfs_block_group_cache *cache;
5969         u64 bytenr;
5970
5971         spin_lock(&root->fs_info->block_group_cache_lock);
5972         bytenr = root->fs_info->first_logical_byte;
5973         spin_unlock(&root->fs_info->block_group_cache_lock);
5974
5975         if (bytenr < (u64)-1)
5976                 return bytenr;
5977
5978         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
5979         if (!cache)
5980                 return 0;
5981
5982         bytenr = cache->key.objectid;
5983         btrfs_put_block_group(cache);
5984
5985         return bytenr;
5986 }
5987
5988 static int pin_down_extent(struct btrfs_root *root,
5989                            struct btrfs_block_group_cache *cache,
5990                            u64 bytenr, u64 num_bytes, int reserved)
5991 {
5992         spin_lock(&cache->space_info->lock);
5993         spin_lock(&cache->lock);
5994         cache->pinned += num_bytes;
5995         cache->space_info->bytes_pinned += num_bytes;
5996         if (reserved) {
5997                 cache->reserved -= num_bytes;
5998                 cache->space_info->bytes_reserved -= num_bytes;
5999         }
6000         spin_unlock(&cache->lock);
6001         spin_unlock(&cache->space_info->lock);
6002
6003         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
6004                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6005         if (reserved)
6006                 trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
6007         return 0;
6008 }
6009
6010 /*
6011  * this function must be called within transaction
6012  */
6013 int btrfs_pin_extent(struct btrfs_root *root,
6014                      u64 bytenr, u64 num_bytes, int reserved)
6015 {
6016         struct btrfs_block_group_cache *cache;
6017
6018         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
6019         BUG_ON(!cache); /* Logic error */
6020
6021         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
6022
6023         btrfs_put_block_group(cache);
6024         return 0;
6025 }
6026
6027 /*
6028  * this function must be called within transaction
6029  */
6030 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
6031                                     u64 bytenr, u64 num_bytes)
6032 {
6033         struct btrfs_block_group_cache *cache;
6034         int ret;
6035
6036         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
6037         if (!cache)
6038                 return -EINVAL;
6039
6040         /*
6041          * pull in the free space cache (if any) so that our pin
6042          * removes the free space from the cache.  We have load_only set
6043          * to one because the slow code to read in the free extents does check
6044          * the pinned extents.
6045          */
6046         cache_block_group(cache, 1);
6047
6048         pin_down_extent(root, cache, bytenr, num_bytes, 0);
6049
6050         /* remove us from the free space cache (if we're there at all) */
6051         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6052         btrfs_put_block_group(cache);
6053         return ret;
6054 }
6055
6056 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
6057 {
6058         int ret;
6059         struct btrfs_block_group_cache *block_group;
6060         struct btrfs_caching_control *caching_ctl;
6061
6062         block_group = btrfs_lookup_block_group(root->fs_info, start);
6063         if (!block_group)
6064                 return -EINVAL;
6065
6066         cache_block_group(block_group, 0);
6067         caching_ctl = get_caching_control(block_group);
6068
6069         if (!caching_ctl) {
6070                 /* Logic error */
6071                 BUG_ON(!block_group_cache_done(block_group));
6072                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6073         } else {
6074                 mutex_lock(&caching_ctl->mutex);
6075
6076                 if (start >= caching_ctl->progress) {
6077                         ret = add_excluded_extent(root, start, num_bytes);
6078                 } else if (start + num_bytes <= caching_ctl->progress) {
6079                         ret = btrfs_remove_free_space(block_group,
6080                                                       start, num_bytes);
6081                 } else {
6082                         num_bytes = caching_ctl->progress - start;
6083                         ret = btrfs_remove_free_space(block_group,
6084                                                       start, num_bytes);
6085                         if (ret)
6086                                 goto out_lock;
6087
6088                         num_bytes = (start + num_bytes) -
6089                                 caching_ctl->progress;
6090                         start = caching_ctl->progress;
6091                         ret = add_excluded_extent(root, start, num_bytes);
6092                 }
6093 out_lock:
6094                 mutex_unlock(&caching_ctl->mutex);
6095                 put_caching_control(caching_ctl);
6096         }
6097         btrfs_put_block_group(block_group);
6098         return ret;
6099 }
6100
6101 int btrfs_exclude_logged_extents(struct btrfs_root *log,
6102                                  struct extent_buffer *eb)
6103 {
6104         struct btrfs_file_extent_item *item;
6105         struct btrfs_key key;
6106         int found_type;
6107         int i;
6108
6109         if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
6110                 return 0;
6111
6112         for (i = 0; i < btrfs_header_nritems(eb); i++) {
6113                 btrfs_item_key_to_cpu(eb, &key, i);
6114                 if (key.type != BTRFS_EXTENT_DATA_KEY)
6115                         continue;
6116                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6117                 found_type = btrfs_file_extent_type(eb, item);
6118                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
6119                         continue;
6120                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6121                         continue;
6122                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6123                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6124                 __exclude_logged_extent(log, key.objectid, key.offset);
6125         }
6126
6127         return 0;
6128 }
6129
6130 /**
6131  * btrfs_update_reserved_bytes - update the block_group and space info counters
6132  * @cache:      The cache we are manipulating
6133  * @num_bytes:  The number of bytes in question
6134  * @reserve:    One of the reservation enums
6135  * @delalloc:   The blocks are allocated for the delalloc write
6136  *
6137  * This is called by the allocator when it reserves space, or by somebody who is
6138  * freeing space that was never actually used on disk.  For example if you
6139  * reserve some space for a new leaf in transaction A and before transaction A
6140  * commits you free that leaf, you call this with reserve set to 0 in order to
6141  * clear the reservation.
6142  *
6143  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
6144  * ENOSPC accounting.  For data we handle the reservation through clearing the
6145  * delalloc bits in the io_tree.  We have to do this since we could end up
6146  * allocating less disk space for the amount of data we have reserved in the
6147  * case of compression.
6148  *
6149  * If this is a reservation and the block group has become read only we cannot
6150  * make the reservation and return -EAGAIN, otherwise this function always
6151  * succeeds.
6152  */
6153 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
6154                                        u64 num_bytes, int reserve, int delalloc)
6155 {
6156         struct btrfs_space_info *space_info = cache->space_info;
6157         int ret = 0;
6158
6159         spin_lock(&space_info->lock);
6160         spin_lock(&cache->lock);
6161         if (reserve != RESERVE_FREE) {
6162                 if (cache->ro) {
6163                         ret = -EAGAIN;
6164                 } else {
6165                         cache->reserved += num_bytes;
6166                         space_info->bytes_reserved += num_bytes;
6167                         if (reserve == RESERVE_ALLOC) {
6168                                 trace_btrfs_space_reservation(cache->fs_info,
6169                                                 "space_info", space_info->flags,
6170                                                 num_bytes, 0);
6171                                 space_info->bytes_may_use -= num_bytes;
6172                         }
6173
6174                         if (delalloc)
6175                                 cache->delalloc_bytes += num_bytes;
6176                 }
6177         } else {
6178                 if (cache->ro)
6179                         space_info->bytes_readonly += num_bytes;
6180                 cache->reserved -= num_bytes;
6181                 space_info->bytes_reserved -= num_bytes;
6182
6183                 if (delalloc)
6184                         cache->delalloc_bytes -= num_bytes;
6185         }
6186         spin_unlock(&cache->lock);
6187         spin_unlock(&space_info->lock);
6188         return ret;
6189 }
6190
6191 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
6192                                 struct btrfs_root *root)
6193 {
6194         struct btrfs_fs_info *fs_info = root->fs_info;
6195         struct btrfs_caching_control *next;
6196         struct btrfs_caching_control *caching_ctl;
6197         struct btrfs_block_group_cache *cache;
6198
6199         down_write(&fs_info->commit_root_sem);
6200
6201         list_for_each_entry_safe(caching_ctl, next,
6202                                  &fs_info->caching_block_groups, list) {
6203                 cache = caching_ctl->block_group;
6204                 if (block_group_cache_done(cache)) {
6205                         cache->last_byte_to_unpin = (u64)-1;
6206                         list_del_init(&caching_ctl->list);
6207                         put_caching_control(caching_ctl);
6208                 } else {
6209                         cache->last_byte_to_unpin = caching_ctl->progress;
6210                 }
6211         }
6212
6213         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6214                 fs_info->pinned_extents = &fs_info->freed_extents[1];
6215         else
6216                 fs_info->pinned_extents = &fs_info->freed_extents[0];
6217
6218         up_write(&fs_info->commit_root_sem);
6219
6220         update_global_block_rsv(fs_info);
6221 }
6222
6223 /*
6224  * Returns the free cluster for the given space info and sets empty_cluster to
6225  * what it should be based on the mount options.
6226  */
6227 static struct btrfs_free_cluster *
6228 fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
6229                    u64 *empty_cluster)
6230 {
6231         struct btrfs_free_cluster *ret = NULL;
6232         bool ssd = btrfs_test_opt(root, SSD);
6233
6234         *empty_cluster = 0;
6235         if (btrfs_mixed_space_info(space_info))
6236                 return ret;
6237
6238         if (ssd)
6239                 *empty_cluster = 2 * 1024 * 1024;
6240         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6241                 ret = &root->fs_info->meta_alloc_cluster;
6242                 if (!ssd)
6243                         *empty_cluster = 64 * 1024;
6244         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
6245                 ret = &root->fs_info->data_alloc_cluster;
6246         }
6247
6248         return ret;
6249 }
6250
6251 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
6252                               const bool return_free_space)
6253 {
6254         struct btrfs_fs_info *fs_info = root->fs_info;
6255         struct btrfs_block_group_cache *cache = NULL;
6256         struct btrfs_space_info *space_info;
6257         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6258         struct btrfs_free_cluster *cluster = NULL;
6259         u64 len;
6260         u64 total_unpinned = 0;
6261         u64 empty_cluster = 0;
6262         bool readonly;
6263
6264         while (start <= end) {
6265                 readonly = false;
6266                 if (!cache ||
6267                     start >= cache->key.objectid + cache->key.offset) {
6268                         if (cache)
6269                                 btrfs_put_block_group(cache);
6270                         total_unpinned = 0;
6271                         cache = btrfs_lookup_block_group(fs_info, start);
6272                         BUG_ON(!cache); /* Logic error */
6273
6274                         cluster = fetch_cluster_info(root,
6275                                                      cache->space_info,
6276                                                      &empty_cluster);
6277                         empty_cluster <<= 1;
6278                 }
6279
6280                 len = cache->key.objectid + cache->key.offset - start;
6281                 len = min(len, end + 1 - start);
6282
6283                 if (start < cache->last_byte_to_unpin) {
6284                         len = min(len, cache->last_byte_to_unpin - start);
6285                         if (return_free_space)
6286                                 btrfs_add_free_space(cache, start, len);
6287                 }
6288
6289                 start += len;
6290                 total_unpinned += len;
6291                 space_info = cache->space_info;
6292
6293                 /*
6294                  * If this space cluster has been marked as fragmented and we've
6295                  * unpinned enough in this block group to potentially allow a
6296                  * cluster to be created inside of it go ahead and clear the
6297                  * fragmented check.
6298                  */
6299                 if (cluster && cluster->fragmented &&
6300                     total_unpinned > empty_cluster) {
6301                         spin_lock(&cluster->lock);
6302                         cluster->fragmented = 0;
6303                         spin_unlock(&cluster->lock);
6304                 }
6305
6306                 spin_lock(&space_info->lock);
6307                 spin_lock(&cache->lock);
6308                 cache->pinned -= len;
6309                 space_info->bytes_pinned -= len;
6310                 space_info->max_extent_size = 0;
6311                 percpu_counter_add(&space_info->total_bytes_pinned, -len);
6312                 if (cache->ro) {
6313                         space_info->bytes_readonly += len;
6314                         readonly = true;
6315                 }
6316                 spin_unlock(&cache->lock);
6317                 if (!readonly && global_rsv->space_info == space_info) {
6318                         spin_lock(&global_rsv->lock);
6319                         if (!global_rsv->full) {
6320                                 len = min(len, global_rsv->size -
6321                                           global_rsv->reserved);
6322                                 global_rsv->reserved += len;
6323                                 space_info->bytes_may_use += len;
6324                                 if (global_rsv->reserved >= global_rsv->size)
6325                                         global_rsv->full = 1;
6326                         }
6327                         spin_unlock(&global_rsv->lock);
6328                 }
6329                 spin_unlock(&space_info->lock);
6330         }
6331
6332         if (cache)
6333                 btrfs_put_block_group(cache);
6334         return 0;
6335 }
6336
6337 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
6338                                struct btrfs_root *root)
6339 {
6340         struct btrfs_fs_info *fs_info = root->fs_info;
6341         struct btrfs_block_group_cache *block_group, *tmp;
6342         struct list_head *deleted_bgs;
6343         struct extent_io_tree *unpin;
6344         u64 start;
6345         u64 end;
6346         int ret;
6347
6348         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6349                 unpin = &fs_info->freed_extents[1];
6350         else
6351                 unpin = &fs_info->freed_extents[0];
6352
6353         while (!trans->aborted) {
6354                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
6355                 ret = find_first_extent_bit(unpin, 0, &start, &end,
6356                                             EXTENT_DIRTY, NULL);
6357                 if (ret) {
6358                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6359                         break;
6360                 }
6361
6362                 if (btrfs_test_opt(root, DISCARD))
6363                         ret = btrfs_discard_extent(root, start,
6364                                                    end + 1 - start, NULL);
6365
6366                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
6367                 unpin_extent_range(root, start, end, true);
6368                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6369                 cond_resched();
6370         }
6371
6372         /*
6373          * Transaction is finished.  We don't need the lock anymore.  We
6374          * do need to clean up the block groups in case of a transaction
6375          * abort.
6376          */
6377         deleted_bgs = &trans->transaction->deleted_bgs;
6378         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6379                 u64 trimmed = 0;
6380
6381                 ret = -EROFS;
6382                 if (!trans->aborted)
6383                         ret = btrfs_discard_extent(root,
6384                                                    block_group->key.objectid,
6385                                                    block_group->key.offset,
6386                                                    &trimmed);
6387
6388                 list_del_init(&block_group->bg_list);
6389                 btrfs_put_block_group_trimming(block_group);
6390                 btrfs_put_block_group(block_group);
6391
6392                 if (ret) {
6393                         const char *errstr = btrfs_decode_error(ret);
6394                         btrfs_warn(fs_info,
6395                                    "Discard failed while removing blockgroup: errno=%d %s\n",
6396                                    ret, errstr);
6397                 }
6398         }
6399
6400         return 0;
6401 }
6402
6403 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
6404                              u64 owner, u64 root_objectid)
6405 {
6406         struct btrfs_space_info *space_info;
6407         u64 flags;
6408
6409         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6410                 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
6411                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
6412                 else
6413                         flags = BTRFS_BLOCK_GROUP_METADATA;
6414         } else {
6415                 flags = BTRFS_BLOCK_GROUP_DATA;
6416         }
6417
6418         space_info = __find_space_info(fs_info, flags);
6419         BUG_ON(!space_info); /* Logic bug */
6420         percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
6421 }
6422
6423
6424 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6425                                 struct btrfs_root *root,
6426                                 struct btrfs_delayed_ref_node *node, u64 parent,
6427                                 u64 root_objectid, u64 owner_objectid,
6428                                 u64 owner_offset, int refs_to_drop,
6429                                 struct btrfs_delayed_extent_op *extent_op)
6430 {
6431         struct btrfs_key key;
6432         struct btrfs_path *path;
6433         struct btrfs_fs_info *info = root->fs_info;
6434         struct btrfs_root *extent_root = info->extent_root;
6435         struct extent_buffer *leaf;
6436         struct btrfs_extent_item *ei;
6437         struct btrfs_extent_inline_ref *iref;
6438         int ret;
6439         int is_data;
6440         int extent_slot = 0;
6441         int found_extent = 0;
6442         int num_to_del = 1;
6443         u32 item_size;
6444         u64 refs;
6445         u64 bytenr = node->bytenr;
6446         u64 num_bytes = node->num_bytes;
6447         int last_ref = 0;
6448         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6449                                                  SKINNY_METADATA);
6450
6451         path = btrfs_alloc_path();
6452         if (!path)
6453                 return -ENOMEM;
6454
6455         path->reada = 1;
6456         path->leave_spinning = 1;
6457
6458         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6459         BUG_ON(!is_data && refs_to_drop != 1);
6460
6461         if (is_data)
6462                 skinny_metadata = 0;
6463
6464         ret = lookup_extent_backref(trans, extent_root, path, &iref,
6465                                     bytenr, num_bytes, parent,
6466                                     root_objectid, owner_objectid,
6467                                     owner_offset);
6468         if (ret == 0) {
6469                 extent_slot = path->slots[0];
6470                 while (extent_slot >= 0) {
6471                         btrfs_item_key_to_cpu(path->nodes[0], &key,
6472                                               extent_slot);
6473                         if (key.objectid != bytenr)
6474                                 break;
6475                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6476                             key.offset == num_bytes) {
6477                                 found_extent = 1;
6478                                 break;
6479                         }
6480                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
6481                             key.offset == owner_objectid) {
6482                                 found_extent = 1;
6483                                 break;
6484                         }
6485                         if (path->slots[0] - extent_slot > 5)
6486                                 break;
6487                         extent_slot--;
6488                 }
6489 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6490                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
6491                 if (found_extent && item_size < sizeof(*ei))
6492                         found_extent = 0;
6493 #endif
6494                 if (!found_extent) {
6495                         BUG_ON(iref);
6496                         ret = remove_extent_backref(trans, extent_root, path,
6497                                                     NULL, refs_to_drop,
6498                                                     is_data, &last_ref);
6499                         if (ret) {
6500                                 btrfs_abort_transaction(trans, extent_root, ret);
6501                                 goto out;
6502                         }
6503                         btrfs_release_path(path);
6504                         path->leave_spinning = 1;
6505
6506                         key.objectid = bytenr;
6507                         key.type = BTRFS_EXTENT_ITEM_KEY;
6508                         key.offset = num_bytes;
6509
6510                         if (!is_data && skinny_metadata) {
6511                                 key.type = BTRFS_METADATA_ITEM_KEY;
6512                                 key.offset = owner_objectid;
6513                         }
6514
6515                         ret = btrfs_search_slot(trans, extent_root,
6516                                                 &key, path, -1, 1);
6517                         if (ret > 0 && skinny_metadata && path->slots[0]) {
6518                                 /*
6519                                  * Couldn't find our skinny metadata item,
6520                                  * see if we have ye olde extent item.
6521                                  */
6522                                 path->slots[0]--;
6523                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
6524                                                       path->slots[0]);
6525                                 if (key.objectid == bytenr &&
6526                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
6527                                     key.offset == num_bytes)
6528                                         ret = 0;
6529                         }
6530
6531                         if (ret > 0 && skinny_metadata) {
6532                                 skinny_metadata = false;
6533                                 key.objectid = bytenr;
6534                                 key.type = BTRFS_EXTENT_ITEM_KEY;
6535                                 key.offset = num_bytes;
6536                                 btrfs_release_path(path);
6537                                 ret = btrfs_search_slot(trans, extent_root,
6538                                                         &key, path, -1, 1);
6539                         }
6540
6541                         if (ret) {
6542                                 btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6543                                         ret, bytenr);
6544                                 if (ret > 0)
6545                                         btrfs_print_leaf(extent_root,
6546                                                          path->nodes[0]);
6547                         }
6548                         if (ret < 0) {
6549                                 btrfs_abort_transaction(trans, extent_root, ret);
6550                                 goto out;
6551                         }
6552                         extent_slot = path->slots[0];
6553                 }
6554         } else if (WARN_ON(ret == -ENOENT)) {
6555                 btrfs_print_leaf(extent_root, path->nodes[0]);
6556                 btrfs_err(info,
6557                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
6558                         bytenr, parent, root_objectid, owner_objectid,
6559                         owner_offset);
6560                 btrfs_abort_transaction(trans, extent_root, ret);
6561                 goto out;
6562         } else {
6563                 btrfs_abort_transaction(trans, extent_root, ret);
6564                 goto out;
6565         }
6566
6567         leaf = path->nodes[0];
6568         item_size = btrfs_item_size_nr(leaf, extent_slot);
6569 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6570         if (item_size < sizeof(*ei)) {
6571                 BUG_ON(found_extent || extent_slot != path->slots[0]);
6572                 ret = convert_extent_item_v0(trans, extent_root, path,
6573                                              owner_objectid, 0);
6574                 if (ret < 0) {
6575                         btrfs_abort_transaction(trans, extent_root, ret);
6576                         goto out;
6577                 }
6578
6579                 btrfs_release_path(path);
6580                 path->leave_spinning = 1;
6581
6582                 key.objectid = bytenr;
6583                 key.type = BTRFS_EXTENT_ITEM_KEY;
6584                 key.offset = num_bytes;
6585
6586                 ret = btrfs_search_slot(trans, extent_root, &key, path,
6587                                         -1, 1);
6588                 if (ret) {
6589                         btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6590                                 ret, bytenr);
6591                         btrfs_print_leaf(extent_root, path->nodes[0]);
6592                 }
6593                 if (ret < 0) {
6594                         btrfs_abort_transaction(trans, extent_root, ret);
6595                         goto out;
6596                 }
6597
6598                 extent_slot = path->slots[0];
6599                 leaf = path->nodes[0];
6600                 item_size = btrfs_item_size_nr(leaf, extent_slot);
6601         }
6602 #endif
6603         BUG_ON(item_size < sizeof(*ei));
6604         ei = btrfs_item_ptr(leaf, extent_slot,
6605                             struct btrfs_extent_item);
6606         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
6607             key.type == BTRFS_EXTENT_ITEM_KEY) {
6608                 struct btrfs_tree_block_info *bi;
6609                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
6610                 bi = (struct btrfs_tree_block_info *)(ei + 1);
6611                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
6612         }
6613
6614         refs = btrfs_extent_refs(leaf, ei);
6615         if (refs < refs_to_drop) {
6616                 btrfs_err(info, "trying to drop %d refs but we only have %Lu "
6617                           "for bytenr %Lu", refs_to_drop, refs, bytenr);
6618                 ret = -EINVAL;
6619                 btrfs_abort_transaction(trans, extent_root, ret);
6620                 goto out;
6621         }
6622         refs -= refs_to_drop;
6623
6624         if (refs > 0) {
6625                 if (extent_op)
6626                         __run_delayed_extent_op(extent_op, leaf, ei);
6627                 /*
6628                  * In the case of inline back ref, reference count will
6629                  * be updated by remove_extent_backref
6630                  */
6631                 if (iref) {
6632                         BUG_ON(!found_extent);
6633                 } else {
6634                         btrfs_set_extent_refs(leaf, ei, refs);
6635                         btrfs_mark_buffer_dirty(leaf);
6636                 }
6637                 if (found_extent) {
6638                         ret = remove_extent_backref(trans, extent_root, path,
6639                                                     iref, refs_to_drop,
6640                                                     is_data, &last_ref);
6641                         if (ret) {
6642                                 btrfs_abort_transaction(trans, extent_root, ret);
6643                                 goto out;
6644                         }
6645                 }
6646                 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
6647                                  root_objectid);
6648         } else {
6649                 if (found_extent) {
6650                         BUG_ON(is_data && refs_to_drop !=
6651                                extent_data_ref_count(path, iref));
6652                         if (iref) {
6653                                 BUG_ON(path->slots[0] != extent_slot);
6654                         } else {
6655                                 BUG_ON(path->slots[0] != extent_slot + 1);
6656                                 path->slots[0] = extent_slot;
6657                                 num_to_del = 2;
6658                         }
6659                 }
6660
6661                 last_ref = 1;
6662                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
6663                                       num_to_del);
6664                 if (ret) {
6665                         btrfs_abort_transaction(trans, extent_root, ret);
6666                         goto out;
6667                 }
6668                 btrfs_release_path(path);
6669
6670                 if (is_data) {
6671                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
6672                         if (ret) {
6673                                 btrfs_abort_transaction(trans, extent_root, ret);
6674                                 goto out;
6675                         }
6676                 }
6677
6678                 ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
6679                                              num_bytes);
6680                 if (ret) {
6681                         btrfs_abort_transaction(trans, extent_root, ret);
6682                         goto out;
6683                 }
6684
6685                 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
6686                 if (ret) {
6687                         btrfs_abort_transaction(trans, extent_root, ret);
6688                         goto out;
6689                 }
6690         }
6691         btrfs_release_path(path);
6692
6693 out:
6694         btrfs_free_path(path);
6695         return ret;
6696 }
6697
6698 /*
6699  * when we free an block, it is possible (and likely) that we free the last
6700  * delayed ref for that extent as well.  This searches the delayed ref tree for
6701  * a given extent, and if there are no other delayed refs to be processed, it
6702  * removes it from the tree.
6703  */
6704 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
6705                                       struct btrfs_root *root, u64 bytenr)
6706 {
6707         struct btrfs_delayed_ref_head *head;
6708         struct btrfs_delayed_ref_root *delayed_refs;
6709         int ret = 0;
6710
6711         delayed_refs = &trans->transaction->delayed_refs;
6712         spin_lock(&delayed_refs->lock);
6713         head = btrfs_find_delayed_ref_head(trans, bytenr);
6714         if (!head)
6715                 goto out_delayed_unlock;
6716
6717         spin_lock(&head->lock);
6718         if (!list_empty(&head->ref_list))
6719                 goto out;
6720
6721         if (head->extent_op) {
6722                 if (!head->must_insert_reserved)
6723                         goto out;
6724                 btrfs_free_delayed_extent_op(head->extent_op);
6725                 head->extent_op = NULL;
6726         }
6727
6728         /*
6729          * waiting for the lock here would deadlock.  If someone else has it
6730          * locked they are already in the process of dropping it anyway
6731          */
6732         if (!mutex_trylock(&head->mutex))
6733                 goto out;
6734
6735         /*
6736          * at this point we have a head with no other entries.  Go
6737          * ahead and process it.
6738          */
6739         head->node.in_tree = 0;
6740         rb_erase(&head->href_node, &delayed_refs->href_root);
6741
6742         atomic_dec(&delayed_refs->num_entries);
6743
6744         /*
6745          * we don't take a ref on the node because we're removing it from the
6746          * tree, so we just steal the ref the tree was holding.
6747          */
6748         delayed_refs->num_heads--;
6749         if (head->processing == 0)
6750                 delayed_refs->num_heads_ready--;
6751         head->processing = 0;
6752         spin_unlock(&head->lock);
6753         spin_unlock(&delayed_refs->lock);
6754
6755         BUG_ON(head->extent_op);
6756         if (head->must_insert_reserved)
6757                 ret = 1;
6758
6759         mutex_unlock(&head->mutex);
6760         btrfs_put_delayed_ref(&head->node);
6761         return ret;
6762 out:
6763         spin_unlock(&head->lock);
6764
6765 out_delayed_unlock:
6766         spin_unlock(&delayed_refs->lock);
6767         return 0;
6768 }
6769
6770 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6771                            struct btrfs_root *root,
6772                            struct extent_buffer *buf,
6773                            u64 parent, int last_ref)
6774 {
6775         int pin = 1;
6776         int ret;
6777
6778         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6779                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6780                                         buf->start, buf->len,
6781                                         parent, root->root_key.objectid,
6782                                         btrfs_header_level(buf),
6783                                         BTRFS_DROP_DELAYED_REF, NULL);
6784                 BUG_ON(ret); /* -ENOMEM */
6785         }
6786
6787         if (!last_ref)
6788                 return;
6789
6790         if (btrfs_header_generation(buf) == trans->transid) {
6791                 struct btrfs_block_group_cache *cache;
6792
6793                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6794                         ret = check_ref_cleanup(trans, root, buf->start);
6795                         if (!ret)
6796                                 goto out;
6797                 }
6798
6799                 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
6800
6801                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
6802                         pin_down_extent(root, cache, buf->start, buf->len, 1);
6803                         btrfs_put_block_group(cache);
6804                         goto out;
6805                 }
6806
6807                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
6808
6809                 btrfs_add_free_space(cache, buf->start, buf->len);
6810                 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
6811                 btrfs_put_block_group(cache);
6812                 trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
6813                 pin = 0;
6814         }
6815 out:
6816         if (pin)
6817                 add_pinned_bytes(root->fs_info, buf->len,
6818                                  btrfs_header_level(buf),
6819                                  root->root_key.objectid);
6820
6821         /*
6822          * Deleting the buffer, clear the corrupt flag since it doesn't matter
6823          * anymore.
6824          */
6825         clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
6826 }
6827
6828 /* Can return -ENOMEM */
6829 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6830                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
6831                       u64 owner, u64 offset)
6832 {
6833         int ret;
6834         struct btrfs_fs_info *fs_info = root->fs_info;
6835
6836         if (btrfs_test_is_dummy_root(root))
6837                 return 0;
6838
6839         add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
6840
6841         /*
6842          * tree log blocks never actually go into the extent allocation
6843          * tree, just update pinning info and exit early.
6844          */
6845         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
6846                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
6847                 /* unlocks the pinned mutex */
6848                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
6849                 ret = 0;
6850         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6851                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
6852                                         num_bytes,
6853                                         parent, root_objectid, (int)owner,
6854                                         BTRFS_DROP_DELAYED_REF, NULL);
6855         } else {
6856                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
6857                                                 num_bytes,
6858                                                 parent, root_objectid, owner,
6859                                                 offset, 0,
6860                                                 BTRFS_DROP_DELAYED_REF, NULL);
6861         }
6862         return ret;
6863 }
6864
6865 /*
6866  * when we wait for progress in the block group caching, its because
6867  * our allocation attempt failed at least once.  So, we must sleep
6868  * and let some progress happen before we try again.
6869  *
6870  * This function will sleep at least once waiting for new free space to
6871  * show up, and then it will check the block group free space numbers
6872  * for our min num_bytes.  Another option is to have it go ahead
6873  * and look in the rbtree for a free extent of a given size, but this
6874  * is a good start.
6875  *
6876  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
6877  * any of the information in this block group.
6878  */
6879 static noinline void
6880 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
6881                                 u64 num_bytes)
6882 {
6883         struct btrfs_caching_control *caching_ctl;
6884
6885         caching_ctl = get_caching_control(cache);
6886         if (!caching_ctl)
6887                 return;
6888
6889         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
6890                    (cache->free_space_ctl->free_space >= num_bytes));
6891
6892         put_caching_control(caching_ctl);
6893 }
6894
6895 static noinline int
6896 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
6897 {
6898         struct btrfs_caching_control *caching_ctl;
6899         int ret = 0;
6900
6901         caching_ctl = get_caching_control(cache);
6902         if (!caching_ctl)
6903                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
6904
6905         wait_event(caching_ctl->wait, block_group_cache_done(cache));
6906         if (cache->cached == BTRFS_CACHE_ERROR)
6907                 ret = -EIO;
6908         put_caching_control(caching_ctl);
6909         return ret;
6910 }
6911
6912 int __get_raid_index(u64 flags)
6913 {
6914         if (flags & BTRFS_BLOCK_GROUP_RAID10)
6915                 return BTRFS_RAID_RAID10;
6916         else if (flags & BTRFS_BLOCK_GROUP_RAID1)
6917                 return BTRFS_RAID_RAID1;
6918         else if (flags & BTRFS_BLOCK_GROUP_DUP)
6919                 return BTRFS_RAID_DUP;
6920         else if (flags & BTRFS_BLOCK_GROUP_RAID0)
6921                 return BTRFS_RAID_RAID0;
6922         else if (flags & BTRFS_BLOCK_GROUP_RAID5)
6923                 return BTRFS_RAID_RAID5;
6924         else if (flags & BTRFS_BLOCK_GROUP_RAID6)
6925                 return BTRFS_RAID_RAID6;
6926
6927         return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
6928 }
6929
6930 int get_block_group_index(struct btrfs_block_group_cache *cache)
6931 {
6932         return __get_raid_index(cache->flags);
6933 }
6934
6935 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
6936         [BTRFS_RAID_RAID10]     = "raid10",
6937         [BTRFS_RAID_RAID1]      = "raid1",
6938         [BTRFS_RAID_DUP]        = "dup",
6939         [BTRFS_RAID_RAID0]      = "raid0",
6940         [BTRFS_RAID_SINGLE]     = "single",
6941         [BTRFS_RAID_RAID5]      = "raid5",
6942         [BTRFS_RAID_RAID6]      = "raid6",
6943 };
6944
6945 static const char *get_raid_name(enum btrfs_raid_types type)
6946 {
6947         if (type >= BTRFS_NR_RAID_TYPES)
6948                 return NULL;
6949
6950         return btrfs_raid_type_names[type];
6951 }
6952
6953 enum btrfs_loop_type {
6954         LOOP_CACHING_NOWAIT = 0,
6955         LOOP_CACHING_WAIT = 1,
6956         LOOP_ALLOC_CHUNK = 2,
6957         LOOP_NO_EMPTY_SIZE = 3,
6958 };
6959
6960 static inline void
6961 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
6962                        int delalloc)
6963 {
6964         if (delalloc)
6965                 down_read(&cache->data_rwsem);
6966 }
6967
6968 static inline void
6969 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
6970                        int delalloc)
6971 {
6972         btrfs_get_block_group(cache);
6973         if (delalloc)
6974                 down_read(&cache->data_rwsem);
6975 }
6976
6977 static struct btrfs_block_group_cache *
6978 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
6979                    struct btrfs_free_cluster *cluster,
6980                    int delalloc)
6981 {
6982         struct btrfs_block_group_cache *used_bg;
6983         bool locked = false;
6984 again:
6985         spin_lock(&cluster->refill_lock);
6986         if (locked) {
6987                 if (used_bg == cluster->block_group)
6988                         return used_bg;
6989
6990                 up_read(&used_bg->data_rwsem);
6991                 btrfs_put_block_group(used_bg);
6992         }
6993
6994         used_bg = cluster->block_group;
6995         if (!used_bg)
6996                 return NULL;
6997
6998         if (used_bg == block_group)
6999                 return used_bg;
7000
7001         btrfs_get_block_group(used_bg);
7002
7003         if (!delalloc)
7004                 return used_bg;
7005
7006         if (down_read_trylock(&used_bg->data_rwsem))
7007                 return used_bg;
7008
7009         spin_unlock(&cluster->refill_lock);
7010         down_read(&used_bg->data_rwsem);
7011         locked = true;
7012         goto again;
7013 }
7014
7015 static inline void
7016 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7017                          int delalloc)
7018 {
7019         if (delalloc)
7020                 up_read(&cache->data_rwsem);
7021         btrfs_put_block_group(cache);
7022 }
7023
7024 /*
7025  * walks the btree of allocated extents and find a hole of a given size.
7026  * The key ins is changed to record the hole:
7027  * ins->objectid == start position
7028  * ins->flags = BTRFS_EXTENT_ITEM_KEY
7029  * ins->offset == the size of the hole.
7030  * Any available blocks before search_start are skipped.
7031  *
7032  * If there is no suitable free space, we will record the max size of
7033  * the free space extent currently.
7034  */
7035 static noinline int find_free_extent(struct btrfs_root *orig_root,
7036                                      u64 num_bytes, u64 empty_size,
7037                                      u64 hint_byte, struct btrfs_key *ins,
7038                                      u64 flags, int delalloc)
7039 {
7040         int ret = 0;
7041         struct btrfs_root *root = orig_root->fs_info->extent_root;
7042         struct btrfs_free_cluster *last_ptr = NULL;
7043         struct btrfs_block_group_cache *block_group = NULL;
7044         u64 search_start = 0;
7045         u64 max_extent_size = 0;
7046         u64 empty_cluster = 0;
7047         struct btrfs_space_info *space_info;
7048         int loop = 0;
7049         int index = __get_raid_index(flags);
7050         int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
7051                 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
7052         bool failed_cluster_refill = false;
7053         bool failed_alloc = false;
7054         bool use_cluster = true;
7055         bool have_caching_bg = false;
7056         bool orig_have_caching_bg = false;
7057         bool full_search = false;
7058
7059         WARN_ON(num_bytes < root->sectorsize);
7060         ins->type = BTRFS_EXTENT_ITEM_KEY;
7061         ins->objectid = 0;
7062         ins->offset = 0;
7063
7064         trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
7065
7066         space_info = __find_space_info(root->fs_info, flags);
7067         if (!space_info) {
7068                 btrfs_err(root->fs_info, "No space info for %llu", flags);
7069                 return -ENOSPC;
7070         }
7071
7072         /*
7073          * If our free space is heavily fragmented we may not be able to make
7074          * big contiguous allocations, so instead of doing the expensive search
7075          * for free space, simply return ENOSPC with our max_extent_size so we
7076          * can go ahead and search for a more manageable chunk.
7077          *
7078          * If our max_extent_size is large enough for our allocation simply
7079          * disable clustering since we will likely not be able to find enough
7080          * space to create a cluster and induce latency trying.
7081          */
7082         if (unlikely(space_info->max_extent_size)) {
7083                 spin_lock(&space_info->lock);
7084                 if (space_info->max_extent_size &&
7085                     num_bytes > space_info->max_extent_size) {
7086                         ins->offset = space_info->max_extent_size;
7087                         spin_unlock(&space_info->lock);
7088                         return -ENOSPC;
7089                 } else if (space_info->max_extent_size) {
7090                         use_cluster = false;
7091                 }
7092                 spin_unlock(&space_info->lock);
7093         }
7094
7095         last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
7096         if (last_ptr) {
7097                 spin_lock(&last_ptr->lock);
7098                 if (last_ptr->block_group)
7099                         hint_byte = last_ptr->window_start;
7100                 if (last_ptr->fragmented) {
7101                         /*
7102                          * We still set window_start so we can keep track of the
7103                          * last place we found an allocation to try and save
7104                          * some time.
7105                          */
7106                         hint_byte = last_ptr->window_start;
7107                         use_cluster = false;
7108                 }
7109                 spin_unlock(&last_ptr->lock);
7110         }
7111
7112         search_start = max(search_start, first_logical_byte(root, 0));
7113         search_start = max(search_start, hint_byte);
7114         if (search_start == hint_byte) {
7115                 block_group = btrfs_lookup_block_group(root->fs_info,
7116                                                        search_start);
7117                 /*
7118                  * we don't want to use the block group if it doesn't match our
7119                  * allocation bits, or if its not cached.
7120                  *
7121                  * However if we are re-searching with an ideal block group
7122                  * picked out then we don't care that the block group is cached.
7123                  */
7124                 if (block_group && block_group_bits(block_group, flags) &&
7125                     block_group->cached != BTRFS_CACHE_NO) {
7126                         down_read(&space_info->groups_sem);
7127                         if (list_empty(&block_group->list) ||
7128                             block_group->ro) {
7129                                 /*
7130                                  * someone is removing this block group,
7131                                  * we can't jump into the have_block_group
7132                                  * target because our list pointers are not
7133                                  * valid
7134                                  */
7135                                 btrfs_put_block_group(block_group);
7136                                 up_read(&space_info->groups_sem);
7137                         } else {
7138                                 index = get_block_group_index(block_group);
7139                                 btrfs_lock_block_group(block_group, delalloc);
7140                                 goto have_block_group;
7141                         }
7142                 } else if (block_group) {
7143                         btrfs_put_block_group(block_group);
7144                 }
7145         }
7146 search:
7147         have_caching_bg = false;
7148         if (index == 0 || index == __get_raid_index(flags))
7149                 full_search = true;
7150         down_read(&space_info->groups_sem);
7151         list_for_each_entry(block_group, &space_info->block_groups[index],
7152                             list) {
7153                 u64 offset;
7154                 int cached;
7155
7156                 btrfs_grab_block_group(block_group, delalloc);
7157                 search_start = block_group->key.objectid;
7158
7159                 /*
7160                  * this can happen if we end up cycling through all the
7161                  * raid types, but we want to make sure we only allocate
7162                  * for the proper type.
7163                  */
7164                 if (!block_group_bits(block_group, flags)) {
7165                     u64 extra = BTRFS_BLOCK_GROUP_DUP |
7166                                 BTRFS_BLOCK_GROUP_RAID1 |
7167                                 BTRFS_BLOCK_GROUP_RAID5 |
7168                                 BTRFS_BLOCK_GROUP_RAID6 |
7169                                 BTRFS_BLOCK_GROUP_RAID10;
7170
7171                         /*
7172                          * if they asked for extra copies and this block group
7173                          * doesn't provide them, bail.  This does allow us to
7174                          * fill raid0 from raid1.
7175                          */
7176                         if ((flags & extra) && !(block_group->flags & extra))
7177                                 goto loop;
7178                 }
7179
7180 have_block_group:
7181                 cached = block_group_cache_done(block_group);
7182                 if (unlikely(!cached)) {
7183                         have_caching_bg = true;
7184                         ret = cache_block_group(block_group, 0);
7185                         BUG_ON(ret < 0);
7186                         ret = 0;
7187                 }
7188
7189                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7190                         goto loop;
7191                 if (unlikely(block_group->ro))
7192                         goto loop;
7193
7194                 /*
7195                  * Ok we want to try and use the cluster allocator, so
7196                  * lets look there
7197                  */
7198                 if (last_ptr && use_cluster) {
7199                         struct btrfs_block_group_cache *used_block_group;
7200                         unsigned long aligned_cluster;
7201                         /*
7202                          * the refill lock keeps out other
7203                          * people trying to start a new cluster
7204                          */
7205                         used_block_group = btrfs_lock_cluster(block_group,
7206                                                               last_ptr,
7207                                                               delalloc);
7208                         if (!used_block_group)
7209                                 goto refill_cluster;
7210
7211                         if (used_block_group != block_group &&
7212                             (used_block_group->ro ||
7213                              !block_group_bits(used_block_group, flags)))
7214                                 goto release_cluster;
7215
7216                         offset = btrfs_alloc_from_cluster(used_block_group,
7217                                                 last_ptr,
7218                                                 num_bytes,
7219                                                 used_block_group->key.objectid,
7220                                                 &max_extent_size);
7221                         if (offset) {
7222                                 /* we have a block, we're done */
7223                                 spin_unlock(&last_ptr->refill_lock);
7224                                 trace_btrfs_reserve_extent_cluster(root,
7225                                                 used_block_group,
7226                                                 search_start, num_bytes);
7227                                 if (used_block_group != block_group) {
7228                                         btrfs_release_block_group(block_group,
7229                                                                   delalloc);
7230                                         block_group = used_block_group;
7231                                 }
7232                                 goto checks;
7233                         }
7234
7235                         WARN_ON(last_ptr->block_group != used_block_group);
7236 release_cluster:
7237                         /* If we are on LOOP_NO_EMPTY_SIZE, we can't
7238                          * set up a new clusters, so lets just skip it
7239                          * and let the allocator find whatever block
7240                          * it can find.  If we reach this point, we
7241                          * will have tried the cluster allocator
7242                          * plenty of times and not have found
7243                          * anything, so we are likely way too
7244                          * fragmented for the clustering stuff to find
7245                          * anything.
7246                          *
7247                          * However, if the cluster is taken from the
7248                          * current block group, release the cluster
7249                          * first, so that we stand a better chance of
7250                          * succeeding in the unclustered
7251                          * allocation.  */
7252                         if (loop >= LOOP_NO_EMPTY_SIZE &&
7253                             used_block_group != block_group) {
7254                                 spin_unlock(&last_ptr->refill_lock);
7255                                 btrfs_release_block_group(used_block_group,
7256                                                           delalloc);
7257                                 goto unclustered_alloc;
7258                         }
7259
7260                         /*
7261                          * this cluster didn't work out, free it and
7262                          * start over
7263                          */
7264                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7265
7266                         if (used_block_group != block_group)
7267                                 btrfs_release_block_group(used_block_group,
7268                                                           delalloc);
7269 refill_cluster:
7270                         if (loop >= LOOP_NO_EMPTY_SIZE) {
7271                                 spin_unlock(&last_ptr->refill_lock);
7272                                 goto unclustered_alloc;
7273                         }
7274
7275                         aligned_cluster = max_t(unsigned long,
7276                                                 empty_cluster + empty_size,
7277                                               block_group->full_stripe_len);
7278
7279                         /* allocate a cluster in this block group */
7280                         ret = btrfs_find_space_cluster(root, block_group,
7281                                                        last_ptr, search_start,
7282                                                        num_bytes,
7283                                                        aligned_cluster);
7284                         if (ret == 0) {
7285                                 /*
7286                                  * now pull our allocation out of this
7287                                  * cluster
7288                                  */
7289                                 offset = btrfs_alloc_from_cluster(block_group,
7290                                                         last_ptr,
7291                                                         num_bytes,
7292                                                         search_start,
7293                                                         &max_extent_size);
7294                                 if (offset) {
7295                                         /* we found one, proceed */
7296                                         spin_unlock(&last_ptr->refill_lock);
7297                                         trace_btrfs_reserve_extent_cluster(root,
7298                                                 block_group, search_start,
7299                                                 num_bytes);
7300                                         goto checks;
7301                                 }
7302                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
7303                                    && !failed_cluster_refill) {
7304                                 spin_unlock(&last_ptr->refill_lock);
7305
7306                                 failed_cluster_refill = true;
7307                                 wait_block_group_cache_progress(block_group,
7308                                        num_bytes + empty_cluster + empty_size);
7309                                 goto have_block_group;
7310                         }
7311
7312                         /*
7313                          * at this point we either didn't find a cluster
7314                          * or we weren't able to allocate a block from our
7315                          * cluster.  Free the cluster we've been trying
7316                          * to use, and go to the next block group
7317                          */
7318                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7319                         spin_unlock(&last_ptr->refill_lock);
7320                         goto loop;
7321                 }
7322
7323 unclustered_alloc:
7324                 /*
7325                  * We are doing an unclustered alloc, set the fragmented flag so
7326                  * we don't bother trying to setup a cluster again until we get
7327                  * more space.
7328                  */
7329                 if (unlikely(last_ptr)) {
7330                         spin_lock(&last_ptr->lock);
7331                         last_ptr->fragmented = 1;
7332                         spin_unlock(&last_ptr->lock);
7333                 }
7334                 spin_lock(&block_group->free_space_ctl->tree_lock);
7335                 if (cached &&
7336                     block_group->free_space_ctl->free_space <
7337                     num_bytes + empty_cluster + empty_size) {
7338                         if (block_group->free_space_ctl->free_space >
7339                             max_extent_size)
7340                                 max_extent_size =
7341                                         block_group->free_space_ctl->free_space;
7342                         spin_unlock(&block_group->free_space_ctl->tree_lock);
7343                         goto loop;
7344                 }
7345                 spin_unlock(&block_group->free_space_ctl->tree_lock);
7346
7347                 offset = btrfs_find_space_for_alloc(block_group, search_start,
7348                                                     num_bytes, empty_size,
7349                                                     &max_extent_size);
7350                 /*
7351                  * If we didn't find a chunk, and we haven't failed on this
7352                  * block group before, and this block group is in the middle of
7353                  * caching and we are ok with waiting, then go ahead and wait
7354                  * for progress to be made, and set failed_alloc to true.
7355                  *
7356                  * If failed_alloc is true then we've already waited on this
7357                  * block group once and should move on to the next block group.
7358                  */
7359                 if (!offset && !failed_alloc && !cached &&
7360                     loop > LOOP_CACHING_NOWAIT) {
7361                         wait_block_group_cache_progress(block_group,
7362                                                 num_bytes + empty_size);
7363                         failed_alloc = true;
7364                         goto have_block_group;
7365                 } else if (!offset) {
7366                         goto loop;
7367                 }
7368 checks:
7369                 search_start = ALIGN(offset, root->stripesize);
7370
7371                 /* move on to the next group */
7372                 if (search_start + num_bytes >
7373                     block_group->key.objectid + block_group->key.offset) {
7374                         btrfs_add_free_space(block_group, offset, num_bytes);
7375                         goto loop;
7376                 }
7377
7378                 if (offset < search_start)
7379                         btrfs_add_free_space(block_group, offset,
7380                                              search_start - offset);
7381                 BUG_ON(offset > search_start);
7382
7383                 ret = btrfs_update_reserved_bytes(block_group, num_bytes,
7384                                                   alloc_type, delalloc);
7385                 if (ret == -EAGAIN) {
7386                         btrfs_add_free_space(block_group, offset, num_bytes);
7387                         goto loop;
7388                 }
7389
7390                 /* we are all good, lets return */
7391                 ins->objectid = search_start;
7392                 ins->offset = num_bytes;
7393
7394                 trace_btrfs_reserve_extent(orig_root, block_group,
7395                                            search_start, num_bytes);
7396                 btrfs_release_block_group(block_group, delalloc);
7397                 break;
7398 loop:
7399                 failed_cluster_refill = false;
7400                 failed_alloc = false;
7401                 BUG_ON(index != get_block_group_index(block_group));
7402                 btrfs_release_block_group(block_group, delalloc);
7403         }
7404         up_read(&space_info->groups_sem);
7405
7406         if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7407                 && !orig_have_caching_bg)
7408                 orig_have_caching_bg = true;
7409
7410         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7411                 goto search;
7412
7413         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7414                 goto search;
7415
7416         /*
7417          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7418          *                      caching kthreads as we move along
7419          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7420          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7421          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7422          *                      again
7423          */
7424         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7425                 index = 0;
7426                 if (loop == LOOP_CACHING_NOWAIT) {
7427                         /*
7428                          * We want to skip the LOOP_CACHING_WAIT step if we
7429                          * don't have any unached bgs and we've alrelady done a
7430                          * full search through.
7431                          */
7432                         if (orig_have_caching_bg || !full_search)
7433                                 loop = LOOP_CACHING_WAIT;
7434                         else
7435                                 loop = LOOP_ALLOC_CHUNK;
7436                 } else {
7437                         loop++;
7438                 }
7439
7440                 if (loop == LOOP_ALLOC_CHUNK) {
7441                         struct btrfs_trans_handle *trans;
7442                         int exist = 0;
7443
7444                         trans = current->journal_info;
7445                         if (trans)
7446                                 exist = 1;
7447                         else
7448                                 trans = btrfs_join_transaction(root);
7449
7450                         if (IS_ERR(trans)) {
7451                                 ret = PTR_ERR(trans);
7452                                 goto out;
7453                         }
7454
7455                         ret = do_chunk_alloc(trans, root, flags,
7456                                              CHUNK_ALLOC_FORCE);
7457
7458                         /*
7459                          * If we can't allocate a new chunk we've already looped
7460                          * through at least once, move on to the NO_EMPTY_SIZE
7461                          * case.
7462                          */
7463                         if (ret == -ENOSPC)
7464                                 loop = LOOP_NO_EMPTY_SIZE;
7465
7466                         /*
7467                          * Do not bail out on ENOSPC since we
7468                          * can do more things.
7469                          */
7470                         if (ret < 0 && ret != -ENOSPC)
7471                                 btrfs_abort_transaction(trans,
7472                                                         root, ret);
7473                         else
7474                                 ret = 0;
7475                         if (!exist)
7476                                 btrfs_end_transaction(trans, root);
7477                         if (ret)
7478                                 goto out;
7479                 }
7480
7481                 if (loop == LOOP_NO_EMPTY_SIZE) {
7482                         /*
7483                          * Don't loop again if we already have no empty_size and
7484                          * no empty_cluster.
7485                          */
7486                         if (empty_size == 0 &&
7487                             empty_cluster == 0) {
7488                                 ret = -ENOSPC;
7489                                 goto out;
7490                         }
7491                         empty_size = 0;
7492                         empty_cluster = 0;
7493                 }
7494
7495                 goto search;
7496         } else if (!ins->objectid) {
7497                 ret = -ENOSPC;
7498         } else if (ins->objectid) {
7499                 if (!use_cluster && last_ptr) {
7500                         spin_lock(&last_ptr->lock);
7501                         last_ptr->window_start = ins->objectid;
7502                         spin_unlock(&last_ptr->lock);
7503                 }
7504                 ret = 0;
7505         }
7506 out:
7507         if (ret == -ENOSPC) {
7508                 spin_lock(&space_info->lock);
7509                 space_info->max_extent_size = max_extent_size;
7510                 spin_unlock(&space_info->lock);
7511                 ins->offset = max_extent_size;
7512         }
7513         return ret;
7514 }
7515
7516 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
7517                             int dump_block_groups)
7518 {
7519         struct btrfs_block_group_cache *cache;
7520         int index = 0;
7521
7522         spin_lock(&info->lock);
7523         printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
7524                info->flags,
7525                info->total_bytes - info->bytes_used - info->bytes_pinned -
7526                info->bytes_reserved - info->bytes_readonly,
7527                (info->full) ? "" : "not ");
7528         printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
7529                "reserved=%llu, may_use=%llu, readonly=%llu\n",
7530                info->total_bytes, info->bytes_used, info->bytes_pinned,
7531                info->bytes_reserved, info->bytes_may_use,
7532                info->bytes_readonly);
7533         spin_unlock(&info->lock);
7534
7535         if (!dump_block_groups)
7536                 return;
7537
7538         down_read(&info->groups_sem);
7539 again:
7540         list_for_each_entry(cache, &info->block_groups[index], list) {
7541                 spin_lock(&cache->lock);
7542                 printk(KERN_INFO "BTRFS: "
7543                            "block group %llu has %llu bytes, "
7544                            "%llu used %llu pinned %llu reserved %s\n",
7545                        cache->key.objectid, cache->key.offset,
7546                        btrfs_block_group_used(&cache->item), cache->pinned,
7547                        cache->reserved, cache->ro ? "[readonly]" : "");
7548                 btrfs_dump_free_space(cache, bytes);
7549                 spin_unlock(&cache->lock);
7550         }
7551         if (++index < BTRFS_NR_RAID_TYPES)
7552                 goto again;
7553         up_read(&info->groups_sem);
7554 }
7555
7556 int btrfs_reserve_extent(struct btrfs_root *root,
7557                          u64 num_bytes, u64 min_alloc_size,
7558                          u64 empty_size, u64 hint_byte,
7559                          struct btrfs_key *ins, int is_data, int delalloc)
7560 {
7561         bool final_tried = num_bytes == min_alloc_size;
7562         u64 flags;
7563         int ret;
7564
7565         flags = btrfs_get_alloc_profile(root, is_data);
7566 again:
7567         WARN_ON(num_bytes < root->sectorsize);
7568         ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
7569                                flags, delalloc);
7570
7571         if (ret == -ENOSPC) {
7572                 if (!final_tried && ins->offset) {
7573                         num_bytes = min(num_bytes >> 1, ins->offset);
7574                         num_bytes = round_down(num_bytes, root->sectorsize);
7575                         num_bytes = max(num_bytes, min_alloc_size);
7576                         if (num_bytes == min_alloc_size)
7577                                 final_tried = true;
7578                         goto again;
7579                 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
7580                         struct btrfs_space_info *sinfo;
7581
7582                         sinfo = __find_space_info(root->fs_info, flags);
7583                         btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
7584                                 flags, num_bytes);
7585                         if (sinfo)
7586                                 dump_space_info(sinfo, num_bytes, 1);
7587                 }
7588         }
7589
7590         return ret;
7591 }
7592
7593 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
7594                                         u64 start, u64 len,
7595                                         int pin, int delalloc)
7596 {
7597         struct btrfs_block_group_cache *cache;
7598         int ret = 0;
7599
7600         cache = btrfs_lookup_block_group(root->fs_info, start);
7601         if (!cache) {
7602                 btrfs_err(root->fs_info, "Unable to find block group for %llu",
7603                         start);
7604                 return -ENOSPC;
7605         }
7606
7607         if (pin)
7608                 pin_down_extent(root, cache, start, len, 1);
7609         else {
7610                 if (btrfs_test_opt(root, DISCARD))
7611                         ret = btrfs_discard_extent(root, start, len, NULL);
7612                 btrfs_add_free_space(cache, start, len);
7613                 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
7614         }
7615
7616         btrfs_put_block_group(cache);
7617
7618         trace_btrfs_reserved_extent_free(root, start, len);
7619
7620         return ret;
7621 }
7622
7623 int btrfs_free_reserved_extent(struct btrfs_root *root,
7624                                u64 start, u64 len, int delalloc)
7625 {
7626         return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
7627 }
7628
7629 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
7630                                        u64 start, u64 len)
7631 {
7632         return __btrfs_free_reserved_extent(root, start, len, 1, 0);
7633 }
7634
7635 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7636                                       struct btrfs_root *root,
7637                                       u64 parent, u64 root_objectid,
7638                                       u64 flags, u64 owner, u64 offset,
7639                                       struct btrfs_key *ins, int ref_mod)
7640 {
7641         int ret;
7642         struct btrfs_fs_info *fs_info = root->fs_info;
7643         struct btrfs_extent_item *extent_item;
7644         struct btrfs_extent_inline_ref *iref;
7645         struct btrfs_path *path;
7646         struct extent_buffer *leaf;
7647         int type;
7648         u32 size;
7649
7650         if (parent > 0)
7651                 type = BTRFS_SHARED_DATA_REF_KEY;
7652         else
7653                 type = BTRFS_EXTENT_DATA_REF_KEY;
7654
7655         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7656
7657         path = btrfs_alloc_path();
7658         if (!path)
7659                 return -ENOMEM;
7660
7661         path->leave_spinning = 1;
7662         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7663                                       ins, size);
7664         if (ret) {
7665                 btrfs_free_path(path);
7666                 return ret;
7667         }
7668
7669         leaf = path->nodes[0];
7670         extent_item = btrfs_item_ptr(leaf, path->slots[0],
7671                                      struct btrfs_extent_item);
7672         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
7673         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7674         btrfs_set_extent_flags(leaf, extent_item,
7675                                flags | BTRFS_EXTENT_FLAG_DATA);
7676
7677         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7678         btrfs_set_extent_inline_ref_type(leaf, iref, type);
7679         if (parent > 0) {
7680                 struct btrfs_shared_data_ref *ref;
7681                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
7682                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7683                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
7684         } else {
7685                 struct btrfs_extent_data_ref *ref;
7686                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
7687                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
7688                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
7689                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
7690                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
7691         }
7692
7693         btrfs_mark_buffer_dirty(path->nodes[0]);
7694         btrfs_free_path(path);
7695
7696         ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
7697                                           ins->offset);
7698         if (ret)
7699                 return ret;
7700
7701         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
7702         if (ret) { /* -ENOENT, logic error */
7703                 btrfs_err(fs_info, "update block group failed for %llu %llu",
7704                         ins->objectid, ins->offset);
7705                 BUG();
7706         }
7707         trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
7708         return ret;
7709 }
7710
7711 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7712                                      struct btrfs_root *root,
7713                                      u64 parent, u64 root_objectid,
7714                                      u64 flags, struct btrfs_disk_key *key,
7715                                      int level, struct btrfs_key *ins)
7716 {
7717         int ret;
7718         struct btrfs_fs_info *fs_info = root->fs_info;
7719         struct btrfs_extent_item *extent_item;
7720         struct btrfs_tree_block_info *block_info;
7721         struct btrfs_extent_inline_ref *iref;
7722         struct btrfs_path *path;
7723         struct extent_buffer *leaf;
7724         u32 size = sizeof(*extent_item) + sizeof(*iref);
7725         u64 num_bytes = ins->offset;
7726         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7727                                                  SKINNY_METADATA);
7728
7729         if (!skinny_metadata)
7730                 size += sizeof(*block_info);
7731
7732         path = btrfs_alloc_path();
7733         if (!path) {
7734                 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7735                                                    root->nodesize);
7736                 return -ENOMEM;
7737         }
7738
7739         path->leave_spinning = 1;
7740         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7741                                       ins, size);
7742         if (ret) {
7743                 btrfs_free_path(path);
7744                 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7745                                                    root->nodesize);
7746                 return ret;
7747         }
7748
7749         leaf = path->nodes[0];
7750         extent_item = btrfs_item_ptr(leaf, path->slots[0],
7751                                      struct btrfs_extent_item);
7752         btrfs_set_extent_refs(leaf, extent_item, 1);
7753         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7754         btrfs_set_extent_flags(leaf, extent_item,
7755                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
7756
7757         if (skinny_metadata) {
7758                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7759                 num_bytes = root->nodesize;
7760         } else {
7761                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
7762                 btrfs_set_tree_block_key(leaf, block_info, key);
7763                 btrfs_set_tree_block_level(leaf, block_info, level);
7764                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
7765         }
7766
7767         if (parent > 0) {
7768                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
7769                 btrfs_set_extent_inline_ref_type(leaf, iref,
7770                                                  BTRFS_SHARED_BLOCK_REF_KEY);
7771                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7772         } else {
7773                 btrfs_set_extent_inline_ref_type(leaf, iref,
7774                                                  BTRFS_TREE_BLOCK_REF_KEY);
7775                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
7776         }
7777
7778         btrfs_mark_buffer_dirty(leaf);
7779         btrfs_free_path(path);
7780
7781         ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
7782                                           num_bytes);
7783         if (ret)
7784                 return ret;
7785
7786         ret = update_block_group(trans, root, ins->objectid, root->nodesize,
7787                                  1);
7788         if (ret) { /* -ENOENT, logic error */
7789                 btrfs_err(fs_info, "update block group failed for %llu %llu",
7790                         ins->objectid, ins->offset);
7791                 BUG();
7792         }
7793
7794         trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
7795         return ret;
7796 }
7797
7798 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7799                                      struct btrfs_root *root,
7800                                      u64 root_objectid, u64 owner,
7801                                      u64 offset, u64 ram_bytes,
7802                                      struct btrfs_key *ins)
7803 {
7804         int ret;
7805
7806         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
7807
7808         ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
7809                                          ins->offset, 0,
7810                                          root_objectid, owner, offset,
7811                                          ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
7812                                          NULL);
7813         return ret;
7814 }
7815
7816 /*
7817  * this is used by the tree logging recovery code.  It records that
7818  * an extent has been allocated and makes sure to clear the free
7819  * space cache bits as well
7820  */
7821 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
7822                                    struct btrfs_root *root,
7823                                    u64 root_objectid, u64 owner, u64 offset,
7824                                    struct btrfs_key *ins)
7825 {
7826         int ret;
7827         struct btrfs_block_group_cache *block_group;
7828
7829         /*
7830          * Mixed block groups will exclude before processing the log so we only
7831          * need to do the exlude dance if this fs isn't mixed.
7832          */
7833         if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
7834                 ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
7835                 if (ret)
7836                         return ret;
7837         }
7838
7839         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
7840         if (!block_group)
7841                 return -EINVAL;
7842
7843         ret = btrfs_update_reserved_bytes(block_group, ins->offset,
7844                                           RESERVE_ALLOC_NO_ACCOUNT, 0);
7845         BUG_ON(ret); /* logic error */
7846         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
7847                                          0, owner, offset, ins, 1);
7848         btrfs_put_block_group(block_group);
7849         return ret;
7850 }
7851
7852 static struct extent_buffer *
7853 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7854                       u64 bytenr, int level)
7855 {
7856         struct extent_buffer *buf;
7857
7858         buf = btrfs_find_create_tree_block(root, bytenr);
7859         if (!buf)
7860                 return ERR_PTR(-ENOMEM);
7861         btrfs_set_header_generation(buf, trans->transid);
7862         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
7863         btrfs_tree_lock(buf);
7864         clean_tree_block(trans, root->fs_info, buf);
7865         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
7866
7867         btrfs_set_lock_blocking(buf);
7868         btrfs_set_buffer_uptodate(buf);
7869
7870         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
7871                 buf->log_index = root->log_transid % 2;
7872                 /*
7873                  * we allow two log transactions at a time, use different
7874                  * EXENT bit to differentiate dirty pages.
7875                  */
7876                 if (buf->log_index == 0)
7877                         set_extent_dirty(&root->dirty_log_pages, buf->start,
7878                                         buf->start + buf->len - 1, GFP_NOFS);
7879                 else
7880                         set_extent_new(&root->dirty_log_pages, buf->start,
7881                                         buf->start + buf->len - 1, GFP_NOFS);
7882         } else {
7883                 buf->log_index = -1;
7884                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
7885                          buf->start + buf->len - 1, GFP_NOFS);
7886         }
7887         trans->blocks_used++;
7888         /* this returns a buffer locked for blocking */
7889         return buf;
7890 }
7891
7892 static struct btrfs_block_rsv *
7893 use_block_rsv(struct btrfs_trans_handle *trans,
7894               struct btrfs_root *root, u32 blocksize)
7895 {
7896         struct btrfs_block_rsv *block_rsv;
7897         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
7898         int ret;
7899         bool global_updated = false;
7900
7901         block_rsv = get_block_rsv(trans, root);
7902
7903         if (unlikely(block_rsv->size == 0))
7904                 goto try_reserve;
7905 again:
7906         ret = block_rsv_use_bytes(block_rsv, blocksize);
7907         if (!ret)
7908                 return block_rsv;
7909
7910         if (block_rsv->failfast)
7911                 return ERR_PTR(ret);
7912
7913         if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
7914                 global_updated = true;
7915                 update_global_block_rsv(root->fs_info);
7916                 goto again;
7917         }
7918
7919         if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
7920                 static DEFINE_RATELIMIT_STATE(_rs,
7921                                 DEFAULT_RATELIMIT_INTERVAL * 10,
7922                                 /*DEFAULT_RATELIMIT_BURST*/ 1);
7923                 if (__ratelimit(&_rs))
7924                         WARN(1, KERN_DEBUG
7925                                 "BTRFS: block rsv returned %d\n", ret);
7926         }
7927 try_reserve:
7928         ret = reserve_metadata_bytes(root, block_rsv, blocksize,
7929                                      BTRFS_RESERVE_NO_FLUSH);
7930         if (!ret)
7931                 return block_rsv;
7932         /*
7933          * If we couldn't reserve metadata bytes try and use some from
7934          * the global reserve if its space type is the same as the global
7935          * reservation.
7936          */
7937         if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
7938             block_rsv->space_info == global_rsv->space_info) {
7939                 ret = block_rsv_use_bytes(global_rsv, blocksize);
7940                 if (!ret)
7941                         return global_rsv;
7942         }
7943         return ERR_PTR(ret);
7944 }
7945
7946 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
7947                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
7948 {
7949         block_rsv_add_bytes(block_rsv, blocksize, 0);
7950         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
7951 }
7952
7953 /*
7954  * finds a free extent and does all the dirty work required for allocation
7955  * returns the tree buffer or an ERR_PTR on error.
7956  */
7957 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
7958                                         struct btrfs_root *root,
7959                                         u64 parent, u64 root_objectid,
7960                                         struct btrfs_disk_key *key, int level,
7961                                         u64 hint, u64 empty_size)
7962 {
7963         struct btrfs_key ins;
7964         struct btrfs_block_rsv *block_rsv;
7965         struct extent_buffer *buf;
7966         struct btrfs_delayed_extent_op *extent_op;
7967         u64 flags = 0;
7968         int ret;
7969         u32 blocksize = root->nodesize;
7970         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7971                                                  SKINNY_METADATA);
7972
7973         if (btrfs_test_is_dummy_root(root)) {
7974                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
7975                                             level);
7976                 if (!IS_ERR(buf))
7977                         root->alloc_bytenr += blocksize;
7978                 return buf;
7979         }
7980
7981         block_rsv = use_block_rsv(trans, root, blocksize);
7982         if (IS_ERR(block_rsv))
7983                 return ERR_CAST(block_rsv);
7984
7985         ret = btrfs_reserve_extent(root, blocksize, blocksize,
7986                                    empty_size, hint, &ins, 0, 0);
7987         if (ret)
7988                 goto out_unuse;
7989
7990         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
7991         if (IS_ERR(buf)) {
7992                 ret = PTR_ERR(buf);
7993                 goto out_free_reserved;
7994         }
7995
7996         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
7997                 if (parent == 0)
7998                         parent = ins.objectid;
7999                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8000         } else
8001                 BUG_ON(parent > 0);
8002
8003         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8004                 extent_op = btrfs_alloc_delayed_extent_op();
8005                 if (!extent_op) {
8006                         ret = -ENOMEM;
8007                         goto out_free_buf;
8008                 }
8009                 if (key)
8010                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
8011                 else
8012                         memset(&extent_op->key, 0, sizeof(extent_op->key));
8013                 extent_op->flags_to_set = flags;
8014                 if (skinny_metadata)
8015                         extent_op->update_key = 0;
8016                 else
8017                         extent_op->update_key = 1;
8018                 extent_op->update_flags = 1;
8019                 extent_op->is_data = 0;
8020                 extent_op->level = level;
8021
8022                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
8023                                                  ins.objectid, ins.offset,
8024                                                  parent, root_objectid, level,
8025                                                  BTRFS_ADD_DELAYED_EXTENT,
8026                                                  extent_op);
8027                 if (ret)
8028                         goto out_free_delayed;
8029         }
8030         return buf;
8031
8032 out_free_delayed:
8033         btrfs_free_delayed_extent_op(extent_op);
8034 out_free_buf:
8035         free_extent_buffer(buf);
8036 out_free_reserved:
8037         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
8038 out_unuse:
8039         unuse_block_rsv(root->fs_info, block_rsv, blocksize);
8040         return ERR_PTR(ret);
8041 }
8042
8043 struct walk_control {
8044         u64 refs[BTRFS_MAX_LEVEL];
8045         u64 flags[BTRFS_MAX_LEVEL];
8046         struct btrfs_key update_progress;
8047         int stage;
8048         int level;
8049         int shared_level;
8050         int update_ref;
8051         int keep_locks;
8052         int reada_slot;
8053         int reada_count;
8054         int for_reloc;
8055 };
8056
8057 #define DROP_REFERENCE  1
8058 #define UPDATE_BACKREF  2
8059
8060 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8061                                      struct btrfs_root *root,
8062                                      struct walk_control *wc,
8063                                      struct btrfs_path *path)
8064 {
8065         u64 bytenr;
8066         u64 generation;
8067         u64 refs;
8068         u64 flags;
8069         u32 nritems;
8070         u32 blocksize;
8071         struct btrfs_key key;
8072         struct extent_buffer *eb;
8073         int ret;
8074         int slot;
8075         int nread = 0;
8076
8077         if (path->slots[wc->level] < wc->reada_slot) {
8078                 wc->reada_count = wc->reada_count * 2 / 3;
8079                 wc->reada_count = max(wc->reada_count, 2);
8080         } else {
8081                 wc->reada_count = wc->reada_count * 3 / 2;
8082                 wc->reada_count = min_t(int, wc->reada_count,
8083                                         BTRFS_NODEPTRS_PER_BLOCK(root));
8084         }
8085
8086         eb = path->nodes[wc->level];
8087         nritems = btrfs_header_nritems(eb);
8088         blocksize = root->nodesize;
8089
8090         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8091                 if (nread >= wc->reada_count)
8092                         break;
8093
8094                 cond_resched();
8095                 bytenr = btrfs_node_blockptr(eb, slot);
8096                 generation = btrfs_node_ptr_generation(eb, slot);
8097
8098                 if (slot == path->slots[wc->level])
8099                         goto reada;
8100
8101                 if (wc->stage == UPDATE_BACKREF &&
8102                     generation <= root->root_key.offset)
8103                         continue;
8104
8105                 /* We don't lock the tree block, it's OK to be racy here */
8106                 ret = btrfs_lookup_extent_info(trans, root, bytenr,
8107                                                wc->level - 1, 1, &refs,
8108                                                &flags);
8109                 /* We don't care about errors in readahead. */
8110                 if (ret < 0)
8111                         continue;
8112                 BUG_ON(refs == 0);
8113
8114                 if (wc->stage == DROP_REFERENCE) {
8115                         if (refs == 1)
8116                                 goto reada;
8117
8118                         if (wc->level == 1 &&
8119                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8120                                 continue;
8121                         if (!wc->update_ref ||
8122                             generation <= root->root_key.offset)
8123                                 continue;
8124                         btrfs_node_key_to_cpu(eb, &key, slot);
8125                         ret = btrfs_comp_cpu_keys(&key,
8126                                                   &wc->update_progress);
8127                         if (ret < 0)
8128                                 continue;
8129                 } else {
8130                         if (wc->level == 1 &&
8131                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8132                                 continue;
8133                 }
8134 reada:
8135                 readahead_tree_block(root, bytenr);
8136                 nread++;
8137         }
8138         wc->reada_slot = slot;
8139 }
8140
8141 /*
8142  * These may not be seen by the usual inc/dec ref code so we have to
8143  * add them here.
8144  */
8145 static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
8146                                      struct btrfs_root *root, u64 bytenr,
8147                                      u64 num_bytes)
8148 {
8149         struct btrfs_qgroup_extent_record *qrecord;
8150         struct btrfs_delayed_ref_root *delayed_refs;
8151
8152         qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
8153         if (!qrecord)
8154                 return -ENOMEM;
8155
8156         qrecord->bytenr = bytenr;
8157         qrecord->num_bytes = num_bytes;
8158         qrecord->old_roots = NULL;
8159
8160         delayed_refs = &trans->transaction->delayed_refs;
8161         spin_lock(&delayed_refs->lock);
8162         if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
8163                 kfree(qrecord);
8164         spin_unlock(&delayed_refs->lock);
8165
8166         return 0;
8167 }
8168
8169 static int account_leaf_items(struct btrfs_trans_handle *trans,
8170                               struct btrfs_root *root,
8171                               struct extent_buffer *eb)
8172 {
8173         int nr = btrfs_header_nritems(eb);
8174         int i, extent_type, ret;
8175         struct btrfs_key key;
8176         struct btrfs_file_extent_item *fi;
8177         u64 bytenr, num_bytes;
8178
8179         /* We can be called directly from walk_up_proc() */
8180         if (!root->fs_info->quota_enabled)
8181                 return 0;
8182
8183         for (i = 0; i < nr; i++) {
8184                 btrfs_item_key_to_cpu(eb, &key, i);
8185
8186                 if (key.type != BTRFS_EXTENT_DATA_KEY)
8187                         continue;
8188
8189                 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
8190                 /* filter out non qgroup-accountable extents  */
8191                 extent_type = btrfs_file_extent_type(eb, fi);
8192
8193                 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
8194                         continue;
8195
8196                 bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
8197                 if (!bytenr)
8198                         continue;
8199
8200                 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
8201
8202                 ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
8203                 if (ret)
8204                         return ret;
8205         }
8206         return 0;
8207 }
8208
8209 /*
8210  * Walk up the tree from the bottom, freeing leaves and any interior
8211  * nodes which have had all slots visited. If a node (leaf or
8212  * interior) is freed, the node above it will have it's slot
8213  * incremented. The root node will never be freed.
8214  *
8215  * At the end of this function, we should have a path which has all
8216  * slots incremented to the next position for a search. If we need to
8217  * read a new node it will be NULL and the node above it will have the
8218  * correct slot selected for a later read.
8219  *
8220  * If we increment the root nodes slot counter past the number of
8221  * elements, 1 is returned to signal completion of the search.
8222  */
8223 static int adjust_slots_upwards(struct btrfs_root *root,
8224                                 struct btrfs_path *path, int root_level)
8225 {
8226         int level = 0;
8227         int nr, slot;
8228         struct extent_buffer *eb;
8229
8230         if (root_level == 0)
8231                 return 1;
8232
8233         while (level <= root_level) {
8234                 eb = path->nodes[level];
8235                 nr = btrfs_header_nritems(eb);
8236                 path->slots[level]++;
8237                 slot = path->slots[level];
8238                 if (slot >= nr || level == 0) {
8239                         /*
8240                          * Don't free the root -  we will detect this
8241                          * condition after our loop and return a
8242                          * positive value for caller to stop walking the tree.
8243                          */
8244                         if (level != root_level) {
8245                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8246                                 path->locks[level] = 0;
8247
8248                                 free_extent_buffer(eb);
8249                                 path->nodes[level] = NULL;
8250                                 path->slots[level] = 0;
8251                         }
8252                 } else {
8253                         /*
8254                          * We have a valid slot to walk back down
8255                          * from. Stop here so caller can process these
8256                          * new nodes.
8257                          */
8258                         break;
8259                 }
8260
8261                 level++;
8262         }
8263
8264         eb = path->nodes[root_level];
8265         if (path->slots[root_level] >= btrfs_header_nritems(eb))
8266                 return 1;
8267
8268         return 0;
8269 }
8270
8271 /*
8272  * root_eb is the subtree root and is locked before this function is called.
8273  */
8274 static int account_shared_subtree(struct btrfs_trans_handle *trans,
8275                                   struct btrfs_root *root,
8276                                   struct extent_buffer *root_eb,
8277                                   u64 root_gen,
8278                                   int root_level)
8279 {
8280         int ret = 0;
8281         int level;
8282         struct extent_buffer *eb = root_eb;
8283         struct btrfs_path *path = NULL;
8284
8285         BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
8286         BUG_ON(root_eb == NULL);
8287
8288         if (!root->fs_info->quota_enabled)
8289                 return 0;
8290
8291         if (!extent_buffer_uptodate(root_eb)) {
8292                 ret = btrfs_read_buffer(root_eb, root_gen);
8293                 if (ret)
8294                         goto out;
8295         }
8296
8297         if (root_level == 0) {
8298                 ret = account_leaf_items(trans, root, root_eb);
8299                 goto out;
8300         }
8301
8302         path = btrfs_alloc_path();
8303         if (!path)
8304                 return -ENOMEM;
8305
8306         /*
8307          * Walk down the tree.  Missing extent blocks are filled in as
8308          * we go. Metadata is accounted every time we read a new
8309          * extent block.
8310          *
8311          * When we reach a leaf, we account for file extent items in it,
8312          * walk back up the tree (adjusting slot pointers as we go)
8313          * and restart the search process.
8314          */
8315         extent_buffer_get(root_eb); /* For path */
8316         path->nodes[root_level] = root_eb;
8317         path->slots[root_level] = 0;
8318         path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
8319 walk_down:
8320         level = root_level;
8321         while (level >= 0) {
8322                 if (path->nodes[level] == NULL) {
8323                         int parent_slot;
8324                         u64 child_gen;
8325                         u64 child_bytenr;
8326
8327                         /* We need to get child blockptr/gen from
8328                          * parent before we can read it. */
8329                         eb = path->nodes[level + 1];
8330                         parent_slot = path->slots[level + 1];
8331                         child_bytenr = btrfs_node_blockptr(eb, parent_slot);
8332                         child_gen = btrfs_node_ptr_generation(eb, parent_slot);
8333
8334                         eb = read_tree_block(root, child_bytenr, child_gen);
8335                         if (IS_ERR(eb)) {
8336                                 ret = PTR_ERR(eb);
8337                                 goto out;
8338                         } else if (!extent_buffer_uptodate(eb)) {
8339                                 free_extent_buffer(eb);
8340                                 ret = -EIO;
8341                                 goto out;
8342                         }
8343
8344                         path->nodes[level] = eb;
8345                         path->slots[level] = 0;
8346
8347                         btrfs_tree_read_lock(eb);
8348                         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
8349                         path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
8350
8351                         ret = record_one_subtree_extent(trans, root, child_bytenr,
8352                                                         root->nodesize);
8353                         if (ret)
8354                                 goto out;
8355                 }
8356
8357                 if (level == 0) {
8358                         ret = account_leaf_items(trans, root, path->nodes[level]);
8359                         if (ret)
8360                                 goto out;
8361
8362                         /* Nonzero return here means we completed our search */
8363                         ret = adjust_slots_upwards(root, path, root_level);
8364                         if (ret)
8365                                 break;
8366
8367                         /* Restart search with new slots */
8368                         goto walk_down;
8369                 }
8370
8371                 level--;
8372         }
8373
8374         ret = 0;
8375 out:
8376         btrfs_free_path(path);
8377
8378         return ret;
8379 }
8380
8381 /*
8382  * helper to process tree block while walking down the tree.
8383  *
8384  * when wc->stage == UPDATE_BACKREF, this function updates
8385  * back refs for pointers in the block.
8386  *
8387  * NOTE: return value 1 means we should stop walking down.
8388  */
8389 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8390                                    struct btrfs_root *root,
8391                                    struct btrfs_path *path,
8392                                    struct walk_control *wc, int lookup_info)
8393 {
8394         int level = wc->level;
8395         struct extent_buffer *eb = path->nodes[level];
8396         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8397         int ret;
8398
8399         if (wc->stage == UPDATE_BACKREF &&
8400             btrfs_header_owner(eb) != root->root_key.objectid)
8401                 return 1;
8402
8403         /*
8404          * when reference count of tree block is 1, it won't increase
8405          * again. once full backref flag is set, we never clear it.
8406          */
8407         if (lookup_info &&
8408             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8409              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8410                 BUG_ON(!path->locks[level]);
8411                 ret = btrfs_lookup_extent_info(trans, root,
8412                                                eb->start, level, 1,
8413                                                &wc->refs[level],
8414                                                &wc->flags[level]);
8415                 BUG_ON(ret == -ENOMEM);
8416                 if (ret)
8417                         return ret;
8418                 BUG_ON(wc->refs[level] == 0);
8419         }
8420
8421         if (wc->stage == DROP_REFERENCE) {
8422                 if (wc->refs[level] > 1)
8423                         return 1;
8424
8425                 if (path->locks[level] && !wc->keep_locks) {
8426                         btrfs_tree_unlock_rw(eb, path->locks[level]);
8427                         path->locks[level] = 0;
8428                 }
8429                 return 0;
8430         }
8431
8432         /* wc->stage == UPDATE_BACKREF */
8433         if (!(wc->flags[level] & flag)) {
8434                 BUG_ON(!path->locks[level]);
8435                 ret = btrfs_inc_ref(trans, root, eb, 1);
8436                 BUG_ON(ret); /* -ENOMEM */
8437                 ret = btrfs_dec_ref(trans, root, eb, 0);
8438                 BUG_ON(ret); /* -ENOMEM */
8439                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
8440                                                   eb->len, flag,
8441                                                   btrfs_header_level(eb), 0);
8442                 BUG_ON(ret); /* -ENOMEM */
8443                 wc->flags[level] |= flag;
8444         }
8445
8446         /*
8447          * the block is shared by multiple trees, so it's not good to
8448          * keep the tree lock
8449          */
8450         if (path->locks[level] && level > 0) {
8451                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8452                 path->locks[level] = 0;
8453         }
8454         return 0;
8455 }
8456
8457 /*
8458  * helper to process tree block pointer.
8459  *
8460  * when wc->stage == DROP_REFERENCE, this function checks
8461  * reference count of the block pointed to. if the block
8462  * is shared and we need update back refs for the subtree
8463  * rooted at the block, this function changes wc->stage to
8464  * UPDATE_BACKREF. if the block is shared and there is no
8465  * need to update back, this function drops the reference
8466  * to the block.
8467  *
8468  * NOTE: return value 1 means we should stop walking down.
8469  */
8470 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8471                                  struct btrfs_root *root,
8472                                  struct btrfs_path *path,
8473                                  struct walk_control *wc, int *lookup_info)
8474 {
8475         u64 bytenr;
8476         u64 generation;
8477         u64 parent;
8478         u32 blocksize;
8479         struct btrfs_key key;
8480         struct extent_buffer *next;
8481         int level = wc->level;
8482         int reada = 0;
8483         int ret = 0;
8484         bool need_account = false;
8485
8486         generation = btrfs_node_ptr_generation(path->nodes[level],
8487                                                path->slots[level]);
8488         /*
8489          * if the lower level block was created before the snapshot
8490          * was created, we know there is no need to update back refs
8491          * for the subtree
8492          */
8493         if (wc->stage == UPDATE_BACKREF &&
8494             generation <= root->root_key.offset) {
8495                 *lookup_info = 1;
8496                 return 1;
8497         }
8498
8499         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8500         blocksize = root->nodesize;
8501
8502         next = btrfs_find_tree_block(root->fs_info, bytenr);
8503         if (!next) {
8504                 next = btrfs_find_create_tree_block(root, bytenr);
8505                 if (!next)
8506                         return -ENOMEM;
8507                 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8508                                                level - 1);
8509                 reada = 1;
8510         }
8511         btrfs_tree_lock(next);
8512         btrfs_set_lock_blocking(next);
8513
8514         ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
8515                                        &wc->refs[level - 1],
8516                                        &wc->flags[level - 1]);
8517         if (ret < 0) {
8518                 btrfs_tree_unlock(next);
8519                 return ret;
8520         }
8521
8522         if (unlikely(wc->refs[level - 1] == 0)) {
8523                 btrfs_err(root->fs_info, "Missing references.");
8524                 BUG();
8525         }
8526         *lookup_info = 0;
8527
8528         if (wc->stage == DROP_REFERENCE) {
8529                 if (wc->refs[level - 1] > 1) {
8530                         need_account = true;
8531                         if (level == 1 &&
8532                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8533                                 goto skip;
8534
8535                         if (!wc->update_ref ||
8536                             generation <= root->root_key.offset)
8537                                 goto skip;
8538
8539                         btrfs_node_key_to_cpu(path->nodes[level], &key,
8540                                               path->slots[level]);
8541                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8542                         if (ret < 0)
8543                                 goto skip;
8544
8545                         wc->stage = UPDATE_BACKREF;
8546                         wc->shared_level = level - 1;
8547                 }
8548         } else {
8549                 if (level == 1 &&
8550                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8551                         goto skip;
8552         }
8553
8554         if (!btrfs_buffer_uptodate(next, generation, 0)) {
8555                 btrfs_tree_unlock(next);
8556                 free_extent_buffer(next);
8557                 next = NULL;
8558                 *lookup_info = 1;
8559         }
8560
8561         if (!next) {
8562                 if (reada && level == 1)
8563                         reada_walk_down(trans, root, wc, path);
8564                 next = read_tree_block(root, bytenr, generation);
8565                 if (IS_ERR(next)) {
8566                         return PTR_ERR(next);
8567                 } else if (!extent_buffer_uptodate(next)) {
8568                         free_extent_buffer(next);
8569                         return -EIO;
8570                 }
8571                 btrfs_tree_lock(next);
8572                 btrfs_set_lock_blocking(next);
8573         }
8574
8575         level--;
8576         BUG_ON(level != btrfs_header_level(next));
8577         path->nodes[level] = next;
8578         path->slots[level] = 0;
8579         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8580         wc->level = level;
8581         if (wc->level == 1)
8582                 wc->reada_slot = 0;
8583         return 0;
8584 skip:
8585         wc->refs[level - 1] = 0;
8586         wc->flags[level - 1] = 0;
8587         if (wc->stage == DROP_REFERENCE) {
8588                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8589                         parent = path->nodes[level]->start;
8590                 } else {
8591                         BUG_ON(root->root_key.objectid !=
8592                                btrfs_header_owner(path->nodes[level]));
8593                         parent = 0;
8594                 }
8595
8596                 if (need_account) {
8597                         ret = account_shared_subtree(trans, root, next,
8598                                                      generation, level - 1);
8599                         if (ret) {
8600                                 btrfs_err_rl(root->fs_info,
8601                                         "Error "
8602                                         "%d accounting shared subtree. Quota "
8603                                         "is out of sync, rescan required.",
8604                                         ret);
8605                         }
8606                 }
8607                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
8608                                 root->root_key.objectid, level - 1, 0);
8609                 BUG_ON(ret); /* -ENOMEM */
8610         }
8611         btrfs_tree_unlock(next);
8612         free_extent_buffer(next);
8613         *lookup_info = 1;
8614         return 1;
8615 }
8616
8617 /*
8618  * helper to process tree block while walking up the tree.
8619  *
8620  * when wc->stage == DROP_REFERENCE, this function drops
8621  * reference count on the block.
8622  *
8623  * when wc->stage == UPDATE_BACKREF, this function changes
8624  * wc->stage back to DROP_REFERENCE if we changed wc->stage
8625  * to UPDATE_BACKREF previously while processing the block.
8626  *
8627  * NOTE: return value 1 means we should stop walking up.
8628  */
8629 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8630                                  struct btrfs_root *root,
8631                                  struct btrfs_path *path,
8632                                  struct walk_control *wc)
8633 {
8634         int ret;
8635         int level = wc->level;
8636         struct extent_buffer *eb = path->nodes[level];
8637         u64 parent = 0;
8638
8639         if (wc->stage == UPDATE_BACKREF) {
8640                 BUG_ON(wc->shared_level < level);
8641                 if (level < wc->shared_level)
8642                         goto out;
8643
8644                 ret = find_next_key(path, level + 1, &wc->update_progress);
8645                 if (ret > 0)
8646                         wc->update_ref = 0;
8647
8648                 wc->stage = DROP_REFERENCE;
8649                 wc->shared_level = -1;
8650                 path->slots[level] = 0;
8651
8652                 /*
8653                  * check reference count again if the block isn't locked.
8654                  * we should start walking down the tree again if reference
8655                  * count is one.
8656                  */
8657                 if (!path->locks[level]) {
8658                         BUG_ON(level == 0);
8659                         btrfs_tree_lock(eb);
8660                         btrfs_set_lock_blocking(eb);
8661                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8662
8663                         ret = btrfs_lookup_extent_info(trans, root,
8664                                                        eb->start, level, 1,
8665                                                        &wc->refs[level],
8666                                                        &wc->flags[level]);
8667                         if (ret < 0) {
8668                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8669                                 path->locks[level] = 0;
8670                                 return ret;
8671                         }
8672                         BUG_ON(wc->refs[level] == 0);
8673                         if (wc->refs[level] == 1) {
8674                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8675                                 path->locks[level] = 0;
8676                                 return 1;
8677                         }
8678                 }
8679         }
8680
8681         /* wc->stage == DROP_REFERENCE */
8682         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
8683
8684         if (wc->refs[level] == 1) {
8685                 if (level == 0) {
8686                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8687                                 ret = btrfs_dec_ref(trans, root, eb, 1);
8688                         else
8689                                 ret = btrfs_dec_ref(trans, root, eb, 0);
8690                         BUG_ON(ret); /* -ENOMEM */
8691                         ret = account_leaf_items(trans, root, eb);
8692                         if (ret) {
8693                                 btrfs_err_rl(root->fs_info,
8694                                         "error "
8695                                         "%d accounting leaf items. Quota "
8696                                         "is out of sync, rescan required.",
8697                                         ret);
8698                         }
8699                 }
8700                 /* make block locked assertion in clean_tree_block happy */
8701                 if (!path->locks[level] &&
8702                     btrfs_header_generation(eb) == trans->transid) {
8703                         btrfs_tree_lock(eb);
8704                         btrfs_set_lock_blocking(eb);
8705                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8706                 }
8707                 clean_tree_block(trans, root->fs_info, eb);
8708         }
8709
8710         if (eb == root->node) {
8711                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8712                         parent = eb->start;
8713                 else
8714                         BUG_ON(root->root_key.objectid !=
8715                                btrfs_header_owner(eb));
8716         } else {
8717                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8718                         parent = path->nodes[level + 1]->start;
8719                 else
8720                         BUG_ON(root->root_key.objectid !=
8721                                btrfs_header_owner(path->nodes[level + 1]));
8722         }
8723
8724         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
8725 out:
8726         wc->refs[level] = 0;
8727         wc->flags[level] = 0;
8728         return 0;
8729 }
8730
8731 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
8732                                    struct btrfs_root *root,
8733                                    struct btrfs_path *path,
8734                                    struct walk_control *wc)
8735 {
8736         int level = wc->level;
8737         int lookup_info = 1;
8738         int ret;
8739
8740         while (level >= 0) {
8741                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
8742                 if (ret > 0)
8743                         break;
8744
8745                 if (level == 0)
8746                         break;
8747
8748                 if (path->slots[level] >=
8749                     btrfs_header_nritems(path->nodes[level]))
8750                         break;
8751
8752                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
8753                 if (ret > 0) {
8754                         path->slots[level]++;
8755                         continue;
8756                 } else if (ret < 0)
8757                         return ret;
8758                 level = wc->level;
8759         }
8760         return 0;
8761 }
8762
8763 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
8764                                  struct btrfs_root *root,
8765                                  struct btrfs_path *path,
8766                                  struct walk_control *wc, int max_level)
8767 {
8768         int level = wc->level;
8769         int ret;
8770
8771         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
8772         while (level < max_level && path->nodes[level]) {
8773                 wc->level = level;
8774                 if (path->slots[level] + 1 <
8775                     btrfs_header_nritems(path->nodes[level])) {
8776                         path->slots[level]++;
8777                         return 0;
8778                 } else {
8779                         ret = walk_up_proc(trans, root, path, wc);
8780                         if (ret > 0)
8781                                 return 0;
8782
8783                         if (path->locks[level]) {
8784                                 btrfs_tree_unlock_rw(path->nodes[level],
8785                                                      path->locks[level]);
8786                                 path->locks[level] = 0;
8787                         }
8788                         free_extent_buffer(path->nodes[level]);
8789                         path->nodes[level] = NULL;
8790                         level++;
8791                 }
8792         }
8793         return 1;
8794 }
8795
8796 /*
8797  * drop a subvolume tree.
8798  *
8799  * this function traverses the tree freeing any blocks that only
8800  * referenced by the tree.
8801  *
8802  * when a shared tree block is found. this function decreases its
8803  * reference count by one. if update_ref is true, this function
8804  * also make sure backrefs for the shared block and all lower level
8805  * blocks are properly updated.
8806  *
8807  * If called with for_reloc == 0, may exit early with -EAGAIN
8808  */
8809 int btrfs_drop_snapshot(struct btrfs_root *root,
8810                          struct btrfs_block_rsv *block_rsv, int update_ref,
8811                          int for_reloc)
8812 {
8813         struct btrfs_path *path;
8814         struct btrfs_trans_handle *trans;
8815         struct btrfs_root *tree_root = root->fs_info->tree_root;
8816         struct btrfs_root_item *root_item = &root->root_item;
8817         struct walk_control *wc;
8818         struct btrfs_key key;
8819         int err = 0;
8820         int ret;
8821         int level;
8822         bool root_dropped = false;
8823
8824         btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
8825
8826         path = btrfs_alloc_path();
8827         if (!path) {
8828                 err = -ENOMEM;
8829                 goto out;
8830         }
8831
8832         wc = kzalloc(sizeof(*wc), GFP_NOFS);
8833         if (!wc) {
8834                 btrfs_free_path(path);
8835                 err = -ENOMEM;
8836                 goto out;
8837         }
8838
8839         trans = btrfs_start_transaction(tree_root, 0);
8840         if (IS_ERR(trans)) {
8841                 err = PTR_ERR(trans);
8842                 goto out_free;
8843         }
8844
8845         if (block_rsv)
8846                 trans->block_rsv = block_rsv;
8847
8848         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
8849                 level = btrfs_header_level(root->node);
8850                 path->nodes[level] = btrfs_lock_root_node(root);
8851                 btrfs_set_lock_blocking(path->nodes[level]);
8852                 path->slots[level] = 0;
8853                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8854                 memset(&wc->update_progress, 0,
8855                        sizeof(wc->update_progress));
8856         } else {
8857                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
8858                 memcpy(&wc->update_progress, &key,
8859                        sizeof(wc->update_progress));
8860
8861                 level = root_item->drop_level;
8862                 BUG_ON(level == 0);
8863                 path->lowest_level = level;
8864                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
8865                 path->lowest_level = 0;
8866                 if (ret < 0) {
8867                         err = ret;
8868                         goto out_end_trans;
8869                 }
8870                 WARN_ON(ret > 0);
8871
8872                 /*
8873                  * unlock our path, this is safe because only this
8874                  * function is allowed to delete this snapshot
8875                  */
8876                 btrfs_unlock_up_safe(path, 0);
8877
8878                 level = btrfs_header_level(root->node);
8879                 while (1) {
8880                         btrfs_tree_lock(path->nodes[level]);
8881                         btrfs_set_lock_blocking(path->nodes[level]);
8882                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8883
8884                         ret = btrfs_lookup_extent_info(trans, root,
8885                                                 path->nodes[level]->start,
8886                                                 level, 1, &wc->refs[level],
8887                                                 &wc->flags[level]);
8888                         if (ret < 0) {
8889                                 err = ret;
8890                                 goto out_end_trans;
8891                         }
8892                         BUG_ON(wc->refs[level] == 0);
8893
8894                         if (level == root_item->drop_level)
8895                                 break;
8896
8897                         btrfs_tree_unlock(path->nodes[level]);
8898                         path->locks[level] = 0;
8899                         WARN_ON(wc->refs[level] != 1);
8900                         level--;
8901                 }
8902         }
8903
8904         wc->level = level;
8905         wc->shared_level = -1;
8906         wc->stage = DROP_REFERENCE;
8907         wc->update_ref = update_ref;
8908         wc->keep_locks = 0;
8909         wc->for_reloc = for_reloc;
8910         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
8911
8912         while (1) {
8913
8914                 ret = walk_down_tree(trans, root, path, wc);
8915                 if (ret < 0) {
8916                         err = ret;
8917                         break;
8918                 }
8919
8920                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
8921                 if (ret < 0) {
8922                         err = ret;
8923                         break;
8924                 }
8925
8926                 if (ret > 0) {
8927                         BUG_ON(wc->stage != DROP_REFERENCE);
8928                         break;
8929                 }
8930
8931                 if (wc->stage == DROP_REFERENCE) {
8932                         level = wc->level;
8933                         btrfs_node_key(path->nodes[level],
8934                                        &root_item->drop_progress,
8935                                        path->slots[level]);
8936                         root_item->drop_level = level;
8937                 }
8938
8939                 BUG_ON(wc->level == 0);
8940                 if (btrfs_should_end_transaction(trans, tree_root) ||
8941                     (!for_reloc && btrfs_need_cleaner_sleep(root))) {
8942                         ret = btrfs_update_root(trans, tree_root,
8943                                                 &root->root_key,
8944                                                 root_item);
8945                         if (ret) {
8946                                 btrfs_abort_transaction(trans, tree_root, ret);
8947                                 err = ret;
8948                                 goto out_end_trans;
8949                         }
8950
8951                         btrfs_end_transaction_throttle(trans, tree_root);
8952                         if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
8953                                 pr_debug("BTRFS: drop snapshot early exit\n");
8954                                 err = -EAGAIN;
8955                                 goto out_free;
8956                         }
8957
8958                         trans = btrfs_start_transaction(tree_root, 0);
8959                         if (IS_ERR(trans)) {
8960                                 err = PTR_ERR(trans);
8961                                 goto out_free;
8962                         }
8963                         if (block_rsv)
8964                                 trans->block_rsv = block_rsv;
8965                 }
8966         }
8967         btrfs_release_path(path);
8968         if (err)
8969                 goto out_end_trans;
8970
8971         ret = btrfs_del_root(trans, tree_root, &root->root_key);
8972         if (ret) {
8973                 btrfs_abort_transaction(trans, tree_root, ret);
8974                 goto out_end_trans;
8975         }
8976
8977         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
8978                 ret = btrfs_find_root(tree_root, &root->root_key, path,
8979                                       NULL, NULL);
8980                 if (ret < 0) {
8981                         btrfs_abort_transaction(trans, tree_root, ret);
8982                         err = ret;
8983                         goto out_end_trans;
8984                 } else if (ret > 0) {
8985                         /* if we fail to delete the orphan item this time
8986                          * around, it'll get picked up the next time.
8987                          *
8988                          * The most common failure here is just -ENOENT.
8989                          */
8990                         btrfs_del_orphan_item(trans, tree_root,
8991                                               root->root_key.objectid);
8992                 }
8993         }
8994
8995         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
8996                 btrfs_add_dropped_root(trans, root);
8997         } else {
8998                 free_extent_buffer(root->node);
8999                 free_extent_buffer(root->commit_root);
9000                 btrfs_put_fs_root(root);
9001         }
9002         root_dropped = true;
9003 out_end_trans:
9004         btrfs_end_transaction_throttle(trans, tree_root);
9005 out_free:
9006         kfree(wc);
9007         btrfs_free_path(path);
9008 out:
9009         /*
9010          * So if we need to stop dropping the snapshot for whatever reason we
9011          * need to make sure to add it back to the dead root list so that we
9012          * keep trying to do the work later.  This also cleans up roots if we
9013          * don't have it in the radix (like when we recover after a power fail
9014          * or unmount) so we don't leak memory.
9015          */
9016         if (!for_reloc && root_dropped == false)
9017                 btrfs_add_dead_root(root);
9018         if (err && err != -EAGAIN)
9019                 btrfs_std_error(root->fs_info, err, NULL);
9020         return err;
9021 }
9022
9023 /*
9024  * drop subtree rooted at tree block 'node'.
9025  *
9026  * NOTE: this function will unlock and release tree block 'node'
9027  * only used by relocation code
9028  */
9029 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9030                         struct btrfs_root *root,
9031                         struct extent_buffer *node,
9032                         struct extent_buffer *parent)
9033 {
9034         struct btrfs_path *path;
9035         struct walk_control *wc;
9036         int level;
9037         int parent_level;
9038         int ret = 0;
9039         int wret;
9040
9041         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9042
9043         path = btrfs_alloc_path();
9044         if (!path)
9045                 return -ENOMEM;
9046
9047         wc = kzalloc(sizeof(*wc), GFP_NOFS);
9048         if (!wc) {
9049                 btrfs_free_path(path);
9050                 return -ENOMEM;
9051         }
9052
9053         btrfs_assert_tree_locked(parent);
9054         parent_level = btrfs_header_level(parent);
9055         extent_buffer_get(parent);
9056         path->nodes[parent_level] = parent;
9057         path->slots[parent_level] = btrfs_header_nritems(parent);
9058
9059         btrfs_assert_tree_locked(node);
9060         level = btrfs_header_level(node);
9061         path->nodes[level] = node;
9062         path->slots[level] = 0;
9063         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9064
9065         wc->refs[parent_level] = 1;
9066         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9067         wc->level = level;
9068         wc->shared_level = -1;
9069         wc->stage = DROP_REFERENCE;
9070         wc->update_ref = 0;
9071         wc->keep_locks = 1;
9072         wc->for_reloc = 1;
9073         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
9074
9075         while (1) {
9076                 wret = walk_down_tree(trans, root, path, wc);
9077                 if (wret < 0) {
9078                         ret = wret;
9079                         break;
9080                 }
9081
9082                 wret = walk_up_tree(trans, root, path, wc, parent_level);
9083                 if (wret < 0)
9084                         ret = wret;
9085                 if (wret != 0)
9086                         break;
9087         }
9088
9089         kfree(wc);
9090         btrfs_free_path(path);
9091         return ret;
9092 }
9093
9094 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
9095 {
9096         u64 num_devices;
9097         u64 stripped;
9098
9099         /*
9100          * if restripe for this chunk_type is on pick target profile and
9101          * return, otherwise do the usual balance
9102          */
9103         stripped = get_restripe_target(root->fs_info, flags);
9104         if (stripped)
9105                 return extended_to_chunk(stripped);
9106
9107         num_devices = root->fs_info->fs_devices->rw_devices;
9108
9109         stripped = BTRFS_BLOCK_GROUP_RAID0 |
9110                 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9111                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9112
9113         if (num_devices == 1) {
9114                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9115                 stripped = flags & ~stripped;
9116
9117                 /* turn raid0 into single device chunks */
9118                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
9119                         return stripped;
9120
9121                 /* turn mirroring into duplication */
9122                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9123                              BTRFS_BLOCK_GROUP_RAID10))
9124                         return stripped | BTRFS_BLOCK_GROUP_DUP;
9125         } else {
9126                 /* they already had raid on here, just return */
9127                 if (flags & stripped)
9128                         return flags;
9129
9130                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9131                 stripped = flags & ~stripped;
9132
9133                 /* switch duplicated blocks with raid1 */
9134                 if (flags & BTRFS_BLOCK_GROUP_DUP)
9135                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
9136
9137                 /* this is drive concat, leave it alone */
9138         }
9139
9140         return flags;
9141 }
9142
9143 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9144 {
9145         struct btrfs_space_info *sinfo = cache->space_info;
9146         u64 num_bytes;
9147         u64 min_allocable_bytes;
9148         int ret = -ENOSPC;
9149
9150         /*
9151          * We need some metadata space and system metadata space for
9152          * allocating chunks in some corner cases until we force to set
9153          * it to be readonly.
9154          */
9155         if ((sinfo->flags &
9156              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9157             !force)
9158                 min_allocable_bytes = 1 * 1024 * 1024;
9159         else
9160                 min_allocable_bytes = 0;
9161
9162         spin_lock(&sinfo->lock);
9163         spin_lock(&cache->lock);
9164
9165         if (cache->ro) {
9166                 cache->ro++;
9167                 ret = 0;
9168                 goto out;
9169         }
9170
9171         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9172                     cache->bytes_super - btrfs_block_group_used(&cache->item);
9173
9174         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
9175             sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
9176             min_allocable_bytes <= sinfo->total_bytes) {
9177                 sinfo->bytes_readonly += num_bytes;
9178                 cache->ro++;
9179                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9180                 ret = 0;
9181         }
9182 out:
9183         spin_unlock(&cache->lock);
9184         spin_unlock(&sinfo->lock);
9185         return ret;
9186 }
9187
9188 int btrfs_inc_block_group_ro(struct btrfs_root *root,
9189                              struct btrfs_block_group_cache *cache)
9190
9191 {
9192         struct btrfs_trans_handle *trans;
9193         u64 alloc_flags;
9194         int ret;
9195
9196 again:
9197         trans = btrfs_join_transaction(root);
9198         if (IS_ERR(trans))
9199                 return PTR_ERR(trans);
9200
9201         /*
9202          * we're not allowed to set block groups readonly after the dirty
9203          * block groups cache has started writing.  If it already started,
9204          * back off and let this transaction commit
9205          */
9206         mutex_lock(&root->fs_info->ro_block_group_mutex);
9207         if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9208                 u64 transid = trans->transid;
9209
9210                 mutex_unlock(&root->fs_info->ro_block_group_mutex);
9211                 btrfs_end_transaction(trans, root);
9212
9213                 ret = btrfs_wait_for_commit(root, transid);
9214                 if (ret)
9215                         return ret;
9216                 goto again;
9217         }
9218
9219         /*
9220          * if we are changing raid levels, try to allocate a corresponding
9221          * block group with the new raid level.
9222          */
9223         alloc_flags = update_block_group_flags(root, cache->flags);
9224         if (alloc_flags != cache->flags) {
9225                 ret = do_chunk_alloc(trans, root, alloc_flags,
9226                                      CHUNK_ALLOC_FORCE);
9227                 /*
9228                  * ENOSPC is allowed here, we may have enough space
9229                  * already allocated at the new raid level to
9230                  * carry on
9231                  */
9232                 if (ret == -ENOSPC)
9233                         ret = 0;
9234                 if (ret < 0)
9235                         goto out;
9236         }
9237
9238         ret = inc_block_group_ro(cache, 0);
9239         if (!ret)
9240                 goto out;
9241         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
9242         ret = do_chunk_alloc(trans, root, alloc_flags,
9243                              CHUNK_ALLOC_FORCE);
9244         if (ret < 0)
9245                 goto out;
9246         ret = inc_block_group_ro(cache, 0);
9247 out:
9248         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9249                 alloc_flags = update_block_group_flags(root, cache->flags);
9250                 lock_chunks(root->fs_info->chunk_root);
9251                 check_system_chunk(trans, root, alloc_flags);
9252                 unlock_chunks(root->fs_info->chunk_root);
9253         }
9254         mutex_unlock(&root->fs_info->ro_block_group_mutex);
9255
9256         btrfs_end_transaction(trans, root);
9257         return ret;
9258 }
9259
9260 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
9261                             struct btrfs_root *root, u64 type)
9262 {
9263         u64 alloc_flags = get_alloc_profile(root, type);
9264         return do_chunk_alloc(trans, root, alloc_flags,
9265                               CHUNK_ALLOC_FORCE);
9266 }
9267
9268 /*
9269  * helper to account the unused space of all the readonly block group in the
9270  * space_info. takes mirrors into account.
9271  */
9272 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9273 {
9274         struct btrfs_block_group_cache *block_group;
9275         u64 free_bytes = 0;
9276         int factor;
9277
9278         /* It's df, we don't care if it's racey */
9279         if (list_empty(&sinfo->ro_bgs))
9280                 return 0;
9281
9282         spin_lock(&sinfo->lock);
9283         list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9284                 spin_lock(&block_group->lock);
9285
9286                 if (!block_group->ro) {
9287                         spin_unlock(&block_group->lock);
9288                         continue;
9289                 }
9290
9291                 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
9292                                           BTRFS_BLOCK_GROUP_RAID10 |
9293                                           BTRFS_BLOCK_GROUP_DUP))
9294                         factor = 2;
9295                 else
9296                         factor = 1;
9297
9298                 free_bytes += (block_group->key.offset -
9299                                btrfs_block_group_used(&block_group->item)) *
9300                                factor;
9301
9302                 spin_unlock(&block_group->lock);
9303         }
9304         spin_unlock(&sinfo->lock);
9305
9306         return free_bytes;
9307 }
9308
9309 void btrfs_dec_block_group_ro(struct btrfs_root *root,
9310                               struct btrfs_block_group_cache *cache)
9311 {
9312         struct btrfs_space_info *sinfo = cache->space_info;
9313         u64 num_bytes;
9314
9315         BUG_ON(!cache->ro);
9316
9317         spin_lock(&sinfo->lock);
9318         spin_lock(&cache->lock);
9319         if (!--cache->ro) {
9320                 num_bytes = cache->key.offset - cache->reserved -
9321                             cache->pinned - cache->bytes_super -
9322                             btrfs_block_group_used(&cache->item);
9323                 sinfo->bytes_readonly -= num_bytes;
9324                 list_del_init(&cache->ro_list);
9325         }
9326         spin_unlock(&cache->lock);
9327         spin_unlock(&sinfo->lock);
9328 }
9329
9330 /*
9331  * checks to see if its even possible to relocate this block group.
9332  *
9333  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9334  * ok to go ahead and try.
9335  */
9336 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
9337 {
9338         struct btrfs_block_group_cache *block_group;
9339         struct btrfs_space_info *space_info;
9340         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
9341         struct btrfs_device *device;
9342         struct btrfs_trans_handle *trans;
9343         u64 min_free;
9344         u64 dev_min = 1;
9345         u64 dev_nr = 0;
9346         u64 target;
9347         int index;
9348         int full = 0;
9349         int ret = 0;
9350
9351         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
9352
9353         /* odd, couldn't find the block group, leave it alone */
9354         if (!block_group)
9355                 return -1;
9356
9357         min_free = btrfs_block_group_used(&block_group->item);
9358
9359         /* no bytes used, we're good */
9360         if (!min_free)
9361                 goto out;
9362
9363         space_info = block_group->space_info;
9364         spin_lock(&space_info->lock);
9365
9366         full = space_info->full;
9367
9368         /*
9369          * if this is the last block group we have in this space, we can't
9370          * relocate it unless we're able to allocate a new chunk below.
9371          *
9372          * Otherwise, we need to make sure we have room in the space to handle
9373          * all of the extents from this block group.  If we can, we're good
9374          */
9375         if ((space_info->total_bytes != block_group->key.offset) &&
9376             (space_info->bytes_used + space_info->bytes_reserved +
9377              space_info->bytes_pinned + space_info->bytes_readonly +
9378              min_free < space_info->total_bytes)) {
9379                 spin_unlock(&space_info->lock);
9380                 goto out;
9381         }
9382         spin_unlock(&space_info->lock);
9383
9384         /*
9385          * ok we don't have enough space, but maybe we have free space on our
9386          * devices to allocate new chunks for relocation, so loop through our
9387          * alloc devices and guess if we have enough space.  if this block
9388          * group is going to be restriped, run checks against the target
9389          * profile instead of the current one.
9390          */
9391         ret = -1;
9392
9393         /*
9394          * index:
9395          *      0: raid10
9396          *      1: raid1
9397          *      2: dup
9398          *      3: raid0
9399          *      4: single
9400          */
9401         target = get_restripe_target(root->fs_info, block_group->flags);
9402         if (target) {
9403                 index = __get_raid_index(extended_to_chunk(target));
9404         } else {
9405                 /*
9406                  * this is just a balance, so if we were marked as full
9407                  * we know there is no space for a new chunk
9408                  */
9409                 if (full)
9410                         goto out;
9411
9412                 index = get_block_group_index(block_group);
9413         }
9414
9415         if (index == BTRFS_RAID_RAID10) {
9416                 dev_min = 4;
9417                 /* Divide by 2 */
9418                 min_free >>= 1;
9419         } else if (index == BTRFS_RAID_RAID1) {
9420                 dev_min = 2;
9421         } else if (index == BTRFS_RAID_DUP) {
9422                 /* Multiply by 2 */
9423                 min_free <<= 1;
9424         } else if (index == BTRFS_RAID_RAID0) {
9425                 dev_min = fs_devices->rw_devices;
9426                 min_free = div64_u64(min_free, dev_min);
9427         }
9428
9429         /* We need to do this so that we can look at pending chunks */
9430         trans = btrfs_join_transaction(root);
9431         if (IS_ERR(trans)) {
9432                 ret = PTR_ERR(trans);
9433                 goto out;
9434         }
9435
9436         mutex_lock(&root->fs_info->chunk_mutex);
9437         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9438                 u64 dev_offset;
9439
9440                 /*
9441                  * check to make sure we can actually find a chunk with enough
9442                  * space to fit our block group in.
9443                  */
9444                 if (device->total_bytes > device->bytes_used + min_free &&
9445                     !device->is_tgtdev_for_dev_replace) {
9446                         ret = find_free_dev_extent(trans, device, min_free,
9447                                                    &dev_offset, NULL);
9448                         if (!ret)
9449                                 dev_nr++;
9450
9451                         if (dev_nr >= dev_min)
9452                                 break;
9453
9454                         ret = -1;
9455                 }
9456         }
9457         mutex_unlock(&root->fs_info->chunk_mutex);
9458         btrfs_end_transaction(trans, root);
9459 out:
9460         btrfs_put_block_group(block_group);
9461         return ret;
9462 }
9463
9464 static int find_first_block_group(struct btrfs_root *root,
9465                 struct btrfs_path *path, struct btrfs_key *key)
9466 {
9467         int ret = 0;
9468         struct btrfs_key found_key;
9469         struct extent_buffer *leaf;
9470         int slot;
9471
9472         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9473         if (ret < 0)
9474                 goto out;
9475
9476         while (1) {
9477                 slot = path->slots[0];
9478                 leaf = path->nodes[0];
9479                 if (slot >= btrfs_header_nritems(leaf)) {
9480                         ret = btrfs_next_leaf(root, path);
9481                         if (ret == 0)
9482                                 continue;
9483                         if (ret < 0)
9484                                 goto out;
9485                         break;
9486                 }
9487                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
9488
9489                 if (found_key.objectid >= key->objectid &&
9490                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9491                         ret = 0;
9492                         goto out;
9493                 }
9494                 path->slots[0]++;
9495         }
9496 out:
9497         return ret;
9498 }
9499
9500 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9501 {
9502         struct btrfs_block_group_cache *block_group;
9503         u64 last = 0;
9504
9505         while (1) {
9506                 struct inode *inode;
9507
9508                 block_group = btrfs_lookup_first_block_group(info, last);
9509                 while (block_group) {
9510                         spin_lock(&block_group->lock);
9511                         if (block_group->iref)
9512                                 break;
9513                         spin_unlock(&block_group->lock);
9514                         block_group = next_block_group(info->tree_root,
9515                                                        block_group);
9516                 }
9517                 if (!block_group) {
9518                         if (last == 0)
9519                                 break;
9520                         last = 0;
9521                         continue;
9522                 }
9523
9524                 inode = block_group->inode;
9525                 block_group->iref = 0;
9526                 block_group->inode = NULL;
9527                 spin_unlock(&block_group->lock);
9528                 iput(inode);
9529                 last = block_group->key.objectid + block_group->key.offset;
9530                 btrfs_put_block_group(block_group);
9531         }
9532 }
9533
9534 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9535 {
9536         struct btrfs_block_group_cache *block_group;
9537         struct btrfs_space_info *space_info;
9538         struct btrfs_caching_control *caching_ctl;
9539         struct rb_node *n;
9540
9541         down_write(&info->commit_root_sem);
9542         while (!list_empty(&info->caching_block_groups)) {
9543                 caching_ctl = list_entry(info->caching_block_groups.next,
9544                                          struct btrfs_caching_control, list);
9545                 list_del(&caching_ctl->list);
9546                 put_caching_control(caching_ctl);
9547         }
9548         up_write(&info->commit_root_sem);
9549
9550         spin_lock(&info->unused_bgs_lock);
9551         while (!list_empty(&info->unused_bgs)) {
9552                 block_group = list_first_entry(&info->unused_bgs,
9553                                                struct btrfs_block_group_cache,
9554                                                bg_list);
9555                 list_del_init(&block_group->bg_list);
9556                 btrfs_put_block_group(block_group);
9557         }
9558         spin_unlock(&info->unused_bgs_lock);
9559
9560         spin_lock(&info->block_group_cache_lock);
9561         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9562                 block_group = rb_entry(n, struct btrfs_block_group_cache,
9563                                        cache_node);
9564                 rb_erase(&block_group->cache_node,
9565                          &info->block_group_cache_tree);
9566                 RB_CLEAR_NODE(&block_group->cache_node);
9567                 spin_unlock(&info->block_group_cache_lock);
9568
9569                 down_write(&block_group->space_info->groups_sem);
9570                 list_del(&block_group->list);
9571                 up_write(&block_group->space_info->groups_sem);
9572
9573                 if (block_group->cached == BTRFS_CACHE_STARTED)
9574                         wait_block_group_cache_done(block_group);
9575
9576                 /*
9577                  * We haven't cached this block group, which means we could
9578                  * possibly have excluded extents on this block group.
9579                  */
9580                 if (block_group->cached == BTRFS_CACHE_NO ||
9581                     block_group->cached == BTRFS_CACHE_ERROR)
9582                         free_excluded_extents(info->extent_root, block_group);
9583
9584                 btrfs_remove_free_space_cache(block_group);
9585                 btrfs_put_block_group(block_group);
9586
9587                 spin_lock(&info->block_group_cache_lock);
9588         }
9589         spin_unlock(&info->block_group_cache_lock);
9590
9591         /* now that all the block groups are freed, go through and
9592          * free all the space_info structs.  This is only called during
9593          * the final stages of unmount, and so we know nobody is
9594          * using them.  We call synchronize_rcu() once before we start,
9595          * just to be on the safe side.
9596          */
9597         synchronize_rcu();
9598
9599         release_global_block_rsv(info);
9600
9601         while (!list_empty(&info->space_info)) {
9602                 int i;
9603
9604                 space_info = list_entry(info->space_info.next,
9605                                         struct btrfs_space_info,
9606                                         list);
9607                 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
9608                         if (WARN_ON(space_info->bytes_pinned > 0 ||
9609                             space_info->bytes_reserved > 0 ||
9610                             space_info->bytes_may_use > 0)) {
9611                                 dump_space_info(space_info, 0, 0);
9612                         }
9613                 }
9614                 list_del(&space_info->list);
9615                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9616                         struct kobject *kobj;
9617                         kobj = space_info->block_group_kobjs[i];
9618                         space_info->block_group_kobjs[i] = NULL;
9619                         if (kobj) {
9620                                 kobject_del(kobj);
9621                                 kobject_put(kobj);
9622                         }
9623                 }
9624                 kobject_del(&space_info->kobj);
9625                 kobject_put(&space_info->kobj);
9626         }
9627         return 0;
9628 }
9629
9630 static void __link_block_group(struct btrfs_space_info *space_info,
9631                                struct btrfs_block_group_cache *cache)
9632 {
9633         int index = get_block_group_index(cache);
9634         bool first = false;
9635
9636         down_write(&space_info->groups_sem);
9637         if (list_empty(&space_info->block_groups[index]))
9638                 first = true;
9639         list_add_tail(&cache->list, &space_info->block_groups[index]);
9640         up_write(&space_info->groups_sem);
9641
9642         if (first) {
9643                 struct raid_kobject *rkobj;
9644                 int ret;
9645
9646                 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
9647                 if (!rkobj)
9648                         goto out_err;
9649                 rkobj->raid_type = index;
9650                 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
9651                 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9652                                   "%s", get_raid_name(index));
9653                 if (ret) {
9654                         kobject_put(&rkobj->kobj);
9655                         goto out_err;
9656                 }
9657                 space_info->block_group_kobjs[index] = &rkobj->kobj;
9658         }
9659
9660         return;
9661 out_err:
9662         pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
9663 }
9664
9665 static struct btrfs_block_group_cache *
9666 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
9667 {
9668         struct btrfs_block_group_cache *cache;
9669
9670         cache = kzalloc(sizeof(*cache), GFP_NOFS);
9671         if (!cache)
9672                 return NULL;
9673
9674         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
9675                                         GFP_NOFS);
9676         if (!cache->free_space_ctl) {
9677                 kfree(cache);
9678                 return NULL;
9679         }
9680
9681         cache->key.objectid = start;
9682         cache->key.offset = size;
9683         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9684
9685         cache->sectorsize = root->sectorsize;
9686         cache->fs_info = root->fs_info;
9687         cache->full_stripe_len = btrfs_full_stripe_len(root,
9688                                                &root->fs_info->mapping_tree,
9689                                                start);
9690         set_free_space_tree_thresholds(cache);
9691
9692         atomic_set(&cache->count, 1);
9693         spin_lock_init(&cache->lock);
9694         init_rwsem(&cache->data_rwsem);
9695         INIT_LIST_HEAD(&cache->list);
9696         INIT_LIST_HEAD(&cache->cluster_list);
9697         INIT_LIST_HEAD(&cache->bg_list);
9698         INIT_LIST_HEAD(&cache->ro_list);
9699         INIT_LIST_HEAD(&cache->dirty_list);
9700         INIT_LIST_HEAD(&cache->io_list);
9701         btrfs_init_free_space_ctl(cache);
9702         atomic_set(&cache->trimming, 0);
9703         mutex_init(&cache->free_space_lock);
9704
9705         return cache;
9706 }
9707
9708 int btrfs_read_block_groups(struct btrfs_root *root)
9709 {
9710         struct btrfs_path *path;
9711         int ret;
9712         struct btrfs_block_group_cache *cache;
9713         struct btrfs_fs_info *info = root->fs_info;
9714         struct btrfs_space_info *space_info;
9715         struct btrfs_key key;
9716         struct btrfs_key found_key;
9717         struct extent_buffer *leaf;
9718         int need_clear = 0;
9719         u64 cache_gen;
9720
9721         root = info->extent_root;
9722         key.objectid = 0;
9723         key.offset = 0;
9724         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9725         path = btrfs_alloc_path();
9726         if (!path)
9727                 return -ENOMEM;
9728         path->reada = 1;
9729
9730         cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
9731         if (btrfs_test_opt(root, SPACE_CACHE) &&
9732             btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
9733                 need_clear = 1;
9734         if (btrfs_test_opt(root, CLEAR_CACHE))
9735                 need_clear = 1;
9736
9737         while (1) {
9738                 ret = find_first_block_group(root, path, &key);
9739                 if (ret > 0)
9740                         break;
9741                 if (ret != 0)
9742                         goto error;
9743
9744                 leaf = path->nodes[0];
9745                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
9746
9747                 cache = btrfs_create_block_group_cache(root, found_key.objectid,
9748                                                        found_key.offset);
9749                 if (!cache) {
9750                         ret = -ENOMEM;
9751                         goto error;
9752                 }
9753
9754                 if (need_clear) {
9755                         /*
9756                          * When we mount with old space cache, we need to
9757                          * set BTRFS_DC_CLEAR and set dirty flag.
9758                          *
9759                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
9760                          *    truncate the old free space cache inode and
9761                          *    setup a new one.
9762                          * b) Setting 'dirty flag' makes sure that we flush
9763                          *    the new space cache info onto disk.
9764                          */
9765                         if (btrfs_test_opt(root, SPACE_CACHE))
9766                                 cache->disk_cache_state = BTRFS_DC_CLEAR;
9767                 }
9768
9769                 read_extent_buffer(leaf, &cache->item,
9770                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
9771                                    sizeof(cache->item));
9772                 cache->flags = btrfs_block_group_flags(&cache->item);
9773
9774                 key.objectid = found_key.objectid + found_key.offset;
9775                 btrfs_release_path(path);
9776
9777                 /*
9778                  * We need to exclude the super stripes now so that the space
9779                  * info has super bytes accounted for, otherwise we'll think
9780                  * we have more space than we actually do.
9781                  */
9782                 ret = exclude_super_stripes(root, cache);
9783                 if (ret) {
9784                         /*
9785                          * We may have excluded something, so call this just in
9786                          * case.
9787                          */
9788                         free_excluded_extents(root, cache);
9789                         btrfs_put_block_group(cache);
9790                         goto error;
9791                 }
9792
9793                 /*
9794                  * check for two cases, either we are full, and therefore
9795                  * don't need to bother with the caching work since we won't
9796                  * find any space, or we are empty, and we can just add all
9797                  * the space in and be done with it.  This saves us _alot_ of
9798                  * time, particularly in the full case.
9799                  */
9800                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
9801                         cache->last_byte_to_unpin = (u64)-1;
9802                         cache->cached = BTRFS_CACHE_FINISHED;
9803                         free_excluded_extents(root, cache);
9804                 } else if (btrfs_block_group_used(&cache->item) == 0) {
9805                         cache->last_byte_to_unpin = (u64)-1;
9806                         cache->cached = BTRFS_CACHE_FINISHED;
9807                         add_new_free_space(cache, root->fs_info,
9808                                            found_key.objectid,
9809                                            found_key.objectid +
9810                                            found_key.offset);
9811                         free_excluded_extents(root, cache);
9812                 }
9813
9814                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
9815                 if (ret) {
9816                         btrfs_remove_free_space_cache(cache);
9817                         btrfs_put_block_group(cache);
9818                         goto error;
9819                 }
9820
9821                 ret = update_space_info(info, cache->flags, found_key.offset,
9822                                         btrfs_block_group_used(&cache->item),
9823                                         &space_info);
9824                 if (ret) {
9825                         btrfs_remove_free_space_cache(cache);
9826                         spin_lock(&info->block_group_cache_lock);
9827                         rb_erase(&cache->cache_node,
9828                                  &info->block_group_cache_tree);
9829                         RB_CLEAR_NODE(&cache->cache_node);
9830                         spin_unlock(&info->block_group_cache_lock);
9831                         btrfs_put_block_group(cache);
9832                         goto error;
9833                 }
9834
9835                 cache->space_info = space_info;
9836                 spin_lock(&cache->space_info->lock);
9837                 cache->space_info->bytes_readonly += cache->bytes_super;
9838                 spin_unlock(&cache->space_info->lock);
9839
9840                 __link_block_group(space_info, cache);
9841
9842                 set_avail_alloc_bits(root->fs_info, cache->flags);
9843                 if (btrfs_chunk_readonly(root, cache->key.objectid)) {
9844                         inc_block_group_ro(cache, 1);
9845                 } else if (btrfs_block_group_used(&cache->item) == 0) {
9846                         spin_lock(&info->unused_bgs_lock);
9847                         /* Should always be true but just in case. */
9848                         if (list_empty(&cache->bg_list)) {
9849                                 btrfs_get_block_group(cache);
9850                                 list_add_tail(&cache->bg_list,
9851                                               &info->unused_bgs);
9852                         }
9853                         spin_unlock(&info->unused_bgs_lock);
9854                 }
9855         }
9856
9857         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
9858                 if (!(get_alloc_profile(root, space_info->flags) &
9859                       (BTRFS_BLOCK_GROUP_RAID10 |
9860                        BTRFS_BLOCK_GROUP_RAID1 |
9861                        BTRFS_BLOCK_GROUP_RAID5 |
9862                        BTRFS_BLOCK_GROUP_RAID6 |
9863                        BTRFS_BLOCK_GROUP_DUP)))
9864                         continue;
9865                 /*
9866                  * avoid allocating from un-mirrored block group if there are
9867                  * mirrored block groups.
9868                  */
9869                 list_for_each_entry(cache,
9870                                 &space_info->block_groups[BTRFS_RAID_RAID0],
9871                                 list)
9872                         inc_block_group_ro(cache, 1);
9873                 list_for_each_entry(cache,
9874                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
9875                                 list)
9876                         inc_block_group_ro(cache, 1);
9877         }
9878
9879         init_global_block_rsv(info);
9880         ret = 0;
9881 error:
9882         btrfs_free_path(path);
9883         return ret;
9884 }
9885
9886 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9887                                        struct btrfs_root *root)
9888 {
9889         struct btrfs_block_group_cache *block_group, *tmp;
9890         struct btrfs_root *extent_root = root->fs_info->extent_root;
9891         struct btrfs_block_group_item item;
9892         struct btrfs_key key;
9893         int ret = 0;
9894         bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
9895
9896         trans->can_flush_pending_bgs = false;
9897         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
9898                 if (ret)
9899                         goto next;
9900
9901                 spin_lock(&block_group->lock);
9902                 memcpy(&item, &block_group->item, sizeof(item));
9903                 memcpy(&key, &block_group->key, sizeof(key));
9904                 spin_unlock(&block_group->lock);
9905
9906                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
9907                                         sizeof(item));
9908                 if (ret)
9909                         btrfs_abort_transaction(trans, extent_root, ret);
9910                 ret = btrfs_finish_chunk_alloc(trans, extent_root,
9911                                                key.objectid, key.offset);
9912                 if (ret)
9913                         btrfs_abort_transaction(trans, extent_root, ret);
9914                 add_block_group_free_space(trans, root->fs_info, block_group);
9915                 /* already aborted the transaction if it failed. */
9916 next:
9917                 list_del_init(&block_group->bg_list);
9918         }
9919         trans->can_flush_pending_bgs = can_flush_pending_bgs;
9920 }
9921
9922 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
9923                            struct btrfs_root *root, u64 bytes_used,
9924                            u64 type, u64 chunk_objectid, u64 chunk_offset,
9925                            u64 size)
9926 {
9927         int ret;
9928         struct btrfs_root *extent_root;
9929         struct btrfs_block_group_cache *cache;
9930
9931         extent_root = root->fs_info->extent_root;
9932
9933         btrfs_set_log_full_commit(root->fs_info, trans);
9934
9935         cache = btrfs_create_block_group_cache(root, chunk_offset, size);
9936         if (!cache)
9937                 return -ENOMEM;
9938
9939         btrfs_set_block_group_used(&cache->item, bytes_used);
9940         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
9941         btrfs_set_block_group_flags(&cache->item, type);
9942
9943         cache->flags = type;
9944         cache->last_byte_to_unpin = (u64)-1;
9945         cache->cached = BTRFS_CACHE_FINISHED;
9946         cache->needs_free_space = 1;
9947         ret = exclude_super_stripes(root, cache);
9948         if (ret) {
9949                 /*
9950                  * We may have excluded something, so call this just in
9951                  * case.
9952                  */
9953                 free_excluded_extents(root, cache);
9954                 btrfs_put_block_group(cache);
9955                 return ret;
9956         }
9957
9958         add_new_free_space(cache, root->fs_info, chunk_offset,
9959                            chunk_offset + size);
9960
9961         free_excluded_extents(root, cache);
9962
9963 #ifdef CONFIG_BTRFS_DEBUG
9964         if (btrfs_should_fragment_free_space(root, cache)) {
9965                 u64 new_bytes_used = size - bytes_used;
9966
9967                 bytes_used += new_bytes_used >> 1;
9968                 fragment_free_space(root, cache);
9969         }
9970 #endif
9971         /*
9972          * Call to ensure the corresponding space_info object is created and
9973          * assigned to our block group, but don't update its counters just yet.
9974          * We want our bg to be added to the rbtree with its ->space_info set.
9975          */
9976         ret = update_space_info(root->fs_info, cache->flags, 0, 0,
9977                                 &cache->space_info);
9978         if (ret) {
9979                 btrfs_remove_free_space_cache(cache);
9980                 btrfs_put_block_group(cache);
9981                 return ret;
9982         }
9983
9984         ret = btrfs_add_block_group_cache(root->fs_info, cache);
9985         if (ret) {
9986                 btrfs_remove_free_space_cache(cache);
9987                 btrfs_put_block_group(cache);
9988                 return ret;
9989         }
9990
9991         /*
9992          * Now that our block group has its ->space_info set and is inserted in
9993          * the rbtree, update the space info's counters.
9994          */
9995         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
9996                                 &cache->space_info);
9997         if (ret) {
9998                 btrfs_remove_free_space_cache(cache);
9999                 spin_lock(&root->fs_info->block_group_cache_lock);
10000                 rb_erase(&cache->cache_node,
10001                          &root->fs_info->block_group_cache_tree);
10002                 RB_CLEAR_NODE(&cache->cache_node);
10003                 spin_unlock(&root->fs_info->block_group_cache_lock);
10004                 btrfs_put_block_group(cache);
10005                 return ret;
10006         }
10007         update_global_block_rsv(root->fs_info);
10008
10009         spin_lock(&cache->space_info->lock);
10010         cache->space_info->bytes_readonly += cache->bytes_super;
10011         spin_unlock(&cache->space_info->lock);
10012
10013         __link_block_group(cache->space_info, cache);
10014
10015         list_add_tail(&cache->bg_list, &trans->new_bgs);
10016
10017         set_avail_alloc_bits(extent_root->fs_info, type);
10018
10019         return 0;
10020 }
10021
10022 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10023 {
10024         u64 extra_flags = chunk_to_extended(flags) &
10025                                 BTRFS_EXTENDED_PROFILE_MASK;
10026
10027         write_seqlock(&fs_info->profiles_lock);
10028         if (flags & BTRFS_BLOCK_GROUP_DATA)
10029                 fs_info->avail_data_alloc_bits &= ~extra_flags;
10030         if (flags & BTRFS_BLOCK_GROUP_METADATA)
10031                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10032         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10033                 fs_info->avail_system_alloc_bits &= ~extra_flags;
10034         write_sequnlock(&fs_info->profiles_lock);
10035 }
10036
10037 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10038                              struct btrfs_root *root, u64 group_start,
10039                              struct extent_map *em)
10040 {
10041         struct btrfs_path *path;
10042         struct btrfs_block_group_cache *block_group;
10043         struct btrfs_free_cluster *cluster;
10044         struct btrfs_root *tree_root = root->fs_info->tree_root;
10045         struct btrfs_key key;
10046         struct inode *inode;
10047         struct kobject *kobj = NULL;
10048         int ret;
10049         int index;
10050         int factor;
10051         struct btrfs_caching_control *caching_ctl = NULL;
10052         bool remove_em;
10053
10054         root = root->fs_info->extent_root;
10055
10056         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
10057         BUG_ON(!block_group);
10058         BUG_ON(!block_group->ro);
10059
10060         /*
10061          * Free the reserved super bytes from this block group before
10062          * remove it.
10063          */
10064         free_excluded_extents(root, block_group);
10065
10066         memcpy(&key, &block_group->key, sizeof(key));
10067         index = get_block_group_index(block_group);
10068         if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
10069                                   BTRFS_BLOCK_GROUP_RAID1 |
10070                                   BTRFS_BLOCK_GROUP_RAID10))
10071                 factor = 2;
10072         else
10073                 factor = 1;
10074
10075         /* make sure this block group isn't part of an allocation cluster */
10076         cluster = &root->fs_info->data_alloc_cluster;
10077         spin_lock(&cluster->refill_lock);
10078         btrfs_return_cluster_to_free_space(block_group, cluster);
10079         spin_unlock(&cluster->refill_lock);
10080
10081         /*
10082          * make sure this block group isn't part of a metadata
10083          * allocation cluster
10084          */
10085         cluster = &root->fs_info->meta_alloc_cluster;
10086         spin_lock(&cluster->refill_lock);
10087         btrfs_return_cluster_to_free_space(block_group, cluster);
10088         spin_unlock(&cluster->refill_lock);
10089
10090         path = btrfs_alloc_path();
10091         if (!path) {
10092                 ret = -ENOMEM;
10093                 goto out;
10094         }
10095
10096         /*
10097          * get the inode first so any iput calls done for the io_list
10098          * aren't the final iput (no unlinks allowed now)
10099          */
10100         inode = lookup_free_space_inode(tree_root, block_group, path);
10101
10102         mutex_lock(&trans->transaction->cache_write_mutex);
10103         /*
10104          * make sure our free spache cache IO is done before remove the
10105          * free space inode
10106          */
10107         spin_lock(&trans->transaction->dirty_bgs_lock);
10108         if (!list_empty(&block_group->io_list)) {
10109                 list_del_init(&block_group->io_list);
10110
10111                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10112
10113                 spin_unlock(&trans->transaction->dirty_bgs_lock);
10114                 btrfs_wait_cache_io(root, trans, block_group,
10115                                     &block_group->io_ctl, path,
10116                                     block_group->key.objectid);
10117                 btrfs_put_block_group(block_group);
10118                 spin_lock(&trans->transaction->dirty_bgs_lock);
10119         }
10120
10121         if (!list_empty(&block_group->dirty_list)) {
10122                 list_del_init(&block_group->dirty_list);
10123                 btrfs_put_block_group(block_group);
10124         }
10125         spin_unlock(&trans->transaction->dirty_bgs_lock);
10126         mutex_unlock(&trans->transaction->cache_write_mutex);
10127
10128         if (!IS_ERR(inode)) {
10129                 ret = btrfs_orphan_add(trans, inode);
10130                 if (ret) {
10131                         btrfs_add_delayed_iput(inode);
10132                         goto out;
10133                 }
10134                 clear_nlink(inode);
10135                 /* One for the block groups ref */
10136                 spin_lock(&block_group->lock);
10137                 if (block_group->iref) {
10138                         block_group->iref = 0;
10139                         block_group->inode = NULL;
10140                         spin_unlock(&block_group->lock);
10141                         iput(inode);
10142                 } else {
10143                         spin_unlock(&block_group->lock);
10144                 }
10145                 /* One for our lookup ref */
10146                 btrfs_add_delayed_iput(inode);
10147         }
10148
10149         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10150         key.offset = block_group->key.objectid;
10151         key.type = 0;
10152
10153         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10154         if (ret < 0)
10155                 goto out;
10156         if (ret > 0)
10157                 btrfs_release_path(path);
10158         if (ret == 0) {
10159                 ret = btrfs_del_item(trans, tree_root, path);
10160                 if (ret)
10161                         goto out;
10162                 btrfs_release_path(path);
10163         }
10164
10165         spin_lock(&root->fs_info->block_group_cache_lock);
10166         rb_erase(&block_group->cache_node,
10167                  &root->fs_info->block_group_cache_tree);
10168         RB_CLEAR_NODE(&block_group->cache_node);
10169
10170         if (root->fs_info->first_logical_byte == block_group->key.objectid)
10171                 root->fs_info->first_logical_byte = (u64)-1;
10172         spin_unlock(&root->fs_info->block_group_cache_lock);
10173
10174         down_write(&block_group->space_info->groups_sem);
10175         /*
10176          * we must use list_del_init so people can check to see if they
10177          * are still on the list after taking the semaphore
10178          */
10179         list_del_init(&block_group->list);
10180         if (list_empty(&block_group->space_info->block_groups[index])) {
10181                 kobj = block_group->space_info->block_group_kobjs[index];
10182                 block_group->space_info->block_group_kobjs[index] = NULL;
10183                 clear_avail_alloc_bits(root->fs_info, block_group->flags);
10184         }
10185         up_write(&block_group->space_info->groups_sem);
10186         if (kobj) {
10187                 kobject_del(kobj);
10188                 kobject_put(kobj);
10189         }
10190
10191         if (block_group->has_caching_ctl)
10192                 caching_ctl = get_caching_control(block_group);
10193         if (block_group->cached == BTRFS_CACHE_STARTED)
10194                 wait_block_group_cache_done(block_group);
10195         if (block_group->has_caching_ctl) {
10196                 down_write(&root->fs_info->commit_root_sem);
10197                 if (!caching_ctl) {
10198                         struct btrfs_caching_control *ctl;
10199
10200                         list_for_each_entry(ctl,
10201                                     &root->fs_info->caching_block_groups, list)
10202                                 if (ctl->block_group == block_group) {
10203                                         caching_ctl = ctl;
10204                                         atomic_inc(&caching_ctl->count);
10205                                         break;
10206                                 }
10207                 }
10208                 if (caching_ctl)
10209                         list_del_init(&caching_ctl->list);
10210                 up_write(&root->fs_info->commit_root_sem);
10211                 if (caching_ctl) {
10212                         /* Once for the caching bgs list and once for us. */
10213                         put_caching_control(caching_ctl);
10214                         put_caching_control(caching_ctl);
10215                 }
10216         }
10217
10218         spin_lock(&trans->transaction->dirty_bgs_lock);
10219         if (!list_empty(&block_group->dirty_list)) {
10220                 WARN_ON(1);
10221         }
10222         if (!list_empty(&block_group->io_list)) {
10223                 WARN_ON(1);
10224         }
10225         spin_unlock(&trans->transaction->dirty_bgs_lock);
10226         btrfs_remove_free_space_cache(block_group);
10227
10228         spin_lock(&block_group->space_info->lock);
10229         list_del_init(&block_group->ro_list);
10230
10231         if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
10232                 WARN_ON(block_group->space_info->total_bytes
10233                         < block_group->key.offset);
10234                 WARN_ON(block_group->space_info->bytes_readonly
10235                         < block_group->key.offset);
10236                 WARN_ON(block_group->space_info->disk_total
10237                         < block_group->key.offset * factor);
10238         }
10239         block_group->space_info->total_bytes -= block_group->key.offset;
10240         block_group->space_info->bytes_readonly -= block_group->key.offset;
10241         block_group->space_info->disk_total -= block_group->key.offset * factor;
10242
10243         spin_unlock(&block_group->space_info->lock);
10244
10245         memcpy(&key, &block_group->key, sizeof(key));
10246
10247         lock_chunks(root);
10248         if (!list_empty(&em->list)) {
10249                 /* We're in the transaction->pending_chunks list. */
10250                 free_extent_map(em);
10251         }
10252         spin_lock(&block_group->lock);
10253         block_group->removed = 1;
10254         /*
10255          * At this point trimming can't start on this block group, because we
10256          * removed the block group from the tree fs_info->block_group_cache_tree
10257          * so no one can't find it anymore and even if someone already got this
10258          * block group before we removed it from the rbtree, they have already
10259          * incremented block_group->trimming - if they didn't, they won't find
10260          * any free space entries because we already removed them all when we
10261          * called btrfs_remove_free_space_cache().
10262          *
10263          * And we must not remove the extent map from the fs_info->mapping_tree
10264          * to prevent the same logical address range and physical device space
10265          * ranges from being reused for a new block group. This is because our
10266          * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10267          * completely transactionless, so while it is trimming a range the
10268          * currently running transaction might finish and a new one start,
10269          * allowing for new block groups to be created that can reuse the same
10270          * physical device locations unless we take this special care.
10271          *
10272          * There may also be an implicit trim operation if the file system
10273          * is mounted with -odiscard. The same protections must remain
10274          * in place until the extents have been discarded completely when
10275          * the transaction commit has completed.
10276          */
10277         remove_em = (atomic_read(&block_group->trimming) == 0);
10278         /*
10279          * Make sure a trimmer task always sees the em in the pinned_chunks list
10280          * if it sees block_group->removed == 1 (needs to lock block_group->lock
10281          * before checking block_group->removed).
10282          */
10283         if (!remove_em) {
10284                 /*
10285                  * Our em might be in trans->transaction->pending_chunks which
10286                  * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10287                  * and so is the fs_info->pinned_chunks list.
10288                  *
10289                  * So at this point we must be holding the chunk_mutex to avoid
10290                  * any races with chunk allocation (more specifically at
10291                  * volumes.c:contains_pending_extent()), to ensure it always
10292                  * sees the em, either in the pending_chunks list or in the
10293                  * pinned_chunks list.
10294                  */
10295                 list_move_tail(&em->list, &root->fs_info->pinned_chunks);
10296         }
10297         spin_unlock(&block_group->lock);
10298
10299         if (remove_em) {
10300                 struct extent_map_tree *em_tree;
10301
10302                 em_tree = &root->fs_info->mapping_tree.map_tree;
10303                 write_lock(&em_tree->lock);
10304                 /*
10305                  * The em might be in the pending_chunks list, so make sure the
10306                  * chunk mutex is locked, since remove_extent_mapping() will
10307                  * delete us from that list.
10308                  */
10309                 remove_extent_mapping(em_tree, em);
10310                 write_unlock(&em_tree->lock);
10311                 /* once for the tree */
10312                 free_extent_map(em);
10313         }
10314
10315         unlock_chunks(root);
10316
10317         ret = remove_block_group_free_space(trans, root->fs_info, block_group);
10318         if (ret)
10319                 goto out;
10320
10321         btrfs_put_block_group(block_group);
10322         btrfs_put_block_group(block_group);
10323
10324         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10325         if (ret > 0)
10326                 ret = -EIO;
10327         if (ret < 0)
10328                 goto out;
10329
10330         ret = btrfs_del_item(trans, root, path);
10331 out:
10332         btrfs_free_path(path);
10333         return ret;
10334 }
10335
10336 struct btrfs_trans_handle *
10337 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10338                                      const u64 chunk_offset)
10339 {
10340         struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10341         struct extent_map *em;
10342         struct map_lookup *map;
10343         unsigned int num_items;
10344
10345         read_lock(&em_tree->lock);
10346         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10347         read_unlock(&em_tree->lock);
10348         ASSERT(em && em->start == chunk_offset);
10349
10350         /*
10351          * We need to reserve 3 + N units from the metadata space info in order
10352          * to remove a block group (done at btrfs_remove_chunk() and at
10353          * btrfs_remove_block_group()), which are used for:
10354          *
10355          * 1 unit for adding the free space inode's orphan (located in the tree
10356          * of tree roots).
10357          * 1 unit for deleting the block group item (located in the extent
10358          * tree).
10359          * 1 unit for deleting the free space item (located in tree of tree
10360          * roots).
10361          * N units for deleting N device extent items corresponding to each
10362          * stripe (located in the device tree).
10363          *
10364          * In order to remove a block group we also need to reserve units in the
10365          * system space info in order to update the chunk tree (update one or
10366          * more device items and remove one chunk item), but this is done at
10367          * btrfs_remove_chunk() through a call to check_system_chunk().
10368          */
10369         map = (struct map_lookup *)em->bdev;
10370         num_items = 3 + map->num_stripes;
10371         free_extent_map(em);
10372
10373         return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10374                                                            num_items, 1);
10375 }
10376
10377 /*
10378  * Process the unused_bgs list and remove any that don't have any allocated
10379  * space inside of them.
10380  */
10381 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10382 {
10383         struct btrfs_block_group_cache *block_group;
10384         struct btrfs_space_info *space_info;
10385         struct btrfs_root *root = fs_info->extent_root;
10386         struct btrfs_trans_handle *trans;
10387         int ret = 0;
10388
10389         if (!fs_info->open)
10390                 return;
10391
10392         spin_lock(&fs_info->unused_bgs_lock);
10393         while (!list_empty(&fs_info->unused_bgs)) {
10394                 u64 start, end;
10395                 int trimming;
10396
10397                 block_group = list_first_entry(&fs_info->unused_bgs,
10398                                                struct btrfs_block_group_cache,
10399                                                bg_list);
10400                 list_del_init(&block_group->bg_list);
10401
10402                 space_info = block_group->space_info;
10403
10404                 if (ret || btrfs_mixed_space_info(space_info)) {
10405                         btrfs_put_block_group(block_group);
10406                         continue;
10407                 }
10408                 spin_unlock(&fs_info->unused_bgs_lock);
10409
10410                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
10411
10412                 /* Don't want to race with allocators so take the groups_sem */
10413                 down_write(&space_info->groups_sem);
10414                 spin_lock(&block_group->lock);
10415                 if (block_group->reserved ||
10416                     btrfs_block_group_used(&block_group->item) ||
10417                     block_group->ro ||
10418                     list_is_singular(&block_group->list)) {
10419                         /*
10420                          * We want to bail if we made new allocations or have
10421                          * outstanding allocations in this block group.  We do
10422                          * the ro check in case balance is currently acting on
10423                          * this block group.
10424                          */
10425                         spin_unlock(&block_group->lock);
10426                         up_write(&space_info->groups_sem);
10427                         goto next;
10428                 }
10429                 spin_unlock(&block_group->lock);
10430
10431                 /* We don't want to force the issue, only flip if it's ok. */
10432                 ret = inc_block_group_ro(block_group, 0);
10433                 up_write(&space_info->groups_sem);
10434                 if (ret < 0) {
10435                         ret = 0;
10436                         goto next;
10437                 }
10438
10439                 /*
10440                  * Want to do this before we do anything else so we can recover
10441                  * properly if we fail to join the transaction.
10442                  */
10443                 trans = btrfs_start_trans_remove_block_group(fs_info,
10444                                                      block_group->key.objectid);
10445                 if (IS_ERR(trans)) {
10446                         btrfs_dec_block_group_ro(root, block_group);
10447                         ret = PTR_ERR(trans);
10448                         goto next;
10449                 }
10450
10451                 /*
10452                  * We could have pending pinned extents for this block group,
10453                  * just delete them, we don't care about them anymore.
10454                  */
10455                 start = block_group->key.objectid;
10456                 end = start + block_group->key.offset - 1;
10457                 /*
10458                  * Hold the unused_bg_unpin_mutex lock to avoid racing with
10459                  * btrfs_finish_extent_commit(). If we are at transaction N,
10460                  * another task might be running finish_extent_commit() for the
10461                  * previous transaction N - 1, and have seen a range belonging
10462                  * to the block group in freed_extents[] before we were able to
10463                  * clear the whole block group range from freed_extents[]. This
10464                  * means that task can lookup for the block group after we
10465                  * unpinned it from freed_extents[] and removed it, leading to
10466                  * a BUG_ON() at btrfs_unpin_extent_range().
10467                  */
10468                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
10469                 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10470                                   EXTENT_DIRTY, GFP_NOFS);
10471                 if (ret) {
10472                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10473                         btrfs_dec_block_group_ro(root, block_group);
10474                         goto end_trans;
10475                 }
10476                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10477                                   EXTENT_DIRTY, GFP_NOFS);
10478                 if (ret) {
10479                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10480                         btrfs_dec_block_group_ro(root, block_group);
10481                         goto end_trans;
10482                 }
10483                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10484
10485                 /* Reset pinned so btrfs_put_block_group doesn't complain */
10486                 spin_lock(&space_info->lock);
10487                 spin_lock(&block_group->lock);
10488
10489                 space_info->bytes_pinned -= block_group->pinned;
10490                 space_info->bytes_readonly += block_group->pinned;
10491                 percpu_counter_add(&space_info->total_bytes_pinned,
10492                                    -block_group->pinned);
10493                 block_group->pinned = 0;
10494
10495                 spin_unlock(&block_group->lock);
10496                 spin_unlock(&space_info->lock);
10497
10498                 /* DISCARD can flip during remount */
10499                 trimming = btrfs_test_opt(root, DISCARD);
10500
10501                 /* Implicit trim during transaction commit. */
10502                 if (trimming)
10503                         btrfs_get_block_group_trimming(block_group);
10504
10505                 /*
10506                  * Btrfs_remove_chunk will abort the transaction if things go
10507                  * horribly wrong.
10508                  */
10509                 ret = btrfs_remove_chunk(trans, root,
10510                                          block_group->key.objectid);
10511
10512                 if (ret) {
10513                         if (trimming)
10514                                 btrfs_put_block_group_trimming(block_group);
10515                         goto end_trans;
10516                 }
10517
10518                 /*
10519                  * If we're not mounted with -odiscard, we can just forget
10520                  * about this block group. Otherwise we'll need to wait
10521                  * until transaction commit to do the actual discard.
10522                  */
10523                 if (trimming) {
10524                         WARN_ON(!list_empty(&block_group->bg_list));
10525                         spin_lock(&trans->transaction->deleted_bgs_lock);
10526                         list_move(&block_group->bg_list,
10527                                   &trans->transaction->deleted_bgs);
10528                         spin_unlock(&trans->transaction->deleted_bgs_lock);
10529                         btrfs_get_block_group(block_group);
10530                 }
10531 end_trans:
10532                 btrfs_end_transaction(trans, root);
10533 next:
10534                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10535                 btrfs_put_block_group(block_group);
10536                 spin_lock(&fs_info->unused_bgs_lock);
10537         }
10538         spin_unlock(&fs_info->unused_bgs_lock);
10539 }
10540
10541 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10542 {
10543         struct btrfs_space_info *space_info;
10544         struct btrfs_super_block *disk_super;
10545         u64 features;
10546         u64 flags;
10547         int mixed = 0;
10548         int ret;
10549
10550         disk_super = fs_info->super_copy;
10551         if (!btrfs_super_root(disk_super))
10552                 return 1;
10553
10554         features = btrfs_super_incompat_flags(disk_super);
10555         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10556                 mixed = 1;
10557
10558         flags = BTRFS_BLOCK_GROUP_SYSTEM;
10559         ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10560         if (ret)
10561                 goto out;
10562
10563         if (mixed) {
10564                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10565                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10566         } else {
10567                 flags = BTRFS_BLOCK_GROUP_METADATA;
10568                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10569                 if (ret)
10570                         goto out;
10571
10572                 flags = BTRFS_BLOCK_GROUP_DATA;
10573                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10574         }
10575 out:
10576         return ret;
10577 }
10578
10579 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
10580 {
10581         return unpin_extent_range(root, start, end, false);
10582 }
10583
10584 /*
10585  * It used to be that old block groups would be left around forever.
10586  * Iterating over them would be enough to trim unused space.  Since we
10587  * now automatically remove them, we also need to iterate over unallocated
10588  * space.
10589  *
10590  * We don't want a transaction for this since the discard may take a
10591  * substantial amount of time.  We don't require that a transaction be
10592  * running, but we do need to take a running transaction into account
10593  * to ensure that we're not discarding chunks that were released in
10594  * the current transaction.
10595  *
10596  * Holding the chunks lock will prevent other threads from allocating
10597  * or releasing chunks, but it won't prevent a running transaction
10598  * from committing and releasing the memory that the pending chunks
10599  * list head uses.  For that, we need to take a reference to the
10600  * transaction.
10601  */
10602 static int btrfs_trim_free_extents(struct btrfs_device *device,
10603                                    u64 minlen, u64 *trimmed)
10604 {
10605         u64 start = 0, len = 0;
10606         int ret;
10607
10608         *trimmed = 0;
10609
10610         /* Not writeable = nothing to do. */
10611         if (!device->writeable)
10612                 return 0;
10613
10614         /* No free space = nothing to do. */
10615         if (device->total_bytes <= device->bytes_used)
10616                 return 0;
10617
10618         ret = 0;
10619
10620         while (1) {
10621                 struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
10622                 struct btrfs_transaction *trans;
10623                 u64 bytes;
10624
10625                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
10626                 if (ret)
10627                         return ret;
10628
10629                 down_read(&fs_info->commit_root_sem);
10630
10631                 spin_lock(&fs_info->trans_lock);
10632                 trans = fs_info->running_transaction;
10633                 if (trans)
10634                         atomic_inc(&trans->use_count);
10635                 spin_unlock(&fs_info->trans_lock);
10636
10637                 ret = find_free_dev_extent_start(trans, device, minlen, start,
10638                                                  &start, &len);
10639                 if (trans)
10640                         btrfs_put_transaction(trans);
10641
10642                 if (ret) {
10643                         up_read(&fs_info->commit_root_sem);
10644                         mutex_unlock(&fs_info->chunk_mutex);
10645                         if (ret == -ENOSPC)
10646                                 ret = 0;
10647                         break;
10648                 }
10649
10650                 ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
10651                 up_read(&fs_info->commit_root_sem);
10652                 mutex_unlock(&fs_info->chunk_mutex);
10653
10654                 if (ret)
10655                         break;
10656
10657                 start += len;
10658                 *trimmed += bytes;
10659
10660                 if (fatal_signal_pending(current)) {
10661                         ret = -ERESTARTSYS;
10662                         break;
10663                 }
10664
10665                 cond_resched();
10666         }
10667
10668         return ret;
10669 }
10670
10671 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
10672 {
10673         struct btrfs_fs_info *fs_info = root->fs_info;
10674         struct btrfs_block_group_cache *cache = NULL;
10675         struct btrfs_device *device;
10676         struct list_head *devices;
10677         u64 group_trimmed;
10678         u64 start;
10679         u64 end;
10680         u64 trimmed = 0;
10681         u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
10682         int ret = 0;
10683
10684         /*
10685          * try to trim all FS space, our block group may start from non-zero.
10686          */
10687         if (range->len == total_bytes)
10688                 cache = btrfs_lookup_first_block_group(fs_info, range->start);
10689         else
10690                 cache = btrfs_lookup_block_group(fs_info, range->start);
10691
10692         while (cache) {
10693                 if (cache->key.objectid >= (range->start + range->len)) {
10694                         btrfs_put_block_group(cache);
10695                         break;
10696                 }
10697
10698                 start = max(range->start, cache->key.objectid);
10699                 end = min(range->start + range->len,
10700                                 cache->key.objectid + cache->key.offset);
10701
10702                 if (end - start >= range->minlen) {
10703                         if (!block_group_cache_done(cache)) {
10704                                 ret = cache_block_group(cache, 0);
10705                                 if (ret) {
10706                                         btrfs_put_block_group(cache);
10707                                         break;
10708                                 }
10709                                 ret = wait_block_group_cache_done(cache);
10710                                 if (ret) {
10711                                         btrfs_put_block_group(cache);
10712                                         break;
10713                                 }
10714                         }
10715                         ret = btrfs_trim_block_group(cache,
10716                                                      &group_trimmed,
10717                                                      start,
10718                                                      end,
10719                                                      range->minlen);
10720
10721                         trimmed += group_trimmed;
10722                         if (ret) {
10723                                 btrfs_put_block_group(cache);
10724                                 break;
10725                         }
10726                 }
10727
10728                 cache = next_block_group(fs_info->tree_root, cache);
10729         }
10730
10731         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
10732         devices = &root->fs_info->fs_devices->alloc_list;
10733         list_for_each_entry(device, devices, dev_alloc_list) {
10734                 ret = btrfs_trim_free_extents(device, range->minlen,
10735                                               &group_trimmed);
10736                 if (ret)
10737                         break;
10738
10739                 trimmed += group_trimmed;
10740         }
10741         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
10742
10743         range->len = trimmed;
10744         return ret;
10745 }
10746
10747 /*
10748  * btrfs_{start,end}_write_no_snapshoting() are similar to
10749  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
10750  * data into the page cache through nocow before the subvolume is snapshoted,
10751  * but flush the data into disk after the snapshot creation, or to prevent
10752  * operations while snapshoting is ongoing and that cause the snapshot to be
10753  * inconsistent (writes followed by expanding truncates for example).
10754  */
10755 void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
10756 {
10757         percpu_counter_dec(&root->subv_writers->counter);
10758         /*
10759          * Make sure counter is updated before we wake up waiters.
10760          */
10761         smp_mb();
10762         if (waitqueue_active(&root->subv_writers->wait))
10763                 wake_up(&root->subv_writers->wait);
10764 }
10765
10766 int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
10767 {
10768         if (atomic_read(&root->will_be_snapshoted))
10769                 return 0;
10770
10771         percpu_counter_inc(&root->subv_writers->counter);
10772         /*
10773          * Make sure counter is updated before we check for snapshot creation.
10774          */
10775         smp_mb();
10776         if (atomic_read(&root->will_be_snapshoted)) {
10777                 btrfs_end_write_no_snapshoting(root);
10778                 return 0;
10779         }
10780         return 1;
10781 }