fs/btrfs/extent-tree.c

   1 /*
   2  * Copyright (C) 2007 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18 #include <linux/sched.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/writeback.h>
  21 #include <linux/blkdev.h>
  22 #include <linux/sort.h>
  23 #include <linux/rcupdate.h>
  24 #include <linux/kthread.h>
  25 #include <linux/slab.h>
  26 #include <linux/ratelimit.h>
  27 #include "compat.h"
  28 #include "hash.h"
  29 #include "ctree.h"
  30 #include "disk-io.h"
  31 #include "print-tree.h"
  32 #include "transaction.h"
  33 #include "volumes.h"
  34 #include "locking.h"
  35 #include "free-space-cache.h"
  36 #include "math.h"
  37
  38 #undef SCRAMBLE_DELAYED_REFS
  39
  40 /*
  41  * control flags for do_chunk_alloc's force field
  42  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  43  * if we really need one.
  44  *
  45  * CHUNK_ALLOC_LIMITED means to only try and allocate one
  46  * if we have very few chunks already allocated.  This is
  47  * used as part of the clustering code to help make sure
  48  * we have a good pool of storage to cluster in, without
  49  * filling the FS with empty chunks
  50  *
  51  * CHUNK_ALLOC_FORCE means it must try to allocate one
  52  *
  53  */
  54 enum {
  55         CHUNK_ALLOC_NO_FORCE = 0,
  56         CHUNK_ALLOC_LIMITED = 1,
  57         CHUNK_ALLOC_FORCE = 2,
  58 };
  59
  60 /*
  61  * Control how reservations are dealt with.
  62  *
  63  * RESERVE_FREE - freeing a reservation.
  64  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
  65  *   ENOSPC accounting
  66  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
  67  *   bytes_may_use as the ENOSPC accounting is done elsewhere
  68  */
  69 enum {
  70         RESERVE_FREE = 0,
  71         RESERVE_ALLOC = 1,
  72         RESERVE_ALLOC_NO_ACCOUNT = 2,
  73 };
  74
  75 static int update_block_group(struct btrfs_root *root,
  76                               u64 bytenr, u64 num_bytes, int alloc);
  77 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  78                                 struct btrfs_root *root,
  79                                 u64 bytenr, u64 num_bytes, u64 parent,
  80                                 u64 root_objectid, u64 owner_objectid,
  81                                 u64 owner_offset, int refs_to_drop,
  82                                 struct btrfs_delayed_extent_op *extra_op);
  83 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  84                                     struct extent_buffer *leaf,
  85                                     struct btrfs_extent_item *ei);
  86 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  87                                       struct btrfs_root *root,
  88                                       u64 parent, u64 root_objectid,
  89                                       u64 flags, u64 owner, u64 offset,
  90                                       struct btrfs_key *ins, int ref_mod);
  91 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  92                                      struct btrfs_root *root,
  93                                      u64 parent, u64 root_objectid,
  94                                      u64 flags, struct btrfs_disk_key *key,
  95                                      int level, struct btrfs_key *ins);
  96 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
  97                           struct btrfs_root *extent_root, u64 flags,
  98                           int force);
  99 static int find_next_key(struct btrfs_path *path, int level,
 100                          struct btrfs_key *key);
 101 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 102                             int dump_block_groups);
 103 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 104                                        u64 num_bytes, int reserve);
 105
 106 static noinline int
 107 block_group_cache_done(struct btrfs_block_group_cache *cache)
 108 {
 109         smp_mb();
 110         return cache->cached == BTRFS_CACHE_FINISHED;
 111 }
 112
 113 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 114 {
 115         return (cache->flags & bits) == bits;
 116 }
 117
 118 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 119 {
 120         atomic_inc(&cache->count);
 121 }
 122
 123 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 124 {
 125         if (atomic_dec_and_test(&cache->count)) {
 126                 WARN_ON(cache->pinned > 0);
 127                 WARN_ON(cache->reserved > 0);
 128                 kfree(cache->free_space_ctl);
 129                 kfree(cache);
 130         }
 131 }
 132
 133 /*
 134  * this adds the block group to the fs_info rb tree for the block group
 135  * cache
 136  */
 137 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 138                                 struct btrfs_block_group_cache *block_group)
 139 {
 140         struct rb_node **p;
 141         struct rb_node *parent = NULL;
 142         struct btrfs_block_group_cache *cache;
 143
 144         spin_lock(&info->block_group_cache_lock);
 145         p = &info->block_group_cache_tree.rb_node;
 146
 147         while (*p) {
 148                 parent = *p;
 149                 cache = rb_entry(parent, struct btrfs_block_group_cache,
 150                                  cache_node);
 151                 if (block_group->key.objectid < cache->key.objectid) {
 152                         p = &(*p)->rb_left;
 153                 } else if (block_group->key.objectid > cache->key.objectid) {
 154                         p = &(*p)->rb_right;
 155                 } else {
 156                         spin_unlock(&info->block_group_cache_lock);
 157                         return -EEXIST;
 158                 }
 159         }
 160
 161         rb_link_node(&block_group->cache_node, parent, p);
 162         rb_insert_color(&block_group->cache_node,
 163                         &info->block_group_cache_tree);
 164
 165         if (info->first_logical_byte > block_group->key.objectid)
 166                 info->first_logical_byte = block_group->key.objectid;
 167
 168         spin_unlock(&info->block_group_cache_lock);
 169
 170         return 0;
 171 }
 172
 173 /*
 174  * This will return the block group at or after bytenr if contains is 0, else
 175  * it will return the block group that contains the bytenr
 176  */
 177 static struct btrfs_block_group_cache *
 178 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 179                               int contains)
 180 {
 181         struct btrfs_block_group_cache *cache, *ret = NULL;
 182         struct rb_node *n;
 183         u64 end, start;
 184
 185         spin_lock(&info->block_group_cache_lock);
 186         n = info->block_group_cache_tree.rb_node;
 187
 188         while (n) {
 189                 cache = rb_entry(n, struct btrfs_block_group_cache,
 190                                  cache_node);
 191                 end = cache->key.objectid + cache->key.offset - 1;
 192                 start = cache->key.objectid;
 193
 194                 if (bytenr < start) {
 195                         if (!contains && (!ret || start < ret->key.objectid))
 196                                 ret = cache;
 197                         n = n->rb_left;
 198                 } else if (bytenr > start) {
 199                         if (contains && bytenr <= end) {
 200                                 ret = cache;
 201                                 break;
 202                         }
 203                         n = n->rb_right;
 204                 } else {
 205                         ret = cache;
 206                         break;
 207                 }
 208         }
 209         if (ret) {
 210                 btrfs_get_block_group(ret);
 211                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
 212                         info->first_logical_byte = ret->key.objectid;
 213         }
 214         spin_unlock(&info->block_group_cache_lock);
 215
 216         return ret;
 217 }
 218
 219 static int add_excluded_extent(struct btrfs_root *root,
 220                                u64 start, u64 num_bytes)
 221 {
 222         u64 end = start + num_bytes - 1;
 223         set_extent_bits(&root->fs_info->freed_extents[0],
 224                         start, end, EXTENT_UPTODATE, GFP_NOFS);
 225         set_extent_bits(&root->fs_info->freed_extents[1],
 226                         start, end, EXTENT_UPTODATE, GFP_NOFS);
 227         return 0;
 228 }
 229
 230 static void free_excluded_extents(struct btrfs_root *root,
 231                                   struct btrfs_block_group_cache *cache)
 232 {
 233         u64 start, end;
 234
 235         start = cache->key.objectid;
 236         end = start + cache->key.offset - 1;
 237
 238         clear_extent_bits(&root->fs_info->freed_extents[0],
 239                           start, end, EXTENT_UPTODATE, GFP_NOFS);
 240         clear_extent_bits(&root->fs_info->freed_extents[1],
 241                           start, end, EXTENT_UPTODATE, GFP_NOFS);
 242 }
 243
 244 static int exclude_super_stripes(struct btrfs_root *root,
 245                                  struct btrfs_block_group_cache *cache)
 246 {
 247         u64 bytenr;
 248         u64 *logical;
 249         int stripe_len;
 250         int i, nr, ret;
 251
 252         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 253                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 254                 cache->bytes_super += stripe_len;
 255                 ret = add_excluded_extent(root, cache->key.objectid,
 256                                           stripe_len);
 257                 BUG_ON(ret); /* -ENOMEM */
 258         }
 259
 260         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 261                 bytenr = btrfs_sb_offset(i);
 262                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 263                                        cache->key.objectid, bytenr,
 264                                        0, &logical, &nr, &stripe_len);
 265                 BUG_ON(ret); /* -ENOMEM */
 266
 267                 while (nr--) {
 268                         cache->bytes_super += stripe_len;
 269                         ret = add_excluded_extent(root, logical[nr],
 270                                                   stripe_len);
 271                         BUG_ON(ret); /* -ENOMEM */
 272                 }
 273
 274                 kfree(logical);
 275         }
 276         return 0;
 277 }
 278
 279 static struct btrfs_caching_control *
 280 get_caching_control(struct btrfs_block_group_cache *cache)
 281 {
 282         struct btrfs_caching_control *ctl;
 283
 284         spin_lock(&cache->lock);
 285         if (cache->cached != BTRFS_CACHE_STARTED) {
 286                 spin_unlock(&cache->lock);
 287                 return NULL;
 288         }
 289
 290         /* We're loading it the fast way, so we don't have a caching_ctl. */
 291         if (!cache->caching_ctl) {
 292                 spin_unlock(&cache->lock);
 293                 return NULL;
 294         }
 295
 296         ctl = cache->caching_ctl;
 297         atomic_inc(&ctl->count);
 298         spin_unlock(&cache->lock);
 299         return ctl;
 300 }
 301
 302 static void put_caching_control(struct btrfs_caching_control *ctl)
 303 {
 304         if (atomic_dec_and_test(&ctl->count))
 305                 kfree(ctl);
 306 }
 307
 308 /*
 309  * this is only called by cache_block_group, since we could have freed extents
 310  * we need to check the pinned_extents for any extents that can't be used yet
 311  * since their free space will be released as soon as the transaction commits.
 312  */
 313 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 314                               struct btrfs_fs_info *info, u64 start, u64 end)
 315 {
 316         u64 extent_start, extent_end, size, total_added = 0;
 317         int ret;
 318
 319         while (start < end) {
 320                 ret = find_first_extent_bit(info->pinned_extents, start,
 321                                             &extent_start, &extent_end,
 322                                             EXTENT_DIRTY | EXTENT_UPTODATE,
 323                                             NULL);
 324                 if (ret)
 325                         break;
 326
 327                 if (extent_start <= start) {
 328                         start = extent_end + 1;
 329                 } else if (extent_start > start && extent_start < end) {
 330                         size = extent_start - start;
 331                         total_added += size;
 332                         ret = btrfs_add_free_space(block_group, start,
 333                                                    size);
 334                         BUG_ON(ret); /* -ENOMEM or logic error */
 335                         start = extent_end + 1;
 336                 } else {
 337                         break;
 338                 }
 339         }
 340
 341         if (start < end) {
 342                 size = end - start;
 343                 total_added += size;
 344                 ret = btrfs_add_free_space(block_group, start, size);
 345                 BUG_ON(ret); /* -ENOMEM or logic error */
 346         }
 347
 348         return total_added;
 349 }
 350
 351 static noinline void caching_thread(struct btrfs_work *work)
 352 {
 353         struct btrfs_block_group_cache *block_group;
 354         struct btrfs_fs_info *fs_info;
 355         struct btrfs_caching_control *caching_ctl;
 356         struct btrfs_root *extent_root;
 357         struct btrfs_path *path;
 358         struct extent_buffer *leaf;
 359         struct btrfs_key key;
 360         u64 total_found = 0;
 361         u64 last = 0;
 362         u32 nritems;
 363         int ret = 0;
 364
 365         caching_ctl = container_of(work, struct btrfs_caching_control, work);
 366         block_group = caching_ctl->block_group;
 367         fs_info = block_group->fs_info;
 368         extent_root = fs_info->extent_root;
 369
 370         path = btrfs_alloc_path();
 371         if (!path)
 372                 goto out;
 373
 374         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 375
 376         /*
 377          * We don't want to deadlock with somebody trying to allocate a new
 378          * extent for the extent root while also trying to search the extent
 379          * root to add free space.  So we skip locking and search the commit
 380          * root, since its read-only
 381          */
 382         path->skip_locking = 1;
 383         path->search_commit_root = 1;
 384         path->reada = 1;
 385
 386         key.objectid = last;
 387         key.offset = 0;
 388         key.type = BTRFS_EXTENT_ITEM_KEY;
 389 again:
 390         mutex_lock(&caching_ctl->mutex);
 391         /* need to make sure the commit_root doesn't disappear */
 392         down_read(&fs_info->extent_commit_sem);
 393
 394         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 395         if (ret < 0)
 396                 goto err;
 397
 398         leaf = path->nodes[0];
 399         nritems = btrfs_header_nritems(leaf);
 400
 401         while (1) {
 402                 if (btrfs_fs_closing(fs_info) > 1) {
 403                         last = (u64)-1;
 404                         break;
 405                 }
 406
 407                 if (path->slots[0] < nritems) {
 408                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 409                 } else {
 410                         ret = find_next_key(path, 0, &key);
 411                         if (ret)
 412                                 break;
 413
 414                         if (need_resched() ||
 415                             btrfs_next_leaf(extent_root, path)) {
 416                                 caching_ctl->progress = last;
 417                                 btrfs_release_path(path);
 418                                 up_read(&fs_info->extent_commit_sem);
 419                                 mutex_unlock(&caching_ctl->mutex);
 420                                 cond_resched();
 421                                 goto again;
 422                         }
 423                         leaf = path->nodes[0];
 424                         nritems = btrfs_header_nritems(leaf);
 425                         continue;
 426                 }
 427
 428                 if (key.objectid < block_group->key.objectid) {
 429                         path->slots[0]++;
 430                         continue;
 431                 }
 432
 433                 if (key.objectid >= block_group->key.objectid +
 434                     block_group->key.offset)
 435                         break;
 436
 437                 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
 438                         total_found += add_new_free_space(block_group,
 439                                                           fs_info, last,
 440                                                           key.objectid);
 441                         last = key.objectid + key.offset;
 442
 443                         if (total_found > (1024 * 1024 * 2)) {
 444                                 total_found = 0;
 445                                 wake_up(&caching_ctl->wait);
 446                         }
 447                 }
 448                 path->slots[0]++;
 449         }
 450         ret = 0;
 451
 452         total_found += add_new_free_space(block_group, fs_info, last,
 453                                           block_group->key.objectid +
 454                                           block_group->key.offset);
 455         caching_ctl->progress = (u64)-1;
 456
 457         spin_lock(&block_group->lock);
 458         block_group->caching_ctl = NULL;
 459         block_group->cached = BTRFS_CACHE_FINISHED;
 460         spin_unlock(&block_group->lock);
 461
 462 err:
 463         btrfs_free_path(path);
 464         up_read(&fs_info->extent_commit_sem);
 465
 466         free_excluded_extents(extent_root, block_group);
 467
 468         mutex_unlock(&caching_ctl->mutex);
 469 out:
 470         wake_up(&caching_ctl->wait);
 471
 472         put_caching_control(caching_ctl);
 473         btrfs_put_block_group(block_group);
 474 }
 475
 476 static int cache_block_group(struct btrfs_block_group_cache *cache,
 477                              int load_cache_only)
 478 {
 479         DEFINE_WAIT(wait);
 480         struct btrfs_fs_info *fs_info = cache->fs_info;
 481         struct btrfs_caching_control *caching_ctl;
 482         int ret = 0;
 483
 484         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 485         if (!caching_ctl)
 486                 return -ENOMEM;
 487
 488         INIT_LIST_HEAD(&caching_ctl->list);
 489         mutex_init(&caching_ctl->mutex);
 490         init_waitqueue_head(&caching_ctl->wait);
 491         caching_ctl->block_group = cache;
 492         caching_ctl->progress = cache->key.objectid;
 493         atomic_set(&caching_ctl->count, 1);
 494         caching_ctl->work.func = caching_thread;
 495
 496         spin_lock(&cache->lock);
 497         /*
 498          * This should be a rare occasion, but this could happen I think in the
 499          * case where one thread starts to load the space cache info, and then
 500          * some other thread starts a transaction commit which tries to do an
 501          * allocation while the other thread is still loading the space cache
 502          * info.  The previous loop should have kept us from choosing this block
 503          * group, but if we've moved to the state where we will wait on caching
 504          * block groups we need to first check if we're doing a fast load here,
 505          * so we can wait for it to finish, otherwise we could end up allocating
 506          * from a block group who's cache gets evicted for one reason or
 507          * another.
 508          */
 509         while (cache->cached == BTRFS_CACHE_FAST) {
 510                 struct btrfs_caching_control *ctl;
 511
 512                 ctl = cache->caching_ctl;
 513                 atomic_inc(&ctl->count);
 514                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 515                 spin_unlock(&cache->lock);
 516
 517                 schedule();
 518
 519                 finish_wait(&ctl->wait, &wait);
 520                 put_caching_control(ctl);
 521                 spin_lock(&cache->lock);
 522         }
 523
 524         if (cache->cached != BTRFS_CACHE_NO) {
 525                 spin_unlock(&cache->lock);
 526                 kfree(caching_ctl);
 527                 return 0;
 528         }
 529         WARN_ON(cache->caching_ctl);
 530         cache->caching_ctl = caching_ctl;
 531         cache->cached = BTRFS_CACHE_FAST;
 532         spin_unlock(&cache->lock);
 533
 534         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
 535                 ret = load_free_space_cache(fs_info, cache);
 536
 537                 spin_lock(&cache->lock);
 538                 if (ret == 1) {
 539                         cache->caching_ctl = NULL;
 540                         cache->cached = BTRFS_CACHE_FINISHED;
 541                         cache->last_byte_to_unpin = (u64)-1;
 542                 } else {
 543                         if (load_cache_only) {
 544                                 cache->caching_ctl = NULL;
 545                                 cache->cached = BTRFS_CACHE_NO;
 546                         } else {
 547                                 cache->cached = BTRFS_CACHE_STARTED;
 548                         }
 549                 }
 550                 spin_unlock(&cache->lock);
 551                 wake_up(&caching_ctl->wait);
 552                 if (ret == 1) {
 553                         put_caching_control(caching_ctl);
 554                         free_excluded_extents(fs_info->extent_root, cache);
 555                         return 0;
 556                 }
 557         } else {
 558                 /*
 559                  * We are not going to do the fast caching, set cached to the
 560                  * appropriate value and wakeup any waiters.
 561                  */
 562                 spin_lock(&cache->lock);
 563                 if (load_cache_only) {
 564                         cache->caching_ctl = NULL;
 565                         cache->cached = BTRFS_CACHE_NO;
 566                 } else {
 567                         cache->cached = BTRFS_CACHE_STARTED;
 568                 }
 569                 spin_unlock(&cache->lock);
 570                 wake_up(&caching_ctl->wait);
 571         }
 572
 573         if (load_cache_only) {
 574                 put_caching_control(caching_ctl);
 575                 return 0;
 576         }
 577
 578         down_write(&fs_info->extent_commit_sem);
 579         atomic_inc(&caching_ctl->count);
 580         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 581         up_write(&fs_info->extent_commit_sem);
 582
 583         btrfs_get_block_group(cache);
 584
 585         btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
 586
 587         return ret;
 588 }
 589
 590 /*
 591  * return the block group that starts at or after bytenr
 592  */
 593 static struct btrfs_block_group_cache *
 594 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 595 {
 596         struct btrfs_block_group_cache *cache;
 597
 598         cache = block_group_cache_tree_search(info, bytenr, 0);
 599
 600         return cache;
 601 }
 602
 603 /*
 604  * return the block group that contains the given bytenr
 605  */
 606 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 607                                                  struct btrfs_fs_info *info,
 608                                                  u64 bytenr)
 609 {
 610         struct btrfs_block_group_cache *cache;
 611
 612         cache = block_group_cache_tree_search(info, bytenr, 1);
 613
 614         return cache;
 615 }
 616
 617 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 618                                                   u64 flags)
 619 {
 620         struct list_head *head = &info->space_info;
 621         struct btrfs_space_info *found;
 622
 623         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 624
 625         rcu_read_lock();
 626         list_for_each_entry_rcu(found, head, list) {
 627                 if (found->flags & flags) {
 628                         rcu_read_unlock();
 629                         return found;
 630                 }
 631         }
 632         rcu_read_unlock();
 633         return NULL;
 634 }
 635
 636 /*
 637  * after adding space to the filesystem, we need to clear the full flags
 638  * on all the space infos.
 639  */
 640 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 641 {
 642         struct list_head *head = &info->space_info;
 643         struct btrfs_space_info *found;
 644
 645         rcu_read_lock();
 646         list_for_each_entry_rcu(found, head, list)
 647                 found->full = 0;
 648         rcu_read_unlock();
 649 }
 650
 651 u64 btrfs_find_block_group(struct btrfs_root *root,
 652                            u64 search_start, u64 search_hint, int owner)
 653 {
 654         struct btrfs_block_group_cache *cache;
 655         u64 used;
 656         u64 last = max(search_hint, search_start);
 657         u64 group_start = 0;
 658         int full_search = 0;
 659         int factor = 9;
 660         int wrapped = 0;
 661 again:
 662         while (1) {
 663                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
 664                 if (!cache)
 665                         break;
 666
 667                 spin_lock(&cache->lock);
 668                 last = cache->key.objectid + cache->key.offset;
 669                 used = btrfs_block_group_used(&cache->item);
 670
 671                 if ((full_search || !cache->ro) &&
 672                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
 673                         if (used + cache->pinned + cache->reserved <
 674                             div_factor(cache->key.offset, factor)) {
 675                                 group_start = cache->key.objectid;
 676                                 spin_unlock(&cache->lock);
 677                                 btrfs_put_block_group(cache);
 678                                 goto found;
 679                         }
 680                 }
 681                 spin_unlock(&cache->lock);
 682                 btrfs_put_block_group(cache);
 683                 cond_resched();
 684         }
 685         if (!wrapped) {
 686                 last = search_start;
 687                 wrapped = 1;
 688                 goto again;
 689         }
 690         if (!full_search && factor < 10) {
 691                 last = search_start;
 692                 full_search = 1;
 693                 factor = 10;
 694                 goto again;
 695         }
 696 found:
 697         return group_start;
 698 }
 699
 700 /* simple helper to search for an existing extent at a given offset */
 701 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 702 {
 703         int ret;
 704         struct btrfs_key key;
 705         struct btrfs_path *path;
 706
 707         path = btrfs_alloc_path();
 708         if (!path)
 709                 return -ENOMEM;
 710
 711         key.objectid = start;
 712         key.offset = len;
 713         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 714         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 715                                 0, 0);
 716         btrfs_free_path(path);
 717         return ret;
 718 }
 719
 720 /*
 721  * helper function to lookup reference count and flags of extent.
 722  *
 723  * the head node for delayed ref is used to store the sum of all the
 724  * reference count modifications queued up in the rbtree. the head
 725  * node may also store the extent flags to set. This way you can check
 726  * to see what the reference count and extent flags would be if all of
 727  * the delayed refs are not processed.
 728  */
 729 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 730                              struct btrfs_root *root, u64 bytenr,
 731                              u64 num_bytes, u64 *refs, u64 *flags)
 732 {
 733         struct btrfs_delayed_ref_head *head;
 734         struct btrfs_delayed_ref_root *delayed_refs;
 735         struct btrfs_path *path;
 736         struct btrfs_extent_item *ei;
 737         struct extent_buffer *leaf;
 738         struct btrfs_key key;
 739         u32 item_size;
 740         u64 num_refs;
 741         u64 extent_flags;
 742         int ret;
 743
 744         path = btrfs_alloc_path();
 745         if (!path)
 746                 return -ENOMEM;
 747
 748         key.objectid = bytenr;
 749         key.type = BTRFS_EXTENT_ITEM_KEY;
 750         key.offset = num_bytes;
 751         if (!trans) {
 752                 path->skip_locking = 1;
 753                 path->search_commit_root = 1;
 754         }
 755 again:
 756         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 757                                 &key, path, 0, 0);
 758         if (ret < 0)
 759                 goto out_free;
 760
 761         if (ret == 0) {
 762                 leaf = path->nodes[0];
 763                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 764                 if (item_size >= sizeof(*ei)) {
 765                         ei = btrfs_item_ptr(leaf, path->slots[0],
 766                                             struct btrfs_extent_item);
 767                         num_refs = btrfs_extent_refs(leaf, ei);
 768                         extent_flags = btrfs_extent_flags(leaf, ei);
 769                 } else {
 770 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 771                         struct btrfs_extent_item_v0 *ei0;
 772                         BUG_ON(item_size != sizeof(*ei0));
 773                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
 774                                              struct btrfs_extent_item_v0);
 775                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
 776                         /* FIXME: this isn't correct for data */
 777                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 778 #else
 779                         BUG();
 780 #endif
 781                 }
 782                 BUG_ON(num_refs == 0);
 783         } else {
 784                 num_refs = 0;
 785                 extent_flags = 0;
 786                 ret = 0;
 787         }
 788
 789         if (!trans)
 790                 goto out;
 791
 792         delayed_refs = &trans->transaction->delayed_refs;
 793         spin_lock(&delayed_refs->lock);
 794         head = btrfs_find_delayed_ref_head(trans, bytenr);
 795         if (head) {
 796                 if (!mutex_trylock(&head->mutex)) {
 797                         atomic_inc(&head->node.refs);
 798                         spin_unlock(&delayed_refs->lock);
 799
 800                         btrfs_release_path(path);
 801
 802                         /*
 803                          * Mutex was contended, block until it's released and try
 804                          * again
 805                          */
 806                         mutex_lock(&head->mutex);
 807                         mutex_unlock(&head->mutex);
 808                         btrfs_put_delayed_ref(&head->node);
 809                         goto again;
 810                 }
 811                 if (head->extent_op && head->extent_op->update_flags)
 812                         extent_flags |= head->extent_op->flags_to_set;
 813                 else
 814                         BUG_ON(num_refs == 0);
 815
 816                 num_refs += head->node.ref_mod;
 817                 mutex_unlock(&head->mutex);
 818         }
 819         spin_unlock(&delayed_refs->lock);
 820 out:
 821         WARN_ON(num_refs == 0);
 822         if (refs)
 823                 *refs = num_refs;
 824         if (flags)
 825                 *flags = extent_flags;
 826 out_free:
 827         btrfs_free_path(path);
 828         return ret;
 829 }
 830
 831 /*
 832  * Back reference rules.  Back refs have three main goals:
 833  *
 834  * 1) differentiate between all holders of references to an extent so that
 835  *    when a reference is dropped we can make sure it was a valid reference
 836  *    before freeing the extent.
 837  *
 838  * 2) Provide enough information to quickly find the holders of an extent
 839  *    if we notice a given block is corrupted or bad.
 840  *
 841  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 842  *    maintenance.  This is actually the same as #2, but with a slightly
 843  *    different use case.
 844  *
 845  * There are two kinds of back refs. The implicit back refs is optimized
 846  * for pointers in non-shared tree blocks. For a given pointer in a block,
 847  * back refs of this kind provide information about the block's owner tree
 848  * and the pointer's key. These information allow us to find the block by
 849  * b-tree searching. The full back refs is for pointers in tree blocks not
 850  * referenced by their owner trees. The location of tree block is recorded
 851  * in the back refs. Actually the full back refs is generic, and can be
 852  * used in all cases the implicit back refs is used. The major shortcoming
 853  * of the full back refs is its overhead. Every time a tree block gets
 854  * COWed, we have to update back refs entry for all pointers in it.
 855  *
 856  * For a newly allocated tree block, we use implicit back refs for
 857  * pointers in it. This means most tree related operations only involve
 858  * implicit back refs. For a tree block created in old transaction, the
 859  * only way to drop a reference to it is COW it. So we can detect the
 860  * event that tree block loses its owner tree's reference and do the
 861  * back refs conversion.
 862  *
 863  * When a tree block is COW'd through a tree, there are four cases:
 864  *
 865  * The reference count of the block is one and the tree is the block's
 866  * owner tree. Nothing to do in this case.
 867  *
 868  * The reference count of the block is one and the tree is not the
 869  * block's owner tree. In this case, full back refs is used for pointers
 870  * in the block. Remove these full back refs, add implicit back refs for
 871  * every pointers in the new block.
 872  *
 873  * The reference count of the block is greater than one and the tree is
 874  * the block's owner tree. In this case, implicit back refs is used for
 875  * pointers in the block. Add full back refs for every pointers in the
 876  * block, increase lower level extents' reference counts. The original
 877  * implicit back refs are entailed to the new block.
 878  *
 879  * The reference count of the block is greater than one and the tree is
 880  * not the block's owner tree. Add implicit back refs for every pointer in
 881  * the new block, increase lower level extents' reference count.
 882  *
 883  * Back Reference Key composing:
 884  *
 885  * The key objectid corresponds to the first byte in the extent,
 886  * The key type is used to differentiate between types of back refs.
 887  * There are different meanings of the key offset for different types
 888  * of back refs.
 889  *
 890  * File extents can be referenced by:
 891  *
 892  * - multiple snapshots, subvolumes, or different generations in one subvol
 893  * - different files inside a single subvolume
 894  * - different offsets inside a file (bookend extents in file.c)
 895  *
 896  * The extent ref structure for the implicit back refs has fields for:
 897  *
 898  * - Objectid of the subvolume root
 899  * - objectid of the file holding the reference
 900  * - original offset in the file
 901  * - how many bookend extents
 902  *
 903  * The key offset for the implicit back refs is hash of the first
 904  * three fields.
 905  *
 906  * The extent ref structure for the full back refs has field for:
 907  *
 908  * - number of pointers in the tree leaf
 909  *
 910  * The key offset for the implicit back refs is the first byte of
 911  * the tree leaf
 912  *
 913  * When a file extent is allocated, The implicit back refs is used.
 914  * the fields are filled in:
 915  *
 916  *     (root_key.objectid, inode objectid, offset in file, 1)
 917  *
 918  * When a file extent is removed file truncation, we find the
 919  * corresponding implicit back refs and check the following fields:
 920  *
 921  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 922  *
 923  * Btree extents can be referenced by:
 924  *
 925  * - Different subvolumes
 926  *
 927  * Both the implicit back refs and the full back refs for tree blocks
 928  * only consist of key. The key offset for the implicit back refs is
 929  * objectid of block's owner tree. The key offset for the full back refs
 930  * is the first byte of parent block.
 931  *
 932  * When implicit back refs is used, information about the lowest key and
 933  * level of the tree block are required. These information are stored in
 934  * tree block info structure.
 935  */
 936
 937 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 938 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
 939                                   struct btrfs_root *root,
 940                                   struct btrfs_path *path,
 941                                   u64 owner, u32 extra_size)
 942 {
 943         struct btrfs_extent_item *item;
 944         struct btrfs_extent_item_v0 *ei0;
 945         struct btrfs_extent_ref_v0 *ref0;
 946         struct btrfs_tree_block_info *bi;
 947         struct extent_buffer *leaf;
 948         struct btrfs_key key;
 949         struct btrfs_key found_key;
 950         u32 new_size = sizeof(*item);
 951         u64 refs;
 952         int ret;
 953
 954         leaf = path->nodes[0];
 955         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
 956
 957         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 958         ei0 = btrfs_item_ptr(leaf, path->slots[0],
 959                              struct btrfs_extent_item_v0);
 960         refs = btrfs_extent_refs_v0(leaf, ei0);
 961
 962         if (owner == (u64)-1) {
 963                 while (1) {
 964                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 965                                 ret = btrfs_next_leaf(root, path);
 966                                 if (ret < 0)
 967                                         return ret;
 968                                 BUG_ON(ret > 0); /* Corruption */
 969                                 leaf = path->nodes[0];
 970                         }
 971                         btrfs_item_key_to_cpu(leaf, &found_key,
 972                                               path->slots[0]);
 973                         BUG_ON(key.objectid != found_key.objectid);
 974                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
 975                                 path->slots[0]++;
 976                                 continue;
 977                         }
 978                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
 979                                               struct btrfs_extent_ref_v0);
 980                         owner = btrfs_ref_objectid_v0(leaf, ref0);
 981                         break;
 982                 }
 983         }
 984         btrfs_release_path(path);
 985
 986         if (owner < BTRFS_FIRST_FREE_OBJECTID)
 987                 new_size += sizeof(*bi);
 988
 989         new_size -= sizeof(*ei0);
 990         ret = btrfs_search_slot(trans, root, &key, path,
 991                                 new_size + extra_size, 1);
 992         if (ret < 0)
 993                 return ret;
 994         BUG_ON(ret); /* Corruption */
 995
 996         btrfs_extend_item(trans, root, path, new_size);
 997
 998         leaf = path->nodes[0];
 999         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1000         btrfs_set_extent_refs(leaf, item, refs);
1001         /* FIXME: get real generation */
1002         btrfs_set_extent_generation(leaf, item, 0);
1003         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1004                 btrfs_set_extent_flags(leaf, item,
1005                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
1006                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
1007                 bi = (struct btrfs_tree_block_info *)(item + 1);
1008                 /* FIXME: get first key of the block */
1009                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1010                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1011         } else {
1012                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1013         }
1014         btrfs_mark_buffer_dirty(leaf);
1015         return 0;
1016 }
1017 #endif
1018
1019 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1020 {
1021         u32 high_crc = ~(u32)0;
1022         u32 low_crc = ~(u32)0;
1023         __le64 lenum;
1024
1025         lenum = cpu_to_le64(root_objectid);
1026         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1027         lenum = cpu_to_le64(owner);
1028         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1029         lenum = cpu_to_le64(offset);
1030         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1031
1032         return ((u64)high_crc << 31) ^ (u64)low_crc;
1033 }
1034
1035 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1036                                      struct btrfs_extent_data_ref *ref)
1037 {
1038         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1039                                     btrfs_extent_data_ref_objectid(leaf, ref),
1040                                     btrfs_extent_data_ref_offset(leaf, ref));
1041 }
1042
1043 static int match_extent_data_ref(struct extent_buffer *leaf,
1044                                  struct btrfs_extent_data_ref *ref,
1045                                  u64 root_objectid, u64 owner, u64 offset)
1046 {
1047         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1048             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1049             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1050                 return 0;
1051         return 1;
1052 }
1053
1054 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1055                                            struct btrfs_root *root,
1056                                            struct btrfs_path *path,
1057                                            u64 bytenr, u64 parent,
1058                                            u64 root_objectid,
1059                                            u64 owner, u64 offset)
1060 {
1061         struct btrfs_key key;
1062         struct btrfs_extent_data_ref *ref;
1063         struct extent_buffer *leaf;
1064         u32 nritems;
1065         int ret;
1066         int recow;
1067         int err = -ENOENT;
1068
1069         key.objectid = bytenr;
1070         if (parent) {
1071                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1072                 key.offset = parent;
1073         } else {
1074                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1075                 key.offset = hash_extent_data_ref(root_objectid,
1076                                                   owner, offset);
1077         }
1078 again:
1079         recow = 0;
1080         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1081         if (ret < 0) {
1082                 err = ret;
1083                 goto fail;
1084         }
1085
1086         if (parent) {
1087                 if (!ret)
1088                         return 0;
1089 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1090                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1091                 btrfs_release_path(path);
1092                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1093                 if (ret < 0) {
1094                         err = ret;
1095                         goto fail;
1096                 }
1097                 if (!ret)
1098                         return 0;
1099 #endif
1100                 goto fail;
1101         }
1102
1103         leaf = path->nodes[0];
1104         nritems = btrfs_header_nritems(leaf);
1105         while (1) {
1106                 if (path->slots[0] >= nritems) {
1107                         ret = btrfs_next_leaf(root, path);
1108                         if (ret < 0)
1109                                 err = ret;
1110                         if (ret)
1111                                 goto fail;
1112
1113                         leaf = path->nodes[0];
1114                         nritems = btrfs_header_nritems(leaf);
1115                         recow = 1;
1116                 }
1117
1118                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1119                 if (key.objectid != bytenr ||
1120                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1121                         goto fail;
1122
1123                 ref = btrfs_item_ptr(leaf, path->slots[0],
1124                                      struct btrfs_extent_data_ref);
1125
1126                 if (match_extent_data_ref(leaf, ref, root_objectid,
1127                                           owner, offset)) {
1128                         if (recow) {
1129                                 btrfs_release_path(path);
1130                                 goto again;
1131                         }
1132                         err = 0;
1133                         break;
1134                 }
1135                 path->slots[0]++;
1136         }
1137 fail:
1138         return err;
1139 }
1140
1141 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1142                                            struct btrfs_root *root,
1143                                            struct btrfs_path *path,
1144                                            u64 bytenr, u64 parent,
1145                                            u64 root_objectid, u64 owner,
1146                                            u64 offset, int refs_to_add)
1147 {
1148         struct btrfs_key key;
1149         struct extent_buffer *leaf;
1150         u32 size;
1151         u32 num_refs;
1152         int ret;
1153
1154         key.objectid = bytenr;
1155         if (parent) {
1156                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1157                 key.offset = parent;
1158                 size = sizeof(struct btrfs_shared_data_ref);
1159         } else {
1160                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1161                 key.offset = hash_extent_data_ref(root_objectid,
1162                                                   owner, offset);
1163                 size = sizeof(struct btrfs_extent_data_ref);
1164         }
1165
1166         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1167         if (ret && ret != -EEXIST)
1168                 goto fail;
1169
1170         leaf = path->nodes[0];
1171         if (parent) {
1172                 struct btrfs_shared_data_ref *ref;
1173                 ref = btrfs_item_ptr(leaf, path->slots[0],
1174                                      struct btrfs_shared_data_ref);
1175                 if (ret == 0) {
1176                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1177                 } else {
1178                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1179                         num_refs += refs_to_add;
1180                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1181                 }
1182         } else {
1183                 struct btrfs_extent_data_ref *ref;
1184                 while (ret == -EEXIST) {
1185                         ref = btrfs_item_ptr(leaf, path->slots[0],
1186                                              struct btrfs_extent_data_ref);
1187                         if (match_extent_data_ref(leaf, ref, root_objectid,
1188                                                   owner, offset))
1189                                 break;
1190                         btrfs_release_path(path);
1191                         key.offset++;
1192                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1193                                                       size);
1194                         if (ret && ret != -EEXIST)
1195                                 goto fail;
1196
1197                         leaf = path->nodes[0];
1198                 }
1199                 ref = btrfs_item_ptr(leaf, path->slots[0],
1200                                      struct btrfs_extent_data_ref);
1201                 if (ret == 0) {
1202                         btrfs_set_extent_data_ref_root(leaf, ref,
1203                                                        root_objectid);
1204                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1205                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1206                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1207                 } else {
1208                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1209                         num_refs += refs_to_add;
1210                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1211                 }
1212         }
1213         btrfs_mark_buffer_dirty(leaf);
1214         ret = 0;
1215 fail:
1216         btrfs_release_path(path);
1217         return ret;
1218 }
1219
1220 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1221                                            struct btrfs_root *root,
1222                                            struct btrfs_path *path,
1223                                            int refs_to_drop)
1224 {
1225         struct btrfs_key key;
1226         struct btrfs_extent_data_ref *ref1 = NULL;
1227         struct btrfs_shared_data_ref *ref2 = NULL;
1228         struct extent_buffer *leaf;
1229         u32 num_refs = 0;
1230         int ret = 0;
1231
1232         leaf = path->nodes[0];
1233         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1234
1235         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1236                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1237                                       struct btrfs_extent_data_ref);
1238                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1239         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1240                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1241                                       struct btrfs_shared_data_ref);
1242                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1243 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1244         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1245                 struct btrfs_extent_ref_v0 *ref0;
1246                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1247                                       struct btrfs_extent_ref_v0);
1248                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1249 #endif
1250         } else {
1251                 BUG();
1252         }
1253
1254         BUG_ON(num_refs < refs_to_drop);
1255         num_refs -= refs_to_drop;
1256
1257         if (num_refs == 0) {
1258                 ret = btrfs_del_item(trans, root, path);
1259         } else {
1260                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1261                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1262                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1263                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1264 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1265                 else {
1266                         struct btrfs_extent_ref_v0 *ref0;
1267                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1268                                         struct btrfs_extent_ref_v0);
1269                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1270                 }
1271 #endif
1272                 btrfs_mark_buffer_dirty(leaf);
1273         }
1274         return ret;
1275 }
1276
1277 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1278                                           struct btrfs_path *path,
1279                                           struct btrfs_extent_inline_ref *iref)
1280 {
1281         struct btrfs_key key;
1282         struct extent_buffer *leaf;
1283         struct btrfs_extent_data_ref *ref1;
1284         struct btrfs_shared_data_ref *ref2;
1285         u32 num_refs = 0;
1286
1287         leaf = path->nodes[0];
1288         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1289         if (iref) {
1290                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1291                     BTRFS_EXTENT_DATA_REF_KEY) {
1292                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1293                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1294                 } else {
1295                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1296                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1297                 }
1298         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1299                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1300                                       struct btrfs_extent_data_ref);
1301                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1302         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1303                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1304                                       struct btrfs_shared_data_ref);
1305                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1306 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1307         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1308                 struct btrfs_extent_ref_v0 *ref0;
1309                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1310                                       struct btrfs_extent_ref_v0);
1311                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1312 #endif
1313         } else {
1314                 WARN_ON(1);
1315         }
1316         return num_refs;
1317 }
1318
1319 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1320                                           struct btrfs_root *root,
1321                                           struct btrfs_path *path,
1322                                           u64 bytenr, u64 parent,
1323                                           u64 root_objectid)
1324 {
1325         struct btrfs_key key;
1326         int ret;
1327
1328         key.objectid = bytenr;
1329         if (parent) {
1330                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1331                 key.offset = parent;
1332         } else {
1333                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1334                 key.offset = root_objectid;
1335         }
1336
1337         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1338         if (ret > 0)
1339                 ret = -ENOENT;
1340 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1341         if (ret == -ENOENT && parent) {
1342                 btrfs_release_path(path);
1343                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1344                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1345                 if (ret > 0)
1346                         ret = -ENOENT;
1347         }
1348 #endif
1349         return ret;
1350 }
1351
1352 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1353                                           struct btrfs_root *root,
1354                                           struct btrfs_path *path,
1355                                           u64 bytenr, u64 parent,
1356                                           u64 root_objectid)
1357 {
1358         struct btrfs_key key;
1359         int ret;
1360
1361         key.objectid = bytenr;
1362         if (parent) {
1363                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1364                 key.offset = parent;
1365         } else {
1366                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1367                 key.offset = root_objectid;
1368         }
1369
1370         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1371         btrfs_release_path(path);
1372         return ret;
1373 }
1374
1375 static inline int extent_ref_type(u64 parent, u64 owner)
1376 {
1377         int type;
1378         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1379                 if (parent > 0)
1380                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1381                 else
1382                         type = BTRFS_TREE_BLOCK_REF_KEY;
1383         } else {
1384                 if (parent > 0)
1385                         type = BTRFS_SHARED_DATA_REF_KEY;
1386                 else
1387                         type = BTRFS_EXTENT_DATA_REF_KEY;
1388         }
1389         return type;
1390 }
1391
1392 static int find_next_key(struct btrfs_path *path, int level,
1393                          struct btrfs_key *key)
1394
1395 {
1396         for (; level < BTRFS_MAX_LEVEL; level++) {
1397                 if (!path->nodes[level])
1398                         break;
1399                 if (path->slots[level] + 1 >=
1400                     btrfs_header_nritems(path->nodes[level]))
1401                         continue;
1402                 if (level == 0)
1403                         btrfs_item_key_to_cpu(path->nodes[level], key,
1404                                               path->slots[level] + 1);
1405                 else
1406                         btrfs_node_key_to_cpu(path->nodes[level], key,
1407                                               path->slots[level] + 1);
1408                 return 0;
1409         }
1410         return 1;
1411 }
1412
1413 /*
1414  * look for inline back ref. if back ref is found, *ref_ret is set
1415  * to the address of inline back ref, and 0 is returned.
1416  *
1417  * if back ref isn't found, *ref_ret is set to the address where it
1418  * should be inserted, and -ENOENT is returned.
1419  *
1420  * if insert is true and there are too many inline back refs, the path
1421  * points to the extent item, and -EAGAIN is returned.
1422  *
1423  * NOTE: inline back refs are ordered in the same way that back ref
1424  *       items in the tree are ordered.
1425  */
1426 static noinline_for_stack
1427 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1428                                  struct btrfs_root *root,
1429                                  struct btrfs_path *path,
1430                                  struct btrfs_extent_inline_ref **ref_ret,
1431                                  u64 bytenr, u64 num_bytes,
1432                                  u64 parent, u64 root_objectid,
1433                                  u64 owner, u64 offset, int insert)
1434 {
1435         struct btrfs_key key;
1436         struct extent_buffer *leaf;
1437         struct btrfs_extent_item *ei;
1438         struct btrfs_extent_inline_ref *iref;
1439         u64 flags;
1440         u64 item_size;
1441         unsigned long ptr;
1442         unsigned long end;
1443         int extra_size;
1444         int type;
1445         int want;
1446         int ret;
1447         int err = 0;
1448
1449         key.objectid = bytenr;
1450         key.type = BTRFS_EXTENT_ITEM_KEY;
1451         key.offset = num_bytes;
1452
1453         want = extent_ref_type(parent, owner);
1454         if (insert) {
1455                 extra_size = btrfs_extent_inline_ref_size(want);
1456                 path->keep_locks = 1;
1457         } else
1458                 extra_size = -1;
1459         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1460         if (ret < 0) {
1461                 err = ret;
1462                 goto out;
1463         }
1464         if (ret && !insert) {
1465                 err = -ENOENT;
1466                 goto out;
1467         }
1468         BUG_ON(ret); /* Corruption */
1469
1470         leaf = path->nodes[0];
1471         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1472 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1473         if (item_size < sizeof(*ei)) {
1474                 if (!insert) {
1475                         err = -ENOENT;
1476                         goto out;
1477                 }
1478                 ret = convert_extent_item_v0(trans, root, path, owner,
1479                                              extra_size);
1480                 if (ret < 0) {
1481                         err = ret;
1482                         goto out;
1483                 }
1484                 leaf = path->nodes[0];
1485                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1486         }
1487 #endif
1488         BUG_ON(item_size < sizeof(*ei));
1489
1490         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1491         flags = btrfs_extent_flags(leaf, ei);
1492
1493         ptr = (unsigned long)(ei + 1);
1494         end = (unsigned long)ei + item_size;
1495
1496         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1497                 ptr += sizeof(struct btrfs_tree_block_info);
1498                 BUG_ON(ptr > end);
1499         } else {
1500                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1501         }
1502
1503         err = -ENOENT;
1504         while (1) {
1505                 if (ptr >= end) {
1506                         WARN_ON(ptr > end);
1507                         break;
1508                 }
1509                 iref = (struct btrfs_extent_inline_ref *)ptr;
1510                 type = btrfs_extent_inline_ref_type(leaf, iref);
1511                 if (want < type)
1512                         break;
1513                 if (want > type) {
1514                         ptr += btrfs_extent_inline_ref_size(type);
1515                         continue;
1516                 }
1517
1518                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1519                         struct btrfs_extent_data_ref *dref;
1520                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1521                         if (match_extent_data_ref(leaf, dref, root_objectid,
1522                                                   owner, offset)) {
1523                                 err = 0;
1524                                 break;
1525                         }
1526                         if (hash_extent_data_ref_item(leaf, dref) <
1527                             hash_extent_data_ref(root_objectid, owner, offset))
1528                                 break;
1529                 } else {
1530                         u64 ref_offset;
1531                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1532                         if (parent > 0) {
1533                                 if (parent == ref_offset) {
1534                                         err = 0;
1535                                         break;
1536                                 }
1537                                 if (ref_offset < parent)
1538                                         break;
1539                         } else {
1540                                 if (root_objectid == ref_offset) {
1541                                         err = 0;
1542                                         break;
1543                                 }
1544                                 if (ref_offset < root_objectid)
1545                                         break;
1546                         }
1547                 }
1548                 ptr += btrfs_extent_inline_ref_size(type);
1549         }
1550         if (err == -ENOENT && insert) {
1551                 if (item_size + extra_size >=
1552                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1553                         err = -EAGAIN;
1554                         goto out;
1555                 }
1556                 /*
1557                  * To add new inline back ref, we have to make sure
1558                  * there is no corresponding back ref item.
1559                  * For simplicity, we just do not add new inline back
1560                  * ref if there is any kind of item for this block
1561                  */
1562                 if (find_next_key(path, 0, &key) == 0 &&
1563                     key.objectid == bytenr &&
1564                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1565                         err = -EAGAIN;
1566                         goto out;
1567                 }
1568         }
1569         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1570 out:
1571         if (insert) {
1572                 path->keep_locks = 0;
1573                 btrfs_unlock_up_safe(path, 1);
1574         }
1575         return err;
1576 }
1577
1578 /*
1579  * helper to add new inline back ref
1580  */
1581 static noinline_for_stack
1582 void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1583                                  struct btrfs_root *root,
1584                                  struct btrfs_path *path,
1585                                  struct btrfs_extent_inline_ref *iref,
1586                                  u64 parent, u64 root_objectid,
1587                                  u64 owner, u64 offset, int refs_to_add,
1588                                  struct btrfs_delayed_extent_op *extent_op)
1589 {
1590         struct extent_buffer *leaf;
1591         struct btrfs_extent_item *ei;
1592         unsigned long ptr;
1593         unsigned long end;
1594         unsigned long item_offset;
1595         u64 refs;
1596         int size;
1597         int type;
1598
1599         leaf = path->nodes[0];
1600         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1601         item_offset = (unsigned long)iref - (unsigned long)ei;
1602
1603         type = extent_ref_type(parent, owner);
1604         size = btrfs_extent_inline_ref_size(type);
1605
1606         btrfs_extend_item(trans, root, path, size);
1607
1608         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1609         refs = btrfs_extent_refs(leaf, ei);
1610         refs += refs_to_add;
1611         btrfs_set_extent_refs(leaf, ei, refs);
1612         if (extent_op)
1613                 __run_delayed_extent_op(extent_op, leaf, ei);
1614
1615         ptr = (unsigned long)ei + item_offset;
1616         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1617         if (ptr < end - size)
1618                 memmove_extent_buffer(leaf, ptr + size, ptr,
1619                                       end - size - ptr);
1620
1621         iref = (struct btrfs_extent_inline_ref *)ptr;
1622         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1623         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1624                 struct btrfs_extent_data_ref *dref;
1625                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1626                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1627                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1628                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1629                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1630         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1631                 struct btrfs_shared_data_ref *sref;
1632                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1633                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1634                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1635         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1636                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1637         } else {
1638                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1639         }
1640         btrfs_mark_buffer_dirty(leaf);
1641 }
1642
1643 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1644                                  struct btrfs_root *root,
1645                                  struct btrfs_path *path,
1646                                  struct btrfs_extent_inline_ref **ref_ret,
1647                                  u64 bytenr, u64 num_bytes, u64 parent,
1648                                  u64 root_objectid, u64 owner, u64 offset)
1649 {
1650         int ret;
1651
1652         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1653                                            bytenr, num_bytes, parent,
1654                                            root_objectid, owner, offset, 0);
1655         if (ret != -ENOENT)
1656                 return ret;
1657
1658         btrfs_release_path(path);
1659         *ref_ret = NULL;
1660
1661         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1662                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1663                                             root_objectid);
1664         } else {
1665                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1666                                              root_objectid, owner, offset);
1667         }
1668         return ret;
1669 }
1670
1671 /*
1672  * helper to update/remove inline back ref
1673  */
1674 static noinline_for_stack
1675 void update_inline_extent_backref(struct btrfs_trans_handle *trans,
1676                                   struct btrfs_root *root,
1677                                   struct btrfs_path *path,
1678                                   struct btrfs_extent_inline_ref *iref,
1679                                   int refs_to_mod,
1680                                   struct btrfs_delayed_extent_op *extent_op)
1681 {
1682         struct extent_buffer *leaf;
1683         struct btrfs_extent_item *ei;
1684         struct btrfs_extent_data_ref *dref = NULL;
1685         struct btrfs_shared_data_ref *sref = NULL;
1686         unsigned long ptr;
1687         unsigned long end;
1688         u32 item_size;
1689         int size;
1690         int type;
1691         u64 refs;
1692
1693         leaf = path->nodes[0];
1694         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1695         refs = btrfs_extent_refs(leaf, ei);
1696         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1697         refs += refs_to_mod;
1698         btrfs_set_extent_refs(leaf, ei, refs);
1699         if (extent_op)
1700                 __run_delayed_extent_op(extent_op, leaf, ei);
1701
1702         type = btrfs_extent_inline_ref_type(leaf, iref);
1703
1704         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1705                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1706                 refs = btrfs_extent_data_ref_count(leaf, dref);
1707         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1708                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1709                 refs = btrfs_shared_data_ref_count(leaf, sref);
1710         } else {
1711                 refs = 1;
1712                 BUG_ON(refs_to_mod != -1);
1713         }
1714
1715         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1716         refs += refs_to_mod;
1717
1718         if (refs > 0) {
1719                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1720                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1721                 else
1722                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1723         } else {
1724                 size =  btrfs_extent_inline_ref_size(type);
1725                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1726                 ptr = (unsigned long)iref;
1727                 end = (unsigned long)ei + item_size;
1728                 if (ptr + size < end)
1729                         memmove_extent_buffer(leaf, ptr, ptr + size,
1730                                               end - ptr - size);
1731                 item_size -= size;
1732                 btrfs_truncate_item(trans, root, path, item_size, 1);
1733         }
1734         btrfs_mark_buffer_dirty(leaf);
1735 }
1736
1737 static noinline_for_stack
1738 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1739                                  struct btrfs_root *root,
1740                                  struct btrfs_path *path,
1741                                  u64 bytenr, u64 num_bytes, u64 parent,
1742                                  u64 root_objectid, u64 owner,
1743                                  u64 offset, int refs_to_add,
1744                                  struct btrfs_delayed_extent_op *extent_op)
1745 {
1746         struct btrfs_extent_inline_ref *iref;
1747         int ret;
1748
1749         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1750                                            bytenr, num_bytes, parent,
1751                                            root_objectid, owner, offset, 1);
1752         if (ret == 0) {
1753                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1754                 update_inline_extent_backref(trans, root, path, iref,
1755                                              refs_to_add, extent_op);
1756         } else if (ret == -ENOENT) {
1757                 setup_inline_extent_backref(trans, root, path, iref, parent,
1758                                             root_objectid, owner, offset,
1759                                             refs_to_add, extent_op);
1760                 ret = 0;
1761         }
1762         return ret;
1763 }
1764
1765 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1766                                  struct btrfs_root *root,
1767                                  struct btrfs_path *path,
1768                                  u64 bytenr, u64 parent, u64 root_objectid,
1769                                  u64 owner, u64 offset, int refs_to_add)
1770 {
1771         int ret;
1772         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1773                 BUG_ON(refs_to_add != 1);
1774                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1775                                             parent, root_objectid);
1776         } else {
1777                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1778                                              parent, root_objectid,
1779                                              owner, offset, refs_to_add);
1780         }
1781         return ret;
1782 }
1783
1784 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1785                                  struct btrfs_root *root,
1786                                  struct btrfs_path *path,
1787                                  struct btrfs_extent_inline_ref *iref,
1788                                  int refs_to_drop, int is_data)
1789 {
1790         int ret = 0;
1791
1792         BUG_ON(!is_data && refs_to_drop != 1);
1793         if (iref) {
1794                 update_inline_extent_backref(trans, root, path, iref,
1795                                              -refs_to_drop, NULL);
1796         } else if (is_data) {
1797                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1798         } else {
1799                 ret = btrfs_del_item(trans, root, path);
1800         }
1801         return ret;
1802 }
1803
1804 static int btrfs_issue_discard(struct block_device *bdev,
1805                                 u64 start, u64 len)
1806 {
1807         return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1808 }
1809
1810 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1811                                 u64 num_bytes, u64 *actual_bytes)
1812 {
1813         int ret;
1814         u64 discarded_bytes = 0;
1815         struct btrfs_bio *bbio = NULL;
1816
1817
1818         /* Tell the block device(s) that the sectors can be discarded */
1819         ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1820                               bytenr, &num_bytes, &bbio, 0);
1821         /* Error condition is -ENOMEM */
1822         if (!ret) {
1823                 struct btrfs_bio_stripe *stripe = bbio->stripes;
1824                 int i;
1825
1826
1827                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1828                         if (!stripe->dev->can_discard)
1829                                 continue;
1830
1831                         ret = btrfs_issue_discard(stripe->dev->bdev,
1832                                                   stripe->physical,
1833                                                   stripe->length);
1834                         if (!ret)
1835                                 discarded_bytes += stripe->length;
1836                         else if (ret != -EOPNOTSUPP)
1837                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1838
1839                         /*
1840                          * Just in case we get back EOPNOTSUPP for some reason,
1841                          * just ignore the return value so we don't screw up
1842                          * people calling discard_extent.
1843                          */
1844                         ret = 0;
1845                 }
1846                 kfree(bbio);
1847         }
1848
1849         if (actual_bytes)
1850                 *actual_bytes = discarded_bytes;
1851
1852
1853         return ret;
1854 }
1855
1856 /* Can return -ENOMEM */
1857 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1858                          struct btrfs_root *root,
1859                          u64 bytenr, u64 num_bytes, u64 parent,
1860                          u64 root_objectid, u64 owner, u64 offset, int for_cow)
1861 {
1862         int ret;
1863         struct btrfs_fs_info *fs_info = root->fs_info;
1864
1865         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1866                root_objectid == BTRFS_TREE_LOG_OBJECTID);
1867
1868         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1869                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1870                                         num_bytes,
1871                                         parent, root_objectid, (int)owner,
1872                                         BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1873         } else {
1874                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1875                                         num_bytes,
1876                                         parent, root_objectid, owner, offset,
1877                                         BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1878         }
1879         return ret;
1880 }
1881
1882 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1883                                   struct btrfs_root *root,
1884                                   u64 bytenr, u64 num_bytes,
1885                                   u64 parent, u64 root_objectid,
1886                                   u64 owner, u64 offset, int refs_to_add,
1887                                   struct btrfs_delayed_extent_op *extent_op)
1888 {
1889         struct btrfs_path *path;
1890         struct extent_buffer *leaf;
1891         struct btrfs_extent_item *item;
1892         u64 refs;
1893         int ret;
1894         int err = 0;
1895
1896         path = btrfs_alloc_path();
1897         if (!path)
1898                 return -ENOMEM;
1899
1900         path->reada = 1;
1901         path->leave_spinning = 1;
1902         /* this will setup the path even if it fails to insert the back ref */
1903         ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1904                                            path, bytenr, num_bytes, parent,
1905                                            root_objectid, owner, offset,
1906                                            refs_to_add, extent_op);
1907         if (ret == 0)
1908                 goto out;
1909
1910         if (ret != -EAGAIN) {
1911                 err = ret;
1912                 goto out;
1913         }
1914
1915         leaf = path->nodes[0];
1916         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1917         refs = btrfs_extent_refs(leaf, item);
1918         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1919         if (extent_op)
1920                 __run_delayed_extent_op(extent_op, leaf, item);
1921
1922         btrfs_mark_buffer_dirty(leaf);
1923         btrfs_release_path(path);
1924
1925         path->reada = 1;
1926         path->leave_spinning = 1;
1927
1928         /* now insert the actual backref */
1929         ret = insert_extent_backref(trans, root->fs_info->extent_root,
1930                                     path, bytenr, parent, root_objectid,
1931                                     owner, offset, refs_to_add);
1932         if (ret)
1933                 btrfs_abort_transaction(trans, root, ret);
1934 out:
1935         btrfs_free_path(path);
1936         return err;
1937 }
1938
1939 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1940                                 struct btrfs_root *root,
1941                                 struct btrfs_delayed_ref_node *node,
1942                                 struct btrfs_delayed_extent_op *extent_op,
1943                                 int insert_reserved)
1944 {
1945         int ret = 0;
1946         struct btrfs_delayed_data_ref *ref;
1947         struct btrfs_key ins;
1948         u64 parent = 0;
1949         u64 ref_root = 0;
1950         u64 flags = 0;
1951
1952         ins.objectid = node->bytenr;
1953         ins.offset = node->num_bytes;
1954         ins.type = BTRFS_EXTENT_ITEM_KEY;
1955
1956         ref = btrfs_delayed_node_to_data_ref(node);
1957         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1958                 parent = ref->parent;
1959         else
1960                 ref_root = ref->root;
1961
1962         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1963                 if (extent_op) {
1964                         BUG_ON(extent_op->update_key);
1965                         flags |= extent_op->flags_to_set;
1966                 }
1967                 ret = alloc_reserved_file_extent(trans, root,
1968                                                  parent, ref_root, flags,
1969                                                  ref->objectid, ref->offset,
1970                                                  &ins, node->ref_mod);
1971         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1972                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1973                                              node->num_bytes, parent,
1974                                              ref_root, ref->objectid,
1975                                              ref->offset, node->ref_mod,
1976                                              extent_op);
1977         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1978                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1979                                           node->num_bytes, parent,
1980                                           ref_root, ref->objectid,
1981                                           ref->offset, node->ref_mod,
1982                                           extent_op);
1983         } else {
1984                 BUG();
1985         }
1986         return ret;
1987 }
1988
1989 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1990                                     struct extent_buffer *leaf,
1991                                     struct btrfs_extent_item *ei)
1992 {
1993         u64 flags = btrfs_extent_flags(leaf, ei);
1994         if (extent_op->update_flags) {
1995                 flags |= extent_op->flags_to_set;
1996                 btrfs_set_extent_flags(leaf, ei, flags);
1997         }
1998
1999         if (extent_op->update_key) {
2000                 struct btrfs_tree_block_info *bi;
2001                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2002                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2003                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2004         }
2005 }
2006
2007 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2008                                  struct btrfs_root *root,
2009                                  struct btrfs_delayed_ref_node *node,
2010                                  struct btrfs_delayed_extent_op *extent_op)
2011 {
2012         struct btrfs_key key;
2013         struct btrfs_path *path;
2014         struct btrfs_extent_item *ei;
2015         struct extent_buffer *leaf;
2016         u32 item_size;
2017         int ret;
2018         int err = 0;
2019
2020         if (trans->aborted)
2021                 return 0;
2022
2023         path = btrfs_alloc_path();
2024         if (!path)
2025                 return -ENOMEM;
2026
2027         key.objectid = node->bytenr;
2028         key.type = BTRFS_EXTENT_ITEM_KEY;
2029         key.offset = node->num_bytes;
2030
2031         path->reada = 1;
2032         path->leave_spinning = 1;
2033         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2034                                 path, 0, 1);
2035         if (ret < 0) {
2036                 err = ret;
2037                 goto out;
2038         }
2039         if (ret > 0) {
2040                 err = -EIO;
2041                 goto out;
2042         }
2043
2044         leaf = path->nodes[0];
2045         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2046 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2047         if (item_size < sizeof(*ei)) {
2048                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2049                                              path, (u64)-1, 0);
2050                 if (ret < 0) {
2051                         err = ret;
2052                         goto out;
2053                 }
2054                 leaf = path->nodes[0];
2055                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2056         }
2057 #endif
2058         BUG_ON(item_size < sizeof(*ei));
2059         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2060         __run_delayed_extent_op(extent_op, leaf, ei);
2061
2062         btrfs_mark_buffer_dirty(leaf);
2063 out:
2064         btrfs_free_path(path);
2065         return err;
2066 }
2067
2068 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2069                                 struct btrfs_root *root,
2070                                 struct btrfs_delayed_ref_node *node,
2071                                 struct btrfs_delayed_extent_op *extent_op,
2072                                 int insert_reserved)
2073 {
2074         int ret = 0;
2075         struct btrfs_delayed_tree_ref *ref;
2076         struct btrfs_key ins;
2077         u64 parent = 0;
2078         u64 ref_root = 0;
2079
2080         ins.objectid = node->bytenr;
2081         ins.offset = node->num_bytes;
2082         ins.type = BTRFS_EXTENT_ITEM_KEY;
2083
2084         ref = btrfs_delayed_node_to_tree_ref(node);
2085         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2086                 parent = ref->parent;
2087         else
2088                 ref_root = ref->root;
2089
2090         BUG_ON(node->ref_mod != 1);
2091         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2092                 BUG_ON(!extent_op || !extent_op->update_flags ||
2093                        !extent_op->update_key);
2094                 ret = alloc_reserved_tree_block(trans, root,
2095                                                 parent, ref_root,
2096                                                 extent_op->flags_to_set,
2097                                                 &extent_op->key,
2098                                                 ref->level, &ins);
2099         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2100                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2101                                              node->num_bytes, parent, ref_root,
2102                                              ref->level, 0, 1, extent_op);
2103         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2104                 ret = __btrfs_free_extent(trans, root, node->bytenr,
2105                                           node->num_bytes, parent, ref_root,
2106                                           ref->level, 0, 1, extent_op);
2107         } else {
2108                 BUG();
2109         }
2110         return ret;
2111 }
2112
2113 /* helper function to actually process a single delayed ref entry */
2114 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2115                                struct btrfs_root *root,
2116                                struct btrfs_delayed_ref_node *node,
2117                                struct btrfs_delayed_extent_op *extent_op,
2118                                int insert_reserved)
2119 {
2120         int ret = 0;
2121
2122         if (trans->aborted)
2123                 return 0;
2124
2125         if (btrfs_delayed_ref_is_head(node)) {
2126                 struct btrfs_delayed_ref_head *head;
2127                 /*
2128                  * we've hit the end of the chain and we were supposed
2129                  * to insert this extent into the tree.  But, it got
2130                  * deleted before we ever needed to insert it, so all
2131                  * we have to do is clean up the accounting
2132                  */
2133                 BUG_ON(extent_op);
2134                 head = btrfs_delayed_node_to_head(node);
2135                 if (insert_reserved) {
2136                         btrfs_pin_extent(root, node->bytenr,
2137                                          node->num_bytes, 1);
2138                         if (head->is_data) {
2139                                 ret = btrfs_del_csums(trans, root,
2140                                                       node->bytenr,
2141                                                       node->num_bytes);
2142                         }
2143                 }
2144                 return ret;
2145         }
2146
2147         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2148             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2149                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2150                                            insert_reserved);
2151         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2152                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2153                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2154                                            insert_reserved);
2155         else
2156                 BUG();
2157         return ret;
2158 }
2159
2160 static noinline struct btrfs_delayed_ref_node *
2161 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2162 {
2163         struct rb_node *node;
2164         struct btrfs_delayed_ref_node *ref;
2165         int action = BTRFS_ADD_DELAYED_REF;
2166 again:
2167         /*
2168          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2169          * this prevents ref count from going down to zero when
2170          * there still are pending delayed ref.
2171          */
2172         node = rb_prev(&head->node.rb_node);
2173         while (1) {
2174                 if (!node)
2175                         break;
2176                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
2177                                 rb_node);
2178                 if (ref->bytenr != head->node.bytenr)
2179                         break;
2180                 if (ref->action == action)
2181                         return ref;
2182                 node = rb_prev(node);
2183         }
2184         if (action == BTRFS_ADD_DELAYED_REF) {
2185                 action = BTRFS_DROP_DELAYED_REF;
2186                 goto again;
2187         }
2188         return NULL;
2189 }
2190
2191 /*
2192  * Returns 0 on success or if called with an already aborted transaction.
2193  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2194  */
2195 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2196                                        struct btrfs_root *root,
2197                                        struct list_head *cluster)
2198 {
2199         struct btrfs_delayed_ref_root *delayed_refs;
2200         struct btrfs_delayed_ref_node *ref;
2201         struct btrfs_delayed_ref_head *locked_ref = NULL;
2202         struct btrfs_delayed_extent_op *extent_op;
2203         struct btrfs_fs_info *fs_info = root->fs_info;
2204         int ret;
2205         int count = 0;
2206         int must_insert_reserved = 0;
2207
2208         delayed_refs = &trans->transaction->delayed_refs;
2209         while (1) {
2210                 if (!locked_ref) {
2211                         /* pick a new head ref from the cluster list */
2212                         if (list_empty(cluster))
2213                                 break;
2214
2215                         locked_ref = list_entry(cluster->next,
2216                                      struct btrfs_delayed_ref_head, cluster);
2217
2218                         /* grab the lock that says we are going to process
2219                          * all the refs for this head */
2220                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2221
2222                         /*
2223                          * we may have dropped the spin lock to get the head
2224                          * mutex lock, and that might have given someone else
2225                          * time to free the head.  If that's true, it has been
2226                          * removed from our list and we can move on.
2227                          */
2228                         if (ret == -EAGAIN) {
2229                                 locked_ref = NULL;
2230                                 count++;
2231                                 continue;
2232                         }
2233                 }
2234
2235                 /*
2236                  * We need to try and merge add/drops of the same ref since we
2237                  * can run into issues with relocate dropping the implicit ref
2238                  * and then it being added back again before the drop can
2239                  * finish.  If we merged anything we need to re-loop so we can
2240                  * get a good ref.
2241                  */
2242                 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2243                                          locked_ref);
2244
2245                 /*
2246                  * locked_ref is the head node, so we have to go one
2247                  * node back for any delayed ref updates
2248                  */
2249                 ref = select_delayed_ref(locked_ref);
2250
2251                 if (ref && ref->seq &&
2252                     btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2253                         /*
2254                          * there are still refs with lower seq numbers in the
2255                          * process of being added. Don't run this ref yet.
2256                          */
2257                         list_del_init(&locked_ref->cluster);
2258                         btrfs_delayed_ref_unlock(locked_ref);
2259                         locked_ref = NULL;
2260                         delayed_refs->num_heads_ready++;
2261                         spin_unlock(&delayed_refs->lock);
2262                         cond_resched();
2263                         spin_lock(&delayed_refs->lock);
2264                         continue;
2265                 }
2266
2267                 /*
2268                  * record the must insert reserved flag before we
2269                  * drop the spin lock.
2270                  */
2271                 must_insert_reserved = locked_ref->must_insert_reserved;
2272                 locked_ref->must_insert_reserved = 0;
2273
2274                 extent_op = locked_ref->extent_op;
2275                 locked_ref->extent_op = NULL;
2276
2277                 if (!ref) {
2278                         /* All delayed refs have been processed, Go ahead
2279                          * and send the head node to run_one_delayed_ref,
2280                          * so that any accounting fixes can happen
2281                          */
2282                         ref = &locked_ref->node;
2283
2284                         if (extent_op && must_insert_reserved) {
2285                                 btrfs_free_delayed_extent_op(extent_op);
2286                                 extent_op = NULL;
2287                         }
2288
2289                         if (extent_op) {
2290                                 spin_unlock(&delayed_refs->lock);
2291
2292                                 ret = run_delayed_extent_op(trans, root,
2293                                                             ref, extent_op);
2294                                 btrfs_free_delayed_extent_op(extent_op);
2295
2296                                 if (ret) {
2297                                         printk(KERN_DEBUG
2298                                                "btrfs: run_delayed_extent_op "
2299                                                "returned %d\n", ret);
2300                                         spin_lock(&delayed_refs->lock);
2301                                         btrfs_delayed_ref_unlock(locked_ref);
2302                                         return ret;
2303                                 }
2304
2305                                 goto next;
2306                         }
2307                 }
2308
2309                 ref->in_tree = 0;
2310                 rb_erase(&ref->rb_node, &delayed_refs->root);
2311                 delayed_refs->num_entries--;
2312                 if (!btrfs_delayed_ref_is_head(ref)) {
2313                         /*
2314                          * when we play the delayed ref, also correct the
2315                          * ref_mod on head
2316                          */
2317                         switch (ref->action) {
2318                         case BTRFS_ADD_DELAYED_REF:
2319                         case BTRFS_ADD_DELAYED_EXTENT:
2320                                 locked_ref->node.ref_mod -= ref->ref_mod;
2321                                 break;
2322                         case BTRFS_DROP_DELAYED_REF:
2323                                 locked_ref->node.ref_mod += ref->ref_mod;
2324                                 break;
2325                         default:
2326                                 WARN_ON(1);
2327                         }
2328                 }
2329                 spin_unlock(&delayed_refs->lock);
2330
2331                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2332                                           must_insert_reserved);
2333
2334                 btrfs_free_delayed_extent_op(extent_op);
2335                 if (ret) {
2336                         btrfs_delayed_ref_unlock(locked_ref);
2337                         btrfs_put_delayed_ref(ref);
2338                         printk(KERN_DEBUG
2339                                "btrfs: run_one_delayed_ref returned %d\n", ret);
2340                         spin_lock(&delayed_refs->lock);
2341                         return ret;
2342                 }
2343
2344                 /*
2345                  * If this node is a head, that means all the refs in this head
2346                  * have been dealt with, and we will pick the next head to deal
2347                  * with, so we must unlock the head and drop it from the cluster
2348                  * list before we release it.
2349                  */
2350                 if (btrfs_delayed_ref_is_head(ref)) {
2351                         list_del_init(&locked_ref->cluster);
2352                         btrfs_delayed_ref_unlock(locked_ref);
2353                         locked_ref = NULL;
2354                 }
2355                 btrfs_put_delayed_ref(ref);
2356                 count++;
2357 next:
2358                 cond_resched();
2359                 spin_lock(&delayed_refs->lock);
2360         }
2361         return count;
2362 }
2363
2364 #ifdef SCRAMBLE_DELAYED_REFS
2365 /*
2366  * Normally delayed refs get processed in ascending bytenr order. This
2367  * correlates in most cases to the order added. To expose dependencies on this
2368  * order, we start to process the tree in the middle instead of the beginning
2369  */
2370 static u64 find_middle(struct rb_root *root)
2371 {
2372         struct rb_node *n = root->rb_node;
2373         struct btrfs_delayed_ref_node *entry;
2374         int alt = 1;
2375         u64 middle;
2376         u64 first = 0, last = 0;
2377
2378         n = rb_first(root);
2379         if (n) {
2380                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2381                 first = entry->bytenr;
2382         }
2383         n = rb_last(root);
2384         if (n) {
2385                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2386                 last = entry->bytenr;
2387         }
2388         n = root->rb_node;
2389
2390         while (n) {
2391                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2392                 WARN_ON(!entry->in_tree);
2393
2394                 middle = entry->bytenr;
2395
2396                 if (alt)
2397                         n = n->rb_left;
2398                 else
2399                         n = n->rb_right;
2400
2401                 alt = 1 - alt;
2402         }
2403         return middle;
2404 }
2405 #endif
2406
2407 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2408                                          struct btrfs_fs_info *fs_info)
2409 {
2410         struct qgroup_update *qgroup_update;
2411         int ret = 0;
2412
2413         if (list_empty(&trans->qgroup_ref_list) !=
2414             !trans->delayed_ref_elem.seq) {
2415                 /* list without seq or seq without list */
2416                 printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
2417                         list_empty(&trans->qgroup_ref_list) ? "" : " not",
2418                         trans->delayed_ref_elem.seq);
2419                 BUG();
2420         }
2421
2422         if (!trans->delayed_ref_elem.seq)
2423                 return 0;
2424
2425         while (!list_empty(&trans->qgroup_ref_list)) {
2426                 qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2427                                                  struct qgroup_update, list);
2428                 list_del(&qgroup_update->list);
2429                 if (!ret)
2430                         ret = btrfs_qgroup_account_ref(
2431                                         trans, fs_info, qgroup_update->node,
2432                                         qgroup_update->extent_op);
2433                 kfree(qgroup_update);
2434         }
2435
2436         btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2437
2438         return ret;
2439 }
2440
2441 /*
2442  * this starts processing the delayed reference count updates and
2443  * extent insertions we have queued up so far.  count can be
2444  * 0, which means to process everything in the tree at the start
2445  * of the run (but not newly added entries), or it can be some target
2446  * number you'd like to process.
2447  *
2448  * Returns 0 on success or if called with an aborted transaction
2449  * Returns <0 on error and aborts the transaction
2450  */
2451 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2452                            struct btrfs_root *root, unsigned long count)
2453 {
2454         struct rb_node *node;
2455         struct btrfs_delayed_ref_root *delayed_refs;
2456         struct btrfs_delayed_ref_node *ref;
2457         struct list_head cluster;
2458         int ret;
2459         u64 delayed_start;
2460         int run_all = count == (unsigned long)-1;
2461         int run_most = 0;
2462         int loops;
2463
2464         /* We'll clean this up in btrfs_cleanup_transaction */
2465         if (trans->aborted)
2466                 return 0;
2467
2468         if (root == root->fs_info->extent_root)
2469                 root = root->fs_info->tree_root;
2470
2471         btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2472
2473         delayed_refs = &trans->transaction->delayed_refs;
2474         INIT_LIST_HEAD(&cluster);
2475 again:
2476         loops = 0;
2477         spin_lock(&delayed_refs->lock);
2478
2479 #ifdef SCRAMBLE_DELAYED_REFS
2480         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2481 #endif
2482
2483         if (count == 0) {
2484                 count = delayed_refs->num_entries * 2;
2485                 run_most = 1;
2486         }
2487         while (1) {
2488                 if (!(run_all || run_most) &&
2489                     delayed_refs->num_heads_ready < 64)
2490                         break;
2491
2492                 /*
2493                  * go find something we can process in the rbtree.  We start at
2494                  * the beginning of the tree, and then build a cluster
2495                  * of refs to process starting at the first one we are able to
2496                  * lock
2497                  */
2498                 delayed_start = delayed_refs->run_delayed_start;
2499                 ret = btrfs_find_ref_cluster(trans, &cluster,
2500                                              delayed_refs->run_delayed_start);
2501                 if (ret)
2502                         break;
2503
2504                 ret = run_clustered_refs(trans, root, &cluster);
2505                 if (ret < 0) {
2506                         btrfs_release_ref_cluster(&cluster);
2507                         spin_unlock(&delayed_refs->lock);
2508                         btrfs_abort_transaction(trans, root, ret);
2509                         return ret;
2510                 }
2511
2512                 count -= min_t(unsigned long, ret, count);
2513
2514                 if (count == 0)
2515                         break;
2516
2517                 if (delayed_start >= delayed_refs->run_delayed_start) {
2518                         if (loops == 0) {
2519                                 /*
2520                                  * btrfs_find_ref_cluster looped. let's do one
2521                                  * more cycle. if we don't run any delayed ref
2522                                  * during that cycle (because we can't because
2523                                  * all of them are blocked), bail out.
2524                                  */
2525                                 loops = 1;
2526                         } else {
2527                                 /*
2528                                  * no runnable refs left, stop trying
2529                                  */
2530                                 BUG_ON(run_all);
2531                                 break;
2532                         }
2533                 }
2534                 if (ret) {
2535                         /* refs were run, let's reset staleness detection */
2536                         loops = 0;
2537                 }
2538         }
2539
2540         if (run_all) {
2541                 if (!list_empty(&trans->new_bgs)) {
2542                         spin_unlock(&delayed_refs->lock);
2543                         btrfs_create_pending_block_groups(trans, root);
2544                         spin_lock(&delayed_refs->lock);
2545                 }
2546
2547                 node = rb_first(&delayed_refs->root);
2548                 if (!node)
2549                         goto out;
2550                 count = (unsigned long)-1;
2551
2552                 while (node) {
2553                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
2554                                        rb_node);
2555                         if (btrfs_delayed_ref_is_head(ref)) {
2556                                 struct btrfs_delayed_ref_head *head;
2557
2558                                 head = btrfs_delayed_node_to_head(ref);
2559                                 atomic_inc(&ref->refs);
2560
2561                                 spin_unlock(&delayed_refs->lock);
2562                                 /*
2563                                  * Mutex was contended, block until it's
2564                                  * released and try again
2565                                  */
2566                                 mutex_lock(&head->mutex);
2567                                 mutex_unlock(&head->mutex);
2568
2569                                 btrfs_put_delayed_ref(ref);
2570                                 cond_resched();
2571                                 goto again;
2572                         }
2573                         node = rb_next(node);
2574                 }
2575                 spin_unlock(&delayed_refs->lock);
2576                 schedule_timeout(1);
2577                 goto again;
2578         }
2579 out:
2580         spin_unlock(&delayed_refs->lock);
2581         assert_qgroups_uptodate(trans);
2582         return 0;
2583 }
2584
2585 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2586                                 struct btrfs_root *root,
2587                                 u64 bytenr, u64 num_bytes, u64 flags,
2588                                 int is_data)
2589 {
2590         struct btrfs_delayed_extent_op *extent_op;
2591         int ret;
2592
2593         extent_op = btrfs_alloc_delayed_extent_op();
2594         if (!extent_op)
2595                 return -ENOMEM;
2596
2597         extent_op->flags_to_set = flags;
2598         extent_op->update_flags = 1;
2599         extent_op->update_key = 0;
2600         extent_op->is_data = is_data ? 1 : 0;
2601
2602         ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2603                                           num_bytes, extent_op);
2604         if (ret)
2605                 btrfs_free_delayed_extent_op(extent_op);
2606         return ret;
2607 }
2608
2609 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2610                                       struct btrfs_root *root,
2611                                       struct btrfs_path *path,
2612                                       u64 objectid, u64 offset, u64 bytenr)
2613 {
2614         struct btrfs_delayed_ref_head *head;
2615         struct btrfs_delayed_ref_node *ref;
2616         struct btrfs_delayed_data_ref *data_ref;
2617         struct btrfs_delayed_ref_root *delayed_refs;
2618         struct rb_node *node;
2619         int ret = 0;
2620
2621         ret = -ENOENT;
2622         delayed_refs = &trans->transaction->delayed_refs;
2623         spin_lock(&delayed_refs->lock);
2624         head = btrfs_find_delayed_ref_head(trans, bytenr);
2625         if (!head)
2626                 goto out;
2627
2628         if (!mutex_trylock(&head->mutex)) {
2629                 atomic_inc(&head->node.refs);
2630                 spin_unlock(&delayed_refs->lock);
2631
2632                 btrfs_release_path(path);
2633
2634                 /*
2635                  * Mutex was contended, block until it's released and let
2636                  * caller try again
2637                  */
2638                 mutex_lock(&head->mutex);
2639                 mutex_unlock(&head->mutex);
2640                 btrfs_put_delayed_ref(&head->node);
2641                 return -EAGAIN;
2642         }
2643
2644         node = rb_prev(&head->node.rb_node);
2645         if (!node)
2646                 goto out_unlock;
2647
2648         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2649
2650         if (ref->bytenr != bytenr)
2651                 goto out_unlock;
2652
2653         ret = 1;
2654         if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2655                 goto out_unlock;
2656
2657         data_ref = btrfs_delayed_node_to_data_ref(ref);
2658
2659         node = rb_prev(node);
2660         if (node) {
2661                 int seq = ref->seq;
2662
2663                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2664                 if (ref->bytenr == bytenr && ref->seq == seq)
2665                         goto out_unlock;
2666         }
2667
2668         if (data_ref->root != root->root_key.objectid ||
2669             data_ref->objectid != objectid || data_ref->offset != offset)
2670                 goto out_unlock;
2671
2672         ret = 0;
2673 out_unlock:
2674         mutex_unlock(&head->mutex);
2675 out:
2676         spin_unlock(&delayed_refs->lock);
2677         return ret;
2678 }
2679
2680 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2681                                         struct btrfs_root *root,
2682                                         struct btrfs_path *path,
2683                                         u64 objectid, u64 offset, u64 bytenr)
2684 {
2685         struct btrfs_root *extent_root = root->fs_info->extent_root;
2686         struct extent_buffer *leaf;
2687         struct btrfs_extent_data_ref *ref;
2688         struct btrfs_extent_inline_ref *iref;
2689         struct btrfs_extent_item *ei;
2690         struct btrfs_key key;
2691         u32 item_size;
2692         int ret;
2693
2694         key.objectid = bytenr;
2695         key.offset = (u64)-1;
2696         key.type = BTRFS_EXTENT_ITEM_KEY;
2697
2698         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2699         if (ret < 0)
2700                 goto out;
2701         BUG_ON(ret == 0); /* Corruption */
2702
2703         ret = -ENOENT;
2704         if (path->slots[0] == 0)
2705                 goto out;
2706
2707         path->slots[0]--;
2708         leaf = path->nodes[0];
2709         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2710
2711         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2712                 goto out;
2713
2714         ret = 1;
2715         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2716 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2717         if (item_size < sizeof(*ei)) {
2718                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2719                 goto out;
2720         }
2721 #endif
2722         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2723
2724         if (item_size != sizeof(*ei) +
2725             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2726                 goto out;
2727
2728         if (btrfs_extent_generation(leaf, ei) <=
2729             btrfs_root_last_snapshot(&root->root_item))
2730                 goto out;
2731
2732         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2733         if (btrfs_extent_inline_ref_type(leaf, iref) !=
2734             BTRFS_EXTENT_DATA_REF_KEY)
2735                 goto out;
2736
2737         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2738         if (btrfs_extent_refs(leaf, ei) !=
2739             btrfs_extent_data_ref_count(leaf, ref) ||
2740             btrfs_extent_data_ref_root(leaf, ref) !=
2741             root->root_key.objectid ||
2742             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2743             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2744                 goto out;
2745
2746         ret = 0;
2747 out:
2748         return ret;
2749 }
2750
2751 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2752                           struct btrfs_root *root,
2753                           u64 objectid, u64 offset, u64 bytenr)
2754 {
2755         struct btrfs_path *path;
2756         int ret;
2757         int ret2;
2758
2759         path = btrfs_alloc_path();
2760         if (!path)
2761                 return -ENOENT;
2762
2763         do {
2764                 ret = check_committed_ref(trans, root, path, objectid,
2765                                           offset, bytenr);
2766                 if (ret && ret != -ENOENT)
2767                         goto out;
2768
2769                 ret2 = check_delayed_ref(trans, root, path, objectid,
2770                                          offset, bytenr);
2771         } while (ret2 == -EAGAIN);
2772
2773         if (ret2 && ret2 != -ENOENT) {
2774                 ret = ret2;
2775                 goto out;
2776         }
2777
2778         if (ret != -ENOENT || ret2 != -ENOENT)
2779                 ret = 0;
2780 out:
2781         btrfs_free_path(path);
2782         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2783                 WARN_ON(ret > 0);
2784         return ret;
2785 }
2786
2787 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2788                            struct btrfs_root *root,
2789                            struct extent_buffer *buf,
2790                            int full_backref, int inc, int for_cow)
2791 {
2792         u64 bytenr;
2793         u64 num_bytes;
2794         u64 parent;
2795         u64 ref_root;
2796         u32 nritems;
2797         struct btrfs_key key;
2798         struct btrfs_file_extent_item *fi;
2799         int i;
2800         int level;
2801         int ret = 0;
2802         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2803                             u64, u64, u64, u64, u64, u64, int);
2804
2805         ref_root = btrfs_header_owner(buf);
2806         nritems = btrfs_header_nritems(buf);
2807         level = btrfs_header_level(buf);
2808
2809         if (!root->ref_cows && level == 0)
2810                 return 0;
2811
2812         if (inc)
2813                 process_func = btrfs_inc_extent_ref;
2814         else
2815                 process_func = btrfs_free_extent;
2816
2817         if (full_backref)
2818                 parent = buf->start;
2819         else
2820                 parent = 0;
2821
2822         for (i = 0; i < nritems; i++) {
2823                 if (level == 0) {
2824                         btrfs_item_key_to_cpu(buf, &key, i);
2825                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2826                                 continue;
2827                         fi = btrfs_item_ptr(buf, i,
2828                                             struct btrfs_file_extent_item);
2829                         if (btrfs_file_extent_type(buf, fi) ==
2830                             BTRFS_FILE_EXTENT_INLINE)
2831                                 continue;
2832                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2833                         if (bytenr == 0)
2834                                 continue;
2835
2836                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2837                         key.offset -= btrfs_file_extent_offset(buf, fi);
2838                         ret = process_func(trans, root, bytenr, num_bytes,
2839                                            parent, ref_root, key.objectid,
2840                                            key.offset, for_cow);
2841                         if (ret)
2842                                 goto fail;
2843                 } else {
2844                         bytenr = btrfs_node_blockptr(buf, i);
2845                         num_bytes = btrfs_level_size(root, level - 1);
2846                         ret = process_func(trans, root, bytenr, num_bytes,
2847                                            parent, ref_root, level - 1, 0,
2848                                            for_cow);
2849                         if (ret)
2850                                 goto fail;
2851                 }
2852         }
2853         return 0;
2854 fail:
2855         return ret;
2856 }
2857
2858 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2859                   struct extent_buffer *buf, int full_backref, int for_cow)
2860 {
2861         return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2862 }
2863
2864 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2865                   struct extent_buffer *buf, int full_backref, int for_cow)
2866 {
2867         return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2868 }
2869
2870 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2871                                  struct btrfs_root *root,
2872                                  struct btrfs_path *path,
2873                                  struct btrfs_block_group_cache *cache)
2874 {
2875         int ret;
2876         struct btrfs_root *extent_root = root->fs_info->extent_root;
2877         unsigned long bi;
2878         struct extent_buffer *leaf;
2879
2880         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2881         if (ret < 0)
2882                 goto fail;
2883         BUG_ON(ret); /* Corruption */
2884
2885         leaf = path->nodes[0];
2886         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2887         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2888         btrfs_mark_buffer_dirty(leaf);
2889         btrfs_release_path(path);
2890 fail:
2891         if (ret) {
2892                 btrfs_abort_transaction(trans, root, ret);
2893                 return ret;
2894         }
2895         return 0;
2896
2897 }
2898
2899 static struct btrfs_block_group_cache *
2900 next_block_group(struct btrfs_root *root,
2901                  struct btrfs_block_group_cache *cache)
2902 {
2903         struct rb_node *node;
2904         spin_lock(&root->fs_info->block_group_cache_lock);
2905         node = rb_next(&cache->cache_node);
2906         btrfs_put_block_group(cache);
2907         if (node) {
2908                 cache = rb_entry(node, struct btrfs_block_group_cache,
2909                                  cache_node);
2910                 btrfs_get_block_group(cache);
2911         } else
2912                 cache = NULL;
2913         spin_unlock(&root->fs_info->block_group_cache_lock);
2914         return cache;
2915 }
2916
2917 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2918                             struct btrfs_trans_handle *trans,
2919                             struct btrfs_path *path)
2920 {
2921         struct btrfs_root *root = block_group->fs_info->tree_root;
2922         struct inode *inode = NULL;
2923         u64 alloc_hint = 0;
2924         int dcs = BTRFS_DC_ERROR;
2925         int num_pages = 0;
2926         int retries = 0;
2927         int ret = 0;
2928
2929         /*
2930          * If this block group is smaller than 100 megs don't bother caching the
2931          * block group.
2932          */
2933         if (block_group->key.offset < (100 * 1024 * 1024)) {
2934                 spin_lock(&block_group->lock);
2935                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2936                 spin_unlock(&block_group->lock);
2937                 return 0;
2938         }
2939
2940 again:
2941         inode = lookup_free_space_inode(root, block_group, path);
2942         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2943                 ret = PTR_ERR(inode);
2944                 btrfs_release_path(path);
2945                 goto out;
2946         }
2947
2948         if (IS_ERR(inode)) {
2949                 BUG_ON(retries);
2950                 retries++;
2951
2952                 if (block_group->ro)
2953                         goto out_free;
2954
2955                 ret = create_free_space_inode(root, trans, block_group, path);
2956                 if (ret)
2957                         goto out_free;
2958                 goto again;
2959         }
2960
2961         /* We've already setup this transaction, go ahead and exit */
2962         if (block_group->cache_generation == trans->transid &&
2963             i_size_read(inode)) {
2964                 dcs = BTRFS_DC_SETUP;
2965                 goto out_put;
2966         }
2967
2968         /*
2969          * We want to set the generation to 0, that way if anything goes wrong
2970          * from here on out we know not to trust this cache when we load up next
2971          * time.
2972          */
2973         BTRFS_I(inode)->generation = 0;
2974         ret = btrfs_update_inode(trans, root, inode);
2975         WARN_ON(ret);
2976
2977         if (i_size_read(inode) > 0) {
2978                 ret = btrfs_truncate_free_space_cache(root, trans, path,
2979                                                       inode);
2980                 if (ret)
2981                         goto out_put;
2982         }
2983
2984         spin_lock(&block_group->lock);
2985         if (block_group->cached != BTRFS_CACHE_FINISHED ||
2986             !btrfs_test_opt(root, SPACE_CACHE)) {
2987                 /*
2988                  * don't bother trying to write stuff out _if_
2989                  * a) we're not cached,
2990                  * b) we're with nospace_cache mount option.
2991                  */
2992                 dcs = BTRFS_DC_WRITTEN;
2993                 spin_unlock(&block_group->lock);
2994                 goto out_put;
2995         }
2996         spin_unlock(&block_group->lock);
2997
2998         /*
2999          * Try to preallocate enough space based on how big the block group is.
3000          * Keep in mind this has to include any pinned space which could end up
3001          * taking up quite a bit since it's not folded into the other space
3002          * cache.
3003          */
3004         num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
3005         if (!num_pages)
3006                 num_pages = 1;
3007
3008         num_pages *= 16;
3009         num_pages *= PAGE_CACHE_SIZE;
3010
3011         ret = btrfs_check_data_free_space(inode, num_pages);
3012         if (ret)
3013                 goto out_put;
3014
3015         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3016                                               num_pages, num_pages,
3017                                               &alloc_hint);
3018         if (!ret)
3019                 dcs = BTRFS_DC_SETUP;
3020         btrfs_free_reserved_data_space(inode, num_pages);
3021
3022 out_put:
3023         iput(inode);
3024 out_free:
3025         btrfs_release_path(path);
3026 out:
3027         spin_lock(&block_group->lock);
3028         if (!ret && dcs == BTRFS_DC_SETUP)
3029                 block_group->cache_generation = trans->transid;
3030         block_group->disk_cache_state = dcs;
3031         spin_unlock(&block_group->lock);
3032
3033         return ret;
3034 }
3035
3036 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3037                                    struct btrfs_root *root)
3038 {
3039         struct btrfs_block_group_cache *cache;
3040         int err = 0;
3041         struct btrfs_path *path;
3042         u64 last = 0;
3043
3044         path = btrfs_alloc_path();
3045         if (!path)
3046                 return -ENOMEM;
3047
3048 again:
3049         while (1) {
3050                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3051                 while (cache) {
3052                         if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3053                                 break;
3054                         cache = next_block_group(root, cache);
3055                 }
3056                 if (!cache) {
3057                         if (last == 0)
3058                                 break;
3059                         last = 0;
3060                         continue;
3061                 }
3062                 err = cache_save_setup(cache, trans, path);
3063                 last = cache->key.objectid + cache->key.offset;
3064                 btrfs_put_block_group(cache);
3065         }
3066
3067         while (1) {
3068                 if (last == 0) {
3069                         err = btrfs_run_delayed_refs(trans, root,
3070                                                      (unsigned long)-1);
3071                         if (err) /* File system offline */
3072                                 goto out;
3073                 }
3074
3075                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3076                 while (cache) {
3077                         if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
3078                                 btrfs_put_block_group(cache);
3079                                 goto again;
3080                         }
3081
3082                         if (cache->dirty)
3083                                 break;
3084                         cache = next_block_group(root, cache);
3085                 }
3086                 if (!cache) {
3087                         if (last == 0)
3088                                 break;
3089                         last = 0;
3090                         continue;
3091                 }
3092
3093                 if (cache->disk_cache_state == BTRFS_DC_SETUP)
3094                         cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
3095                 cache->dirty = 0;
3096                 last = cache->key.objectid + cache->key.offset;
3097
3098                 err = write_one_cache_group(trans, root, path, cache);
3099                 if (err) /* File system offline */
3100                         goto out;
3101
3102                 btrfs_put_block_group(cache);
3103         }
3104
3105         while (1) {
3106                 /*
3107                  * I don't think this is needed since we're just marking our
3108                  * preallocated extent as written, but just in case it can't
3109                  * hurt.
3110                  */
3111                 if (last == 0) {
3112                         err = btrfs_run_delayed_refs(trans, root,
3113                                                      (unsigned long)-1);
3114                         if (err) /* File system offline */
3115                                 goto out;
3116                 }
3117
3118                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3119                 while (cache) {
3120                         /*
3121                          * Really this shouldn't happen, but it could if we
3122                          * couldn't write the entire preallocated extent and
3123                          * splitting the extent resulted in a new block.
3124                          */
3125                         if (cache->dirty) {
3126                                 btrfs_put_block_group(cache);
3127                                 goto again;
3128                         }
3129                         if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3130                                 break;
3131                         cache = next_block_group(root, cache);
3132                 }
3133                 if (!cache) {
3134                         if (last == 0)
3135                                 break;
3136                         last = 0;
3137                         continue;
3138                 }
3139
3140                 err = btrfs_write_out_cache(root, trans, cache, path);
3141
3142                 /*
3143                  * If we didn't have an error then the cache state is still
3144                  * NEED_WRITE, so we can set it to WRITTEN.
3145                  */
3146                 if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3147                         cache->disk_cache_state = BTRFS_DC_WRITTEN;
3148                 last = cache->key.objectid + cache->key.offset;
3149                 btrfs_put_block_group(cache);
3150         }
3151 out:
3152
3153         btrfs_free_path(path);
3154         return err;
3155 }
3156
3157 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3158 {
3159         struct btrfs_block_group_cache *block_group;
3160         int readonly = 0;
3161
3162         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3163         if (!block_group || block_group->ro)
3164                 readonly = 1;
3165         if (block_group)
3166                 btrfs_put_block_group(block_group);
3167         return readonly;
3168 }
3169
3170 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3171                              u64 total_bytes, u64 bytes_used,
3172                              struct btrfs_space_info **space_info)
3173 {
3174         struct btrfs_space_info *found;
3175         int i;
3176         int factor;
3177
3178         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3179                      BTRFS_BLOCK_GROUP_RAID10))
3180                 factor = 2;
3181         else
3182                 factor = 1;
3183
3184         found = __find_space_info(info, flags);
3185         if (found) {
3186                 spin_lock(&found->lock);
3187                 found->total_bytes += total_bytes;
3188                 found->disk_total += total_bytes * factor;
3189                 found->bytes_used += bytes_used;
3190                 found->disk_used += bytes_used * factor;
3191                 found->full = 0;
3192                 spin_unlock(&found->lock);
3193                 *space_info = found;
3194                 return 0;
3195         }
3196         found = kzalloc(sizeof(*found), GFP_NOFS);
3197         if (!found)
3198                 return -ENOMEM;
3199
3200         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3201                 INIT_LIST_HEAD(&found->block_groups[i]);
3202         init_rwsem(&found->groups_sem);
3203         spin_lock_init(&found->lock);
3204         found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3205         found->total_bytes = total_bytes;
3206         found->disk_total = total_bytes * factor;
3207         found->bytes_used = bytes_used;
3208         found->disk_used = bytes_used * factor;
3209         found->bytes_pinned = 0;
3210         found->bytes_reserved = 0;
3211         found->bytes_readonly = 0;
3212         found->bytes_may_use = 0;
3213         found->full = 0;
3214         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3215         found->chunk_alloc = 0;
3216         found->flush = 0;
3217         init_waitqueue_head(&found->wait);
3218         *space_info = found;
3219         list_add_rcu(&found->list, &info->space_info);
3220         if (flags & BTRFS_BLOCK_GROUP_DATA)
3221                 info->data_sinfo = found;
3222         return 0;
3223 }
3224
3225 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3226 {
3227         u64 extra_flags = chunk_to_extended(flags) &
3228                                 BTRFS_EXTENDED_PROFILE_MASK;
3229
3230         write_seqlock(&fs_info->profiles_lock);
3231         if (flags & BTRFS_BLOCK_GROUP_DATA)
3232                 fs_info->avail_data_alloc_bits |= extra_flags;
3233         if (flags & BTRFS_BLOCK_GROUP_METADATA)
3234                 fs_info->avail_metadata_alloc_bits |= extra_flags;
3235         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3236                 fs_info->avail_system_alloc_bits |= extra_flags;
3237         write_sequnlock(&fs_info->profiles_lock);
3238 }
3239
3240 /*
3241  * returns target flags in extended format or 0 if restripe for this
3242  * chunk_type is not in progress
3243  *
3244  * should be called with either volume_mutex or balance_lock held
3245  */
3246 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3247 {
3248         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3249         u64 target = 0;
3250
3251         if (!bctl)
3252                 return 0;
3253
3254         if (flags & BTRFS_BLOCK_GROUP_DATA &&
3255             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3256                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3257         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3258                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3259                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3260         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3261                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3262                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3263         }
3264
3265         return target;
3266 }
3267
3268 /*
3269  * @flags: available profiles in extended format (see ctree.h)
3270  *
3271  * Returns reduced profile in chunk format.  If profile changing is in
3272  * progress (either running or paused) picks the target profile (if it's
3273  * already available), otherwise falls back to plain reducing.
3274  */
3275 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3276 {
3277         /*
3278          * we add in the count of missing devices because we want
3279          * to make sure that any RAID levels on a degraded FS
3280          * continue to be honored.
3281          */
3282         u64 num_devices = root->fs_info->fs_devices->rw_devices +
3283                 root->fs_info->fs_devices->missing_devices;
3284         u64 target;
3285
3286         /*
3287          * see if restripe for this chunk_type is in progress, if so
3288          * try to reduce to the target profile
3289          */
3290         spin_lock(&root->fs_info->balance_lock);
3291         target = get_restripe_target(root->fs_info, flags);
3292         if (target) {
3293                 /* pick target profile only if it's already available */
3294                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3295                         spin_unlock(&root->fs_info->balance_lock);
3296                         return extended_to_chunk(target);
3297                 }
3298         }
3299         spin_unlock(&root->fs_info->balance_lock);
3300
3301         if (num_devices == 1)
3302                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3303         if (num_devices < 4)
3304                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3305
3306         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
3307             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3308                       BTRFS_BLOCK_GROUP_RAID10))) {
3309                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
3310         }
3311
3312         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3313             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3314                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3315         }
3316
3317         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3318             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3319              (flags & BTRFS_BLOCK_GROUP_RAID10) |
3320              (flags & BTRFS_BLOCK_GROUP_DUP))) {
3321                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3322         }
3323
3324         return extended_to_chunk(flags);
3325 }
3326
3327 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3328 {
3329         unsigned seq;
3330
3331         do {
3332                 seq = read_seqbegin(&root->fs_info->profiles_lock);
3333
3334                 if (flags & BTRFS_BLOCK_GROUP_DATA)
3335                         flags |= root->fs_info->avail_data_alloc_bits;
3336                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3337                         flags |= root->fs_info->avail_system_alloc_bits;
3338                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3339                         flags |= root->fs_info->avail_metadata_alloc_bits;
3340         } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3341
3342         return btrfs_reduce_alloc_profile(root, flags);
3343 }
3344
3345 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3346 {
3347         u64 flags;
3348
3349         if (data)
3350                 flags = BTRFS_BLOCK_GROUP_DATA;
3351         else if (root == root->fs_info->chunk_root)
3352                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3353         else
3354                 flags = BTRFS_BLOCK_GROUP_METADATA;
3355
3356         return get_alloc_profile(root, flags);
3357 }
3358
3359 /*
3360  * This will check the space that the inode allocates from to make sure we have
3361  * enough space for bytes.
3362  */
3363 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3364 {
3365         struct btrfs_space_info *data_sinfo;
3366         struct btrfs_root *root = BTRFS_I(inode)->root;
3367         struct btrfs_fs_info *fs_info = root->fs_info;
3368         u64 used;
3369         int ret = 0, committed = 0, alloc_chunk = 1;
3370
3371         /* make sure bytes are sectorsize aligned */
3372         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3373
3374         if (root == root->fs_info->tree_root ||
3375             BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3376                 alloc_chunk = 0;
3377                 committed = 1;
3378         }
3379
3380         data_sinfo = fs_info->data_sinfo;
3381         if (!data_sinfo)
3382                 goto alloc;
3383
3384 again:
3385         /* make sure we have enough space to handle the data first */
3386         spin_lock(&data_sinfo->lock);
3387         used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3388                 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3389                 data_sinfo->bytes_may_use;
3390
3391         if (used + bytes > data_sinfo->total_bytes) {
3392                 struct btrfs_trans_handle *trans;
3393
3394                 /*
3395                  * if we don't have enough free bytes in this space then we need
3396                  * to alloc a new chunk.
3397                  */
3398                 if (!data_sinfo->full && alloc_chunk) {
3399                         u64 alloc_target;
3400
3401                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3402                         spin_unlock(&data_sinfo->lock);
3403 alloc:
3404                         alloc_target = btrfs_get_alloc_profile(root, 1);
3405                         trans = btrfs_join_transaction(root);
3406                         if (IS_ERR(trans))
3407                                 return PTR_ERR(trans);
3408
3409                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3410                                              alloc_target,
3411                                              CHUNK_ALLOC_NO_FORCE);
3412                         btrfs_end_transaction(trans, root);
3413                         if (ret < 0) {
3414                                 if (ret != -ENOSPC)
3415                                         return ret;
3416                                 else
3417                                         goto commit_trans;
3418                         }
3419
3420                         if (!data_sinfo)
3421                                 data_sinfo = fs_info->data_sinfo;
3422
3423                         goto again;
3424                 }
3425
3426                 /*
3427                  * If we have less pinned bytes than we want to allocate then
3428                  * don't bother committing the transaction, it won't help us.
3429                  */
3430                 if (data_sinfo->bytes_pinned < bytes)
3431                         committed = 1;
3432                 spin_unlock(&data_sinfo->lock);
3433
3434                 /* commit the current transaction and try again */
3435 commit_trans:
3436                 if (!committed &&
3437                     !atomic_read(&root->fs_info->open_ioctl_trans)) {
3438                         committed = 1;
3439                         trans = btrfs_join_transaction(root);
3440                         if (IS_ERR(trans))
3441                                 return PTR_ERR(trans);
3442                         ret = btrfs_commit_transaction(trans, root);
3443                         if (ret)
3444                                 return ret;
3445                         goto again;
3446                 }
3447
3448                 return -ENOSPC;
3449         }
3450         data_sinfo->bytes_may_use += bytes;
3451         trace_btrfs_space_reservation(root->fs_info, "space_info",
3452                                       data_sinfo->flags, bytes, 1);
3453         spin_unlock(&data_sinfo->lock);
3454
3455         return 0;
3456 }
3457
3458 /*
3459  * Called if we need to clear a data reservation for this inode.
3460  */
3461 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3462 {
3463         struct btrfs_root *root = BTRFS_I(inode)->root;
3464         struct btrfs_space_info *data_sinfo;
3465
3466         /* make sure bytes are sectorsize aligned */
3467         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3468
3469         data_sinfo = root->fs_info->data_sinfo;
3470         spin_lock(&data_sinfo->lock);
3471         data_sinfo->bytes_may_use -= bytes;
3472         trace_btrfs_space_reservation(root->fs_info, "space_info",
3473                                       data_sinfo->flags, bytes, 0);
3474         spin_unlock(&data_sinfo->lock);
3475 }
3476
3477 static void force_metadata_allocation(struct btrfs_fs_info *info)
3478 {
3479         struct list_head *head = &info->space_info;
3480         struct btrfs_space_info *found;
3481
3482         rcu_read_lock();
3483         list_for_each_entry_rcu(found, head, list) {
3484                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3485                         found->force_alloc = CHUNK_ALLOC_FORCE;
3486         }
3487         rcu_read_unlock();
3488 }
3489
3490 static int should_alloc_chunk(struct btrfs_root *root,
3491                               struct btrfs_space_info *sinfo, int force)
3492 {
3493         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3494         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3495         u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3496         u64 thresh;
3497
3498         if (force == CHUNK_ALLOC_FORCE)
3499                 return 1;
3500
3501         /*
3502          * We need to take into account the global rsv because for all intents
3503          * and purposes it's used space.  Don't worry about locking the
3504          * global_rsv, it doesn't change except when the transaction commits.
3505          */
3506         if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3507                 num_allocated += global_rsv->size;
3508
3509         /*
3510          * in limited mode, we want to have some free space up to
3511          * about 1% of the FS size.
3512          */
3513         if (force == CHUNK_ALLOC_LIMITED) {
3514                 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3515                 thresh = max_t(u64, 64 * 1024 * 1024,
3516                                div_factor_fine(thresh, 1));
3517
3518                 if (num_bytes - num_allocated < thresh)
3519                         return 1;
3520         }
3521
3522         if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3523                 return 0;
3524         return 1;
3525 }
3526
3527 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3528 {
3529         u64 num_dev;
3530
3531         if (type & BTRFS_BLOCK_GROUP_RAID10 ||
3532             type & BTRFS_BLOCK_GROUP_RAID0)
3533                 num_dev = root->fs_info->fs_devices->rw_devices;
3534         else if (type & BTRFS_BLOCK_GROUP_RAID1)
3535                 num_dev = 2;
3536         else
3537                 num_dev = 1;    /* DUP or single */
3538
3539         /* metadata for updaing devices and chunk tree */
3540         return btrfs_calc_trans_metadata_size(root, num_dev + 1);
3541 }
3542
3543 static void check_system_chunk(struct btrfs_trans_handle *trans,
3544                                struct btrfs_root *root, u64 type)
3545 {
3546         struct btrfs_space_info *info;
3547         u64 left;
3548         u64 thresh;
3549
3550         info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3551         spin_lock(&info->lock);
3552         left = info->total_bytes - info->bytes_used - info->bytes_pinned -
3553                 info->bytes_reserved - info->bytes_readonly;
3554         spin_unlock(&info->lock);
3555
3556         thresh = get_system_chunk_thresh(root, type);
3557         if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
3558                 printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
3559                        left, thresh, type);
3560                 dump_space_info(info, 0, 0);
3561         }
3562
3563         if (left < thresh) {
3564                 u64 flags;
3565
3566                 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
3567                 btrfs_alloc_chunk(trans, root, flags);
3568         }
3569 }
3570
3571 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3572                           struct btrfs_root *extent_root, u64 flags, int force)
3573 {
3574         struct btrfs_space_info *space_info;
3575         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3576         int wait_for_alloc = 0;
3577         int ret = 0;
3578
3579         /* Don't re-enter if we're already allocating a chunk */
3580         if (trans->allocating_chunk)
3581                 return -ENOSPC;
3582
3583         space_info = __find_space_info(extent_root->fs_info, flags);
3584         if (!space_info) {
3585                 ret = update_space_info(extent_root->fs_info, flags,
3586                                         0, 0, &space_info);
3587                 BUG_ON(ret); /* -ENOMEM */
3588         }
3589         BUG_ON(!space_info); /* Logic error */
3590
3591 again:
3592         spin_lock(&space_info->lock);
3593         if (force < space_info->force_alloc)
3594                 force = space_info->force_alloc;
3595         if (space_info->full) {
3596                 spin_unlock(&space_info->lock);
3597                 return 0;
3598         }
3599
3600         if (!should_alloc_chunk(extent_root, space_info, force)) {
3601                 spin_unlock(&space_info->lock);
3602                 return 0;
3603         } else if (space_info->chunk_alloc) {
3604                 wait_for_alloc = 1;
3605         } else {
3606                 space_info->chunk_alloc = 1;
3607         }
3608
3609         spin_unlock(&space_info->lock);
3610
3611         mutex_lock(&fs_info->chunk_mutex);
3612
3613         /*
3614          * The chunk_mutex is held throughout the entirety of a chunk
3615          * allocation, so once we've acquired the chunk_mutex we know that the
3616          * other guy is done and we need to recheck and see if we should
3617          * allocate.
3618          */
3619         if (wait_for_alloc) {
3620                 mutex_unlock(&fs_info->chunk_mutex);
3621                 wait_for_alloc = 0;
3622                 goto again;
3623         }
3624
3625         trans->allocating_chunk = true;
3626
3627         /*
3628          * If we have mixed data/metadata chunks we want to make sure we keep
3629          * allocating mixed chunks instead of individual chunks.
3630          */
3631         if (btrfs_mixed_space_info(space_info))
3632                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3633
3634         /*
3635          * if we're doing a data chunk, go ahead and make sure that
3636          * we keep a reasonable number of metadata chunks allocated in the
3637          * FS as well.
3638          */
3639         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3640                 fs_info->data_chunk_allocations++;
3641                 if (!(fs_info->data_chunk_allocations %
3642                       fs_info->metadata_ratio))
3643                         force_metadata_allocation(fs_info);
3644         }
3645
3646         /*
3647          * Check if we have enough space in SYSTEM chunk because we may need
3648          * to update devices.
3649          */
3650         check_system_chunk(trans, extent_root, flags);
3651
3652         ret = btrfs_alloc_chunk(trans, extent_root, flags);
3653         trans->allocating_chunk = false;
3654         if (ret < 0 && ret != -ENOSPC)
3655                 goto out;
3656
3657         spin_lock(&space_info->lock);
3658         if (ret)
3659                 space_info->full = 1;
3660         else
3661                 ret = 1;
3662
3663         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3664         space_info->chunk_alloc = 0;
3665         spin_unlock(&space_info->lock);
3666 out:
3667         mutex_unlock(&fs_info->chunk_mutex);
3668         return ret;
3669 }
3670
3671 static int can_overcommit(struct btrfs_root *root,
3672                           struct btrfs_space_info *space_info, u64 bytes,
3673                           enum btrfs_reserve_flush_enum flush)
3674 {
3675         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3676         u64 profile = btrfs_get_alloc_profile(root, 0);
3677         u64 rsv_size = 0;
3678         u64 avail;
3679         u64 used;
3680
3681         used = space_info->bytes_used + space_info->bytes_reserved +
3682                 space_info->bytes_pinned + space_info->bytes_readonly;
3683
3684         spin_lock(&global_rsv->lock);
3685         rsv_size = global_rsv->size;
3686         spin_unlock(&global_rsv->lock);
3687
3688         /*
3689          * We only want to allow over committing if we have lots of actual space
3690          * free, but if we don't have enough space to handle the global reserve
3691          * space then we could end up having a real enospc problem when trying
3692          * to allocate a chunk or some other such important allocation.
3693          */
3694         rsv_size <<= 1;
3695         if (used + rsv_size >= space_info->total_bytes)
3696                 return 0;
3697
3698         used += space_info->bytes_may_use;
3699
3700         spin_lock(&root->fs_info->free_chunk_lock);
3701         avail = root->fs_info->free_chunk_space;
3702         spin_unlock(&root->fs_info->free_chunk_lock);
3703
3704         /*
3705          * If we have dup, raid1 or raid10 then only half of the free
3706          * space is actually useable.
3707          */
3708         if (profile & (BTRFS_BLOCK_GROUP_DUP |
3709                        BTRFS_BLOCK_GROUP_RAID1 |
3710                        BTRFS_BLOCK_GROUP_RAID10))
3711                 avail >>= 1;
3712
3713         /*
3714          * If we aren't flushing all things, let us overcommit up to
3715          * 1/2th of the space. If we can flush, don't let us overcommit
3716          * too much, let it overcommit up to 1/8 of the space.
3717          */
3718         if (flush == BTRFS_RESERVE_FLUSH_ALL)
3719                 avail >>= 3;
3720         else
3721                 avail >>= 1;
3722
3723         if (used + bytes < space_info->total_bytes + avail)
3724                 return 1;
3725         return 0;
3726 }
3727
3728 static inline int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
3729                                                       unsigned long nr_pages,
3730                                                       enum wb_reason reason)
3731 {
3732         /* the flusher is dealing with the dirty inodes now. */
3733         if (writeback_in_progress(sb->s_bdi))
3734                 return 1;
3735
3736         if (down_read_trylock(&sb->s_umount)) {
3737                 writeback_inodes_sb_nr(sb, nr_pages, reason);
3738                 up_read(&sb->s_umount);
3739                 return 1;
3740         }
3741
3742         return 0;
3743 }
3744
3745 void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3746                                   unsigned long nr_pages)
3747 {
3748         struct super_block *sb = root->fs_info->sb;
3749         int started;
3750
3751         /* If we can not start writeback, just sync all the delalloc file. */
3752         started = writeback_inodes_sb_nr_if_idle_safe(sb, nr_pages,
3753                                                       WB_REASON_FS_FREE_SPACE);
3754         if (!started) {
3755                 /*
3756                  * We needn't worry the filesystem going from r/w to r/o though
3757                  * we don't acquire ->s_umount mutex, because the filesystem
3758                  * should guarantee the delalloc inodes list be empty after
3759                  * the filesystem is readonly(all dirty pages are written to
3760                  * the disk).
3761                  */
3762                 btrfs_start_delalloc_inodes(root, 0);
3763                 btrfs_wait_ordered_extents(root, 0);
3764         }
3765 }
3766
3767 /*
3768  * shrink metadata reservation for delalloc
3769  */
3770 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3771                             bool wait_ordered)
3772 {
3773         struct btrfs_block_rsv *block_rsv;
3774         struct btrfs_space_info *space_info;
3775         struct btrfs_trans_handle *trans;
3776         u64 delalloc_bytes;
3777         u64 max_reclaim;
3778         long time_left;
3779         unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3780         int loops = 0;
3781         enum btrfs_reserve_flush_enum flush;
3782
3783         trans = (struct btrfs_trans_handle *)current->journal_info;
3784         block_rsv = &root->fs_info->delalloc_block_rsv;
3785         space_info = block_rsv->space_info;
3786
3787         smp_mb();
3788         delalloc_bytes = percpu_counter_sum_positive(
3789                                                 &root->fs_info->delalloc_bytes);
3790         if (delalloc_bytes == 0) {
3791                 if (trans)
3792                         return;
3793                 btrfs_wait_ordered_extents(root, 0);
3794                 return;
3795         }
3796
3797         while (delalloc_bytes && loops < 3) {
3798                 max_reclaim = min(delalloc_bytes, to_reclaim);
3799                 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3800                 btrfs_writeback_inodes_sb_nr(root, nr_pages);
3801                 /*
3802                  * We need to wait for the async pages to actually start before
3803                  * we do anything.
3804                  */
3805                 wait_event(root->fs_info->async_submit_wait,
3806                            !atomic_read(&root->fs_info->async_delalloc_pages));
3807
3808                 if (!trans)
3809                         flush = BTRFS_RESERVE_FLUSH_ALL;
3810                 else
3811                         flush = BTRFS_RESERVE_NO_FLUSH;
3812                 spin_lock(&space_info->lock);
3813                 if (can_overcommit(root, space_info, orig, flush)) {
3814                         spin_unlock(&space_info->lock);
3815                         break;
3816                 }
3817                 spin_unlock(&space_info->lock);
3818
3819                 loops++;
3820                 if (wait_ordered && !trans) {
3821                         btrfs_wait_ordered_extents(root, 0);
3822                 } else {
3823                         time_left = schedule_timeout_killable(1);
3824                         if (time_left)
3825                                 break;
3826                 }
3827                 smp_mb();
3828                 delalloc_bytes = percpu_counter_sum_positive(
3829                                                 &root->fs_info->delalloc_bytes);
3830         }
3831 }
3832
3833 /**
3834  * maybe_commit_transaction - possibly commit the transaction if its ok to
3835  * @root - the root we're allocating for
3836  * @bytes - the number of bytes we want to reserve
3837  * @force - force the commit
3838  *
3839  * This will check to make sure that committing the transaction will actually
3840  * get us somewhere and then commit the transaction if it does.  Otherwise it
3841  * will return -ENOSPC.
3842  */
3843 static int may_commit_transaction(struct btrfs_root *root,
3844                                   struct btrfs_space_info *space_info,
3845                                   u64 bytes, int force)
3846 {
3847         struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3848         struct btrfs_trans_handle *trans;
3849
3850         trans = (struct btrfs_trans_handle *)current->journal_info;
3851         if (trans)
3852                 return -EAGAIN;
3853
3854         if (force)
3855                 goto commit;
3856
3857         /* See if there is enough pinned space to make this reservation */
3858         spin_lock(&space_info->lock);
3859         if (space_info->bytes_pinned >= bytes) {
3860                 spin_unlock(&space_info->lock);
3861                 goto commit;
3862         }
3863         spin_unlock(&space_info->lock);
3864
3865         /*
3866          * See if there is some space in the delayed insertion reservation for
3867          * this reservation.
3868          */
3869         if (space_info != delayed_rsv->space_info)
3870                 return -ENOSPC;
3871
3872         spin_lock(&space_info->lock);
3873         spin_lock(&delayed_rsv->lock);
3874         if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
3875                 spin_unlock(&delayed_rsv->lock);
3876                 spin_unlock(&space_info->lock);
3877                 return -ENOSPC;
3878         }
3879         spin_unlock(&delayed_rsv->lock);
3880         spin_unlock(&space_info->lock);
3881
3882 commit:
3883         trans = btrfs_join_transaction(root);
3884         if (IS_ERR(trans))
3885                 return -ENOSPC;
3886
3887         return btrfs_commit_transaction(trans, root);
3888 }
3889
3890 enum flush_state {
3891         FLUSH_DELAYED_ITEMS_NR  =       1,
3892         FLUSH_DELAYED_ITEMS     =       2,
3893         FLUSH_DELALLOC          =       3,
3894         FLUSH_DELALLOC_WAIT     =       4,
3895         ALLOC_CHUNK             =       5,
3896         COMMIT_TRANS            =       6,
3897 };
3898
3899 static int flush_space(struct btrfs_root *root,
3900                        struct btrfs_space_info *space_info, u64 num_bytes,
3901                        u64 orig_bytes, int state)
3902 {
3903         struct btrfs_trans_handle *trans;
3904         int nr;
3905         int ret = 0;
3906
3907         switch (state) {
3908         case FLUSH_DELAYED_ITEMS_NR:
3909         case FLUSH_DELAYED_ITEMS:
3910                 if (state == FLUSH_DELAYED_ITEMS_NR) {
3911                         u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
3912
3913                         nr = (int)div64_u64(num_bytes, bytes);
3914                         if (!nr)
3915                                 nr = 1;
3916                         nr *= 2;
3917                 } else {
3918                         nr = -1;
3919                 }
3920                 trans = btrfs_join_transaction(root);
3921                 if (IS_ERR(trans)) {
3922                         ret = PTR_ERR(trans);
3923                         break;
3924                 }
3925                 ret = btrfs_run_delayed_items_nr(trans, root, nr);
3926                 btrfs_end_transaction(trans, root);
3927                 break;
3928         case FLUSH_DELALLOC:
3929         case FLUSH_DELALLOC_WAIT:
3930                 shrink_delalloc(root, num_bytes, orig_bytes,
3931                                 state == FLUSH_DELALLOC_WAIT);
3932                 break;
3933         case ALLOC_CHUNK:
3934                 trans = btrfs_join_transaction(root);
3935                 if (IS_ERR(trans)) {
3936                         ret = PTR_ERR(trans);
3937                         break;
3938                 }
3939                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3940                                      btrfs_get_alloc_profile(root, 0),
3941                                      CHUNK_ALLOC_NO_FORCE);
3942                 btrfs_end_transaction(trans, root);
3943                 if (ret == -ENOSPC)
3944                         ret = 0;
3945                 break;
3946         case COMMIT_TRANS:
3947                 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3948                 break;
3949         default:
3950                 ret = -ENOSPC;
3951                 break;
3952         }
3953
3954         return ret;
3955 }
3956 /**
3957  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3958  * @root - the root we're allocating for
3959  * @block_rsv - the block_rsv we're allocating for
3960  * @orig_bytes - the number of bytes we want
3961  * @flush - wether or not we can flush to make our reservation
3962  *
3963  * This will reserve orgi_bytes number of bytes from the space info associated
3964  * with the block_rsv.  If there is not enough space it will make an attempt to
3965  * flush out space to make room.  It will do this by flushing delalloc if
3966  * possible or committing the transaction.  If flush is 0 then no attempts to
3967  * regain reservations will be made and this will fail if there is not enough
3968  * space already.
3969  */
3970 static int reserve_metadata_bytes(struct btrfs_root *root,
3971                                   struct btrfs_block_rsv *block_rsv,
3972                                   u64 orig_bytes,
3973                                   enum btrfs_reserve_flush_enum flush)
3974 {
3975         struct btrfs_space_info *space_info = block_rsv->space_info;
3976         u64 used;
3977         u64 num_bytes = orig_bytes;
3978         int flush_state = FLUSH_DELAYED_ITEMS_NR;
3979         int ret = 0;
3980         bool flushing = false;
3981
3982 again:
3983         ret = 0;
3984         spin_lock(&space_info->lock);
3985         /*
3986          * We only want to wait if somebody other than us is flushing and we
3987          * are actually allowed to flush all things.
3988          */
3989         while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
3990                space_info->flush) {
3991                 spin_unlock(&space_info->lock);
3992                 /*
3993                  * If we have a trans handle we can't wait because the flusher
3994                  * may have to commit the transaction, which would mean we would
3995                  * deadlock since we are waiting for the flusher to finish, but
3996                  * hold the current transaction open.
3997                  */
3998                 if (current->journal_info)
3999                         return -EAGAIN;
4000                 ret = wait_event_killable(space_info->wait, !space_info->flush);
4001                 /* Must have been killed, return */
4002                 if (ret)
4003                         return -EINTR;
4004
4005                 spin_lock(&space_info->lock);
4006         }
4007
4008         ret = -ENOSPC;
4009         used = space_info->bytes_used + space_info->bytes_reserved +
4010                 space_info->bytes_pinned + space_info->bytes_readonly +
4011                 space_info->bytes_may_use;
4012
4013         /*
4014          * The idea here is that we've not already over-reserved the block group
4015          * then we can go ahead and save our reservation first and then start
4016          * flushing if we need to.  Otherwise if we've already overcommitted
4017          * lets start flushing stuff first and then come back and try to make
4018          * our reservation.
4019          */
4020         if (used <= space_info->total_bytes) {
4021                 if (used + orig_bytes <= space_info->total_bytes) {
4022                         space_info->bytes_may_use += orig_bytes;
4023                         trace_btrfs_space_reservation(root->fs_info,
4024                                 "space_info", space_info->flags, orig_bytes, 1);
4025                         ret = 0;
4026                 } else {
4027                         /*
4028                          * Ok set num_bytes to orig_bytes since we aren't
4029                          * overocmmitted, this way we only try and reclaim what
4030                          * we need.
4031                          */
4032                         num_bytes = orig_bytes;
4033                 }
4034         } else {
4035                 /*
4036                  * Ok we're over committed, set num_bytes to the overcommitted
4037                  * amount plus the amount of bytes that we need for this
4038                  * reservation.
4039                  */
4040                 num_bytes = used - space_info->total_bytes +
4041                         (orig_bytes * 2);
4042         }
4043
4044         if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
4045                 space_info->bytes_may_use += orig_bytes;
4046                 trace_btrfs_space_reservation(root->fs_info, "space_info",
4047                                               space_info->flags, orig_bytes,
4048                                               1);
4049                 ret = 0;
4050         }
4051
4052         /*
4053          * Couldn't make our reservation, save our place so while we're trying
4054          * to reclaim space we can actually use it instead of somebody else
4055          * stealing it from us.
4056          *
4057          * We make the other tasks wait for the flush only when we can flush
4058          * all things.
4059          */
4060         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4061                 flushing = true;
4062                 space_info->flush = 1;
4063         }
4064
4065         spin_unlock(&space_info->lock);
4066
4067         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4068                 goto out;
4069
4070         ret = flush_space(root, space_info, num_bytes, orig_bytes,
4071                           flush_state);
4072         flush_state++;
4073
4074         /*
4075          * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4076          * would happen. So skip delalloc flush.
4077          */
4078         if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4079             (flush_state == FLUSH_DELALLOC ||
4080              flush_state == FLUSH_DELALLOC_WAIT))
4081                 flush_state = ALLOC_CHUNK;
4082
4083         if (!ret)
4084                 goto again;
4085         else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4086                  flush_state < COMMIT_TRANS)
4087                 goto again;
4088         else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4089                  flush_state <= COMMIT_TRANS)
4090                 goto again;
4091
4092 out:
4093         if (flushing) {
4094                 spin_lock(&space_info->lock);
4095                 space_info->flush = 0;
4096                 wake_up_all(&space_info->wait);
4097                 spin_unlock(&space_info->lock);
4098         }
4099         return ret;
4100 }
4101
4102 static struct btrfs_block_rsv *get_block_rsv(
4103                                         const struct btrfs_trans_handle *trans,
4104                                         const struct btrfs_root *root)
4105 {
4106         struct btrfs_block_rsv *block_rsv = NULL;
4107
4108         if (root->ref_cows)
4109                 block_rsv = trans->block_rsv;
4110
4111         if (root == root->fs_info->csum_root && trans->adding_csums)
4112                 block_rsv = trans->block_rsv;
4113
4114         if (!block_rsv)
4115                 block_rsv = root->block_rsv;
4116
4117         if (!block_rsv)
4118                 block_rsv = &root->fs_info->empty_block_rsv;
4119
4120         return block_rsv;
4121 }
4122
4123 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4124                                u64 num_bytes)
4125 {
4126         int ret = -ENOSPC;
4127         spin_lock(&block_rsv->lock);
4128         if (block_rsv->reserved >= num_bytes) {
4129                 block_rsv->reserved -= num_bytes;
4130                 if (block_rsv->reserved < block_rsv->size)
4131                         block_rsv->full = 0;
4132                 ret = 0;
4133         }
4134         spin_unlock(&block_rsv->lock);
4135         return ret;
4136 }
4137
4138 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4139                                 u64 num_bytes, int update_size)
4140 {
4141         spin_lock(&block_rsv->lock);
4142         block_rsv->reserved += num_bytes;
4143         if (update_size)
4144                 block_rsv->size += num_bytes;
4145         else if (block_rsv->reserved >= block_rsv->size)
4146                 block_rsv->full = 1;
4147         spin_unlock(&block_rsv->lock);
4148 }
4149
4150 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4151                                     struct btrfs_block_rsv *block_rsv,
4152                                     struct btrfs_block_rsv *dest, u64 num_bytes)
4153 {
4154         struct btrfs_space_info *space_info = block_rsv->space_info;
4155
4156         spin_lock(&block_rsv->lock);
4157         if (num_bytes == (u64)-1)
4158                 num_bytes = block_rsv->size;
4159         block_rsv->size -= num_bytes;
4160         if (block_rsv->reserved >= block_rsv->size) {
4161                 num_bytes = block_rsv->reserved - block_rsv->size;
4162                 block_rsv->reserved = block_rsv->size;
4163                 block_rsv->full = 1;
4164         } else {
4165                 num_bytes = 0;
4166         }
4167         spin_unlock(&block_rsv->lock);
4168
4169         if (num_bytes > 0) {
4170                 if (dest) {
4171                         spin_lock(&dest->lock);
4172                         if (!dest->full) {
4173                                 u64 bytes_to_add;
4174
4175                                 bytes_to_add = dest->size - dest->reserved;
4176                                 bytes_to_add = min(num_bytes, bytes_to_add);
4177                                 dest->reserved += bytes_to_add;
4178                                 if (dest->reserved >= dest->size)
4179                                         dest->full = 1;
4180                                 num_bytes -= bytes_to_add;
4181                         }
4182                         spin_unlock(&dest->lock);
4183                 }
4184                 if (num_bytes) {
4185                         spin_lock(&space_info->lock);
4186                         space_info->bytes_may_use -= num_bytes;
4187                         trace_btrfs_space_reservation(fs_info, "space_info",
4188                                         space_info->flags, num_bytes, 0);
4189                         space_info->reservation_progress++;
4190                         spin_unlock(&space_info->lock);
4191                 }
4192         }
4193 }
4194
4195 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4196                                    struct btrfs_block_rsv *dst, u64 num_bytes)
4197 {
4198         int ret;
4199
4200         ret = block_rsv_use_bytes(src, num_bytes);
4201         if (ret)
4202                 return ret;
4203
4204         block_rsv_add_bytes(dst, num_bytes, 1);
4205         return 0;
4206 }
4207
4208 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4209 {
4210         memset(rsv, 0, sizeof(*rsv));
4211         spin_lock_init(&rsv->lock);
4212         rsv->type = type;
4213 }
4214
4215 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4216                                               unsigned short type)
4217 {
4218         struct btrfs_block_rsv *block_rsv;
4219         struct btrfs_fs_info *fs_info = root->fs_info;
4220
4221         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4222         if (!block_rsv)
4223                 return NULL;
4224
4225         btrfs_init_block_rsv(block_rsv, type);
4226         block_rsv->space_info = __find_space_info(fs_info,
4227                                                   BTRFS_BLOCK_GROUP_METADATA);
4228         return block_rsv;
4229 }
4230
4231 void btrfs_free_block_rsv(struct btrfs_root *root,
4232                           struct btrfs_block_rsv *rsv)
4233 {
4234         if (!rsv)
4235                 return;
4236         btrfs_block_rsv_release(root, rsv, (u64)-1);
4237         kfree(rsv);
4238 }
4239
4240 int btrfs_block_rsv_add(struct btrfs_root *root,
4241                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4242                         enum btrfs_reserve_flush_enum flush)
4243 {
4244         int ret;
4245
4246         if (num_bytes == 0)
4247                 return 0;
4248
4249         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4250         if (!ret) {
4251                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
4252                 return 0;
4253         }
4254
4255         return ret;
4256 }
4257
4258 int btrfs_block_rsv_check(struct btrfs_root *root,
4259                           struct btrfs_block_rsv *block_rsv, int min_factor)
4260 {
4261         u64 num_bytes = 0;
4262         int ret = -ENOSPC;
4263
4264         if (!block_rsv)
4265                 return 0;
4266
4267         spin_lock(&block_rsv->lock);
4268         num_bytes = div_factor(block_rsv->size, min_factor);
4269         if (block_rsv->reserved >= num_bytes)
4270                 ret = 0;
4271         spin_unlock(&block_rsv->lock);
4272
4273         return ret;
4274 }
4275
4276 int btrfs_block_rsv_refill(struct btrfs_root *root,
4277                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4278                            enum btrfs_reserve_flush_enum flush)
4279 {
4280         u64 num_bytes = 0;
4281         int ret = -ENOSPC;
4282
4283         if (!block_rsv)
4284                 return 0;
4285
4286         spin_lock(&block_rsv->lock);
4287         num_bytes = min_reserved;
4288         if (block_rsv->reserved >= num_bytes)
4289                 ret = 0;
4290         else
4291                 num_bytes -= block_rsv->reserved;
4292         spin_unlock(&block_rsv->lock);
4293
4294         if (!ret)
4295                 return 0;
4296
4297         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4298         if (!ret) {
4299                 block_rsv_add_bytes(block_rsv, num_bytes, 0);
4300                 return 0;
4301         }
4302
4303         return ret;
4304 }
4305
4306 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4307                             struct btrfs_block_rsv *dst_rsv,
4308                             u64 num_bytes)
4309 {
4310         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4311 }
4312
4313 void btrfs_block_rsv_release(struct btrfs_root *root,
4314                              struct btrfs_block_rsv *block_rsv,
4315                              u64 num_bytes)
4316 {
4317         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4318         if (global_rsv->full || global_rsv == block_rsv ||
4319             block_rsv->space_info != global_rsv->space_info)
4320                 global_rsv = NULL;
4321         block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4322                                 num_bytes);
4323 }
4324
4325 /*
4326  * helper to calculate size of global block reservation.
4327  * the desired value is sum of space used by extent tree,
4328  * checksum tree and root tree
4329  */
4330 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4331 {
4332         struct btrfs_space_info *sinfo;
4333         u64 num_bytes;
4334         u64 meta_used;
4335         u64 data_used;
4336         int csum_size = btrfs_super_csum_size(fs_info->super_copy);
4337
4338         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
4339         spin_lock(&sinfo->lock);
4340         data_used = sinfo->bytes_used;
4341         spin_unlock(&sinfo->lock);
4342
4343         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4344         spin_lock(&sinfo->lock);
4345         if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
4346                 data_used = 0;
4347         meta_used = sinfo->bytes_used;
4348         spin_unlock(&sinfo->lock);
4349
4350         num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4351                     csum_size * 2;
4352         num_bytes += div64_u64(data_used + meta_used, 50);
4353
4354         if (num_bytes * 3 > meta_used)
4355                 num_bytes = div64_u64(meta_used, 3);
4356
4357         return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
4358 }
4359
4360 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4361 {
4362         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4363         struct btrfs_space_info *sinfo = block_rsv->space_info;
4364         u64 num_bytes;
4365
4366         num_bytes = calc_global_metadata_size(fs_info);
4367
4368         spin_lock(&sinfo->lock);
4369         spin_lock(&block_rsv->lock);
4370
4371         block_rsv->size = num_bytes;
4372
4373         num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4374                     sinfo->bytes_reserved + sinfo->bytes_readonly +
4375                     sinfo->bytes_may_use;
4376
4377         if (sinfo->total_bytes > num_bytes) {
4378                 num_bytes = sinfo->total_bytes - num_bytes;
4379                 block_rsv->reserved += num_bytes;
4380                 sinfo->bytes_may_use += num_bytes;
4381                 trace_btrfs_space_reservation(fs_info, "space_info",
4382                                       sinfo->flags, num_bytes, 1);
4383         }
4384
4385         if (block_rsv->reserved >= block_rsv->size) {
4386                 num_bytes = block_rsv->reserved - block_rsv->size;
4387                 sinfo->bytes_may_use -= num_bytes;
4388                 trace_btrfs_space_reservation(fs_info, "space_info",
4389                                       sinfo->flags, num_bytes, 0);
4390                 sinfo->reservation_progress++;
4391                 block_rsv->reserved = block_rsv->size;
4392                 block_rsv->full = 1;
4393         }
4394
4395         spin_unlock(&block_rsv->lock);
4396         spin_unlock(&sinfo->lock);
4397 }
4398
4399 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4400 {
4401         struct btrfs_space_info *space_info;
4402
4403         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4404         fs_info->chunk_block_rsv.space_info = space_info;
4405
4406         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4407         fs_info->global_block_rsv.space_info = space_info;
4408         fs_info->delalloc_block_rsv.space_info = space_info;
4409         fs_info->trans_block_rsv.space_info = space_info;
4410         fs_info->empty_block_rsv.space_info = space_info;
4411         fs_info->delayed_block_rsv.space_info = space_info;
4412
4413         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4414         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4415         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4416         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4417         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4418
4419         update_global_block_rsv(fs_info);
4420 }
4421
4422 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4423 {
4424         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4425                                 (u64)-1);
4426         WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4427         WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4428         WARN_ON(fs_info->trans_block_rsv.size > 0);
4429         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4430         WARN_ON(fs_info->chunk_block_rsv.size > 0);
4431         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4432         WARN_ON(fs_info->delayed_block_rsv.size > 0);
4433         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4434 }
4435
4436 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4437                                   struct btrfs_root *root)
4438 {
4439         if (!trans->block_rsv)
4440                 return;
4441
4442         if (!trans->bytes_reserved)
4443                 return;
4444
4445         trace_btrfs_space_reservation(root->fs_info, "transaction",
4446                                       trans->transid, trans->bytes_reserved, 0);
4447         btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4448         trans->bytes_reserved = 0;
4449 }
4450
4451 /* Can only return 0 or -ENOSPC */
4452 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4453                                   struct inode *inode)
4454 {
4455         struct btrfs_root *root = BTRFS_I(inode)->root;
4456         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4457         struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4458
4459         /*
4460          * We need to hold space in order to delete our orphan item once we've
4461          * added it, so this takes the reservation so we can release it later
4462          * when we are truly done with the orphan item.
4463          */
4464         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4465         trace_btrfs_space_reservation(root->fs_info, "orphan",
4466                                       btrfs_ino(inode), num_bytes, 1);
4467         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4468 }
4469
4470 void btrfs_orphan_release_metadata(struct inode *inode)
4471 {
4472         struct btrfs_root *root = BTRFS_I(inode)->root;
4473         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4474         trace_btrfs_space_reservation(root->fs_info, "orphan",
4475                                       btrfs_ino(inode), num_bytes, 0);
4476         btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4477 }
4478
4479 int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4480                                 struct btrfs_pending_snapshot *pending)
4481 {
4482         struct btrfs_root *root = pending->root;
4483         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4484         struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4485         /*
4486          * two for root back/forward refs, two for directory entries,
4487          * one for root of the snapshot and one for parent inode.
4488          */
4489         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
4490         dst_rsv->space_info = src_rsv->space_info;
4491         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4492 }
4493
4494 /**
4495  * drop_outstanding_extent - drop an outstanding extent
4496  * @inode: the inode we're dropping the extent for
4497  *
4498  * This is called when we are freeing up an outstanding extent, either called
4499  * after an error or after an extent is written.  This will return the number of
4500  * reserved extents that need to be freed.  This must be called with
4501  * BTRFS_I(inode)->lock held.
4502  */
4503 static unsigned drop_outstanding_extent(struct inode *inode)
4504 {
4505         unsigned drop_inode_space = 0;
4506         unsigned dropped_extents = 0;
4507
4508         BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4509         BTRFS_I(inode)->outstanding_extents--;
4510
4511         if (BTRFS_I(inode)->outstanding_extents == 0 &&
4512             test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4513                                &BTRFS_I(inode)->runtime_flags))
4514                 drop_inode_space = 1;
4515
4516         /*
4517          * If we have more or the same amount of outsanding extents than we have
4518          * reserved then we need to leave the reserved extents count alone.
4519          */
4520         if (BTRFS_I(inode)->outstanding_extents >=
4521             BTRFS_I(inode)->reserved_extents)
4522                 return drop_inode_space;
4523
4524         dropped_extents = BTRFS_I(inode)->reserved_extents -
4525                 BTRFS_I(inode)->outstanding_extents;
4526         BTRFS_I(inode)->reserved_extents -= dropped_extents;
4527         return dropped_extents + drop_inode_space;
4528 }
4529
4530 /**
4531  * calc_csum_metadata_size - return the amount of metada space that must be
4532  *      reserved/free'd for the given bytes.
4533  * @inode: the inode we're manipulating
4534  * @num_bytes: the number of bytes in question
4535  * @reserve: 1 if we are reserving space, 0 if we are freeing space
4536  *
4537  * This adjusts the number of csum_bytes in the inode and then returns the
4538  * correct amount of metadata that must either be reserved or freed.  We
4539  * calculate how many checksums we can fit into one leaf and then divide the
4540  * number of bytes that will need to be checksumed by this value to figure out
4541  * how many checksums will be required.  If we are adding bytes then the number
4542  * may go up and we will return the number of additional bytes that must be
4543  * reserved.  If it is going down we will return the number of bytes that must
4544  * be freed.
4545  *
4546  * This must be called with BTRFS_I(inode)->lock held.
4547  */
4548 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4549                                    int reserve)
4550 {
4551         struct btrfs_root *root = BTRFS_I(inode)->root;
4552         u64 csum_size;
4553         int num_csums_per_leaf;
4554         int num_csums;
4555         int old_csums;
4556
4557         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4558             BTRFS_I(inode)->csum_bytes == 0)
4559                 return 0;
4560
4561         old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4562         if (reserve)
4563                 BTRFS_I(inode)->csum_bytes += num_bytes;
4564         else
4565                 BTRFS_I(inode)->csum_bytes -= num_bytes;
4566         csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4567         num_csums_per_leaf = (int)div64_u64(csum_size,
4568                                             sizeof(struct btrfs_csum_item) +
4569                                             sizeof(struct btrfs_disk_key));
4570         num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4571         num_csums = num_csums + num_csums_per_leaf - 1;
4572         num_csums = num_csums / num_csums_per_leaf;
4573
4574         old_csums = old_csums + num_csums_per_leaf - 1;
4575         old_csums = old_csums / num_csums_per_leaf;
4576
4577         /* No change, no need to reserve more */
4578         if (old_csums == num_csums)
4579                 return 0;
4580
4581         if (reserve)
4582                 return btrfs_calc_trans_metadata_size(root,
4583                                                       num_csums - old_csums);
4584
4585         return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4586 }
4587
4588 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4589 {
4590         struct btrfs_root *root = BTRFS_I(inode)->root;
4591         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4592         u64 to_reserve = 0;
4593         u64 csum_bytes;
4594         unsigned nr_extents = 0;
4595         int extra_reserve = 0;
4596         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4597         int ret = 0;
4598         bool delalloc_lock = true;
4599
4600         /* If we are a free space inode we need to not flush since we will be in
4601          * the middle of a transaction commit.  We also don't need the delalloc
4602          * mutex since we won't race with anybody.  We need this mostly to make
4603          * lockdep shut its filthy mouth.
4604          */
4605         if (btrfs_is_free_space_inode(inode)) {
4606                 flush = BTRFS_RESERVE_NO_FLUSH;
4607                 delalloc_lock = false;
4608         }
4609
4610         if (flush != BTRFS_RESERVE_NO_FLUSH &&
4611             btrfs_transaction_in_commit(root->fs_info))
4612                 schedule_timeout(1);
4613
4614         if (delalloc_lock)
4615                 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4616
4617         num_bytes = ALIGN(num_bytes, root->sectorsize);
4618
4619         spin_lock(&BTRFS_I(inode)->lock);
4620         BTRFS_I(inode)->outstanding_extents++;
4621
4622         if (BTRFS_I(inode)->outstanding_extents >
4623             BTRFS_I(inode)->reserved_extents)
4624                 nr_extents = BTRFS_I(inode)->outstanding_extents -
4625                         BTRFS_I(inode)->reserved_extents;
4626
4627         /*
4628          * Add an item to reserve for updating the inode when we complete the
4629          * delalloc io.
4630          */
4631         if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4632                       &BTRFS_I(inode)->runtime_flags)) {
4633                 nr_extents++;
4634                 extra_reserve = 1;
4635         }
4636
4637         to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4638         to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4639         csum_bytes = BTRFS_I(inode)->csum_bytes;
4640         spin_unlock(&BTRFS_I(inode)->lock);
4641
4642         if (root->fs_info->quota_enabled)
4643                 ret = btrfs_qgroup_reserve(root, num_bytes +
4644                                            nr_extents * root->leafsize);
4645
4646         /*
4647          * ret != 0 here means the qgroup reservation failed, we go straight to
4648          * the shared error handling then.
4649          */
4650         if (ret == 0)
4651                 ret = reserve_metadata_bytes(root, block_rsv,
4652                                              to_reserve, flush);
4653
4654         if (ret) {
4655                 u64 to_free = 0;
4656                 unsigned dropped;
4657
4658                 spin_lock(&BTRFS_I(inode)->lock);
4659                 dropped = drop_outstanding_extent(inode);
4660                 /*
4661                  * If the inodes csum_bytes is the same as the original
4662                  * csum_bytes then we know we haven't raced with any free()ers
4663                  * so we can just reduce our inodes csum bytes and carry on.
4664                  * Otherwise we have to do the normal free thing to account for
4665                  * the case that the free side didn't free up its reserve
4666                  * because of this outstanding reservation.
4667                  */
4668                 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4669                         calc_csum_metadata_size(inode, num_bytes, 0);
4670                 else
4671                         to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4672                 spin_unlock(&BTRFS_I(inode)->lock);
4673                 if (dropped)
4674                         to_free += btrfs_calc_trans_metadata_size(root, dropped);
4675
4676                 if (to_free) {
4677                         btrfs_block_rsv_release(root, block_rsv, to_free);
4678                         trace_btrfs_space_reservation(root->fs_info,
4679                                                       "delalloc",
4680                                                       btrfs_ino(inode),
4681                                                       to_free, 0);
4682                 }
4683                 if (root->fs_info->quota_enabled) {
4684                         btrfs_qgroup_free(root, num_bytes +
4685                                                 nr_extents * root->leafsize);
4686                 }
4687                 if (delalloc_lock)
4688                         mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4689                 return ret;
4690         }
4691
4692         spin_lock(&BTRFS_I(inode)->lock);
4693         if (extra_reserve) {
4694                 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4695                         &BTRFS_I(inode)->runtime_flags);
4696                 nr_extents--;
4697         }
4698         BTRFS_I(inode)->reserved_extents += nr_extents;
4699         spin_unlock(&BTRFS_I(inode)->lock);
4700
4701         if (delalloc_lock)
4702                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4703
4704         if (to_reserve)
4705                 trace_btrfs_space_reservation(root->fs_info,"delalloc",
4706                                               btrfs_ino(inode), to_reserve, 1);
4707         block_rsv_add_bytes(block_rsv, to_reserve, 1);
4708
4709         return 0;
4710 }
4711
4712 /**
4713  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4714  * @inode: the inode to release the reservation for
4715  * @num_bytes: the number of bytes we're releasing
4716  *
4717  * This will release the metadata reservation for an inode.  This can be called
4718  * once we complete IO for a given set of bytes to release their metadata
4719  * reservations.
4720  */
4721 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4722 {
4723         struct btrfs_root *root = BTRFS_I(inode)->root;
4724         u64 to_free = 0;
4725         unsigned dropped;
4726
4727         num_bytes = ALIGN(num_bytes, root->sectorsize);
4728         spin_lock(&BTRFS_I(inode)->lock);
4729         dropped = drop_outstanding_extent(inode);
4730
4731         to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4732         spin_unlock(&BTRFS_I(inode)->lock);
4733         if (dropped > 0)
4734                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4735
4736         trace_btrfs_space_reservation(root->fs_info, "delalloc",
4737                                       btrfs_ino(inode), to_free, 0);
4738         if (root->fs_info->quota_enabled) {
4739                 btrfs_qgroup_free(root, num_bytes +
4740                                         dropped * root->leafsize);
4741         }
4742
4743         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4744                                 to_free);
4745 }
4746
4747 /**
4748  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4749  * @inode: inode we're writing to
4750  * @num_bytes: the number of bytes we want to allocate
4751  *
4752  * This will do the following things
4753  *
4754  * o reserve space in the data space info for num_bytes
4755  * o reserve space in the metadata space info based on number of outstanding
4756  *   extents and how much csums will be needed
4757  * o add to the inodes ->delalloc_bytes
4758  * o add it to the fs_info's delalloc inodes list.
4759  *
4760  * This will return 0 for success and -ENOSPC if there is no space left.
4761  */
4762 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4763 {
4764         int ret;
4765
4766         ret = btrfs_check_data_free_space(inode, num_bytes);
4767         if (ret)
4768                 return ret;
4769
4770         ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4771         if (ret) {
4772                 btrfs_free_reserved_data_space(inode, num_bytes);
4773                 return ret;
4774         }
4775
4776         return 0;
4777 }
4778
4779 /**
4780  * btrfs_delalloc_release_space - release data and metadata space for delalloc
4781  * @inode: inode we're releasing space for
4782  * @num_bytes: the number of bytes we want to free up
4783  *
4784  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
4785  * called in the case that we don't need the metadata AND data reservations
4786  * anymore.  So if there is an error or we insert an inline extent.
4787  *
4788  * This function will release the metadata space that was not used and will
4789  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4790  * list if there are no delalloc bytes left.
4791  */
4792 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4793 {
4794         btrfs_delalloc_release_metadata(inode, num_bytes);
4795         btrfs_free_reserved_data_space(inode, num_bytes);
4796 }
4797
4798 static int update_block_group(struct btrfs_root *root,
4799                               u64 bytenr, u64 num_bytes, int alloc)
4800 {
4801         struct btrfs_block_group_cache *cache = NULL;
4802         struct btrfs_fs_info *info = root->fs_info;
4803         u64 total = num_bytes;
4804         u64 old_val;
4805         u64 byte_in_group;
4806         int factor;
4807
4808         /* block accounting for super block */
4809         spin_lock(&info->delalloc_lock);
4810         old_val = btrfs_super_bytes_used(info->super_copy);
4811         if (alloc)
4812                 old_val += num_bytes;
4813         else
4814                 old_val -= num_bytes;
4815         btrfs_set_super_bytes_used(info->super_copy, old_val);
4816         spin_unlock(&info->delalloc_lock);
4817
4818         while (total) {
4819                 cache = btrfs_lookup_block_group(info, bytenr);
4820                 if (!cache)
4821                         return -ENOENT;
4822                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4823                                     BTRFS_BLOCK_GROUP_RAID1 |
4824                                     BTRFS_BLOCK_GROUP_RAID10))
4825                         factor = 2;
4826                 else
4827                         factor = 1;
4828                 /*
4829                  * If this block group has free space cache written out, we
4830                  * need to make sure to load it if we are removing space.  This
4831                  * is because we need the unpinning stage to actually add the
4832                  * space back to the block group, otherwise we will leak space.
4833                  */
4834                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4835                         cache_block_group(cache, 1);
4836
4837                 byte_in_group = bytenr - cache->key.objectid;
4838                 WARN_ON(byte_in_group > cache->key.offset);
4839
4840                 spin_lock(&cache->space_info->lock);
4841                 spin_lock(&cache->lock);
4842
4843                 if (btrfs_test_opt(root, SPACE_CACHE) &&
4844                     cache->disk_cache_state < BTRFS_DC_CLEAR)
4845                         cache->disk_cache_state = BTRFS_DC_CLEAR;
4846
4847                 cache->dirty = 1;
4848                 old_val = btrfs_block_group_used(&cache->item);
4849                 num_bytes = min(total, cache->key.offset - byte_in_group);
4850                 if (alloc) {
4851                         old_val += num_bytes;
4852                         btrfs_set_block_group_used(&cache->item, old_val);
4853                         cache->reserved -= num_bytes;
4854                         cache->space_info->bytes_reserved -= num_bytes;
4855                         cache->space_info->bytes_used += num_bytes;
4856                         cache->space_info->disk_used += num_bytes * factor;
4857                         spin_unlock(&cache->lock);
4858                         spin_unlock(&cache->space_info->lock);
4859                 } else {
4860                         old_val -= num_bytes;
4861                         btrfs_set_block_group_used(&cache->item, old_val);
4862                         cache->pinned += num_bytes;
4863                         cache->space_info->bytes_pinned += num_bytes;
4864                         cache->space_info->bytes_used -= num_bytes;
4865                         cache->space_info->disk_used -= num_bytes * factor;
4866                         spin_unlock(&cache->lock);
4867                         spin_unlock(&cache->space_info->lock);
4868
4869                         set_extent_dirty(info->pinned_extents,
4870                                          bytenr, bytenr + num_bytes - 1,
4871                                          GFP_NOFS | __GFP_NOFAIL);
4872                 }
4873                 btrfs_put_block_group(cache);
4874                 total -= num_bytes;
4875                 bytenr += num_bytes;
4876         }
4877         return 0;
4878 }
4879
4880 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4881 {
4882         struct btrfs_block_group_cache *cache;
4883         u64 bytenr;
4884
4885         spin_lock(&root->fs_info->block_group_cache_lock);
4886         bytenr = root->fs_info->first_logical_byte;
4887         spin_unlock(&root->fs_info->block_group_cache_lock);
4888
4889         if (bytenr < (u64)-1)
4890                 return bytenr;
4891
4892         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4893         if (!cache)
4894                 return 0;
4895
4896         bytenr = cache->key.objectid;
4897         btrfs_put_block_group(cache);
4898
4899         return bytenr;
4900 }
4901
4902 static int pin_down_extent(struct btrfs_root *root,
4903                            struct btrfs_block_group_cache *cache,
4904                            u64 bytenr, u64 num_bytes, int reserved)
4905 {
4906         spin_lock(&cache->space_info->lock);
4907         spin_lock(&cache->lock);
4908         cache->pinned += num_bytes;
4909         cache->space_info->bytes_pinned += num_bytes;
4910         if (reserved) {
4911                 cache->reserved -= num_bytes;
4912                 cache->space_info->bytes_reserved -= num_bytes;
4913         }
4914         spin_unlock(&cache->lock);
4915         spin_unlock(&cache->space_info->lock);
4916
4917         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4918                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4919         return 0;
4920 }
4921
4922 /*
4923  * this function must be called within transaction
4924  */
4925 int btrfs_pin_extent(struct btrfs_root *root,
4926                      u64 bytenr, u64 num_bytes, int reserved)
4927 {
4928         struct btrfs_block_group_cache *cache;
4929
4930         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4931         BUG_ON(!cache); /* Logic error */
4932
4933         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4934
4935         btrfs_put_block_group(cache);
4936         return 0;
4937 }
4938
4939 /*
4940  * this function must be called within transaction
4941  */
4942 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
4943                                     u64 bytenr, u64 num_bytes)
4944 {
4945         struct btrfs_block_group_cache *cache;
4946
4947         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4948         BUG_ON(!cache); /* Logic error */
4949
4950         /*
4951          * pull in the free space cache (if any) so that our pin
4952          * removes the free space from the cache.  We have load_only set
4953          * to one because the slow code to read in the free extents does check
4954          * the pinned extents.
4955          */
4956         cache_block_group(cache, 1);
4957
4958         pin_down_extent(root, cache, bytenr, num_bytes, 0);
4959
4960         /* remove us from the free space cache (if we're there at all) */
4961         btrfs_remove_free_space(cache, bytenr, num_bytes);
4962         btrfs_put_block_group(cache);
4963         return 0;
4964 }
4965
4966 /**
4967  * btrfs_update_reserved_bytes - update the block_group and space info counters
4968  * @cache:      The cache we are manipulating
4969  * @num_bytes:  The number of bytes in question
4970  * @reserve:    One of the reservation enums
4971  *
4972  * This is called by the allocator when it reserves space, or by somebody who is
4973  * freeing space that was never actually used on disk.  For example if you
4974  * reserve some space for a new leaf in transaction A and before transaction A
4975  * commits you free that leaf, you call this with reserve set to 0 in order to
4976  * clear the reservation.
4977  *
4978  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4979  * ENOSPC accounting.  For data we handle the reservation through clearing the
4980  * delalloc bits in the io_tree.  We have to do this since we could end up
4981  * allocating less disk space for the amount of data we have reserved in the
4982  * case of compression.
4983  *
4984  * If this is a reservation and the block group has become read only we cannot
4985  * make the reservation and return -EAGAIN, otherwise this function always
4986  * succeeds.
4987  */
4988 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4989                                        u64 num_bytes, int reserve)
4990 {
4991         struct btrfs_space_info *space_info = cache->space_info;
4992         int ret = 0;
4993
4994         spin_lock(&space_info->lock);
4995         spin_lock(&cache->lock);
4996         if (reserve != RESERVE_FREE) {
4997                 if (cache->ro) {
4998                         ret = -EAGAIN;
4999                 } else {
5000                         cache->reserved += num_bytes;
5001                         space_info->bytes_reserved += num_bytes;
5002                         if (reserve == RESERVE_ALLOC) {
5003                                 trace_btrfs_space_reservation(cache->fs_info,
5004                                                 "space_info", space_info->flags,
5005                                                 num_bytes, 0);
5006                                 space_info->bytes_may_use -= num_bytes;
5007                         }
5008                 }
5009         } else {
5010                 if (cache->ro)
5011                         space_info->bytes_readonly += num_bytes;
5012                 cache->reserved -= num_bytes;
5013                 space_info->bytes_reserved -= num_bytes;
5014                 space_info->reservation_progress++;
5015         }
5016         spin_unlock(&cache->lock);
5017         spin_unlock(&space_info->lock);
5018         return ret;
5019 }
5020
5021 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5022                                 struct btrfs_root *root)
5023 {
5024         struct btrfs_fs_info *fs_info = root->fs_info;
5025         struct btrfs_caching_control *next;
5026         struct btrfs_caching_control *caching_ctl;
5027         struct btrfs_block_group_cache *cache;
5028
5029         down_write(&fs_info->extent_commit_sem);
5030
5031         list_for_each_entry_safe(caching_ctl, next,
5032                                  &fs_info->caching_block_groups, list) {
5033                 cache = caching_ctl->block_group;
5034                 if (block_group_cache_done(cache)) {
5035                         cache->last_byte_to_unpin = (u64)-1;
5036                         list_del_init(&caching_ctl->list);
5037                         put_caching_control(caching_ctl);
5038                 } else {
5039                         cache->last_byte_to_unpin = caching_ctl->progress;
5040                 }
5041         }
5042
5043         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5044                 fs_info->pinned_extents = &fs_info->freed_extents[1];
5045         else
5046                 fs_info->pinned_extents = &fs_info->freed_extents[0];
5047
5048         up_write(&fs_info->extent_commit_sem);
5049
5050         update_global_block_rsv(fs_info);
5051 }
5052
5053 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
5054 {
5055         struct btrfs_fs_info *fs_info = root->fs_info;
5056         struct btrfs_block_group_cache *cache = NULL;
5057         struct btrfs_space_info *space_info;
5058         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5059         u64 len;
5060         bool readonly;
5061
5062         while (start <= end) {
5063                 readonly = false;
5064                 if (!cache ||
5065                     start >= cache->key.objectid + cache->key.offset) {
5066                         if (cache)
5067                                 btrfs_put_block_group(cache);
5068                         cache = btrfs_lookup_block_group(fs_info, start);
5069                         BUG_ON(!cache); /* Logic error */
5070                 }
5071
5072                 len = cache->key.objectid + cache->key.offset - start;
5073                 len = min(len, end + 1 - start);
5074
5075                 if (start < cache->last_byte_to_unpin) {
5076                         len = min(len, cache->last_byte_to_unpin - start);
5077                         btrfs_add_free_space(cache, start, len);
5078                 }
5079
5080                 start += len;
5081                 space_info = cache->space_info;
5082
5083                 spin_lock(&space_info->lock);
5084                 spin_lock(&cache->lock);
5085                 cache->pinned -= len;
5086                 space_info->bytes_pinned -= len;
5087                 if (cache->ro) {
5088                         space_info->bytes_readonly += len;
5089                         readonly = true;
5090                 }
5091                 spin_unlock(&cache->lock);
5092                 if (!readonly && global_rsv->space_info == space_info) {
5093                         spin_lock(&global_rsv->lock);
5094                         if (!global_rsv->full) {
5095                                 len = min(len, global_rsv->size -
5096                                           global_rsv->reserved);
5097                                 global_rsv->reserved += len;
5098                                 space_info->bytes_may_use += len;
5099                                 if (global_rsv->reserved >= global_rsv->size)
5100                                         global_rsv->full = 1;
5101                         }
5102                         spin_unlock(&global_rsv->lock);
5103                 }
5104                 spin_unlock(&space_info->lock);
5105         }
5106
5107         if (cache)
5108                 btrfs_put_block_group(cache);
5109         return 0;
5110 }
5111
5112 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5113                                struct btrfs_root *root)
5114 {
5115         struct btrfs_fs_info *fs_info = root->fs_info;
5116         struct extent_io_tree *unpin;
5117         u64 start;
5118         u64 end;
5119         int ret;
5120
5121         if (trans->aborted)
5122                 return 0;
5123
5124         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5125                 unpin = &fs_info->freed_extents[1];
5126         else
5127                 unpin = &fs_info->freed_extents[0];
5128
5129         while (1) {
5130                 ret = find_first_extent_bit(unpin, 0, &start, &end,
5131                                             EXTENT_DIRTY, NULL);
5132                 if (ret)
5133                         break;
5134
5135                 if (btrfs_test_opt(root, DISCARD))
5136                         ret = btrfs_discard_extent(root, start,
5137                                                    end + 1 - start, NULL);
5138
5139                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
5140                 unpin_extent_range(root, start, end);
5141                 cond_resched();
5142         }
5143
5144         return 0;
5145 }
5146
5147 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5148                                 struct btrfs_root *root,
5149                                 u64 bytenr, u64 num_bytes, u64 parent,
5150                                 u64 root_objectid, u64 owner_objectid,
5151                                 u64 owner_offset, int refs_to_drop,
5152                                 struct btrfs_delayed_extent_op *extent_op)
5153 {
5154         struct btrfs_key key;
5155         struct btrfs_path *path;
5156         struct btrfs_fs_info *info = root->fs_info;
5157         struct btrfs_root *extent_root = info->extent_root;
5158         struct extent_buffer *leaf;
5159         struct btrfs_extent_item *ei;
5160         struct btrfs_extent_inline_ref *iref;
5161         int ret;
5162         int is_data;
5163         int extent_slot = 0;
5164         int found_extent = 0;
5165         int num_to_del = 1;
5166         u32 item_size;
5167         u64 refs;
5168
5169         path = btrfs_alloc_path();
5170         if (!path)
5171                 return -ENOMEM;
5172
5173         path->reada = 1;
5174         path->leave_spinning = 1;
5175
5176         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5177         BUG_ON(!is_data && refs_to_drop != 1);
5178
5179         ret = lookup_extent_backref(trans, extent_root, path, &iref,
5180                                     bytenr, num_bytes, parent,
5181                                     root_objectid, owner_objectid,
5182                                     owner_offset);
5183         if (ret == 0) {
5184                 extent_slot = path->slots[0];
5185                 while (extent_slot >= 0) {
5186                         btrfs_item_key_to_cpu(path->nodes[0], &key,
5187                                               extent_slot);
5188                         if (key.objectid != bytenr)
5189                                 break;
5190                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5191                             key.offset == num_bytes) {
5192                                 found_extent = 1;
5193                                 break;
5194                         }
5195                         if (path->slots[0] - extent_slot > 5)
5196                                 break;
5197                         extent_slot--;
5198                 }
5199 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5200                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
5201                 if (found_extent && item_size < sizeof(*ei))
5202                         found_extent = 0;
5203 #endif
5204                 if (!found_extent) {
5205                         BUG_ON(iref);
5206                         ret = remove_extent_backref(trans, extent_root, path,
5207                                                     NULL, refs_to_drop,
5208                                                     is_data);
5209                         if (ret) {
5210                                 btrfs_abort_transaction(trans, extent_root, ret);
5211                                 goto out;
5212                         }
5213                         btrfs_release_path(path);
5214                         path->leave_spinning = 1;
5215
5216                         key.objectid = bytenr;
5217                         key.type = BTRFS_EXTENT_ITEM_KEY;
5218                         key.offset = num_bytes;
5219
5220                         ret = btrfs_search_slot(trans, extent_root,
5221                                                 &key, path, -1, 1);
5222                         if (ret) {
5223                                 printk(KERN_ERR "umm, got %d back from search"
5224                                        ", was looking for %llu\n", ret,
5225                                        (unsigned long long)bytenr);
5226                                 if (ret > 0)
5227                                         btrfs_print_leaf(extent_root,
5228                                                          path->nodes[0]);
5229                         }
5230                         if (ret < 0) {
5231                                 btrfs_abort_transaction(trans, extent_root, ret);
5232                                 goto out;
5233                         }
5234                         extent_slot = path->slots[0];
5235                 }
5236         } else if (ret == -ENOENT) {
5237                 btrfs_print_leaf(extent_root, path->nodes[0]);
5238                 WARN_ON(1);
5239                 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
5240                        "parent %llu root %llu  owner %llu offset %llu\n",
5241                        (unsigned long long)bytenr,
5242                        (unsigned long long)parent,
5243                        (unsigned long long)root_objectid,
5244                        (unsigned long long)owner_objectid,
5245                        (unsigned long long)owner_offset);
5246         } else {
5247                 btrfs_abort_transaction(trans, extent_root, ret);
5248                 goto out;
5249         }
5250
5251         leaf = path->nodes[0];
5252         item_size = btrfs_item_size_nr(leaf, extent_slot);
5253 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5254         if (item_size < sizeof(*ei)) {
5255                 BUG_ON(found_extent || extent_slot != path->slots[0]);
5256                 ret = convert_extent_item_v0(trans, extent_root, path,
5257                                              owner_objectid, 0);
5258                 if (ret < 0) {
5259                         btrfs_abort_transaction(trans, extent_root, ret);
5260                         goto out;
5261                 }
5262
5263                 btrfs_release_path(path);
5264                 path->leave_spinning = 1;
5265
5266                 key.objectid = bytenr;
5267                 key.type = BTRFS_EXTENT_ITEM_KEY;
5268                 key.offset = num_bytes;
5269
5270                 ret = btrfs_search_slot(trans, extent_root, &key, path,
5271                                         -1, 1);
5272                 if (ret) {
5273                         printk(KERN_ERR "umm, got %d back from search"
5274                                ", was looking for %llu\n", ret,
5275                                (unsigned long long)bytenr);
5276                         btrfs_print_leaf(extent_root, path->nodes[0]);
5277                 }
5278                 if (ret < 0) {
5279                         btrfs_abort_transaction(trans, extent_root, ret);
5280                         goto out;
5281                 }
5282
5283                 extent_slot = path->slots[0];
5284                 leaf = path->nodes[0];
5285                 item_size = btrfs_item_size_nr(leaf, extent_slot);
5286         }
5287 #endif
5288         BUG_ON(item_size < sizeof(*ei));
5289         ei = btrfs_item_ptr(leaf, extent_slot,
5290                             struct btrfs_extent_item);
5291         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
5292                 struct btrfs_tree_block_info *bi;
5293                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
5294                 bi = (struct btrfs_tree_block_info *)(ei + 1);
5295                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
5296         }
5297
5298         refs = btrfs_extent_refs(leaf, ei);
5299         BUG_ON(refs < refs_to_drop);
5300         refs -= refs_to_drop;
5301
5302         if (refs > 0) {
5303                 if (extent_op)
5304                         __run_delayed_extent_op(extent_op, leaf, ei);
5305                 /*
5306                  * In the case of inline back ref, reference count will
5307                  * be updated by remove_extent_backref
5308                  */
5309                 if (iref) {
5310                         BUG_ON(!found_extent);
5311                 } else {
5312                         btrfs_set_extent_refs(leaf, ei, refs);
5313                         btrfs_mark_buffer_dirty(leaf);
5314                 }
5315                 if (found_extent) {
5316                         ret = remove_extent_backref(trans, extent_root, path,
5317                                                     iref, refs_to_drop,
5318                                                     is_data);
5319                         if (ret) {
5320                                 btrfs_abort_transaction(trans, extent_root, ret);
5321                                 goto out;
5322                         }
5323                 }
5324         } else {
5325                 if (found_extent) {
5326                         BUG_ON(is_data && refs_to_drop !=
5327                                extent_data_ref_count(root, path, iref));
5328                         if (iref) {
5329                                 BUG_ON(path->slots[0] != extent_slot);
5330                         } else {
5331                                 BUG_ON(path->slots[0] != extent_slot + 1);
5332                                 path->slots[0] = extent_slot;
5333                                 num_to_del = 2;
5334                         }
5335                 }
5336
5337                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5338                                       num_to_del);
5339                 if (ret) {
5340                         btrfs_abort_transaction(trans, extent_root, ret);
5341                         goto out;
5342                 }
5343                 btrfs_release_path(path);
5344
5345                 if (is_data) {
5346                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5347                         if (ret) {
5348                                 btrfs_abort_transaction(trans, extent_root, ret);
5349                                 goto out;
5350                         }
5351                 }
5352
5353                 ret = update_block_group(root, bytenr, num_bytes, 0);
5354                 if (ret) {
5355                         btrfs_abort_transaction(trans, extent_root, ret);
5356                         goto out;
5357                 }
5358         }
5359 out:
5360         btrfs_free_path(path);
5361         return ret;
5362 }
5363
5364 /*
5365  * when we free an block, it is possible (and likely) that we free the last
5366  * delayed ref for that extent as well.  This searches the delayed ref tree for
5367  * a given extent, and if there are no other delayed refs to be processed, it
5368  * removes it from the tree.
5369  */
5370 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5371                                       struct btrfs_root *root, u64 bytenr)
5372 {
5373         struct btrfs_delayed_ref_head *head;
5374         struct btrfs_delayed_ref_root *delayed_refs;
5375         struct btrfs_delayed_ref_node *ref;
5376         struct rb_node *node;
5377         int ret = 0;
5378
5379         delayed_refs = &trans->transaction->delayed_refs;
5380         spin_lock(&delayed_refs->lock);
5381         head = btrfs_find_delayed_ref_head(trans, bytenr);
5382         if (!head)
5383                 goto out;
5384
5385         node = rb_prev(&head->node.rb_node);
5386         if (!node)
5387                 goto out;
5388
5389         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
5390
5391         /* there are still entries for this ref, we can't drop it */
5392         if (ref->bytenr == bytenr)
5393                 goto out;
5394
5395         if (head->extent_op) {
5396                 if (!head->must_insert_reserved)
5397                         goto out;
5398                 btrfs_free_delayed_extent_op(head->extent_op);
5399                 head->extent_op = NULL;
5400         }
5401
5402         /*
5403          * waiting for the lock here would deadlock.  If someone else has it
5404          * locked they are already in the process of dropping it anyway
5405          */
5406         if (!mutex_trylock(&head->mutex))
5407                 goto out;
5408
5409         /*
5410          * at this point we have a head with no other entries.  Go
5411          * ahead and process it.
5412          */
5413         head->node.in_tree = 0;
5414         rb_erase(&head->node.rb_node, &delayed_refs->root);
5415
5416         delayed_refs->num_entries--;
5417
5418         /*
5419          * we don't take a ref on the node because we're removing it from the
5420          * tree, so we just steal the ref the tree was holding.
5421          */
5422         delayed_refs->num_heads--;
5423         if (list_empty(&head->cluster))
5424                 delayed_refs->num_heads_ready--;
5425
5426         list_del_init(&head->cluster);
5427         spin_unlock(&delayed_refs->lock);
5428
5429         BUG_ON(head->extent_op);
5430         if (head->must_insert_reserved)
5431                 ret = 1;
5432
5433         mutex_unlock(&head->mutex);
5434         btrfs_put_delayed_ref(&head->node);
5435         return ret;
5436 out:
5437         spin_unlock(&delayed_refs->lock);
5438         return 0;
5439 }
5440
5441 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5442                            struct btrfs_root *root,
5443                            struct extent_buffer *buf,
5444                            u64 parent, int last_ref)
5445 {
5446         struct btrfs_block_group_cache *cache = NULL;
5447         int ret;
5448
5449         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5450                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
5451                                         buf->start, buf->len,
5452                                         parent, root->root_key.objectid,
5453                                         btrfs_header_level(buf),
5454                                         BTRFS_DROP_DELAYED_REF, NULL, 0);
5455                 BUG_ON(ret); /* -ENOMEM */
5456         }
5457
5458         if (!last_ref)
5459                 return;
5460
5461         cache = btrfs_lookup_block_group(root->fs_info, buf->start);
5462
5463         if (btrfs_header_generation(buf) == trans->transid) {
5464                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5465                         ret = check_ref_cleanup(trans, root, buf->start);
5466                         if (!ret)
5467                                 goto out;
5468                 }
5469
5470                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
5471                         pin_down_extent(root, cache, buf->start, buf->len, 1);
5472                         goto out;
5473                 }
5474
5475                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
5476
5477                 btrfs_add_free_space(cache, buf->start, buf->len);
5478                 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5479         }
5480 out:
5481         /*
5482          * Deleting the buffer, clear the corrupt flag since it doesn't matter
5483          * anymore.
5484          */
5485         clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
5486         btrfs_put_block_group(cache);
5487 }
5488
5489 /* Can return -ENOMEM */
5490 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5491                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
5492                       u64 owner, u64 offset, int for_cow)
5493 {
5494         int ret;
5495         struct btrfs_fs_info *fs_info = root->fs_info;
5496
5497         /*
5498          * tree log blocks never actually go into the extent allocation
5499          * tree, just update pinning info and exit early.
5500          */
5501         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
5502                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
5503                 /* unlocks the pinned mutex */
5504                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
5505                 ret = 0;
5506         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5507                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
5508                                         num_bytes,
5509                                         parent, root_objectid, (int)owner,
5510                                         BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5511         } else {
5512                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5513                                                 num_bytes,
5514                                                 parent, root_objectid, owner,
5515                                                 offset, BTRFS_DROP_DELAYED_REF,
5516                                                 NULL, for_cow);
5517         }
5518         return ret;
5519 }
5520
5521 static u64 stripe_align(struct btrfs_root *root, u64 val)
5522 {
5523         u64 mask = ((u64)root->stripesize - 1);
5524         u64 ret = (val + mask) & ~mask;
5525         return ret;
5526 }
5527
5528 /*
5529  * when we wait for progress in the block group caching, its because
5530  * our allocation attempt failed at least once.  So, we must sleep
5531  * and let some progress happen before we try again.
5532  *
5533  * This function will sleep at least once waiting for new free space to
5534  * show up, and then it will check the block group free space numbers
5535  * for our min num_bytes.  Another option is to have it go ahead
5536  * and look in the rbtree for a free extent of a given size, but this
5537  * is a good start.
5538  */
5539 static noinline int
5540 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5541                                 u64 num_bytes)
5542 {
5543         struct btrfs_caching_control *caching_ctl;
5544         DEFINE_WAIT(wait);
5545
5546         caching_ctl = get_caching_control(cache);
5547         if (!caching_ctl)
5548                 return 0;
5549
5550         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
5551                    (cache->free_space_ctl->free_space >= num_bytes));
5552
5553         put_caching_control(caching_ctl);
5554         return 0;
5555 }
5556
5557 static noinline int
5558 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5559 {
5560         struct btrfs_caching_control *caching_ctl;
5561         DEFINE_WAIT(wait);
5562
5563         caching_ctl = get_caching_control(cache);
5564         if (!caching_ctl)
5565                 return 0;
5566
5567         wait_event(caching_ctl->wait, block_group_cache_done(cache));
5568
5569         put_caching_control(caching_ctl);
5570         return 0;
5571 }
5572
5573 int __get_raid_index(u64 flags)
5574 {
5575         if (flags & BTRFS_BLOCK_GROUP_RAID10)
5576                 return BTRFS_RAID_RAID10;
5577         else if (flags & BTRFS_BLOCK_GROUP_RAID1)
5578                 return BTRFS_RAID_RAID1;
5579         else if (flags & BTRFS_BLOCK_GROUP_DUP)
5580                 return BTRFS_RAID_DUP;
5581         else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5582                 return BTRFS_RAID_RAID0;
5583         else
5584                 return BTRFS_RAID_SINGLE;
5585 }
5586
5587 static int get_block_group_index(struct btrfs_block_group_cache *cache)
5588 {
5589         return __get_raid_index(cache->flags);
5590 }
5591
5592 enum btrfs_loop_type {
5593         LOOP_CACHING_NOWAIT = 0,
5594         LOOP_CACHING_WAIT = 1,
5595         LOOP_ALLOC_CHUNK = 2,
5596         LOOP_NO_EMPTY_SIZE = 3,
5597 };
5598
5599 /*
5600  * walks the btree of allocated extents and find a hole of a given size.
5601  * The key ins is changed to record the hole:
5602  * ins->objectid == block start
5603  * ins->flags = BTRFS_EXTENT_ITEM_KEY
5604  * ins->offset == number of blocks
5605  * Any available blocks before search_start are skipped.
5606  */
5607 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5608                                      struct btrfs_root *orig_root,
5609                                      u64 num_bytes, u64 empty_size,
5610                                      u64 hint_byte, struct btrfs_key *ins,
5611                                      u64 data)
5612 {
5613         int ret = 0;
5614         struct btrfs_root *root = orig_root->fs_info->extent_root;
5615         struct btrfs_free_cluster *last_ptr = NULL;
5616         struct btrfs_block_group_cache *block_group = NULL;
5617         struct btrfs_block_group_cache *used_block_group;
5618         u64 search_start = 0;
5619         int empty_cluster = 2 * 1024 * 1024;
5620         struct btrfs_space_info *space_info;
5621         int loop = 0;
5622         int index = __get_raid_index(data);
5623         int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5624                 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5625         bool found_uncached_bg = false;
5626         bool failed_cluster_refill = false;
5627         bool failed_alloc = false;
5628         bool use_cluster = true;
5629         bool have_caching_bg = false;
5630
5631         WARN_ON(num_bytes < root->sectorsize);
5632         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
5633         ins->objectid = 0;
5634         ins->offset = 0;
5635
5636         trace_find_free_extent(orig_root, num_bytes, empty_size, data);
5637
5638         space_info = __find_space_info(root->fs_info, data);
5639         if (!space_info) {
5640                 printk(KERN_ERR "No space info for %llu\n", data);
5641                 return -ENOSPC;
5642         }
5643
5644         /*
5645          * If the space info is for both data and metadata it means we have a
5646          * small filesystem and we can't use the clustering stuff.
5647          */
5648         if (btrfs_mixed_space_info(space_info))
5649                 use_cluster = false;
5650
5651         if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5652                 last_ptr = &root->fs_info->meta_alloc_cluster;
5653                 if (!btrfs_test_opt(root, SSD))
5654                         empty_cluster = 64 * 1024;
5655         }
5656
5657         if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
5658             btrfs_test_opt(root, SSD)) {
5659                 last_ptr = &root->fs_info->data_alloc_cluster;
5660         }
5661
5662         if (last_ptr) {
5663                 spin_lock(&last_ptr->lock);
5664                 if (last_ptr->block_group)
5665                         hint_byte = last_ptr->window_start;
5666                 spin_unlock(&last_ptr->lock);
5667         }
5668
5669         search_start = max(search_start, first_logical_byte(root, 0));
5670         search_start = max(search_start, hint_byte);
5671
5672         if (!last_ptr)
5673                 empty_cluster = 0;
5674
5675         if (search_start == hint_byte) {
5676                 block_group = btrfs_lookup_block_group(root->fs_info,
5677                                                        search_start);
5678                 used_block_group = block_group;
5679                 /*
5680                  * we don't want to use the block group if it doesn't match our
5681                  * allocation bits, or if its not cached.
5682                  *
5683                  * However if we are re-searching with an ideal block group
5684                  * picked out then we don't care that the block group is cached.
5685                  */
5686                 if (block_group && block_group_bits(block_group, data) &&
5687                     block_group->cached != BTRFS_CACHE_NO) {
5688                         down_read(&space_info->groups_sem);
5689                         if (list_empty(&block_group->list) ||
5690                             block_group->ro) {
5691                                 /*
5692                                  * someone is removing this block group,
5693                                  * we can't jump into the have_block_group
5694                                  * target because our list pointers are not
5695                                  * valid
5696                                  */
5697                                 btrfs_put_block_group(block_group);
5698                                 up_read(&space_info->groups_sem);
5699                         } else {
5700                                 index = get_block_group_index(block_group);
5701                                 goto have_block_group;
5702                         }
5703                 } else if (block_group) {
5704                         btrfs_put_block_group(block_group);
5705                 }
5706         }
5707 search:
5708         have_caching_bg = false;
5709         down_read(&space_info->groups_sem);
5710         list_for_each_entry(block_group, &space_info->block_groups[index],
5711                             list) {
5712                 u64 offset;
5713                 int cached;
5714
5715                 used_block_group = block_group;
5716                 btrfs_get_block_group(block_group);
5717                 search_start = block_group->key.objectid;
5718
5719                 /*
5720                  * this can happen if we end up cycling through all the
5721                  * raid types, but we want to make sure we only allocate
5722                  * for the proper type.
5723                  */
5724                 if (!block_group_bits(block_group, data)) {
5725                     u64 extra = BTRFS_BLOCK_GROUP_DUP |
5726                                 BTRFS_BLOCK_GROUP_RAID1 |
5727                                 BTRFS_BLOCK_GROUP_RAID10;
5728
5729                         /*
5730                          * if they asked for extra copies and this block group
5731                          * doesn't provide them, bail.  This does allow us to
5732                          * fill raid0 from raid1.
5733                          */
5734                         if ((data & extra) && !(block_group->flags & extra))
5735                                 goto loop;
5736                 }
5737
5738 have_block_group:
5739                 cached = block_group_cache_done(block_group);
5740                 if (unlikely(!cached)) {
5741                         found_uncached_bg = true;
5742                         ret = cache_block_group(block_group, 0);
5743                         BUG_ON(ret < 0);
5744                         ret = 0;
5745                 }
5746
5747                 if (unlikely(block_group->ro))
5748                         goto loop;
5749
5750                 /*
5751                  * Ok we want to try and use the cluster allocator, so
5752                  * lets look there
5753                  */
5754                 if (last_ptr) {
5755                         /*
5756                          * the refill lock keeps out other
5757                          * people trying to start a new cluster
5758                          */
5759                         spin_lock(&last_ptr->refill_lock);
5760                         used_block_group = last_ptr->block_group;
5761                         if (used_block_group != block_group &&
5762                             (!used_block_group ||
5763                              used_block_group->ro ||
5764                              !block_group_bits(used_block_group, data))) {
5765                                 used_block_group = block_group;
5766                                 goto refill_cluster;
5767                         }
5768
5769                         if (used_block_group != block_group)
5770                                 btrfs_get_block_group(used_block_group);
5771
5772                         offset = btrfs_alloc_from_cluster(used_block_group,
5773                           last_ptr, num_bytes, used_block_group->key.objectid);
5774                         if (offset) {
5775                                 /* we have a block, we're done */
5776                                 spin_unlock(&last_ptr->refill_lock);
5777                                 trace_btrfs_reserve_extent_cluster(root,
5778                                         block_group, search_start, num_bytes);
5779                                 goto checks;
5780                         }
5781
5782                         WARN_ON(last_ptr->block_group != used_block_group);
5783                         if (used_block_group != block_group) {
5784                                 btrfs_put_block_group(used_block_group);
5785                                 used_block_group = block_group;
5786                         }
5787 refill_cluster:
5788                         BUG_ON(used_block_group != block_group);
5789                         /* If we are on LOOP_NO_EMPTY_SIZE, we can't
5790                          * set up a new clusters, so lets just skip it
5791                          * and let the allocator find whatever block
5792                          * it can find.  If we reach this point, we
5793                          * will have tried the cluster allocator
5794                          * plenty of times and not have found
5795                          * anything, so we are likely way too
5796                          * fragmented for the clustering stuff to find
5797                          * anything.
5798                          *
5799                          * However, if the cluster is taken from the
5800                          * current block group, release the cluster
5801                          * first, so that we stand a better chance of
5802                          * succeeding in the unclustered
5803                          * allocation.  */
5804                         if (loop >= LOOP_NO_EMPTY_SIZE &&
5805                             last_ptr->block_group != block_group) {
5806                                 spin_unlock(&last_ptr->refill_lock);
5807                                 goto unclustered_alloc;
5808                         }
5809
5810                         /*
5811                          * this cluster didn't work out, free it and
5812                          * start over
5813                          */
5814                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
5815
5816                         if (loop >= LOOP_NO_EMPTY_SIZE) {
5817                                 spin_unlock(&last_ptr->refill_lock);
5818                                 goto unclustered_alloc;
5819                         }
5820
5821                         /* allocate a cluster in this block group */
5822                         ret = btrfs_find_space_cluster(trans, root,
5823                                                block_group, last_ptr,
5824                                                search_start, num_bytes,
5825                                                empty_cluster + empty_size);
5826                         if (ret == 0) {
5827                                 /*
5828                                  * now pull our allocation out of this
5829                                  * cluster
5830                                  */
5831                                 offset = btrfs_alloc_from_cluster(block_group,
5832                                                   last_ptr, num_bytes,
5833                                                   search_start);
5834                                 if (offset) {
5835                                         /* we found one, proceed */
5836                                         spin_unlock(&last_ptr->refill_lock);
5837                                         trace_btrfs_reserve_extent_cluster(root,
5838                                                 block_group, search_start,
5839                                                 num_bytes);
5840                                         goto checks;
5841                                 }
5842                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
5843                                    && !failed_cluster_refill) {
5844                                 spin_unlock(&last_ptr->refill_lock);
5845
5846                                 failed_cluster_refill = true;
5847                                 wait_block_group_cache_progress(block_group,
5848                                        num_bytes + empty_cluster + empty_size);
5849                                 goto have_block_group;
5850                         }
5851
5852                         /*
5853                          * at this point we either didn't find a cluster
5854                          * or we weren't able to allocate a block from our
5855                          * cluster.  Free the cluster we've been trying
5856                          * to use, and go to the next block group
5857                          */
5858                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
5859                         spin_unlock(&last_ptr->refill_lock);
5860                         goto loop;
5861                 }
5862
5863 unclustered_alloc:
5864                 spin_lock(&block_group->free_space_ctl->tree_lock);
5865                 if (cached &&
5866                     block_group->free_space_ctl->free_space <
5867                     num_bytes + empty_cluster + empty_size) {
5868                         spin_unlock(&block_group->free_space_ctl->tree_lock);
5869                         goto loop;
5870                 }
5871                 spin_unlock(&block_group->free_space_ctl->tree_lock);
5872
5873                 offset = btrfs_find_space_for_alloc(block_group, search_start,
5874                                                     num_bytes, empty_size);
5875                 /*
5876                  * If we didn't find a chunk, and we haven't failed on this
5877                  * block group before, and this block group is in the middle of
5878                  * caching and we are ok with waiting, then go ahead and wait
5879                  * for progress to be made, and set failed_alloc to true.
5880                  *
5881                  * If failed_alloc is true then we've already waited on this
5882                  * block group once and should move on to the next block group.
5883                  */
5884                 if (!offset && !failed_alloc && !cached &&
5885                     loop > LOOP_CACHING_NOWAIT) {
5886                         wait_block_group_cache_progress(block_group,
5887                                                 num_bytes + empty_size);
5888                         failed_alloc = true;
5889                         goto have_block_group;
5890                 } else if (!offset) {
5891                         if (!cached)
5892                                 have_caching_bg = true;
5893                         goto loop;
5894                 }
5895 checks:
5896                 search_start = stripe_align(root, offset);
5897
5898                 /* move on to the next group */
5899                 if (search_start + num_bytes >
5900                     used_block_group->key.objectid + used_block_group->key.offset) {
5901                         btrfs_add_free_space(used_block_group, offset, num_bytes);
5902                         goto loop;
5903                 }
5904
5905                 if (offset < search_start)
5906                         btrfs_add_free_space(used_block_group, offset,
5907                                              search_start - offset);
5908                 BUG_ON(offset > search_start);
5909
5910                 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
5911                                                   alloc_type);
5912                 if (ret == -EAGAIN) {
5913                         btrfs_add_free_space(used_block_group, offset, num_bytes);
5914                         goto loop;
5915                 }
5916
5917                 /* we are all good, lets return */
5918                 ins->objectid = search_start;
5919                 ins->offset = num_bytes;
5920
5921                 trace_btrfs_reserve_extent(orig_root, block_group,
5922                                            search_start, num_bytes);
5923                 if (used_block_group != block_group)
5924                         btrfs_put_block_group(used_block_group);
5925                 btrfs_put_block_group(block_group);
5926                 break;
5927 loop:
5928                 failed_cluster_refill = false;
5929                 failed_alloc = false;
5930                 BUG_ON(index != get_block_group_index(block_group));
5931                 if (used_block_group != block_group)
5932                         btrfs_put_block_group(used_block_group);
5933                 btrfs_put_block_group(block_group);
5934         }
5935         up_read(&space_info->groups_sem);
5936
5937         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5938                 goto search;
5939
5940         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5941                 goto search;
5942
5943         /*
5944          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
5945          *                      caching kthreads as we move along
5946          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
5947          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
5948          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
5949          *                      again
5950          */
5951         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
5952                 index = 0;
5953                 loop++;
5954                 if (loop == LOOP_ALLOC_CHUNK) {
5955                         ret = do_chunk_alloc(trans, root, data,
5956                                              CHUNK_ALLOC_FORCE);
5957                         /*
5958                          * Do not bail out on ENOSPC since we
5959                          * can do more things.
5960                          */
5961                         if (ret < 0 && ret != -ENOSPC) {
5962                                 btrfs_abort_transaction(trans,
5963                                                         root, ret);
5964                                 goto out;
5965                         }
5966                 }
5967
5968                 if (loop == LOOP_NO_EMPTY_SIZE) {
5969                         empty_size = 0;
5970                         empty_cluster = 0;
5971                 }
5972
5973                 goto search;
5974         } else if (!ins->objectid) {
5975                 ret = -ENOSPC;
5976         } else if (ins->objectid) {
5977                 ret = 0;
5978         }
5979 out:
5980
5981         return ret;
5982 }
5983
5984 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5985                             int dump_block_groups)
5986 {
5987         struct btrfs_block_group_cache *cache;
5988         int index = 0;
5989
5990         spin_lock(&info->lock);
5991         printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5992                (unsigned long long)info->flags,
5993                (unsigned long long)(info->total_bytes - info->bytes_used -
5994                                     info->bytes_pinned - info->bytes_reserved -
5995                                     info->bytes_readonly),
5996                (info->full) ? "" : "not ");
5997         printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
5998                "reserved=%llu, may_use=%llu, readonly=%llu\n",
5999                (unsigned long long)info->total_bytes,
6000                (unsigned long long)info->bytes_used,
6001                (unsigned long long)info->bytes_pinned,
6002                (unsigned long long)info->bytes_reserved,
6003                (unsigned long long)info->bytes_may_use,
6004                (unsigned long long)info->bytes_readonly);
6005         spin_unlock(&info->lock);
6006
6007         if (!dump_block_groups)
6008                 return;
6009
6010         down_read(&info->groups_sem);
6011 again:
6012         list_for_each_entry(cache, &info->block_groups[index], list) {
6013                 spin_lock(&cache->lock);
6014                 printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
6015                        (unsigned long long)cache->key.objectid,
6016                        (unsigned long long)cache->key.offset,
6017                        (unsigned long long)btrfs_block_group_used(&cache->item),
6018                        (unsigned long long)cache->pinned,
6019                        (unsigned long long)cache->reserved,
6020                        cache->ro ? "[readonly]" : "");
6021                 btrfs_dump_free_space(cache, bytes);
6022                 spin_unlock(&cache->lock);
6023         }
6024         if (++index < BTRFS_NR_RAID_TYPES)
6025                 goto again;
6026         up_read(&info->groups_sem);
6027 }
6028
6029 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
6030                          struct btrfs_root *root,
6031                          u64 num_bytes, u64 min_alloc_size,
6032                          u64 empty_size, u64 hint_byte,
6033                          struct btrfs_key *ins, u64 data)
6034 {
6035         bool final_tried = false;
6036         int ret;
6037
6038         data = btrfs_get_alloc_profile(root, data);
6039 again:
6040         WARN_ON(num_bytes < root->sectorsize);
6041         ret = find_free_extent(trans, root, num_bytes, empty_size,
6042                                hint_byte, ins, data);
6043
6044         if (ret == -ENOSPC) {
6045                 if (!final_tried) {
6046                         num_bytes = num_bytes >> 1;
6047                         num_bytes = num_bytes & ~(root->sectorsize - 1);
6048                         num_bytes = max(num_bytes, min_alloc_size);
6049                         if (num_bytes == min_alloc_size)
6050                                 final_tried = true;
6051                         goto again;
6052                 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6053                         struct btrfs_space_info *sinfo;
6054
6055                         sinfo = __find_space_info(root->fs_info, data);
6056                         printk(KERN_ERR "btrfs allocation failed flags %llu, "
6057                                "wanted %llu\n", (unsigned long long)data,
6058                                (unsigned long long)num_bytes);
6059                         if (sinfo)
6060                                 dump_space_info(sinfo, num_bytes, 1);
6061                 }
6062         }
6063
6064         trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
6065
6066         return ret;
6067 }
6068
6069 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
6070                                         u64 start, u64 len, int pin)
6071 {
6072         struct btrfs_block_group_cache *cache;
6073         int ret = 0;
6074
6075         cache = btrfs_lookup_block_group(root->fs_info, start);
6076         if (!cache) {
6077                 printk(KERN_ERR "Unable to find block group for %llu\n",
6078                        (unsigned long long)start);
6079                 return -ENOSPC;
6080         }
6081
6082         if (btrfs_test_opt(root, DISCARD))
6083                 ret = btrfs_discard_extent(root, start, len, NULL);
6084
6085         if (pin)
6086                 pin_down_extent(root, cache, start, len, 1);
6087         else {
6088                 btrfs_add_free_space(cache, start, len);
6089                 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
6090         }
6091         btrfs_put_block_group(cache);
6092
6093         trace_btrfs_reserved_extent_free(root, start, len);
6094
6095         return ret;
6096 }
6097
6098 int btrfs_free_reserved_extent(struct btrfs_root *root,
6099                                         u64 start, u64 len)
6100 {
6101         return __btrfs_free_reserved_extent(root, start, len, 0);
6102 }
6103
6104 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
6105                                        u64 start, u64 len)
6106 {
6107         return __btrfs_free_reserved_extent(root, start, len, 1);
6108 }
6109
6110 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6111                                       struct btrfs_root *root,
6112                                       u64 parent, u64 root_objectid,
6113                                       u64 flags, u64 owner, u64 offset,
6114                                       struct btrfs_key *ins, int ref_mod)
6115 {
6116         int ret;
6117         struct btrfs_fs_info *fs_info = root->fs_info;
6118         struct btrfs_extent_item *extent_item;
6119         struct btrfs_extent_inline_ref *iref;
6120         struct btrfs_path *path;
6121         struct extent_buffer *leaf;
6122         int type;
6123         u32 size;
6124
6125         if (parent > 0)
6126                 type = BTRFS_SHARED_DATA_REF_KEY;
6127         else
6128                 type = BTRFS_EXTENT_DATA_REF_KEY;
6129
6130         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
6131
6132         path = btrfs_alloc_path();
6133         if (!path)
6134                 return -ENOMEM;
6135
6136         path->leave_spinning = 1;
6137         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6138                                       ins, size);
6139         if (ret) {
6140                 btrfs_free_path(path);
6141                 return ret;
6142         }
6143
6144         leaf = path->nodes[0];
6145         extent_item = btrfs_item_ptr(leaf, path->slots[0],
6146                                      struct btrfs_extent_item);
6147         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
6148         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6149         btrfs_set_extent_flags(leaf, extent_item,
6150                                flags | BTRFS_EXTENT_FLAG_DATA);
6151
6152         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6153         btrfs_set_extent_inline_ref_type(leaf, iref, type);
6154         if (parent > 0) {
6155                 struct btrfs_shared_data_ref *ref;
6156                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
6157                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6158                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
6159         } else {
6160                 struct btrfs_extent_data_ref *ref;
6161                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
6162                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
6163                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
6164                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
6165                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
6166         }
6167
6168         btrfs_mark_buffer_dirty(path->nodes[0]);
6169         btrfs_free_path(path);
6170
6171         ret = update_block_group(root, ins->objectid, ins->offset, 1);
6172         if (ret) { /* -ENOENT, logic error */
6173                 printk(KERN_ERR "btrfs update block group failed for %llu "
6174                        "%llu\n", (unsigned long long)ins->objectid,
6175                        (unsigned long long)ins->offset);
6176                 BUG();
6177         }
6178         return ret;
6179 }
6180
6181 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6182                                      struct btrfs_root *root,
6183                                      u64 parent, u64 root_objectid,
6184                                      u64 flags, struct btrfs_disk_key *key,
6185                                      int level, struct btrfs_key *ins)
6186 {
6187         int ret;
6188         struct btrfs_fs_info *fs_info = root->fs_info;
6189         struct btrfs_extent_item *extent_item;
6190         struct btrfs_tree_block_info *block_info;
6191         struct btrfs_extent_inline_ref *iref;
6192         struct btrfs_path *path;
6193         struct extent_buffer *leaf;
6194         u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
6195
6196         path = btrfs_alloc_path();
6197         if (!path)
6198                 return -ENOMEM;
6199
6200         path->leave_spinning = 1;
6201         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6202                                       ins, size);
6203         if (ret) {
6204                 btrfs_free_path(path);
6205                 return ret;
6206         }
6207
6208         leaf = path->nodes[0];
6209         extent_item = btrfs_item_ptr(leaf, path->slots[0],
6210                                      struct btrfs_extent_item);
6211         btrfs_set_extent_refs(leaf, extent_item, 1);
6212         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6213         btrfs_set_extent_flags(leaf, extent_item,
6214                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
6215         block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
6216
6217         btrfs_set_tree_block_key(leaf, block_info, key);
6218         btrfs_set_tree_block_level(leaf, block_info, level);
6219
6220         iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
6221         if (parent > 0) {
6222                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
6223                 btrfs_set_extent_inline_ref_type(leaf, iref,
6224                                                  BTRFS_SHARED_BLOCK_REF_KEY);
6225                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6226         } else {
6227                 btrfs_set_extent_inline_ref_type(leaf, iref,
6228                                                  BTRFS_TREE_BLOCK_REF_KEY);
6229                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
6230         }
6231
6232         btrfs_mark_buffer_dirty(leaf);
6233         btrfs_free_path(path);
6234
6235         ret = update_block_group(root, ins->objectid, ins->offset, 1);
6236         if (ret) { /* -ENOENT, logic error */
6237                 printk(KERN_ERR "btrfs update block group failed for %llu "
6238                        "%llu\n", (unsigned long long)ins->objectid,
6239                        (unsigned long long)ins->offset);
6240                 BUG();
6241         }
6242         return ret;
6243 }
6244
6245 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6246                                      struct btrfs_root *root,
6247                                      u64 root_objectid, u64 owner,
6248                                      u64 offset, struct btrfs_key *ins)
6249 {
6250         int ret;
6251
6252         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
6253
6254         ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
6255                                          ins->offset, 0,
6256                                          root_objectid, owner, offset,
6257                                          BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
6258         return ret;
6259 }
6260
6261 /*
6262  * this is used by the tree logging recovery code.  It records that
6263  * an extent has been allocated and makes sure to clear the free
6264  * space cache bits as well
6265  */
6266 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6267                                    struct btrfs_root *root,
6268                                    u64 root_objectid, u64 owner, u64 offset,
6269                                    struct btrfs_key *ins)
6270 {
6271         int ret;
6272         struct btrfs_block_group_cache *block_group;
6273         struct btrfs_caching_control *caching_ctl;
6274         u64 start = ins->objectid;
6275         u64 num_bytes = ins->offset;
6276
6277         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6278         cache_block_group(block_group, 0);
6279         caching_ctl = get_caching_control(block_group);
6280
6281         if (!caching_ctl) {
6282                 BUG_ON(!block_group_cache_done(block_group));
6283                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6284                 BUG_ON(ret); /* -ENOMEM */
6285         } else {
6286                 mutex_lock(&caching_ctl->mutex);
6287
6288                 if (start >= caching_ctl->progress) {
6289                         ret = add_excluded_extent(root, start, num_bytes);
6290                         BUG_ON(ret); /* -ENOMEM */
6291                 } else if (start + num_bytes <= caching_ctl->progress) {
6292                         ret = btrfs_remove_free_space(block_group,
6293                                                       start, num_bytes);
6294                         BUG_ON(ret); /* -ENOMEM */
6295                 } else {
6296                         num_bytes = caching_ctl->progress - start;
6297                         ret = btrfs_remove_free_space(block_group,
6298                                                       start, num_bytes);
6299                         BUG_ON(ret); /* -ENOMEM */
6300
6301                         start = caching_ctl->progress;
6302                         num_bytes = ins->objectid + ins->offset -
6303                                     caching_ctl->progress;
6304                         ret = add_excluded_extent(root, start, num_bytes);
6305                         BUG_ON(ret); /* -ENOMEM */
6306                 }
6307
6308                 mutex_unlock(&caching_ctl->mutex);
6309                 put_caching_control(caching_ctl);
6310         }
6311
6312         ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6313                                           RESERVE_ALLOC_NO_ACCOUNT);
6314         BUG_ON(ret); /* logic error */
6315         btrfs_put_block_group(block_group);
6316         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6317                                          0, owner, offset, ins, 1);
6318         return ret;
6319 }
6320
6321 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
6322                                             struct btrfs_root *root,
6323                                             u64 bytenr, u32 blocksize,
6324                                             int level)
6325 {
6326         struct extent_buffer *buf;
6327
6328         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
6329         if (!buf)
6330                 return ERR_PTR(-ENOMEM);
6331         btrfs_set_header_generation(buf, trans->transid);
6332         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
6333         btrfs_tree_lock(buf);
6334         clean_tree_block(trans, root, buf);
6335         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
6336
6337         btrfs_set_lock_blocking(buf);
6338         btrfs_set_buffer_uptodate(buf);
6339
6340         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
6341                 /*
6342                  * we allow two log transactions at a time, use different
6343                  * EXENT bit to differentiate dirty pages.
6344                  */
6345                 if (root->log_transid % 2 == 0)
6346                         set_extent_dirty(&root->dirty_log_pages, buf->start,
6347                                         buf->start + buf->len - 1, GFP_NOFS);
6348                 else
6349                         set_extent_new(&root->dirty_log_pages, buf->start,
6350                                         buf->start + buf->len - 1, GFP_NOFS);
6351         } else {
6352                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
6353                          buf->start + buf->len - 1, GFP_NOFS);
6354         }
6355         trans->blocks_used++;
6356         /* this returns a buffer locked for blocking */
6357         return buf;
6358 }
6359
6360 static struct btrfs_block_rsv *
6361 use_block_rsv(struct btrfs_trans_handle *trans,
6362               struct btrfs_root *root, u32 blocksize)
6363 {
6364         struct btrfs_block_rsv *block_rsv;
6365         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
6366         int ret;
6367
6368         block_rsv = get_block_rsv(trans, root);
6369
6370         if (block_rsv->size == 0) {
6371                 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6372                                              BTRFS_RESERVE_NO_FLUSH);
6373                 /*
6374                  * If we couldn't reserve metadata bytes try and use some from
6375                  * the global reserve.
6376                  */
6377                 if (ret && block_rsv != global_rsv) {
6378                         ret = block_rsv_use_bytes(global_rsv, blocksize);
6379                         if (!ret)
6380                                 return global_rsv;
6381                         return ERR_PTR(ret);
6382                 } else if (ret) {
6383                         return ERR_PTR(ret);
6384                 }
6385                 return block_rsv;
6386         }
6387
6388         ret = block_rsv_use_bytes(block_rsv, blocksize);
6389         if (!ret)
6390                 return block_rsv;
6391         if (ret && !block_rsv->failfast) {
6392                 static DEFINE_RATELIMIT_STATE(_rs,
6393                                 DEFAULT_RATELIMIT_INTERVAL,
6394                                 /*DEFAULT_RATELIMIT_BURST*/ 2);
6395                 if (__ratelimit(&_rs))
6396                         WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
6397                              ret);
6398                 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6399                                              BTRFS_RESERVE_NO_FLUSH);
6400                 if (!ret) {
6401                         return block_rsv;
6402                 } else if (ret && block_rsv != global_rsv) {
6403                         ret = block_rsv_use_bytes(global_rsv, blocksize);
6404                         if (!ret)
6405                                 return global_rsv;
6406                 }
6407         }
6408
6409         return ERR_PTR(-ENOSPC);
6410 }
6411
6412 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6413                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
6414 {
6415         block_rsv_add_bytes(block_rsv, blocksize, 0);
6416         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6417 }
6418
6419 /*
6420  * finds a free extent and does all the dirty work required for allocation
6421  * returns the key for the extent through ins, and a tree buffer for
6422  * the first block of the extent through buf.
6423  *
6424  * returns the tree buffer or NULL.
6425  */
6426 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6427                                         struct btrfs_root *root, u32 blocksize,
6428                                         u64 parent, u64 root_objectid,
6429                                         struct btrfs_disk_key *key, int level,
6430                                         u64 hint, u64 empty_size)
6431 {
6432         struct btrfs_key ins;
6433         struct btrfs_block_rsv *block_rsv;
6434         struct extent_buffer *buf;
6435         u64 flags = 0;
6436         int ret;
6437
6438
6439         block_rsv = use_block_rsv(trans, root, blocksize);
6440         if (IS_ERR(block_rsv))
6441                 return ERR_CAST(block_rsv);
6442
6443         ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6444                                    empty_size, hint, &ins, 0);
6445         if (ret) {
6446                 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6447                 return ERR_PTR(ret);
6448         }
6449
6450         buf = btrfs_init_new_buffer(trans, root, ins.objectid,
6451                                     blocksize, level);
6452         BUG_ON(IS_ERR(buf)); /* -ENOMEM */
6453
6454         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
6455                 if (parent == 0)
6456                         parent = ins.objectid;
6457                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6458         } else
6459                 BUG_ON(parent > 0);
6460
6461         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6462                 struct btrfs_delayed_extent_op *extent_op;
6463                 extent_op = btrfs_alloc_delayed_extent_op();
6464                 BUG_ON(!extent_op); /* -ENOMEM */
6465                 if (key)
6466                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
6467                 else
6468                         memset(&extent_op->key, 0, sizeof(extent_op->key));
6469                 extent_op->flags_to_set = flags;
6470                 extent_op->update_key = 1;
6471                 extent_op->update_flags = 1;
6472                 extent_op->is_data = 0;
6473
6474                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6475                                         ins.objectid,
6476                                         ins.offset, parent, root_objectid,
6477                                         level, BTRFS_ADD_DELAYED_EXTENT,
6478                                         extent_op, 0);
6479                 BUG_ON(ret); /* -ENOMEM */
6480         }
6481         return buf;
6482 }
6483
6484 struct walk_control {
6485         u64 refs[BTRFS_MAX_LEVEL];
6486         u64 flags[BTRFS_MAX_LEVEL];
6487         struct btrfs_key update_progress;
6488         int stage;
6489         int level;
6490         int shared_level;
6491         int update_ref;
6492         int keep_locks;
6493         int reada_slot;
6494         int reada_count;
6495         int for_reloc;
6496 };
6497
6498 #define DROP_REFERENCE  1
6499 #define UPDATE_BACKREF  2
6500
6501 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
6502                                      struct btrfs_root *root,
6503                                      struct walk_control *wc,
6504                                      struct btrfs_path *path)
6505 {
6506         u64 bytenr;
6507         u64 generation;
6508         u64 refs;
6509         u64 flags;
6510         u32 nritems;
6511         u32 blocksize;
6512         struct btrfs_key key;
6513         struct extent_buffer *eb;
6514         int ret;
6515         int slot;
6516         int nread = 0;
6517
6518         if (path->slots[wc->level] < wc->reada_slot) {
6519                 wc->reada_count = wc->reada_count * 2 / 3;
6520                 wc->reada_count = max(wc->reada_count, 2);
6521         } else {
6522                 wc->reada_count = wc->reada_count * 3 / 2;
6523                 wc->reada_count = min_t(int, wc->reada_count,
6524                                         BTRFS_NODEPTRS_PER_BLOCK(root));
6525         }
6526
6527         eb = path->nodes[wc->level];
6528         nritems = btrfs_header_nritems(eb);
6529         blocksize = btrfs_level_size(root, wc->level - 1);
6530
6531         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
6532                 if (nread >= wc->reada_count)
6533                         break;
6534
6535                 cond_resched();
6536                 bytenr = btrfs_node_blockptr(eb, slot);
6537                 generation = btrfs_node_ptr_generation(eb, slot);
6538
6539                 if (slot == path->slots[wc->level])
6540                         goto reada;
6541
6542                 if (wc->stage == UPDATE_BACKREF &&
6543                     generation <= root->root_key.offset)
6544                         continue;
6545
6546                 /* We don't lock the tree block, it's OK to be racy here */
6547                 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6548                                                &refs, &flags);
6549                 /* We don't care about errors in readahead. */
6550                 if (ret < 0)
6551                         continue;
6552                 BUG_ON(refs == 0);
6553
6554                 if (wc->stage == DROP_REFERENCE) {
6555                         if (refs == 1)
6556                                 goto reada;
6557
6558                         if (wc->level == 1 &&
6559                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6560                                 continue;
6561                         if (!wc->update_ref ||
6562                             generation <= root->root_key.offset)
6563                                 continue;
6564                         btrfs_node_key_to_cpu(eb, &key, slot);
6565                         ret = btrfs_comp_cpu_keys(&key,
6566                                                   &wc->update_progress);
6567                         if (ret < 0)
6568                                 continue;
6569                 } else {
6570                         if (wc->level == 1 &&
6571                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6572                                 continue;
6573                 }
6574 reada:
6575                 ret = readahead_tree_block(root, bytenr, blocksize,
6576                                            generation);
6577                 if (ret)
6578                         break;
6579                 nread++;
6580         }
6581         wc->reada_slot = slot;
6582 }
6583
6584 /*
6585  * hepler to process tree block while walking down the tree.
6586  *
6587  * when wc->stage == UPDATE_BACKREF, this function updates
6588  * back refs for pointers in the block.
6589  *
6590  * NOTE: return value 1 means we should stop walking down.
6591  */
6592 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6593                                    struct btrfs_root *root,
6594                                    struct btrfs_path *path,
6595                                    struct walk_control *wc, int lookup_info)
6596 {
6597         int level = wc->level;
6598         struct extent_buffer *eb = path->nodes[level];
6599         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6600         int ret;
6601
6602         if (wc->stage == UPDATE_BACKREF &&
6603             btrfs_header_owner(eb) != root->root_key.objectid)
6604                 return 1;
6605
6606         /*
6607          * when reference count of tree block is 1, it won't increase
6608          * again. once full backref flag is set, we never clear it.
6609          */
6610         if (lookup_info &&
6611             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
6612              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
6613                 BUG_ON(!path->locks[level]);
6614                 ret = btrfs_lookup_extent_info(trans, root,
6615                                                eb->start, eb->len,
6616                                                &wc->refs[level],
6617                                                &wc->flags[level]);
6618                 BUG_ON(ret == -ENOMEM);
6619                 if (ret)
6620                         return ret;
6621                 BUG_ON(wc->refs[level] == 0);
6622         }
6623
6624         if (wc->stage == DROP_REFERENCE) {
6625                 if (wc->refs[level] > 1)
6626                         return 1;
6627
6628                 if (path->locks[level] && !wc->keep_locks) {
6629                         btrfs_tree_unlock_rw(eb, path->locks[level]);
6630                         path->locks[level] = 0;
6631                 }
6632                 return 0;
6633         }
6634
6635         /* wc->stage == UPDATE_BACKREF */
6636         if (!(wc->flags[level] & flag)) {
6637                 BUG_ON(!path->locks[level]);
6638                 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
6639                 BUG_ON(ret); /* -ENOMEM */
6640                 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6641                 BUG_ON(ret); /* -ENOMEM */
6642                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6643                                                   eb->len, flag, 0);
6644                 BUG_ON(ret); /* -ENOMEM */
6645                 wc->flags[level] |= flag;
6646         }
6647
6648         /*
6649          * the block is shared by multiple trees, so it's not good to
6650          * keep the tree lock
6651          */
6652         if (path->locks[level] && level > 0) {
6653                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6654                 path->locks[level] = 0;
6655         }
6656         return 0;
6657 }
6658
6659 /*
6660  * hepler to process tree block pointer.
6661  *
6662  * when wc->stage == DROP_REFERENCE, this function checks
6663  * reference count of the block pointed to. if the block
6664  * is shared and we need update back refs for the subtree
6665  * rooted at the block, this function changes wc->stage to
6666  * UPDATE_BACKREF. if the block is shared and there is no
6667  * need to update back, this function drops the reference
6668  * to the block.
6669  *
6670  * NOTE: return value 1 means we should stop walking down.
6671  */
6672 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6673                                  struct btrfs_root *root,
6674                                  struct btrfs_path *path,
6675                                  struct walk_control *wc, int *lookup_info)
6676 {
6677         u64 bytenr;
6678         u64 generation;
6679         u64 parent;
6680         u32 blocksize;
6681         struct btrfs_key key;
6682         struct extent_buffer *next;
6683         int level = wc->level;
6684         int reada = 0;
6685         int ret = 0;
6686
6687         generation = btrfs_node_ptr_generation(path->nodes[level],
6688                                                path->slots[level]);
6689         /*
6690          * if the lower level block was created before the snapshot
6691          * was created, we know there is no need to update back refs
6692          * for the subtree
6693          */
6694         if (wc->stage == UPDATE_BACKREF &&
6695             generation <= root->root_key.offset) {
6696                 *lookup_info = 1;
6697                 return 1;
6698         }
6699
6700         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
6701         blocksize = btrfs_level_size(root, level - 1);
6702
6703         next = btrfs_find_tree_block(root, bytenr, blocksize);
6704         if (!next) {
6705                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
6706                 if (!next)
6707                         return -ENOMEM;
6708                 reada = 1;
6709         }
6710         btrfs_tree_lock(next);
6711         btrfs_set_lock_blocking(next);
6712
6713         ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6714                                        &wc->refs[level - 1],
6715                                        &wc->flags[level - 1]);
6716         if (ret < 0) {
6717                 btrfs_tree_unlock(next);
6718                 return ret;
6719         }
6720
6721         BUG_ON(wc->refs[level - 1] == 0);
6722         *lookup_info = 0;
6723
6724         if (wc->stage == DROP_REFERENCE) {
6725                 if (wc->refs[level - 1] > 1) {
6726                         if (level == 1 &&
6727                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6728                                 goto skip;
6729
6730                         if (!wc->update_ref ||
6731                             generation <= root->root_key.offset)
6732                                 goto skip;
6733
6734                         btrfs_node_key_to_cpu(path->nodes[level], &key,
6735                                               path->slots[level]);
6736                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
6737                         if (ret < 0)
6738                                 goto skip;
6739
6740                         wc->stage = UPDATE_BACKREF;
6741                         wc->shared_level = level - 1;
6742                 }
6743         } else {
6744                 if (level == 1 &&
6745                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6746                         goto skip;
6747         }
6748
6749         if (!btrfs_buffer_uptodate(next, generation, 0)) {
6750                 btrfs_tree_unlock(next);
6751                 free_extent_buffer(next);
6752                 next = NULL;
6753                 *lookup_info = 1;
6754         }
6755
6756         if (!next) {
6757                 if (reada && level == 1)
6758                         reada_walk_down(trans, root, wc, path);
6759                 next = read_tree_block(root, bytenr, blocksize, generation);
6760                 if (!next)
6761                         return -EIO;
6762                 btrfs_tree_lock(next);
6763                 btrfs_set_lock_blocking(next);
6764         }
6765
6766         level--;
6767         BUG_ON(level != btrfs_header_level(next));
6768         path->nodes[level] = next;
6769         path->slots[level] = 0;
6770         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6771         wc->level = level;
6772         if (wc->level == 1)
6773                 wc->reada_slot = 0;
6774         return 0;
6775 skip:
6776         wc->refs[level - 1] = 0;
6777         wc->flags[level - 1] = 0;
6778         if (wc->stage == DROP_REFERENCE) {
6779                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6780                         parent = path->nodes[level]->start;
6781                 } else {
6782                         BUG_ON(root->root_key.objectid !=
6783                                btrfs_header_owner(path->nodes[level]));
6784                         parent = 0;
6785                 }
6786
6787                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6788                                 root->root_key.objectid, level - 1, 0, 0);
6789                 BUG_ON(ret); /* -ENOMEM */
6790         }
6791         btrfs_tree_unlock(next);
6792         free_extent_buffer(next);
6793         *lookup_info = 1;
6794         return 1;
6795 }
6796
6797 /*
6798  * hepler to process tree block while walking up the tree.
6799  *
6800  * when wc->stage == DROP_REFERENCE, this function drops
6801  * reference count on the block.
6802  *
6803  * when wc->stage == UPDATE_BACKREF, this function changes
6804  * wc->stage back to DROP_REFERENCE if we changed wc->stage
6805  * to UPDATE_BACKREF previously while processing the block.
6806  *
6807  * NOTE: return value 1 means we should stop walking up.
6808  */
6809 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6810                                  struct btrfs_root *root,
6811                                  struct btrfs_path *path,
6812                                  struct walk_control *wc)
6813 {
6814         int ret;
6815         int level = wc->level;
6816         struct extent_buffer *eb = path->nodes[level];
6817         u64 parent = 0;
6818
6819         if (wc->stage == UPDATE_BACKREF) {
6820                 BUG_ON(wc->shared_level < level);
6821                 if (level < wc->shared_level)
6822                         goto out;
6823
6824                 ret = find_next_key(path, level + 1, &wc->update_progress);
6825                 if (ret > 0)
6826                         wc->update_ref = 0;
6827
6828                 wc->stage = DROP_REFERENCE;
6829                 wc->shared_level = -1;
6830                 path->slots[level] = 0;
6831
6832                 /*
6833                  * check reference count again if the block isn't locked.
6834                  * we should start walking down the tree again if reference
6835                  * count is one.
6836                  */
6837                 if (!path->locks[level]) {
6838                         BUG_ON(level == 0);
6839                         btrfs_tree_lock(eb);
6840                         btrfs_set_lock_blocking(eb);
6841                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6842
6843                         ret = btrfs_lookup_extent_info(trans, root,
6844                                                        eb->start, eb->len,
6845                                                        &wc->refs[level],
6846                                                        &wc->flags[level]);
6847                         if (ret < 0) {
6848                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6849                                 path->locks[level] = 0;
6850                                 return ret;
6851                         }
6852                         BUG_ON(wc->refs[level] == 0);
6853                         if (wc->refs[level] == 1) {
6854                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6855                                 path->locks[level] = 0;
6856                                 return 1;
6857                         }
6858                 }
6859         }
6860
6861         /* wc->stage == DROP_REFERENCE */
6862         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
6863
6864         if (wc->refs[level] == 1) {
6865                 if (level == 0) {
6866                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6867                                 ret = btrfs_dec_ref(trans, root, eb, 1,
6868                                                     wc->for_reloc);
6869                         else
6870                                 ret = btrfs_dec_ref(trans, root, eb, 0,
6871                                                     wc->for_reloc);
6872                         BUG_ON(ret); /* -ENOMEM */
6873                 }
6874                 /* make block locked assertion in clean_tree_block happy */
6875                 if (!path->locks[level] &&
6876                     btrfs_header_generation(eb) == trans->transid) {
6877                         btrfs_tree_lock(eb);
6878                         btrfs_set_lock_blocking(eb);
6879                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6880                 }
6881                 clean_tree_block(trans, root, eb);
6882         }
6883
6884         if (eb == root->node) {
6885                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6886                         parent = eb->start;
6887                 else
6888                         BUG_ON(root->root_key.objectid !=
6889                                btrfs_header_owner(eb));
6890         } else {
6891                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6892                         parent = path->nodes[level + 1]->start;
6893                 else
6894                         BUG_ON(root->root_key.objectid !=
6895                                btrfs_header_owner(path->nodes[level + 1]));
6896         }
6897
6898         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6899 out:
6900         wc->refs[level] = 0;
6901         wc->flags[level] = 0;
6902         return 0;
6903 }
6904
6905 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
6906                                    struct btrfs_root *root,
6907                                    struct btrfs_path *path,
6908                                    struct walk_control *wc)
6909 {
6910         int level = wc->level;
6911         int lookup_info = 1;
6912         int ret;
6913
6914         while (level >= 0) {
6915                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
6916                 if (ret > 0)
6917                         break;
6918
6919                 if (level == 0)
6920                         break;
6921
6922                 if (path->slots[level] >=
6923                     btrfs_header_nritems(path->nodes[level]))
6924                         break;
6925
6926                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
6927                 if (ret > 0) {
6928                         path->slots[level]++;
6929                         continue;
6930                 } else if (ret < 0)
6931                         return ret;
6932                 level = wc->level;
6933         }
6934         return 0;
6935 }
6936
6937 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6938                                  struct btrfs_root *root,
6939                                  struct btrfs_path *path,
6940                                  struct walk_control *wc, int max_level)
6941 {
6942         int level = wc->level;
6943         int ret;
6944
6945         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
6946         while (level < max_level && path->nodes[level]) {
6947                 wc->level = level;
6948                 if (path->slots[level] + 1 <
6949                     btrfs_header_nritems(path->nodes[level])) {
6950                         path->slots[level]++;
6951                         return 0;
6952                 } else {
6953                         ret = walk_up_proc(trans, root, path, wc);
6954                         if (ret > 0)
6955                                 return 0;
6956
6957                         if (path->locks[level]) {
6958                                 btrfs_tree_unlock_rw(path->nodes[level],
6959                                                      path->locks[level]);
6960                                 path->locks[level] = 0;
6961                         }
6962                         free_extent_buffer(path->nodes[level]);
6963                         path->nodes[level] = NULL;
6964                         level++;
6965                 }
6966         }
6967         return 1;
6968 }
6969
6970 /*
6971  * drop a subvolume tree.
6972  *
6973  * this function traverses the tree freeing any blocks that only
6974  * referenced by the tree.
6975  *
6976  * when a shared tree block is found. this function decreases its
6977  * reference count by one. if update_ref is true, this function
6978  * also make sure backrefs for the shared block and all lower level
6979  * blocks are properly updated.
6980  */
6981 int btrfs_drop_snapshot(struct btrfs_root *root,
6982                          struct btrfs_block_rsv *block_rsv, int update_ref,
6983                          int for_reloc)
6984 {
6985         struct btrfs_path *path;
6986         struct btrfs_trans_handle *trans;
6987         struct btrfs_root *tree_root = root->fs_info->tree_root;
6988         struct btrfs_root_item *root_item = &root->root_item;
6989         struct walk_control *wc;
6990         struct btrfs_key key;
6991         int err = 0;
6992         int ret;
6993         int level;
6994
6995         path = btrfs_alloc_path();
6996         if (!path) {
6997                 err = -ENOMEM;
6998                 goto out;
6999         }
7000
7001         wc = kzalloc(sizeof(*wc), GFP_NOFS);
7002         if (!wc) {
7003                 btrfs_free_path(path);
7004                 err = -ENOMEM;
7005                 goto out;
7006         }
7007
7008         trans = btrfs_start_transaction(tree_root, 0);
7009         if (IS_ERR(trans)) {
7010                 err = PTR_ERR(trans);
7011                 goto out_free;
7012         }
7013
7014         if (block_rsv)
7015                 trans->block_rsv = block_rsv;
7016
7017         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
7018                 level = btrfs_header_level(root->node);
7019                 path->nodes[level] = btrfs_lock_root_node(root);
7020                 btrfs_set_lock_blocking(path->nodes[level]);
7021                 path->slots[level] = 0;
7022                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7023                 memset(&wc->update_progress, 0,
7024                        sizeof(wc->update_progress));
7025         } else {
7026                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
7027                 memcpy(&wc->update_progress, &key,
7028                        sizeof(wc->update_progress));
7029
7030                 level = root_item->drop_level;
7031                 BUG_ON(level == 0);
7032                 path->lowest_level = level;
7033                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7034                 path->lowest_level = 0;
7035                 if (ret < 0) {
7036                         err = ret;
7037                         goto out_end_trans;
7038                 }
7039                 WARN_ON(ret > 0);
7040
7041                 /*
7042                  * unlock our path, this is safe because only this
7043                  * function is allowed to delete this snapshot
7044                  */
7045                 btrfs_unlock_up_safe(path, 0);
7046
7047                 level = btrfs_header_level(root->node);
7048                 while (1) {
7049                         btrfs_tree_lock(path->nodes[level]);
7050                         btrfs_set_lock_blocking(path->nodes[level]);
7051
7052                         ret = btrfs_lookup_extent_info(trans, root,
7053                                                 path->nodes[level]->start,
7054                                                 path->nodes[level]->len,
7055                                                 &wc->refs[level],
7056                                                 &wc->flags[level]);
7057                         if (ret < 0) {
7058                                 err = ret;
7059                                 goto out_end_trans;
7060                         }
7061                         BUG_ON(wc->refs[level] == 0);
7062
7063                         if (level == root_item->drop_level)
7064                                 break;
7065
7066                         btrfs_tree_unlock(path->nodes[level]);
7067                         WARN_ON(wc->refs[level] != 1);
7068                         level--;
7069                 }
7070         }
7071
7072         wc->level = level;
7073         wc->shared_level = -1;
7074         wc->stage = DROP_REFERENCE;
7075         wc->update_ref = update_ref;
7076         wc->keep_locks = 0;
7077         wc->for_reloc = for_reloc;
7078         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7079
7080         while (1) {
7081                 ret = walk_down_tree(trans, root, path, wc);
7082                 if (ret < 0) {
7083                         err = ret;
7084                         break;
7085                 }
7086
7087                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
7088                 if (ret < 0) {
7089                         err = ret;
7090                         break;
7091                 }
7092
7093                 if (ret > 0) {
7094                         BUG_ON(wc->stage != DROP_REFERENCE);
7095                         break;
7096                 }
7097
7098                 if (wc->stage == DROP_REFERENCE) {
7099                         level = wc->level;
7100                         btrfs_node_key(path->nodes[level],
7101                                        &root_item->drop_progress,
7102                                        path->slots[level]);
7103                         root_item->drop_level = level;
7104                 }
7105
7106                 BUG_ON(wc->level == 0);
7107                 if (btrfs_should_end_transaction(trans, tree_root)) {
7108                         ret = btrfs_update_root(trans, tree_root,
7109                                                 &root->root_key,
7110                                                 root_item);
7111                         if (ret) {
7112                                 btrfs_abort_transaction(trans, tree_root, ret);
7113                                 err = ret;
7114                                 goto out_end_trans;
7115                         }
7116
7117                         btrfs_end_transaction_throttle(trans, tree_root);
7118                         trans = btrfs_start_transaction(tree_root, 0);
7119                         if (IS_ERR(trans)) {
7120                                 err = PTR_ERR(trans);
7121                                 goto out_free;
7122                         }
7123                         if (block_rsv)
7124                                 trans->block_rsv = block_rsv;
7125                 }
7126         }
7127         btrfs_release_path(path);
7128         if (err)
7129                 goto out_end_trans;
7130
7131         ret = btrfs_del_root(trans, tree_root, &root->root_key);
7132         if (ret) {
7133                 btrfs_abort_transaction(trans, tree_root, ret);
7134                 goto out_end_trans;
7135         }
7136
7137         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7138                 ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
7139                                            NULL, NULL);
7140                 if (ret < 0) {
7141                         btrfs_abort_transaction(trans, tree_root, ret);
7142                         err = ret;
7143                         goto out_end_trans;
7144                 } else if (ret > 0) {
7145                         /* if we fail to delete the orphan item this time
7146                          * around, it'll get picked up the next time.
7147                          *
7148                          * The most common failure here is just -ENOENT.
7149                          */
7150                         btrfs_del_orphan_item(trans, tree_root,
7151                                               root->root_key.objectid);
7152                 }
7153         }
7154
7155         if (root->in_radix) {
7156                 btrfs_free_fs_root(tree_root->fs_info, root);
7157         } else {
7158                 free_extent_buffer(root->node);
7159                 free_extent_buffer(root->commit_root);
7160                 kfree(root);
7161         }
7162 out_end_trans:
7163         btrfs_end_transaction_throttle(trans, tree_root);
7164 out_free:
7165         kfree(wc);
7166         btrfs_free_path(path);
7167 out:
7168         if (err)
7169                 btrfs_std_error(root->fs_info, err);
7170         return err;
7171 }
7172
7173 /*
7174  * drop subtree rooted at tree block 'node'.
7175  *
7176  * NOTE: this function will unlock and release tree block 'node'
7177  * only used by relocation code
7178  */
7179 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
7180                         struct btrfs_root *root,
7181                         struct extent_buffer *node,
7182                         struct extent_buffer *parent)
7183 {
7184         struct btrfs_path *path;
7185         struct walk_control *wc;
7186         int level;
7187         int parent_level;
7188         int ret = 0;
7189         int wret;
7190
7191         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7192
7193         path = btrfs_alloc_path();
7194         if (!path)
7195                 return -ENOMEM;
7196
7197         wc = kzalloc(sizeof(*wc), GFP_NOFS);
7198         if (!wc) {
7199                 btrfs_free_path(path);
7200                 return -ENOMEM;
7201         }
7202
7203         btrfs_assert_tree_locked(parent);
7204         parent_level = btrfs_header_level(parent);
7205         extent_buffer_get(parent);
7206         path->nodes[parent_level] = parent;
7207         path->slots[parent_level] = btrfs_header_nritems(parent);
7208
7209         btrfs_assert_tree_locked(node);
7210         level = btrfs_header_level(node);
7211         path->nodes[level] = node;
7212         path->slots[level] = 0;
7213         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7214
7215         wc->refs[parent_level] = 1;
7216         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7217         wc->level = level;
7218         wc->shared_level = -1;
7219         wc->stage = DROP_REFERENCE;
7220         wc->update_ref = 0;
7221         wc->keep_locks = 1;
7222         wc->for_reloc = 1;
7223         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7224
7225         while (1) {
7226                 wret = walk_down_tree(trans, root, path, wc);
7227                 if (wret < 0) {
7228                         ret = wret;
7229                         break;
7230                 }
7231
7232                 wret = walk_up_tree(trans, root, path, wc, parent_level);
7233                 if (wret < 0)
7234                         ret = wret;
7235                 if (wret != 0)
7236                         break;
7237         }
7238
7239         kfree(wc);
7240         btrfs_free_path(path);
7241         return ret;
7242 }
7243
7244 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7245 {
7246         u64 num_devices;
7247         u64 stripped;
7248
7249         /*
7250          * if restripe for this chunk_type is on pick target profile and
7251          * return, otherwise do the usual balance
7252          */
7253         stripped = get_restripe_target(root->fs_info, flags);
7254         if (stripped)
7255                 return extended_to_chunk(stripped);
7256
7257         /*
7258          * we add in the count of missing devices because we want
7259          * to make sure that any RAID levels on a degraded FS
7260          * continue to be honored.
7261          */
7262         num_devices = root->fs_info->fs_devices->rw_devices +
7263                 root->fs_info->fs_devices->missing_devices;
7264
7265         stripped = BTRFS_BLOCK_GROUP_RAID0 |
7266                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7267
7268         if (num_devices == 1) {
7269                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7270                 stripped = flags & ~stripped;
7271
7272                 /* turn raid0 into single device chunks */
7273                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
7274                         return stripped;
7275
7276                 /* turn mirroring into duplication */
7277                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7278                              BTRFS_BLOCK_GROUP_RAID10))
7279                         return stripped | BTRFS_BLOCK_GROUP_DUP;
7280         } else {
7281                 /* they already had raid on here, just return */
7282                 if (flags & stripped)
7283                         return flags;
7284
7285                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7286                 stripped = flags & ~stripped;
7287
7288                 /* switch duplicated blocks with raid1 */
7289                 if (flags & BTRFS_BLOCK_GROUP_DUP)
7290                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
7291
7292                 /* this is drive concat, leave it alone */
7293         }
7294
7295         return flags;
7296 }
7297
7298 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
7299 {
7300         struct btrfs_space_info *sinfo = cache->space_info;
7301         u64 num_bytes;
7302         u64 min_allocable_bytes;
7303         int ret = -ENOSPC;
7304
7305
7306         /*
7307          * We need some metadata space and system metadata space for
7308          * allocating chunks in some corner cases until we force to set
7309          * it to be readonly.
7310          */
7311         if ((sinfo->flags &
7312              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
7313             !force)
7314                 min_allocable_bytes = 1 * 1024 * 1024;
7315         else
7316                 min_allocable_bytes = 0;
7317
7318         spin_lock(&sinfo->lock);
7319         spin_lock(&cache->lock);
7320
7321         if (cache->ro) {
7322                 ret = 0;
7323                 goto out;
7324         }
7325
7326         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7327                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7328
7329         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7330             sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
7331             min_allocable_bytes <= sinfo->total_bytes) {
7332                 sinfo->bytes_readonly += num_bytes;
7333                 cache->ro = 1;
7334                 ret = 0;
7335         }
7336 out:
7337         spin_unlock(&cache->lock);
7338         spin_unlock(&sinfo->lock);
7339         return ret;
7340 }
7341
7342 int btrfs_set_block_group_ro(struct btrfs_root *root,
7343                              struct btrfs_block_group_cache *cache)
7344
7345 {
7346         struct btrfs_trans_handle *trans;
7347         u64 alloc_flags;
7348         int ret;
7349
7350         BUG_ON(cache->ro);
7351
7352         trans = btrfs_join_transaction(root);
7353         if (IS_ERR(trans))
7354                 return PTR_ERR(trans);
7355
7356         alloc_flags = update_block_group_flags(root, cache->flags);
7357         if (alloc_flags != cache->flags) {
7358                 ret = do_chunk_alloc(trans, root, alloc_flags,
7359                                      CHUNK_ALLOC_FORCE);
7360                 if (ret < 0)
7361                         goto out;
7362         }
7363
7364         ret = set_block_group_ro(cache, 0);
7365         if (!ret)
7366                 goto out;
7367         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7368         ret = do_chunk_alloc(trans, root, alloc_flags,
7369                              CHUNK_ALLOC_FORCE);
7370         if (ret < 0)
7371                 goto out;
7372         ret = set_block_group_ro(cache, 0);
7373 out:
7374         btrfs_end_transaction(trans, root);
7375         return ret;
7376 }
7377
7378 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
7379                             struct btrfs_root *root, u64 type)
7380 {
7381         u64 alloc_flags = get_alloc_profile(root, type);
7382         return do_chunk_alloc(trans, root, alloc_flags,
7383                               CHUNK_ALLOC_FORCE);
7384 }
7385
7386 /*
7387  * helper to account the unused space of all the readonly block group in the
7388  * list. takes mirrors into account.
7389  */
7390 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
7391 {
7392         struct btrfs_block_group_cache *block_group;
7393         u64 free_bytes = 0;
7394         int factor;
7395
7396         list_for_each_entry(block_group, groups_list, list) {
7397                 spin_lock(&block_group->lock);
7398
7399                 if (!block_group->ro) {
7400                         spin_unlock(&block_group->lock);
7401                         continue;
7402                 }
7403
7404                 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
7405                                           BTRFS_BLOCK_GROUP_RAID10 |
7406                                           BTRFS_BLOCK_GROUP_DUP))
7407                         factor = 2;
7408                 else
7409                         factor = 1;
7410
7411                 free_bytes += (block_group->key.offset -
7412                                btrfs_block_group_used(&block_group->item)) *
7413                                factor;
7414
7415                 spin_unlock(&block_group->lock);
7416         }
7417
7418         return free_bytes;
7419 }
7420
7421 /*
7422  * helper to account the unused space of all the readonly block group in the
7423  * space_info. takes mirrors into account.
7424  */
7425 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
7426 {
7427         int i;
7428         u64 free_bytes = 0;
7429
7430         spin_lock(&sinfo->lock);
7431
7432         for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
7433                 if (!list_empty(&sinfo->block_groups[i]))
7434                         free_bytes += __btrfs_get_ro_block_group_free_space(
7435                                                 &sinfo->block_groups[i]);
7436
7437         spin_unlock(&sinfo->lock);
7438
7439         return free_bytes;
7440 }
7441
7442 void btrfs_set_block_group_rw(struct btrfs_root *root,
7443                               struct btrfs_block_group_cache *cache)
7444 {
7445         struct btrfs_space_info *sinfo = cache->space_info;
7446         u64 num_bytes;
7447
7448         BUG_ON(!cache->ro);
7449
7450         spin_lock(&sinfo->lock);
7451         spin_lock(&cache->lock);
7452         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7453                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7454         sinfo->bytes_readonly -= num_bytes;
7455         cache->ro = 0;
7456         spin_unlock(&cache->lock);
7457         spin_unlock(&sinfo->lock);
7458 }
7459
7460 /*
7461  * checks to see if its even possible to relocate this block group.
7462  *
7463  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7464  * ok to go ahead and try.
7465  */
7466 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7467 {
7468         struct btrfs_block_group_cache *block_group;
7469         struct btrfs_space_info *space_info;
7470         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7471         struct btrfs_device *device;
7472         u64 min_free;
7473         u64 dev_min = 1;
7474         u64 dev_nr = 0;
7475         u64 target;
7476         int index;
7477         int full = 0;
7478         int ret = 0;
7479
7480         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7481
7482         /* odd, couldn't find the block group, leave it alone */
7483         if (!block_group)
7484                 return -1;
7485
7486         min_free = btrfs_block_group_used(&block_group->item);
7487
7488         /* no bytes used, we're good */
7489         if (!min_free)
7490                 goto out;
7491
7492         space_info = block_group->space_info;
7493         spin_lock(&space_info->lock);
7494
7495         full = space_info->full;
7496
7497         /*
7498          * if this is the last block group we have in this space, we can't
7499          * relocate it unless we're able to allocate a new chunk below.
7500          *
7501          * Otherwise, we need to make sure we have room in the space to handle
7502          * all of the extents from this block group.  If we can, we're good
7503          */
7504         if ((space_info->total_bytes != block_group->key.offset) &&
7505             (space_info->bytes_used + space_info->bytes_reserved +
7506              space_info->bytes_pinned + space_info->bytes_readonly +
7507              min_free < space_info->total_bytes)) {
7508                 spin_unlock(&space_info->lock);
7509                 goto out;
7510         }
7511         spin_unlock(&space_info->lock);
7512
7513         /*
7514          * ok we don't have enough space, but maybe we have free space on our
7515          * devices to allocate new chunks for relocation, so loop through our
7516          * alloc devices and guess if we have enough space.  if this block
7517          * group is going to be restriped, run checks against the target
7518          * profile instead of the current one.
7519          */
7520         ret = -1;
7521
7522         /*
7523          * index:
7524          *      0: raid10
7525          *      1: raid1
7526          *      2: dup
7527          *      3: raid0
7528          *      4: single
7529          */
7530         target = get_restripe_target(root->fs_info, block_group->flags);
7531         if (target) {
7532                 index = __get_raid_index(extended_to_chunk(target));
7533         } else {
7534                 /*
7535                  * this is just a balance, so if we were marked as full
7536                  * we know there is no space for a new chunk
7537                  */
7538                 if (full)
7539                         goto out;
7540
7541                 index = get_block_group_index(block_group);
7542         }
7543
7544         if (index == BTRFS_RAID_RAID10) {
7545                 dev_min = 4;
7546                 /* Divide by 2 */
7547                 min_free >>= 1;
7548         } else if (index == BTRFS_RAID_RAID1) {
7549                 dev_min = 2;
7550         } else if (index == BTRFS_RAID_DUP) {
7551                 /* Multiply by 2 */
7552                 min_free <<= 1;
7553         } else if (index == BTRFS_RAID_RAID0) {
7554                 dev_min = fs_devices->rw_devices;
7555                 do_div(min_free, dev_min);
7556         }
7557
7558         mutex_lock(&root->fs_info->chunk_mutex);
7559         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7560                 u64 dev_offset;
7561
7562                 /*
7563                  * check to make sure we can actually find a chunk with enough
7564                  * space to fit our block group in.
7565                  */
7566                 if (device->total_bytes > device->bytes_used + min_free &&
7567                     !device->is_tgtdev_for_dev_replace) {
7568                         ret = find_free_dev_extent(device, min_free,
7569                                                    &dev_offset, NULL);
7570                         if (!ret)
7571                                 dev_nr++;
7572
7573                         if (dev_nr >= dev_min)
7574                                 break;
7575
7576                         ret = -1;
7577                 }
7578         }
7579         mutex_unlock(&root->fs_info->chunk_mutex);
7580 out:
7581         btrfs_put_block_group(block_group);
7582         return ret;
7583 }
7584
7585 static int find_first_block_group(struct btrfs_root *root,
7586                 struct btrfs_path *path, struct btrfs_key *key)
7587 {
7588         int ret = 0;
7589         struct btrfs_key found_key;
7590         struct extent_buffer *leaf;
7591         int slot;
7592
7593         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7594         if (ret < 0)
7595                 goto out;
7596
7597         while (1) {
7598                 slot = path->slots[0];
7599                 leaf = path->nodes[0];
7600                 if (slot >= btrfs_header_nritems(leaf)) {
7601                         ret = btrfs_next_leaf(root, path);
7602                         if (ret == 0)
7603                                 continue;
7604                         if (ret < 0)
7605                                 goto out;
7606                         break;
7607                 }
7608                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7609
7610                 if (found_key.objectid >= key->objectid &&
7611                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7612                         ret = 0;
7613                         goto out;
7614                 }
7615                 path->slots[0]++;
7616         }
7617 out:
7618         return ret;
7619 }
7620
7621 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
7622 {
7623         struct btrfs_block_group_cache *block_group;
7624         u64 last = 0;
7625
7626         while (1) {
7627                 struct inode *inode;
7628
7629                 block_group = btrfs_lookup_first_block_group(info, last);
7630                 while (block_group) {
7631                         spin_lock(&block_group->lock);
7632                         if (block_group->iref)
7633                                 break;
7634                         spin_unlock(&block_group->lock);
7635                         block_group = next_block_group(info->tree_root,
7636                                                        block_group);
7637                 }
7638                 if (!block_group) {
7639                         if (last == 0)
7640                                 break;
7641                         last = 0;
7642                         continue;
7643                 }
7644
7645                 inode = block_group->inode;
7646                 block_group->iref = 0;
7647                 block_group->inode = NULL;
7648                 spin_unlock(&block_group->lock);
7649                 iput(inode);
7650                 last = block_group->key.objectid + block_group->key.offset;
7651                 btrfs_put_block_group(block_group);
7652         }
7653 }
7654
7655 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7656 {
7657         struct btrfs_block_group_cache *block_group;
7658         struct btrfs_space_info *space_info;
7659         struct btrfs_caching_control *caching_ctl;
7660         struct rb_node *n;
7661
7662         down_write(&info->extent_commit_sem);
7663         while (!list_empty(&info->caching_block_groups)) {
7664                 caching_ctl = list_entry(info->caching_block_groups.next,
7665                                          struct btrfs_caching_control, list);
7666                 list_del(&caching_ctl->list);
7667                 put_caching_control(caching_ctl);
7668         }
7669         up_write(&info->extent_commit_sem);
7670
7671         spin_lock(&info->block_group_cache_lock);
7672         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7673                 block_group = rb_entry(n, struct btrfs_block_group_cache,
7674                                        cache_node);
7675                 rb_erase(&block_group->cache_node,
7676                          &info->block_group_cache_tree);
7677                 spin_unlock(&info->block_group_cache_lock);
7678
7679                 down_write(&block_group->space_info->groups_sem);
7680                 list_del(&block_group->list);
7681                 up_write(&block_group->space_info->groups_sem);
7682
7683                 if (block_group->cached == BTRFS_CACHE_STARTED)
7684                         wait_block_group_cache_done(block_group);
7685
7686                 /*
7687                  * We haven't cached this block group, which means we could
7688                  * possibly have excluded extents on this block group.
7689                  */
7690                 if (block_group->cached == BTRFS_CACHE_NO)
7691                         free_excluded_extents(info->extent_root, block_group);
7692
7693                 btrfs_remove_free_space_cache(block_group);
7694                 btrfs_put_block_group(block_group);
7695
7696                 spin_lock(&info->block_group_cache_lock);
7697         }
7698         spin_unlock(&info->block_group_cache_lock);
7699
7700         /* now that all the block groups are freed, go through and
7701          * free all the space_info structs.  This is only called during
7702          * the final stages of unmount, and so we know nobody is
7703          * using them.  We call synchronize_rcu() once before we start,
7704          * just to be on the safe side.
7705          */
7706         synchronize_rcu();
7707
7708         release_global_block_rsv(info);
7709
7710         while(!list_empty(&info->space_info)) {
7711                 space_info = list_entry(info->space_info.next,
7712                                         struct btrfs_space_info,
7713                                         list);
7714                 if (space_info->bytes_pinned > 0 ||
7715                     space_info->bytes_reserved > 0 ||
7716                     space_info->bytes_may_use > 0) {
7717                         WARN_ON(1);
7718                         dump_space_info(space_info, 0, 0);
7719                 }
7720                 list_del(&space_info->list);
7721                 kfree(space_info);
7722         }
7723         return 0;
7724 }
7725
7726 static void __link_block_group(struct btrfs_space_info *space_info,
7727                                struct btrfs_block_group_cache *cache)
7728 {
7729         int index = get_block_group_index(cache);
7730
7731         down_write(&space_info->groups_sem);
7732         list_add_tail(&cache->list, &space_info->block_groups[index]);
7733         up_write(&space_info->groups_sem);
7734 }
7735
7736 int btrfs_read_block_groups(struct btrfs_root *root)
7737 {
7738         struct btrfs_path *path;
7739         int ret;
7740         struct btrfs_block_group_cache *cache;
7741         struct btrfs_fs_info *info = root->fs_info;
7742         struct btrfs_space_info *space_info;
7743         struct btrfs_key key;
7744         struct btrfs_key found_key;
7745         struct extent_buffer *leaf;
7746         int need_clear = 0;
7747         u64 cache_gen;
7748
7749         root = info->extent_root;
7750         key.objectid = 0;
7751         key.offset = 0;
7752         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7753         path = btrfs_alloc_path();
7754         if (!path)
7755                 return -ENOMEM;
7756         path->reada = 1;
7757
7758         cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7759         if (btrfs_test_opt(root, SPACE_CACHE) &&
7760             btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7761                 need_clear = 1;
7762         if (btrfs_test_opt(root, CLEAR_CACHE))
7763                 need_clear = 1;
7764
7765         while (1) {
7766                 ret = find_first_block_group(root, path, &key);
7767                 if (ret > 0)
7768                         break;
7769                 if (ret != 0)
7770                         goto error;
7771                 leaf = path->nodes[0];
7772                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7773                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7774                 if (!cache) {
7775                         ret = -ENOMEM;
7776                         goto error;
7777                 }
7778                 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7779                                                 GFP_NOFS);
7780                 if (!cache->free_space_ctl) {
7781                         kfree(cache);
7782                         ret = -ENOMEM;
7783                         goto error;
7784                 }
7785
7786                 atomic_set(&cache->count, 1);
7787                 spin_lock_init(&cache->lock);
7788                 cache->fs_info = info;
7789                 INIT_LIST_HEAD(&cache->list);
7790                 INIT_LIST_HEAD(&cache->cluster_list);
7791
7792                 if (need_clear) {
7793                         /*
7794                          * When we mount with old space cache, we need to
7795                          * set BTRFS_DC_CLEAR and set dirty flag.
7796                          *
7797                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
7798                          *    truncate the old free space cache inode and
7799                          *    setup a new one.
7800                          * b) Setting 'dirty flag' makes sure that we flush
7801                          *    the new space cache info onto disk.
7802                          */
7803                         cache->disk_cache_state = BTRFS_DC_CLEAR;
7804                         if (btrfs_test_opt(root, SPACE_CACHE))
7805                                 cache->dirty = 1;
7806                 }
7807
7808                 read_extent_buffer(leaf, &cache->item,
7809                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
7810                                    sizeof(cache->item));
7811                 memcpy(&cache->key, &found_key, sizeof(found_key));
7812
7813                 key.objectid = found_key.objectid + found_key.offset;
7814                 btrfs_release_path(path);
7815                 cache->flags = btrfs_block_group_flags(&cache->item);
7816                 cache->sectorsize = root->sectorsize;
7817
7818                 btrfs_init_free_space_ctl(cache);
7819
7820                 /*
7821                  * We need to exclude the super stripes now so that the space
7822                  * info has super bytes accounted for, otherwise we'll think
7823                  * we have more space than we actually do.
7824                  */
7825                 exclude_super_stripes(root, cache);
7826
7827                 /*
7828                  * check for two cases, either we are full, and therefore
7829                  * don't need to bother with the caching work since we won't
7830                  * find any space, or we are empty, and we can just add all
7831                  * the space in and be done with it.  This saves us _alot_ of
7832                  * time, particularly in the full case.
7833                  */
7834                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7835                         cache->last_byte_to_unpin = (u64)-1;
7836                         cache->cached = BTRFS_CACHE_FINISHED;
7837                         free_excluded_extents(root, cache);
7838                 } else if (btrfs_block_group_used(&cache->item) == 0) {
7839                         cache->last_byte_to_unpin = (u64)-1;
7840                         cache->cached = BTRFS_CACHE_FINISHED;
7841                         add_new_free_space(cache, root->fs_info,
7842                                            found_key.objectid,
7843                                            found_key.objectid +
7844                                            found_key.offset);
7845                         free_excluded_extents(root, cache);
7846                 }
7847
7848                 ret = update_space_info(info, cache->flags, found_key.offset,
7849                                         btrfs_block_group_used(&cache->item),
7850                                         &space_info);
7851                 BUG_ON(ret); /* -ENOMEM */
7852                 cache->space_info = space_info;
7853                 spin_lock(&cache->space_info->lock);
7854                 cache->space_info->bytes_readonly += cache->bytes_super;
7855                 spin_unlock(&cache->space_info->lock);
7856
7857                 __link_block_group(space_info, cache);
7858
7859                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7860                 BUG_ON(ret); /* Logic error */
7861
7862                 set_avail_alloc_bits(root->fs_info, cache->flags);
7863                 if (btrfs_chunk_readonly(root, cache->key.objectid))
7864                         set_block_group_ro(cache, 1);
7865         }
7866
7867         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7868                 if (!(get_alloc_profile(root, space_info->flags) &
7869                       (BTRFS_BLOCK_GROUP_RAID10 |
7870                        BTRFS_BLOCK_GROUP_RAID1 |
7871                        BTRFS_BLOCK_GROUP_DUP)))
7872                         continue;
7873                 /*
7874                  * avoid allocating from un-mirrored block group if there are
7875                  * mirrored block groups.
7876                  */
7877                 list_for_each_entry(cache, &space_info->block_groups[3], list)
7878                         set_block_group_ro(cache, 1);
7879                 list_for_each_entry(cache, &space_info->block_groups[4], list)
7880                         set_block_group_ro(cache, 1);
7881         }
7882
7883         init_global_block_rsv(info);
7884         ret = 0;
7885 error:
7886         btrfs_free_path(path);
7887         return ret;
7888 }
7889
7890 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
7891                                        struct btrfs_root *root)
7892 {
7893         struct btrfs_block_group_cache *block_group, *tmp;
7894         struct btrfs_root *extent_root = root->fs_info->extent_root;
7895         struct btrfs_block_group_item item;
7896         struct btrfs_key key;
7897         int ret = 0;
7898
7899         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
7900                                  new_bg_list) {
7901                 list_del_init(&block_group->new_bg_list);
7902
7903                 if (ret)
7904                         continue;
7905
7906                 spin_lock(&block_group->lock);
7907                 memcpy(&item, &block_group->item, sizeof(item));
7908                 memcpy(&key, &block_group->key, sizeof(key));
7909                 spin_unlock(&block_group->lock);
7910
7911                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
7912                                         sizeof(item));
7913                 if (ret)
7914                         btrfs_abort_transaction(trans, extent_root, ret);
7915         }
7916 }
7917
7918 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7919                            struct btrfs_root *root, u64 bytes_used,
7920                            u64 type, u64 chunk_objectid, u64 chunk_offset,
7921                            u64 size)
7922 {
7923         int ret;
7924         struct btrfs_root *extent_root;
7925         struct btrfs_block_group_cache *cache;
7926
7927         extent_root = root->fs_info->extent_root;
7928
7929         root->fs_info->last_trans_log_full_commit = trans->transid;
7930
7931         cache = kzalloc(sizeof(*cache), GFP_NOFS);
7932         if (!cache)
7933                 return -ENOMEM;
7934         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7935                                         GFP_NOFS);
7936         if (!cache->free_space_ctl) {
7937                 kfree(cache);
7938                 return -ENOMEM;
7939         }
7940
7941         cache->key.objectid = chunk_offset;
7942         cache->key.offset = size;
7943         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7944         cache->sectorsize = root->sectorsize;
7945         cache->fs_info = root->fs_info;
7946
7947         atomic_set(&cache->count, 1);
7948         spin_lock_init(&cache->lock);
7949         INIT_LIST_HEAD(&cache->list);
7950         INIT_LIST_HEAD(&cache->cluster_list);
7951         INIT_LIST_HEAD(&cache->new_bg_list);
7952
7953         btrfs_init_free_space_ctl(cache);
7954
7955         btrfs_set_block_group_used(&cache->item, bytes_used);
7956         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7957         cache->flags = type;
7958         btrfs_set_block_group_flags(&cache->item, type);
7959
7960         cache->last_byte_to_unpin = (u64)-1;
7961         cache->cached = BTRFS_CACHE_FINISHED;
7962         exclude_super_stripes(root, cache);
7963
7964         add_new_free_space(cache, root->fs_info, chunk_offset,
7965                            chunk_offset + size);
7966
7967         free_excluded_extents(root, cache);
7968
7969         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7970                                 &cache->space_info);
7971         BUG_ON(ret); /* -ENOMEM */
7972         update_global_block_rsv(root->fs_info);
7973
7974         spin_lock(&cache->space_info->lock);
7975         cache->space_info->bytes_readonly += cache->bytes_super;
7976         spin_unlock(&cache->space_info->lock);
7977
7978         __link_block_group(cache->space_info, cache);
7979
7980         ret = btrfs_add_block_group_cache(root->fs_info, cache);
7981         BUG_ON(ret); /* Logic error */
7982
7983         list_add_tail(&cache->new_bg_list, &trans->new_bgs);
7984
7985         set_avail_alloc_bits(extent_root->fs_info, type);
7986
7987         return 0;
7988 }
7989
7990 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7991 {
7992         u64 extra_flags = chunk_to_extended(flags) &
7993                                 BTRFS_EXTENDED_PROFILE_MASK;
7994
7995         write_seqlock(&fs_info->profiles_lock);
7996         if (flags & BTRFS_BLOCK_GROUP_DATA)
7997                 fs_info->avail_data_alloc_bits &= ~extra_flags;
7998         if (flags & BTRFS_BLOCK_GROUP_METADATA)
7999                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
8000         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
8001                 fs_info->avail_system_alloc_bits &= ~extra_flags;
8002         write_sequnlock(&fs_info->profiles_lock);
8003 }
8004
8005 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8006                              struct btrfs_root *root, u64 group_start)
8007 {
8008         struct btrfs_path *path;
8009         struct btrfs_block_group_cache *block_group;
8010         struct btrfs_free_cluster *cluster;
8011         struct btrfs_root *tree_root = root->fs_info->tree_root;
8012         struct btrfs_key key;
8013         struct inode *inode;
8014         int ret;
8015         int index;
8016         int factor;
8017
8018         root = root->fs_info->extent_root;
8019
8020         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
8021         BUG_ON(!block_group);
8022         BUG_ON(!block_group->ro);
8023
8024         /*
8025          * Free the reserved super bytes from this block group before
8026          * remove it.
8027          */
8028         free_excluded_extents(root, block_group);
8029
8030         memcpy(&key, &block_group->key, sizeof(key));
8031         index = get_block_group_index(block_group);
8032         if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
8033                                   BTRFS_BLOCK_GROUP_RAID1 |
8034                                   BTRFS_BLOCK_GROUP_RAID10))
8035                 factor = 2;
8036         else
8037                 factor = 1;
8038
8039         /* make sure this block group isn't part of an allocation cluster */
8040         cluster = &root->fs_info->data_alloc_cluster;
8041         spin_lock(&cluster->refill_lock);
8042         btrfs_return_cluster_to_free_space(block_group, cluster);
8043         spin_unlock(&cluster->refill_lock);
8044
8045         /*
8046          * make sure this block group isn't part of a metadata
8047          * allocation cluster
8048          */
8049         cluster = &root->fs_info->meta_alloc_cluster;
8050         spin_lock(&cluster->refill_lock);
8051         btrfs_return_cluster_to_free_space(block_group, cluster);
8052         spin_unlock(&cluster->refill_lock);
8053
8054         path = btrfs_alloc_path();
8055         if (!path) {
8056                 ret = -ENOMEM;
8057                 goto out;
8058         }
8059
8060         inode = lookup_free_space_inode(tree_root, block_group, path);
8061         if (!IS_ERR(inode)) {
8062                 ret = btrfs_orphan_add(trans, inode);
8063                 if (ret) {
8064                         btrfs_add_delayed_iput(inode);
8065                         goto out;
8066                 }
8067                 clear_nlink(inode);
8068                 /* One for the block groups ref */
8069                 spin_lock(&block_group->lock);
8070                 if (block_group->iref) {
8071                         block_group->iref = 0;
8072                         block_group->inode = NULL;
8073                         spin_unlock(&block_group->lock);
8074                         iput(inode);
8075                 } else {
8076                         spin_unlock(&block_group->lock);
8077                 }
8078                 /* One for our lookup ref */
8079                 btrfs_add_delayed_iput(inode);
8080         }
8081
8082         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
8083         key.offset = block_group->key.objectid;
8084         key.type = 0;
8085
8086         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
8087         if (ret < 0)
8088                 goto out;
8089         if (ret > 0)
8090                 btrfs_release_path(path);
8091         if (ret == 0) {
8092                 ret = btrfs_del_item(trans, tree_root, path);
8093                 if (ret)
8094                         goto out;
8095                 btrfs_release_path(path);
8096         }
8097
8098         spin_lock(&root->fs_info->block_group_cache_lock);
8099         rb_erase(&block_group->cache_node,
8100                  &root->fs_info->block_group_cache_tree);
8101
8102         if (root->fs_info->first_logical_byte == block_group->key.objectid)
8103                 root->fs_info->first_logical_byte = (u64)-1;
8104         spin_unlock(&root->fs_info->block_group_cache_lock);
8105
8106         down_write(&block_group->space_info->groups_sem);
8107         /*
8108          * we must use list_del_init so people can check to see if they
8109          * are still on the list after taking the semaphore
8110          */
8111         list_del_init(&block_group->list);
8112         if (list_empty(&block_group->space_info->block_groups[index]))
8113                 clear_avail_alloc_bits(root->fs_info, block_group->flags);
8114         up_write(&block_group->space_info->groups_sem);
8115
8116         if (block_group->cached == BTRFS_CACHE_STARTED)
8117                 wait_block_group_cache_done(block_group);
8118
8119         btrfs_remove_free_space_cache(block_group);
8120
8121         spin_lock(&block_group->space_info->lock);
8122         block_group->space_info->total_bytes -= block_group->key.offset;
8123         block_group->space_info->bytes_readonly -= block_group->key.offset;
8124         block_group->space_info->disk_total -= block_group->key.offset * factor;
8125         spin_unlock(&block_group->space_info->lock);
8126
8127         memcpy(&key, &block_group->key, sizeof(key));
8128
8129         btrfs_clear_space_info_full(root->fs_info);
8130
8131         btrfs_put_block_group(block_group);
8132         btrfs_put_block_group(block_group);
8133
8134         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8135         if (ret > 0)
8136                 ret = -EIO;
8137         if (ret < 0)
8138                 goto out;
8139
8140         ret = btrfs_del_item(trans, root, path);
8141 out:
8142         btrfs_free_path(path);
8143         return ret;
8144 }
8145
8146 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
8147 {
8148         struct btrfs_space_info *space_info;
8149         struct btrfs_super_block *disk_super;
8150         u64 features;
8151         u64 flags;
8152         int mixed = 0;
8153         int ret;
8154
8155         disk_super = fs_info->super_copy;
8156         if (!btrfs_super_root(disk_super))
8157                 return 1;
8158
8159         features = btrfs_super_incompat_flags(disk_super);
8160         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
8161                 mixed = 1;
8162
8163         flags = BTRFS_BLOCK_GROUP_SYSTEM;
8164         ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8165         if (ret)
8166                 goto out;
8167
8168         if (mixed) {
8169                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
8170                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8171         } else {
8172                 flags = BTRFS_BLOCK_GROUP_METADATA;
8173                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8174                 if (ret)
8175                         goto out;
8176
8177                 flags = BTRFS_BLOCK_GROUP_DATA;
8178                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8179         }
8180 out:
8181         return ret;
8182 }
8183
8184 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8185 {
8186         return unpin_extent_range(root, start, end);
8187 }
8188
8189 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8190                                u64 num_bytes, u64 *actual_bytes)
8191 {
8192         return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
8193 }
8194
8195 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8196 {
8197         struct btrfs_fs_info *fs_info = root->fs_info;
8198         struct btrfs_block_group_cache *cache = NULL;
8199         u64 group_trimmed;
8200         u64 start;
8201         u64 end;
8202         u64 trimmed = 0;
8203         u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
8204         int ret = 0;
8205
8206         /*
8207          * try to trim all FS space, our block group may start from non-zero.
8208          */
8209         if (range->len == total_bytes)
8210                 cache = btrfs_lookup_first_block_group(fs_info, range->start);
8211         else
8212                 cache = btrfs_lookup_block_group(fs_info, range->start);
8213
8214         while (cache) {
8215                 if (cache->key.objectid >= (range->start + range->len)) {
8216                         btrfs_put_block_group(cache);
8217                         break;
8218                 }
8219
8220                 start = max(range->start, cache->key.objectid);
8221                 end = min(range->start + range->len,
8222                                 cache->key.objectid + cache->key.offset);
8223
8224                 if (end - start >= range->minlen) {
8225                         if (!block_group_cache_done(cache)) {
8226                                 ret = cache_block_group(cache, 0);
8227                                 if (!ret)
8228                                         wait_block_group_cache_done(cache);
8229                         }
8230                         ret = btrfs_trim_block_group(cache,
8231                                                      &group_trimmed,
8232                                                      start,
8233                                                      end,
8234                                                      range->minlen);
8235
8236                         trimmed += group_trimmed;
8237                         if (ret) {
8238                                 btrfs_put_block_group(cache);
8239                                 break;
8240                         }
8241                 }
8242
8243                 cache = next_block_group(fs_info->tree_root, cache);
8244         }
8245
8246         range->len = trimmed;
8247         return ret;
8248 }