Btrfs: wire up the free space tree to the extent tree
[deliverable/linux.git] / fs / btrfs / extent-tree.c
1 /*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include <linux/percpu_counter.h>
28 #include "hash.h"
29 #include "tree-log.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "volumes.h"
33 #include "raid56.h"
34 #include "locking.h"
35 #include "free-space-cache.h"
36 #include "free-space-tree.h"
37 #include "math.h"
38 #include "sysfs.h"
39 #include "qgroup.h"
40
41 #undef SCRAMBLE_DELAYED_REFS
42
43 /*
44 * control flags for do_chunk_alloc's force field
45 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
46 * if we really need one.
47 *
48 * CHUNK_ALLOC_LIMITED means to only try and allocate one
49 * if we have very few chunks already allocated. This is
50 * used as part of the clustering code to help make sure
51 * we have a good pool of storage to cluster in, without
52 * filling the FS with empty chunks
53 *
54 * CHUNK_ALLOC_FORCE means it must try to allocate one
55 *
56 */
57 enum {
58 CHUNK_ALLOC_NO_FORCE = 0,
59 CHUNK_ALLOC_LIMITED = 1,
60 CHUNK_ALLOC_FORCE = 2,
61 };
62
63 /*
64 * Control how reservations are dealt with.
65 *
66 * RESERVE_FREE - freeing a reservation.
67 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
68 * ENOSPC accounting
69 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
70 * bytes_may_use as the ENOSPC accounting is done elsewhere
71 */
72 enum {
73 RESERVE_FREE = 0,
74 RESERVE_ALLOC = 1,
75 RESERVE_ALLOC_NO_ACCOUNT = 2,
76 };
77
78 static int update_block_group(struct btrfs_trans_handle *trans,
79 struct btrfs_root *root, u64 bytenr,
80 u64 num_bytes, int alloc);
81 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
82 struct btrfs_root *root,
83 struct btrfs_delayed_ref_node *node, u64 parent,
84 u64 root_objectid, u64 owner_objectid,
85 u64 owner_offset, int refs_to_drop,
86 struct btrfs_delayed_extent_op *extra_op);
87 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
88 struct extent_buffer *leaf,
89 struct btrfs_extent_item *ei);
90 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
91 struct btrfs_root *root,
92 u64 parent, u64 root_objectid,
93 u64 flags, u64 owner, u64 offset,
94 struct btrfs_key *ins, int ref_mod);
95 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
96 struct btrfs_root *root,
97 u64 parent, u64 root_objectid,
98 u64 flags, struct btrfs_disk_key *key,
99 int level, struct btrfs_key *ins,
100 int no_quota);
101 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
102 struct btrfs_root *extent_root, u64 flags,
103 int force);
104 static int find_next_key(struct btrfs_path *path, int level,
105 struct btrfs_key *key);
106 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
107 int dump_block_groups);
108 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
109 u64 num_bytes, int reserve,
110 int delalloc);
111 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
112 u64 num_bytes);
113 int btrfs_pin_extent(struct btrfs_root *root,
114 u64 bytenr, u64 num_bytes, int reserved);
115
116 static noinline int
117 block_group_cache_done(struct btrfs_block_group_cache *cache)
118 {
119 smp_mb();
120 return cache->cached == BTRFS_CACHE_FINISHED ||
121 cache->cached == BTRFS_CACHE_ERROR;
122 }
123
124 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
125 {
126 return (cache->flags & bits) == bits;
127 }
128
129 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
130 {
131 atomic_inc(&cache->count);
132 }
133
134 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
135 {
136 if (atomic_dec_and_test(&cache->count)) {
137 WARN_ON(cache->pinned > 0);
138 WARN_ON(cache->reserved > 0);
139 kfree(cache->free_space_ctl);
140 kfree(cache);
141 }
142 }
143
144 /*
145 * this adds the block group to the fs_info rb tree for the block group
146 * cache
147 */
148 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
149 struct btrfs_block_group_cache *block_group)
150 {
151 struct rb_node **p;
152 struct rb_node *parent = NULL;
153 struct btrfs_block_group_cache *cache;
154
155 spin_lock(&info->block_group_cache_lock);
156 p = &info->block_group_cache_tree.rb_node;
157
158 while (*p) {
159 parent = *p;
160 cache = rb_entry(parent, struct btrfs_block_group_cache,
161 cache_node);
162 if (block_group->key.objectid < cache->key.objectid) {
163 p = &(*p)->rb_left;
164 } else if (block_group->key.objectid > cache->key.objectid) {
165 p = &(*p)->rb_right;
166 } else {
167 spin_unlock(&info->block_group_cache_lock);
168 return -EEXIST;
169 }
170 }
171
172 rb_link_node(&block_group->cache_node, parent, p);
173 rb_insert_color(&block_group->cache_node,
174 &info->block_group_cache_tree);
175
176 if (info->first_logical_byte > block_group->key.objectid)
177 info->first_logical_byte = block_group->key.objectid;
178
179 spin_unlock(&info->block_group_cache_lock);
180
181 return 0;
182 }
183
184 /*
185 * This will return the block group at or after bytenr if contains is 0, else
186 * it will return the block group that contains the bytenr
187 */
188 static struct btrfs_block_group_cache *
189 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
190 int contains)
191 {
192 struct btrfs_block_group_cache *cache, *ret = NULL;
193 struct rb_node *n;
194 u64 end, start;
195
196 spin_lock(&info->block_group_cache_lock);
197 n = info->block_group_cache_tree.rb_node;
198
199 while (n) {
200 cache = rb_entry(n, struct btrfs_block_group_cache,
201 cache_node);
202 end = cache->key.objectid + cache->key.offset - 1;
203 start = cache->key.objectid;
204
205 if (bytenr < start) {
206 if (!contains && (!ret || start < ret->key.objectid))
207 ret = cache;
208 n = n->rb_left;
209 } else if (bytenr > start) {
210 if (contains && bytenr <= end) {
211 ret = cache;
212 break;
213 }
214 n = n->rb_right;
215 } else {
216 ret = cache;
217 break;
218 }
219 }
220 if (ret) {
221 btrfs_get_block_group(ret);
222 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
223 info->first_logical_byte = ret->key.objectid;
224 }
225 spin_unlock(&info->block_group_cache_lock);
226
227 return ret;
228 }
229
230 static int add_excluded_extent(struct btrfs_root *root,
231 u64 start, u64 num_bytes)
232 {
233 u64 end = start + num_bytes - 1;
234 set_extent_bits(&root->fs_info->freed_extents[0],
235 start, end, EXTENT_UPTODATE, GFP_NOFS);
236 set_extent_bits(&root->fs_info->freed_extents[1],
237 start, end, EXTENT_UPTODATE, GFP_NOFS);
238 return 0;
239 }
240
241 static void free_excluded_extents(struct btrfs_root *root,
242 struct btrfs_block_group_cache *cache)
243 {
244 u64 start, end;
245
246 start = cache->key.objectid;
247 end = start + cache->key.offset - 1;
248
249 clear_extent_bits(&root->fs_info->freed_extents[0],
250 start, end, EXTENT_UPTODATE, GFP_NOFS);
251 clear_extent_bits(&root->fs_info->freed_extents[1],
252 start, end, EXTENT_UPTODATE, GFP_NOFS);
253 }
254
255 static int exclude_super_stripes(struct btrfs_root *root,
256 struct btrfs_block_group_cache *cache)
257 {
258 u64 bytenr;
259 u64 *logical;
260 int stripe_len;
261 int i, nr, ret;
262
263 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
264 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
265 cache->bytes_super += stripe_len;
266 ret = add_excluded_extent(root, cache->key.objectid,
267 stripe_len);
268 if (ret)
269 return ret;
270 }
271
272 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
273 bytenr = btrfs_sb_offset(i);
274 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
275 cache->key.objectid, bytenr,
276 0, &logical, &nr, &stripe_len);
277 if (ret)
278 return ret;
279
280 while (nr--) {
281 u64 start, len;
282
283 if (logical[nr] > cache->key.objectid +
284 cache->key.offset)
285 continue;
286
287 if (logical[nr] + stripe_len <= cache->key.objectid)
288 continue;
289
290 start = logical[nr];
291 if (start < cache->key.objectid) {
292 start = cache->key.objectid;
293 len = (logical[nr] + stripe_len) - start;
294 } else {
295 len = min_t(u64, stripe_len,
296 cache->key.objectid +
297 cache->key.offset - start);
298 }
299
300 cache->bytes_super += len;
301 ret = add_excluded_extent(root, start, len);
302 if (ret) {
303 kfree(logical);
304 return ret;
305 }
306 }
307
308 kfree(logical);
309 }
310 return 0;
311 }
312
313 static struct btrfs_caching_control *
314 get_caching_control(struct btrfs_block_group_cache *cache)
315 {
316 struct btrfs_caching_control *ctl;
317
318 spin_lock(&cache->lock);
319 if (!cache->caching_ctl) {
320 spin_unlock(&cache->lock);
321 return NULL;
322 }
323
324 ctl = cache->caching_ctl;
325 atomic_inc(&ctl->count);
326 spin_unlock(&cache->lock);
327 return ctl;
328 }
329
330 static void put_caching_control(struct btrfs_caching_control *ctl)
331 {
332 if (atomic_dec_and_test(&ctl->count))
333 kfree(ctl);
334 }
335
336 /*
337 * this is only called by cache_block_group, since we could have freed extents
338 * we need to check the pinned_extents for any extents that can't be used yet
339 * since their free space will be released as soon as the transaction commits.
340 */
341 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
342 struct btrfs_fs_info *info, u64 start, u64 end)
343 {
344 u64 extent_start, extent_end, size, total_added = 0;
345 int ret;
346
347 while (start < end) {
348 ret = find_first_extent_bit(info->pinned_extents, start,
349 &extent_start, &extent_end,
350 EXTENT_DIRTY | EXTENT_UPTODATE,
351 NULL);
352 if (ret)
353 break;
354
355 if (extent_start <= start) {
356 start = extent_end + 1;
357 } else if (extent_start > start && extent_start < end) {
358 size = extent_start - start;
359 total_added += size;
360 ret = btrfs_add_free_space(block_group, start,
361 size);
362 BUG_ON(ret); /* -ENOMEM or logic error */
363 start = extent_end + 1;
364 } else {
365 break;
366 }
367 }
368
369 if (start < end) {
370 size = end - start;
371 total_added += size;
372 ret = btrfs_add_free_space(block_group, start, size);
373 BUG_ON(ret); /* -ENOMEM or logic error */
374 }
375
376 return total_added;
377 }
378
379 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
380 {
381 struct btrfs_block_group_cache *block_group;
382 struct btrfs_fs_info *fs_info;
383 struct btrfs_root *extent_root;
384 struct btrfs_path *path;
385 struct extent_buffer *leaf;
386 struct btrfs_key key;
387 u64 total_found = 0;
388 u64 last = 0;
389 u32 nritems;
390 int ret;
391
392 block_group = caching_ctl->block_group;
393 fs_info = block_group->fs_info;
394 extent_root = fs_info->extent_root;
395
396 path = btrfs_alloc_path();
397 if (!path)
398 return -ENOMEM;
399
400 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
401
402 /*
403 * We don't want to deadlock with somebody trying to allocate a new
404 * extent for the extent root while also trying to search the extent
405 * root to add free space. So we skip locking and search the commit
406 * root, since its read-only
407 */
408 path->skip_locking = 1;
409 path->search_commit_root = 1;
410 path->reada = 1;
411
412 key.objectid = last;
413 key.offset = 0;
414 key.type = BTRFS_EXTENT_ITEM_KEY;
415
416 next:
417 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
418 if (ret < 0)
419 goto out;
420
421 leaf = path->nodes[0];
422 nritems = btrfs_header_nritems(leaf);
423
424 while (1) {
425 if (btrfs_fs_closing(fs_info) > 1) {
426 last = (u64)-1;
427 break;
428 }
429
430 if (path->slots[0] < nritems) {
431 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
432 } else {
433 ret = find_next_key(path, 0, &key);
434 if (ret)
435 break;
436
437 if (need_resched() ||
438 rwsem_is_contended(&fs_info->commit_root_sem)) {
439 caching_ctl->progress = last;
440 btrfs_release_path(path);
441 up_read(&fs_info->commit_root_sem);
442 mutex_unlock(&caching_ctl->mutex);
443 cond_resched();
444 mutex_lock(&caching_ctl->mutex);
445 down_read(&fs_info->commit_root_sem);
446 goto next;
447 }
448
449 ret = btrfs_next_leaf(extent_root, path);
450 if (ret < 0)
451 goto out;
452 if (ret)
453 break;
454 leaf = path->nodes[0];
455 nritems = btrfs_header_nritems(leaf);
456 continue;
457 }
458
459 if (key.objectid < last) {
460 key.objectid = last;
461 key.offset = 0;
462 key.type = BTRFS_EXTENT_ITEM_KEY;
463
464 caching_ctl->progress = last;
465 btrfs_release_path(path);
466 goto next;
467 }
468
469 if (key.objectid < block_group->key.objectid) {
470 path->slots[0]++;
471 continue;
472 }
473
474 if (key.objectid >= block_group->key.objectid +
475 block_group->key.offset)
476 break;
477
478 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
479 key.type == BTRFS_METADATA_ITEM_KEY) {
480 total_found += add_new_free_space(block_group,
481 fs_info, last,
482 key.objectid);
483 if (key.type == BTRFS_METADATA_ITEM_KEY)
484 last = key.objectid +
485 fs_info->tree_root->nodesize;
486 else
487 last = key.objectid + key.offset;
488
489 if (total_found > CACHING_CTL_WAKE_UP) {
490 total_found = 0;
491 wake_up(&caching_ctl->wait);
492 }
493 }
494 path->slots[0]++;
495 }
496 ret = 0;
497
498 total_found += add_new_free_space(block_group, fs_info, last,
499 block_group->key.objectid +
500 block_group->key.offset);
501 caching_ctl->progress = (u64)-1;
502
503 out:
504 btrfs_free_path(path);
505 return ret;
506 }
507
508 static noinline void caching_thread(struct btrfs_work *work)
509 {
510 struct btrfs_block_group_cache *block_group;
511 struct btrfs_fs_info *fs_info;
512 struct btrfs_caching_control *caching_ctl;
513 int ret;
514
515 caching_ctl = container_of(work, struct btrfs_caching_control, work);
516 block_group = caching_ctl->block_group;
517 fs_info = block_group->fs_info;
518
519 mutex_lock(&caching_ctl->mutex);
520 down_read(&fs_info->commit_root_sem);
521
522 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
523 ret = load_free_space_tree(caching_ctl);
524 else
525 ret = load_extent_tree_free(caching_ctl);
526
527 spin_lock(&block_group->lock);
528 block_group->caching_ctl = NULL;
529 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
530 spin_unlock(&block_group->lock);
531
532 up_read(&fs_info->commit_root_sem);
533 free_excluded_extents(fs_info->extent_root, block_group);
534 mutex_unlock(&caching_ctl->mutex);
535
536 wake_up(&caching_ctl->wait);
537
538 put_caching_control(caching_ctl);
539 btrfs_put_block_group(block_group);
540 }
541
542 static int cache_block_group(struct btrfs_block_group_cache *cache,
543 int load_cache_only)
544 {
545 DEFINE_WAIT(wait);
546 struct btrfs_fs_info *fs_info = cache->fs_info;
547 struct btrfs_caching_control *caching_ctl;
548 int ret = 0;
549
550 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
551 if (!caching_ctl)
552 return -ENOMEM;
553
554 INIT_LIST_HEAD(&caching_ctl->list);
555 mutex_init(&caching_ctl->mutex);
556 init_waitqueue_head(&caching_ctl->wait);
557 caching_ctl->block_group = cache;
558 caching_ctl->progress = cache->key.objectid;
559 atomic_set(&caching_ctl->count, 1);
560 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
561 caching_thread, NULL, NULL);
562
563 spin_lock(&cache->lock);
564 /*
565 * This should be a rare occasion, but this could happen I think in the
566 * case where one thread starts to load the space cache info, and then
567 * some other thread starts a transaction commit which tries to do an
568 * allocation while the other thread is still loading the space cache
569 * info. The previous loop should have kept us from choosing this block
570 * group, but if we've moved to the state where we will wait on caching
571 * block groups we need to first check if we're doing a fast load here,
572 * so we can wait for it to finish, otherwise we could end up allocating
573 * from a block group who's cache gets evicted for one reason or
574 * another.
575 */
576 while (cache->cached == BTRFS_CACHE_FAST) {
577 struct btrfs_caching_control *ctl;
578
579 ctl = cache->caching_ctl;
580 atomic_inc(&ctl->count);
581 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
582 spin_unlock(&cache->lock);
583
584 schedule();
585
586 finish_wait(&ctl->wait, &wait);
587 put_caching_control(ctl);
588 spin_lock(&cache->lock);
589 }
590
591 if (cache->cached != BTRFS_CACHE_NO) {
592 spin_unlock(&cache->lock);
593 kfree(caching_ctl);
594 return 0;
595 }
596 WARN_ON(cache->caching_ctl);
597 cache->caching_ctl = caching_ctl;
598 cache->cached = BTRFS_CACHE_FAST;
599 spin_unlock(&cache->lock);
600
601 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
602 mutex_lock(&caching_ctl->mutex);
603 ret = load_free_space_cache(fs_info, cache);
604
605 spin_lock(&cache->lock);
606 if (ret == 1) {
607 cache->caching_ctl = NULL;
608 cache->cached = BTRFS_CACHE_FINISHED;
609 cache->last_byte_to_unpin = (u64)-1;
610 caching_ctl->progress = (u64)-1;
611 } else {
612 if (load_cache_only) {
613 cache->caching_ctl = NULL;
614 cache->cached = BTRFS_CACHE_NO;
615 } else {
616 cache->cached = BTRFS_CACHE_STARTED;
617 cache->has_caching_ctl = 1;
618 }
619 }
620 spin_unlock(&cache->lock);
621 mutex_unlock(&caching_ctl->mutex);
622
623 wake_up(&caching_ctl->wait);
624 if (ret == 1) {
625 put_caching_control(caching_ctl);
626 free_excluded_extents(fs_info->extent_root, cache);
627 return 0;
628 }
629 } else {
630 /*
631 * We're either using the free space tree or no caching at all.
632 * Set cached to the appropriate value and wakeup any waiters.
633 */
634 spin_lock(&cache->lock);
635 if (load_cache_only) {
636 cache->caching_ctl = NULL;
637 cache->cached = BTRFS_CACHE_NO;
638 } else {
639 cache->cached = BTRFS_CACHE_STARTED;
640 cache->has_caching_ctl = 1;
641 }
642 spin_unlock(&cache->lock);
643 wake_up(&caching_ctl->wait);
644 }
645
646 if (load_cache_only) {
647 put_caching_control(caching_ctl);
648 return 0;
649 }
650
651 down_write(&fs_info->commit_root_sem);
652 atomic_inc(&caching_ctl->count);
653 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
654 up_write(&fs_info->commit_root_sem);
655
656 btrfs_get_block_group(cache);
657
658 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
659
660 return ret;
661 }
662
663 /*
664 * return the block group that starts at or after bytenr
665 */
666 static struct btrfs_block_group_cache *
667 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
668 {
669 struct btrfs_block_group_cache *cache;
670
671 cache = block_group_cache_tree_search(info, bytenr, 0);
672
673 return cache;
674 }
675
676 /*
677 * return the block group that contains the given bytenr
678 */
679 struct btrfs_block_group_cache *btrfs_lookup_block_group(
680 struct btrfs_fs_info *info,
681 u64 bytenr)
682 {
683 struct btrfs_block_group_cache *cache;
684
685 cache = block_group_cache_tree_search(info, bytenr, 1);
686
687 return cache;
688 }
689
690 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
691 u64 flags)
692 {
693 struct list_head *head = &info->space_info;
694 struct btrfs_space_info *found;
695
696 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
697
698 rcu_read_lock();
699 list_for_each_entry_rcu(found, head, list) {
700 if (found->flags & flags) {
701 rcu_read_unlock();
702 return found;
703 }
704 }
705 rcu_read_unlock();
706 return NULL;
707 }
708
709 /*
710 * after adding space to the filesystem, we need to clear the full flags
711 * on all the space infos.
712 */
713 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
714 {
715 struct list_head *head = &info->space_info;
716 struct btrfs_space_info *found;
717
718 rcu_read_lock();
719 list_for_each_entry_rcu(found, head, list)
720 found->full = 0;
721 rcu_read_unlock();
722 }
723
724 /* simple helper to search for an existing data extent at a given offset */
725 int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
726 {
727 int ret;
728 struct btrfs_key key;
729 struct btrfs_path *path;
730
731 path = btrfs_alloc_path();
732 if (!path)
733 return -ENOMEM;
734
735 key.objectid = start;
736 key.offset = len;
737 key.type = BTRFS_EXTENT_ITEM_KEY;
738 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
739 0, 0);
740 btrfs_free_path(path);
741 return ret;
742 }
743
744 /*
745 * helper function to lookup reference count and flags of a tree block.
746 *
747 * the head node for delayed ref is used to store the sum of all the
748 * reference count modifications queued up in the rbtree. the head
749 * node may also store the extent flags to set. This way you can check
750 * to see what the reference count and extent flags would be if all of
751 * the delayed refs are not processed.
752 */
753 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
754 struct btrfs_root *root, u64 bytenr,
755 u64 offset, int metadata, u64 *refs, u64 *flags)
756 {
757 struct btrfs_delayed_ref_head *head;
758 struct btrfs_delayed_ref_root *delayed_refs;
759 struct btrfs_path *path;
760 struct btrfs_extent_item *ei;
761 struct extent_buffer *leaf;
762 struct btrfs_key key;
763 u32 item_size;
764 u64 num_refs;
765 u64 extent_flags;
766 int ret;
767
768 /*
769 * If we don't have skinny metadata, don't bother doing anything
770 * different
771 */
772 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
773 offset = root->nodesize;
774 metadata = 0;
775 }
776
777 path = btrfs_alloc_path();
778 if (!path)
779 return -ENOMEM;
780
781 if (!trans) {
782 path->skip_locking = 1;
783 path->search_commit_root = 1;
784 }
785
786 search_again:
787 key.objectid = bytenr;
788 key.offset = offset;
789 if (metadata)
790 key.type = BTRFS_METADATA_ITEM_KEY;
791 else
792 key.type = BTRFS_EXTENT_ITEM_KEY;
793
794 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
795 &key, path, 0, 0);
796 if (ret < 0)
797 goto out_free;
798
799 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
800 if (path->slots[0]) {
801 path->slots[0]--;
802 btrfs_item_key_to_cpu(path->nodes[0], &key,
803 path->slots[0]);
804 if (key.objectid == bytenr &&
805 key.type == BTRFS_EXTENT_ITEM_KEY &&
806 key.offset == root->nodesize)
807 ret = 0;
808 }
809 }
810
811 if (ret == 0) {
812 leaf = path->nodes[0];
813 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
814 if (item_size >= sizeof(*ei)) {
815 ei = btrfs_item_ptr(leaf, path->slots[0],
816 struct btrfs_extent_item);
817 num_refs = btrfs_extent_refs(leaf, ei);
818 extent_flags = btrfs_extent_flags(leaf, ei);
819 } else {
820 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
821 struct btrfs_extent_item_v0 *ei0;
822 BUG_ON(item_size != sizeof(*ei0));
823 ei0 = btrfs_item_ptr(leaf, path->slots[0],
824 struct btrfs_extent_item_v0);
825 num_refs = btrfs_extent_refs_v0(leaf, ei0);
826 /* FIXME: this isn't correct for data */
827 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
828 #else
829 BUG();
830 #endif
831 }
832 BUG_ON(num_refs == 0);
833 } else {
834 num_refs = 0;
835 extent_flags = 0;
836 ret = 0;
837 }
838
839 if (!trans)
840 goto out;
841
842 delayed_refs = &trans->transaction->delayed_refs;
843 spin_lock(&delayed_refs->lock);
844 head = btrfs_find_delayed_ref_head(trans, bytenr);
845 if (head) {
846 if (!mutex_trylock(&head->mutex)) {
847 atomic_inc(&head->node.refs);
848 spin_unlock(&delayed_refs->lock);
849
850 btrfs_release_path(path);
851
852 /*
853 * Mutex was contended, block until it's released and try
854 * again
855 */
856 mutex_lock(&head->mutex);
857 mutex_unlock(&head->mutex);
858 btrfs_put_delayed_ref(&head->node);
859 goto search_again;
860 }
861 spin_lock(&head->lock);
862 if (head->extent_op && head->extent_op->update_flags)
863 extent_flags |= head->extent_op->flags_to_set;
864 else
865 BUG_ON(num_refs == 0);
866
867 num_refs += head->node.ref_mod;
868 spin_unlock(&head->lock);
869 mutex_unlock(&head->mutex);
870 }
871 spin_unlock(&delayed_refs->lock);
872 out:
873 WARN_ON(num_refs == 0);
874 if (refs)
875 *refs = num_refs;
876 if (flags)
877 *flags = extent_flags;
878 out_free:
879 btrfs_free_path(path);
880 return ret;
881 }
882
883 /*
884 * Back reference rules. Back refs have three main goals:
885 *
886 * 1) differentiate between all holders of references to an extent so that
887 * when a reference is dropped we can make sure it was a valid reference
888 * before freeing the extent.
889 *
890 * 2) Provide enough information to quickly find the holders of an extent
891 * if we notice a given block is corrupted or bad.
892 *
893 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
894 * maintenance. This is actually the same as #2, but with a slightly
895 * different use case.
896 *
897 * There are two kinds of back refs. The implicit back refs is optimized
898 * for pointers in non-shared tree blocks. For a given pointer in a block,
899 * back refs of this kind provide information about the block's owner tree
900 * and the pointer's key. These information allow us to find the block by
901 * b-tree searching. The full back refs is for pointers in tree blocks not
902 * referenced by their owner trees. The location of tree block is recorded
903 * in the back refs. Actually the full back refs is generic, and can be
904 * used in all cases the implicit back refs is used. The major shortcoming
905 * of the full back refs is its overhead. Every time a tree block gets
906 * COWed, we have to update back refs entry for all pointers in it.
907 *
908 * For a newly allocated tree block, we use implicit back refs for
909 * pointers in it. This means most tree related operations only involve
910 * implicit back refs. For a tree block created in old transaction, the
911 * only way to drop a reference to it is COW it. So we can detect the
912 * event that tree block loses its owner tree's reference and do the
913 * back refs conversion.
914 *
915 * When a tree block is COW'd through a tree, there are four cases:
916 *
917 * The reference count of the block is one and the tree is the block's
918 * owner tree. Nothing to do in this case.
919 *
920 * The reference count of the block is one and the tree is not the
921 * block's owner tree. In this case, full back refs is used for pointers
922 * in the block. Remove these full back refs, add implicit back refs for
923 * every pointers in the new block.
924 *
925 * The reference count of the block is greater than one and the tree is
926 * the block's owner tree. In this case, implicit back refs is used for
927 * pointers in the block. Add full back refs for every pointers in the
928 * block, increase lower level extents' reference counts. The original
929 * implicit back refs are entailed to the new block.
930 *
931 * The reference count of the block is greater than one and the tree is
932 * not the block's owner tree. Add implicit back refs for every pointer in
933 * the new block, increase lower level extents' reference count.
934 *
935 * Back Reference Key composing:
936 *
937 * The key objectid corresponds to the first byte in the extent,
938 * The key type is used to differentiate between types of back refs.
939 * There are different meanings of the key offset for different types
940 * of back refs.
941 *
942 * File extents can be referenced by:
943 *
944 * - multiple snapshots, subvolumes, or different generations in one subvol
945 * - different files inside a single subvolume
946 * - different offsets inside a file (bookend extents in file.c)
947 *
948 * The extent ref structure for the implicit back refs has fields for:
949 *
950 * - Objectid of the subvolume root
951 * - objectid of the file holding the reference
952 * - original offset in the file
953 * - how many bookend extents
954 *
955 * The key offset for the implicit back refs is hash of the first
956 * three fields.
957 *
958 * The extent ref structure for the full back refs has field for:
959 *
960 * - number of pointers in the tree leaf
961 *
962 * The key offset for the implicit back refs is the first byte of
963 * the tree leaf
964 *
965 * When a file extent is allocated, The implicit back refs is used.
966 * the fields are filled in:
967 *
968 * (root_key.objectid, inode objectid, offset in file, 1)
969 *
970 * When a file extent is removed file truncation, we find the
971 * corresponding implicit back refs and check the following fields:
972 *
973 * (btrfs_header_owner(leaf), inode objectid, offset in file)
974 *
975 * Btree extents can be referenced by:
976 *
977 * - Different subvolumes
978 *
979 * Both the implicit back refs and the full back refs for tree blocks
980 * only consist of key. The key offset for the implicit back refs is
981 * objectid of block's owner tree. The key offset for the full back refs
982 * is the first byte of parent block.
983 *
984 * When implicit back refs is used, information about the lowest key and
985 * level of the tree block are required. These information are stored in
986 * tree block info structure.
987 */
988
989 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
990 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
991 struct btrfs_root *root,
992 struct btrfs_path *path,
993 u64 owner, u32 extra_size)
994 {
995 struct btrfs_extent_item *item;
996 struct btrfs_extent_item_v0 *ei0;
997 struct btrfs_extent_ref_v0 *ref0;
998 struct btrfs_tree_block_info *bi;
999 struct extent_buffer *leaf;
1000 struct btrfs_key key;
1001 struct btrfs_key found_key;
1002 u32 new_size = sizeof(*item);
1003 u64 refs;
1004 int ret;
1005
1006 leaf = path->nodes[0];
1007 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1008
1009 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1010 ei0 = btrfs_item_ptr(leaf, path->slots[0],
1011 struct btrfs_extent_item_v0);
1012 refs = btrfs_extent_refs_v0(leaf, ei0);
1013
1014 if (owner == (u64)-1) {
1015 while (1) {
1016 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1017 ret = btrfs_next_leaf(root, path);
1018 if (ret < 0)
1019 return ret;
1020 BUG_ON(ret > 0); /* Corruption */
1021 leaf = path->nodes[0];
1022 }
1023 btrfs_item_key_to_cpu(leaf, &found_key,
1024 path->slots[0]);
1025 BUG_ON(key.objectid != found_key.objectid);
1026 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1027 path->slots[0]++;
1028 continue;
1029 }
1030 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1031 struct btrfs_extent_ref_v0);
1032 owner = btrfs_ref_objectid_v0(leaf, ref0);
1033 break;
1034 }
1035 }
1036 btrfs_release_path(path);
1037
1038 if (owner < BTRFS_FIRST_FREE_OBJECTID)
1039 new_size += sizeof(*bi);
1040
1041 new_size -= sizeof(*ei0);
1042 ret = btrfs_search_slot(trans, root, &key, path,
1043 new_size + extra_size, 1);
1044 if (ret < 0)
1045 return ret;
1046 BUG_ON(ret); /* Corruption */
1047
1048 btrfs_extend_item(root, path, new_size);
1049
1050 leaf = path->nodes[0];
1051 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1052 btrfs_set_extent_refs(leaf, item, refs);
1053 /* FIXME: get real generation */
1054 btrfs_set_extent_generation(leaf, item, 0);
1055 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1056 btrfs_set_extent_flags(leaf, item,
1057 BTRFS_EXTENT_FLAG_TREE_BLOCK |
1058 BTRFS_BLOCK_FLAG_FULL_BACKREF);
1059 bi = (struct btrfs_tree_block_info *)(item + 1);
1060 /* FIXME: get first key of the block */
1061 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1062 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1063 } else {
1064 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1065 }
1066 btrfs_mark_buffer_dirty(leaf);
1067 return 0;
1068 }
1069 #endif
1070
1071 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1072 {
1073 u32 high_crc = ~(u32)0;
1074 u32 low_crc = ~(u32)0;
1075 __le64 lenum;
1076
1077 lenum = cpu_to_le64(root_objectid);
1078 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1079 lenum = cpu_to_le64(owner);
1080 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1081 lenum = cpu_to_le64(offset);
1082 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1083
1084 return ((u64)high_crc << 31) ^ (u64)low_crc;
1085 }
1086
1087 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1088 struct btrfs_extent_data_ref *ref)
1089 {
1090 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1091 btrfs_extent_data_ref_objectid(leaf, ref),
1092 btrfs_extent_data_ref_offset(leaf, ref));
1093 }
1094
1095 static int match_extent_data_ref(struct extent_buffer *leaf,
1096 struct btrfs_extent_data_ref *ref,
1097 u64 root_objectid, u64 owner, u64 offset)
1098 {
1099 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1100 btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1101 btrfs_extent_data_ref_offset(leaf, ref) != offset)
1102 return 0;
1103 return 1;
1104 }
1105
1106 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1107 struct btrfs_root *root,
1108 struct btrfs_path *path,
1109 u64 bytenr, u64 parent,
1110 u64 root_objectid,
1111 u64 owner, u64 offset)
1112 {
1113 struct btrfs_key key;
1114 struct btrfs_extent_data_ref *ref;
1115 struct extent_buffer *leaf;
1116 u32 nritems;
1117 int ret;
1118 int recow;
1119 int err = -ENOENT;
1120
1121 key.objectid = bytenr;
1122 if (parent) {
1123 key.type = BTRFS_SHARED_DATA_REF_KEY;
1124 key.offset = parent;
1125 } else {
1126 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1127 key.offset = hash_extent_data_ref(root_objectid,
1128 owner, offset);
1129 }
1130 again:
1131 recow = 0;
1132 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1133 if (ret < 0) {
1134 err = ret;
1135 goto fail;
1136 }
1137
1138 if (parent) {
1139 if (!ret)
1140 return 0;
1141 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1142 key.type = BTRFS_EXTENT_REF_V0_KEY;
1143 btrfs_release_path(path);
1144 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1145 if (ret < 0) {
1146 err = ret;
1147 goto fail;
1148 }
1149 if (!ret)
1150 return 0;
1151 #endif
1152 goto fail;
1153 }
1154
1155 leaf = path->nodes[0];
1156 nritems = btrfs_header_nritems(leaf);
1157 while (1) {
1158 if (path->slots[0] >= nritems) {
1159 ret = btrfs_next_leaf(root, path);
1160 if (ret < 0)
1161 err = ret;
1162 if (ret)
1163 goto fail;
1164
1165 leaf = path->nodes[0];
1166 nritems = btrfs_header_nritems(leaf);
1167 recow = 1;
1168 }
1169
1170 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1171 if (key.objectid != bytenr ||
1172 key.type != BTRFS_EXTENT_DATA_REF_KEY)
1173 goto fail;
1174
1175 ref = btrfs_item_ptr(leaf, path->slots[0],
1176 struct btrfs_extent_data_ref);
1177
1178 if (match_extent_data_ref(leaf, ref, root_objectid,
1179 owner, offset)) {
1180 if (recow) {
1181 btrfs_release_path(path);
1182 goto again;
1183 }
1184 err = 0;
1185 break;
1186 }
1187 path->slots[0]++;
1188 }
1189 fail:
1190 return err;
1191 }
1192
1193 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1194 struct btrfs_root *root,
1195 struct btrfs_path *path,
1196 u64 bytenr, u64 parent,
1197 u64 root_objectid, u64 owner,
1198 u64 offset, int refs_to_add)
1199 {
1200 struct btrfs_key key;
1201 struct extent_buffer *leaf;
1202 u32 size;
1203 u32 num_refs;
1204 int ret;
1205
1206 key.objectid = bytenr;
1207 if (parent) {
1208 key.type = BTRFS_SHARED_DATA_REF_KEY;
1209 key.offset = parent;
1210 size = sizeof(struct btrfs_shared_data_ref);
1211 } else {
1212 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1213 key.offset = hash_extent_data_ref(root_objectid,
1214 owner, offset);
1215 size = sizeof(struct btrfs_extent_data_ref);
1216 }
1217
1218 ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1219 if (ret && ret != -EEXIST)
1220 goto fail;
1221
1222 leaf = path->nodes[0];
1223 if (parent) {
1224 struct btrfs_shared_data_ref *ref;
1225 ref = btrfs_item_ptr(leaf, path->slots[0],
1226 struct btrfs_shared_data_ref);
1227 if (ret == 0) {
1228 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1229 } else {
1230 num_refs = btrfs_shared_data_ref_count(leaf, ref);
1231 num_refs += refs_to_add;
1232 btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1233 }
1234 } else {
1235 struct btrfs_extent_data_ref *ref;
1236 while (ret == -EEXIST) {
1237 ref = btrfs_item_ptr(leaf, path->slots[0],
1238 struct btrfs_extent_data_ref);
1239 if (match_extent_data_ref(leaf, ref, root_objectid,
1240 owner, offset))
1241 break;
1242 btrfs_release_path(path);
1243 key.offset++;
1244 ret = btrfs_insert_empty_item(trans, root, path, &key,
1245 size);
1246 if (ret && ret != -EEXIST)
1247 goto fail;
1248
1249 leaf = path->nodes[0];
1250 }
1251 ref = btrfs_item_ptr(leaf, path->slots[0],
1252 struct btrfs_extent_data_ref);
1253 if (ret == 0) {
1254 btrfs_set_extent_data_ref_root(leaf, ref,
1255 root_objectid);
1256 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1257 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1258 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1259 } else {
1260 num_refs = btrfs_extent_data_ref_count(leaf, ref);
1261 num_refs += refs_to_add;
1262 btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1263 }
1264 }
1265 btrfs_mark_buffer_dirty(leaf);
1266 ret = 0;
1267 fail:
1268 btrfs_release_path(path);
1269 return ret;
1270 }
1271
1272 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1273 struct btrfs_root *root,
1274 struct btrfs_path *path,
1275 int refs_to_drop, int *last_ref)
1276 {
1277 struct btrfs_key key;
1278 struct btrfs_extent_data_ref *ref1 = NULL;
1279 struct btrfs_shared_data_ref *ref2 = NULL;
1280 struct extent_buffer *leaf;
1281 u32 num_refs = 0;
1282 int ret = 0;
1283
1284 leaf = path->nodes[0];
1285 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1286
1287 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1288 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1289 struct btrfs_extent_data_ref);
1290 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1291 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1292 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1293 struct btrfs_shared_data_ref);
1294 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1295 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1296 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1297 struct btrfs_extent_ref_v0 *ref0;
1298 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1299 struct btrfs_extent_ref_v0);
1300 num_refs = btrfs_ref_count_v0(leaf, ref0);
1301 #endif
1302 } else {
1303 BUG();
1304 }
1305
1306 BUG_ON(num_refs < refs_to_drop);
1307 num_refs -= refs_to_drop;
1308
1309 if (num_refs == 0) {
1310 ret = btrfs_del_item(trans, root, path);
1311 *last_ref = 1;
1312 } else {
1313 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1314 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1315 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1316 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1317 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1318 else {
1319 struct btrfs_extent_ref_v0 *ref0;
1320 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1321 struct btrfs_extent_ref_v0);
1322 btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1323 }
1324 #endif
1325 btrfs_mark_buffer_dirty(leaf);
1326 }
1327 return ret;
1328 }
1329
1330 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1331 struct btrfs_extent_inline_ref *iref)
1332 {
1333 struct btrfs_key key;
1334 struct extent_buffer *leaf;
1335 struct btrfs_extent_data_ref *ref1;
1336 struct btrfs_shared_data_ref *ref2;
1337 u32 num_refs = 0;
1338
1339 leaf = path->nodes[0];
1340 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1341 if (iref) {
1342 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1343 BTRFS_EXTENT_DATA_REF_KEY) {
1344 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1345 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1346 } else {
1347 ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1348 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1349 }
1350 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1351 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1352 struct btrfs_extent_data_ref);
1353 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1354 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1355 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1356 struct btrfs_shared_data_ref);
1357 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1358 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1359 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1360 struct btrfs_extent_ref_v0 *ref0;
1361 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1362 struct btrfs_extent_ref_v0);
1363 num_refs = btrfs_ref_count_v0(leaf, ref0);
1364 #endif
1365 } else {
1366 WARN_ON(1);
1367 }
1368 return num_refs;
1369 }
1370
1371 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1372 struct btrfs_root *root,
1373 struct btrfs_path *path,
1374 u64 bytenr, u64 parent,
1375 u64 root_objectid)
1376 {
1377 struct btrfs_key key;
1378 int ret;
1379
1380 key.objectid = bytenr;
1381 if (parent) {
1382 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1383 key.offset = parent;
1384 } else {
1385 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1386 key.offset = root_objectid;
1387 }
1388
1389 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1390 if (ret > 0)
1391 ret = -ENOENT;
1392 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1393 if (ret == -ENOENT && parent) {
1394 btrfs_release_path(path);
1395 key.type = BTRFS_EXTENT_REF_V0_KEY;
1396 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1397 if (ret > 0)
1398 ret = -ENOENT;
1399 }
1400 #endif
1401 return ret;
1402 }
1403
1404 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1405 struct btrfs_root *root,
1406 struct btrfs_path *path,
1407 u64 bytenr, u64 parent,
1408 u64 root_objectid)
1409 {
1410 struct btrfs_key key;
1411 int ret;
1412
1413 key.objectid = bytenr;
1414 if (parent) {
1415 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1416 key.offset = parent;
1417 } else {
1418 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1419 key.offset = root_objectid;
1420 }
1421
1422 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1423 btrfs_release_path(path);
1424 return ret;
1425 }
1426
1427 static inline int extent_ref_type(u64 parent, u64 owner)
1428 {
1429 int type;
1430 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1431 if (parent > 0)
1432 type = BTRFS_SHARED_BLOCK_REF_KEY;
1433 else
1434 type = BTRFS_TREE_BLOCK_REF_KEY;
1435 } else {
1436 if (parent > 0)
1437 type = BTRFS_SHARED_DATA_REF_KEY;
1438 else
1439 type = BTRFS_EXTENT_DATA_REF_KEY;
1440 }
1441 return type;
1442 }
1443
1444 static int find_next_key(struct btrfs_path *path, int level,
1445 struct btrfs_key *key)
1446
1447 {
1448 for (; level < BTRFS_MAX_LEVEL; level++) {
1449 if (!path->nodes[level])
1450 break;
1451 if (path->slots[level] + 1 >=
1452 btrfs_header_nritems(path->nodes[level]))
1453 continue;
1454 if (level == 0)
1455 btrfs_item_key_to_cpu(path->nodes[level], key,
1456 path->slots[level] + 1);
1457 else
1458 btrfs_node_key_to_cpu(path->nodes[level], key,
1459 path->slots[level] + 1);
1460 return 0;
1461 }
1462 return 1;
1463 }
1464
1465 /*
1466 * look for inline back ref. if back ref is found, *ref_ret is set
1467 * to the address of inline back ref, and 0 is returned.
1468 *
1469 * if back ref isn't found, *ref_ret is set to the address where it
1470 * should be inserted, and -ENOENT is returned.
1471 *
1472 * if insert is true and there are too many inline back refs, the path
1473 * points to the extent item, and -EAGAIN is returned.
1474 *
1475 * NOTE: inline back refs are ordered in the same way that back ref
1476 * items in the tree are ordered.
1477 */
1478 static noinline_for_stack
1479 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1480 struct btrfs_root *root,
1481 struct btrfs_path *path,
1482 struct btrfs_extent_inline_ref **ref_ret,
1483 u64 bytenr, u64 num_bytes,
1484 u64 parent, u64 root_objectid,
1485 u64 owner, u64 offset, int insert)
1486 {
1487 struct btrfs_key key;
1488 struct extent_buffer *leaf;
1489 struct btrfs_extent_item *ei;
1490 struct btrfs_extent_inline_ref *iref;
1491 u64 flags;
1492 u64 item_size;
1493 unsigned long ptr;
1494 unsigned long end;
1495 int extra_size;
1496 int type;
1497 int want;
1498 int ret;
1499 int err = 0;
1500 bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
1501 SKINNY_METADATA);
1502
1503 key.objectid = bytenr;
1504 key.type = BTRFS_EXTENT_ITEM_KEY;
1505 key.offset = num_bytes;
1506
1507 want = extent_ref_type(parent, owner);
1508 if (insert) {
1509 extra_size = btrfs_extent_inline_ref_size(want);
1510 path->keep_locks = 1;
1511 } else
1512 extra_size = -1;
1513
1514 /*
1515 * Owner is our parent level, so we can just add one to get the level
1516 * for the block we are interested in.
1517 */
1518 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1519 key.type = BTRFS_METADATA_ITEM_KEY;
1520 key.offset = owner;
1521 }
1522
1523 again:
1524 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1525 if (ret < 0) {
1526 err = ret;
1527 goto out;
1528 }
1529
1530 /*
1531 * We may be a newly converted file system which still has the old fat
1532 * extent entries for metadata, so try and see if we have one of those.
1533 */
1534 if (ret > 0 && skinny_metadata) {
1535 skinny_metadata = false;
1536 if (path->slots[0]) {
1537 path->slots[0]--;
1538 btrfs_item_key_to_cpu(path->nodes[0], &key,
1539 path->slots[0]);
1540 if (key.objectid == bytenr &&
1541 key.type == BTRFS_EXTENT_ITEM_KEY &&
1542 key.offset == num_bytes)
1543 ret = 0;
1544 }
1545 if (ret) {
1546 key.objectid = bytenr;
1547 key.type = BTRFS_EXTENT_ITEM_KEY;
1548 key.offset = num_bytes;
1549 btrfs_release_path(path);
1550 goto again;
1551 }
1552 }
1553
1554 if (ret && !insert) {
1555 err = -ENOENT;
1556 goto out;
1557 } else if (WARN_ON(ret)) {
1558 err = -EIO;
1559 goto out;
1560 }
1561
1562 leaf = path->nodes[0];
1563 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1564 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1565 if (item_size < sizeof(*ei)) {
1566 if (!insert) {
1567 err = -ENOENT;
1568 goto out;
1569 }
1570 ret = convert_extent_item_v0(trans, root, path, owner,
1571 extra_size);
1572 if (ret < 0) {
1573 err = ret;
1574 goto out;
1575 }
1576 leaf = path->nodes[0];
1577 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1578 }
1579 #endif
1580 BUG_ON(item_size < sizeof(*ei));
1581
1582 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1583 flags = btrfs_extent_flags(leaf, ei);
1584
1585 ptr = (unsigned long)(ei + 1);
1586 end = (unsigned long)ei + item_size;
1587
1588 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1589 ptr += sizeof(struct btrfs_tree_block_info);
1590 BUG_ON(ptr > end);
1591 }
1592
1593 err = -ENOENT;
1594 while (1) {
1595 if (ptr >= end) {
1596 WARN_ON(ptr > end);
1597 break;
1598 }
1599 iref = (struct btrfs_extent_inline_ref *)ptr;
1600 type = btrfs_extent_inline_ref_type(leaf, iref);
1601 if (want < type)
1602 break;
1603 if (want > type) {
1604 ptr += btrfs_extent_inline_ref_size(type);
1605 continue;
1606 }
1607
1608 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1609 struct btrfs_extent_data_ref *dref;
1610 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1611 if (match_extent_data_ref(leaf, dref, root_objectid,
1612 owner, offset)) {
1613 err = 0;
1614 break;
1615 }
1616 if (hash_extent_data_ref_item(leaf, dref) <
1617 hash_extent_data_ref(root_objectid, owner, offset))
1618 break;
1619 } else {
1620 u64 ref_offset;
1621 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1622 if (parent > 0) {
1623 if (parent == ref_offset) {
1624 err = 0;
1625 break;
1626 }
1627 if (ref_offset < parent)
1628 break;
1629 } else {
1630 if (root_objectid == ref_offset) {
1631 err = 0;
1632 break;
1633 }
1634 if (ref_offset < root_objectid)
1635 break;
1636 }
1637 }
1638 ptr += btrfs_extent_inline_ref_size(type);
1639 }
1640 if (err == -ENOENT && insert) {
1641 if (item_size + extra_size >=
1642 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1643 err = -EAGAIN;
1644 goto out;
1645 }
1646 /*
1647 * To add new inline back ref, we have to make sure
1648 * there is no corresponding back ref item.
1649 * For simplicity, we just do not add new inline back
1650 * ref if there is any kind of item for this block
1651 */
1652 if (find_next_key(path, 0, &key) == 0 &&
1653 key.objectid == bytenr &&
1654 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1655 err = -EAGAIN;
1656 goto out;
1657 }
1658 }
1659 *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1660 out:
1661 if (insert) {
1662 path->keep_locks = 0;
1663 btrfs_unlock_up_safe(path, 1);
1664 }
1665 return err;
1666 }
1667
1668 /*
1669 * helper to add new inline back ref
1670 */
1671 static noinline_for_stack
1672 void setup_inline_extent_backref(struct btrfs_root *root,
1673 struct btrfs_path *path,
1674 struct btrfs_extent_inline_ref *iref,
1675 u64 parent, u64 root_objectid,
1676 u64 owner, u64 offset, int refs_to_add,
1677 struct btrfs_delayed_extent_op *extent_op)
1678 {
1679 struct extent_buffer *leaf;
1680 struct btrfs_extent_item *ei;
1681 unsigned long ptr;
1682 unsigned long end;
1683 unsigned long item_offset;
1684 u64 refs;
1685 int size;
1686 int type;
1687
1688 leaf = path->nodes[0];
1689 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1690 item_offset = (unsigned long)iref - (unsigned long)ei;
1691
1692 type = extent_ref_type(parent, owner);
1693 size = btrfs_extent_inline_ref_size(type);
1694
1695 btrfs_extend_item(root, path, size);
1696
1697 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1698 refs = btrfs_extent_refs(leaf, ei);
1699 refs += refs_to_add;
1700 btrfs_set_extent_refs(leaf, ei, refs);
1701 if (extent_op)
1702 __run_delayed_extent_op(extent_op, leaf, ei);
1703
1704 ptr = (unsigned long)ei + item_offset;
1705 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1706 if (ptr < end - size)
1707 memmove_extent_buffer(leaf, ptr + size, ptr,
1708 end - size - ptr);
1709
1710 iref = (struct btrfs_extent_inline_ref *)ptr;
1711 btrfs_set_extent_inline_ref_type(leaf, iref, type);
1712 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1713 struct btrfs_extent_data_ref *dref;
1714 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1715 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1716 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1717 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1718 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1719 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1720 struct btrfs_shared_data_ref *sref;
1721 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1722 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1723 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1724 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1725 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1726 } else {
1727 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1728 }
1729 btrfs_mark_buffer_dirty(leaf);
1730 }
1731
1732 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1733 struct btrfs_root *root,
1734 struct btrfs_path *path,
1735 struct btrfs_extent_inline_ref **ref_ret,
1736 u64 bytenr, u64 num_bytes, u64 parent,
1737 u64 root_objectid, u64 owner, u64 offset)
1738 {
1739 int ret;
1740
1741 ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1742 bytenr, num_bytes, parent,
1743 root_objectid, owner, offset, 0);
1744 if (ret != -ENOENT)
1745 return ret;
1746
1747 btrfs_release_path(path);
1748 *ref_ret = NULL;
1749
1750 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1751 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1752 root_objectid);
1753 } else {
1754 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1755 root_objectid, owner, offset);
1756 }
1757 return ret;
1758 }
1759
1760 /*
1761 * helper to update/remove inline back ref
1762 */
1763 static noinline_for_stack
1764 void update_inline_extent_backref(struct btrfs_root *root,
1765 struct btrfs_path *path,
1766 struct btrfs_extent_inline_ref *iref,
1767 int refs_to_mod,
1768 struct btrfs_delayed_extent_op *extent_op,
1769 int *last_ref)
1770 {
1771 struct extent_buffer *leaf;
1772 struct btrfs_extent_item *ei;
1773 struct btrfs_extent_data_ref *dref = NULL;
1774 struct btrfs_shared_data_ref *sref = NULL;
1775 unsigned long ptr;
1776 unsigned long end;
1777 u32 item_size;
1778 int size;
1779 int type;
1780 u64 refs;
1781
1782 leaf = path->nodes[0];
1783 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1784 refs = btrfs_extent_refs(leaf, ei);
1785 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1786 refs += refs_to_mod;
1787 btrfs_set_extent_refs(leaf, ei, refs);
1788 if (extent_op)
1789 __run_delayed_extent_op(extent_op, leaf, ei);
1790
1791 type = btrfs_extent_inline_ref_type(leaf, iref);
1792
1793 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1794 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1795 refs = btrfs_extent_data_ref_count(leaf, dref);
1796 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1797 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1798 refs = btrfs_shared_data_ref_count(leaf, sref);
1799 } else {
1800 refs = 1;
1801 BUG_ON(refs_to_mod != -1);
1802 }
1803
1804 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1805 refs += refs_to_mod;
1806
1807 if (refs > 0) {
1808 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1809 btrfs_set_extent_data_ref_count(leaf, dref, refs);
1810 else
1811 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1812 } else {
1813 *last_ref = 1;
1814 size = btrfs_extent_inline_ref_size(type);
1815 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1816 ptr = (unsigned long)iref;
1817 end = (unsigned long)ei + item_size;
1818 if (ptr + size < end)
1819 memmove_extent_buffer(leaf, ptr, ptr + size,
1820 end - ptr - size);
1821 item_size -= size;
1822 btrfs_truncate_item(root, path, item_size, 1);
1823 }
1824 btrfs_mark_buffer_dirty(leaf);
1825 }
1826
1827 static noinline_for_stack
1828 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1829 struct btrfs_root *root,
1830 struct btrfs_path *path,
1831 u64 bytenr, u64 num_bytes, u64 parent,
1832 u64 root_objectid, u64 owner,
1833 u64 offset, int refs_to_add,
1834 struct btrfs_delayed_extent_op *extent_op)
1835 {
1836 struct btrfs_extent_inline_ref *iref;
1837 int ret;
1838
1839 ret = lookup_inline_extent_backref(trans, root, path, &iref,
1840 bytenr, num_bytes, parent,
1841 root_objectid, owner, offset, 1);
1842 if (ret == 0) {
1843 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1844 update_inline_extent_backref(root, path, iref,
1845 refs_to_add, extent_op, NULL);
1846 } else if (ret == -ENOENT) {
1847 setup_inline_extent_backref(root, path, iref, parent,
1848 root_objectid, owner, offset,
1849 refs_to_add, extent_op);
1850 ret = 0;
1851 }
1852 return ret;
1853 }
1854
1855 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1856 struct btrfs_root *root,
1857 struct btrfs_path *path,
1858 u64 bytenr, u64 parent, u64 root_objectid,
1859 u64 owner, u64 offset, int refs_to_add)
1860 {
1861 int ret;
1862 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1863 BUG_ON(refs_to_add != 1);
1864 ret = insert_tree_block_ref(trans, root, path, bytenr,
1865 parent, root_objectid);
1866 } else {
1867 ret = insert_extent_data_ref(trans, root, path, bytenr,
1868 parent, root_objectid,
1869 owner, offset, refs_to_add);
1870 }
1871 return ret;
1872 }
1873
1874 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1875 struct btrfs_root *root,
1876 struct btrfs_path *path,
1877 struct btrfs_extent_inline_ref *iref,
1878 int refs_to_drop, int is_data, int *last_ref)
1879 {
1880 int ret = 0;
1881
1882 BUG_ON(!is_data && refs_to_drop != 1);
1883 if (iref) {
1884 update_inline_extent_backref(root, path, iref,
1885 -refs_to_drop, NULL, last_ref);
1886 } else if (is_data) {
1887 ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
1888 last_ref);
1889 } else {
1890 *last_ref = 1;
1891 ret = btrfs_del_item(trans, root, path);
1892 }
1893 return ret;
1894 }
1895
1896 #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
1897 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1898 u64 *discarded_bytes)
1899 {
1900 int j, ret = 0;
1901 u64 bytes_left, end;
1902 u64 aligned_start = ALIGN(start, 1 << 9);
1903
1904 if (WARN_ON(start != aligned_start)) {
1905 len -= aligned_start - start;
1906 len = round_down(len, 1 << 9);
1907 start = aligned_start;
1908 }
1909
1910 *discarded_bytes = 0;
1911
1912 if (!len)
1913 return 0;
1914
1915 end = start + len;
1916 bytes_left = len;
1917
1918 /* Skip any superblocks on this device. */
1919 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1920 u64 sb_start = btrfs_sb_offset(j);
1921 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1922 u64 size = sb_start - start;
1923
1924 if (!in_range(sb_start, start, bytes_left) &&
1925 !in_range(sb_end, start, bytes_left) &&
1926 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1927 continue;
1928
1929 /*
1930 * Superblock spans beginning of range. Adjust start and
1931 * try again.
1932 */
1933 if (sb_start <= start) {
1934 start += sb_end - start;
1935 if (start > end) {
1936 bytes_left = 0;
1937 break;
1938 }
1939 bytes_left = end - start;
1940 continue;
1941 }
1942
1943 if (size) {
1944 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1945 GFP_NOFS, 0);
1946 if (!ret)
1947 *discarded_bytes += size;
1948 else if (ret != -EOPNOTSUPP)
1949 return ret;
1950 }
1951
1952 start = sb_end;
1953 if (start > end) {
1954 bytes_left = 0;
1955 break;
1956 }
1957 bytes_left = end - start;
1958 }
1959
1960 if (bytes_left) {
1961 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1962 GFP_NOFS, 0);
1963 if (!ret)
1964 *discarded_bytes += bytes_left;
1965 }
1966 return ret;
1967 }
1968
1969 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1970 u64 num_bytes, u64 *actual_bytes)
1971 {
1972 int ret;
1973 u64 discarded_bytes = 0;
1974 struct btrfs_bio *bbio = NULL;
1975
1976
1977 /* Tell the block device(s) that the sectors can be discarded */
1978 ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1979 bytenr, &num_bytes, &bbio, 0);
1980 /* Error condition is -ENOMEM */
1981 if (!ret) {
1982 struct btrfs_bio_stripe *stripe = bbio->stripes;
1983 int i;
1984
1985
1986 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1987 u64 bytes;
1988 if (!stripe->dev->can_discard)
1989 continue;
1990
1991 ret = btrfs_issue_discard(stripe->dev->bdev,
1992 stripe->physical,
1993 stripe->length,
1994 &bytes);
1995 if (!ret)
1996 discarded_bytes += bytes;
1997 else if (ret != -EOPNOTSUPP)
1998 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1999
2000 /*
2001 * Just in case we get back EOPNOTSUPP for some reason,
2002 * just ignore the return value so we don't screw up
2003 * people calling discard_extent.
2004 */
2005 ret = 0;
2006 }
2007 btrfs_put_bbio(bbio);
2008 }
2009
2010 if (actual_bytes)
2011 *actual_bytes = discarded_bytes;
2012
2013
2014 if (ret == -EOPNOTSUPP)
2015 ret = 0;
2016 return ret;
2017 }
2018
2019 /* Can return -ENOMEM */
2020 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2021 struct btrfs_root *root,
2022 u64 bytenr, u64 num_bytes, u64 parent,
2023 u64 root_objectid, u64 owner, u64 offset,
2024 int no_quota)
2025 {
2026 int ret;
2027 struct btrfs_fs_info *fs_info = root->fs_info;
2028
2029 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2030 root_objectid == BTRFS_TREE_LOG_OBJECTID);
2031
2032 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2033 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2034 num_bytes,
2035 parent, root_objectid, (int)owner,
2036 BTRFS_ADD_DELAYED_REF, NULL, no_quota);
2037 } else {
2038 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2039 num_bytes,
2040 parent, root_objectid, owner, offset,
2041 BTRFS_ADD_DELAYED_REF, NULL, no_quota);
2042 }
2043 return ret;
2044 }
2045
2046 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2047 struct btrfs_root *root,
2048 struct btrfs_delayed_ref_node *node,
2049 u64 parent, u64 root_objectid,
2050 u64 owner, u64 offset, int refs_to_add,
2051 struct btrfs_delayed_extent_op *extent_op)
2052 {
2053 struct btrfs_fs_info *fs_info = root->fs_info;
2054 struct btrfs_path *path;
2055 struct extent_buffer *leaf;
2056 struct btrfs_extent_item *item;
2057 struct btrfs_key key;
2058 u64 bytenr = node->bytenr;
2059 u64 num_bytes = node->num_bytes;
2060 u64 refs;
2061 int ret;
2062 int no_quota = node->no_quota;
2063
2064 path = btrfs_alloc_path();
2065 if (!path)
2066 return -ENOMEM;
2067
2068 if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
2069 no_quota = 1;
2070
2071 path->reada = 1;
2072 path->leave_spinning = 1;
2073 /* this will setup the path even if it fails to insert the back ref */
2074 ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
2075 bytenr, num_bytes, parent,
2076 root_objectid, owner, offset,
2077 refs_to_add, extent_op);
2078 if ((ret < 0 && ret != -EAGAIN) || !ret)
2079 goto out;
2080
2081 /*
2082 * Ok we had -EAGAIN which means we didn't have space to insert and
2083 * inline extent ref, so just update the reference count and add a
2084 * normal backref.
2085 */
2086 leaf = path->nodes[0];
2087 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2088 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2089 refs = btrfs_extent_refs(leaf, item);
2090 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2091 if (extent_op)
2092 __run_delayed_extent_op(extent_op, leaf, item);
2093
2094 btrfs_mark_buffer_dirty(leaf);
2095 btrfs_release_path(path);
2096
2097 path->reada = 1;
2098 path->leave_spinning = 1;
2099 /* now insert the actual backref */
2100 ret = insert_extent_backref(trans, root->fs_info->extent_root,
2101 path, bytenr, parent, root_objectid,
2102 owner, offset, refs_to_add);
2103 if (ret)
2104 btrfs_abort_transaction(trans, root, ret);
2105 out:
2106 btrfs_free_path(path);
2107 return ret;
2108 }
2109
2110 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2111 struct btrfs_root *root,
2112 struct btrfs_delayed_ref_node *node,
2113 struct btrfs_delayed_extent_op *extent_op,
2114 int insert_reserved)
2115 {
2116 int ret = 0;
2117 struct btrfs_delayed_data_ref *ref;
2118 struct btrfs_key ins;
2119 u64 parent = 0;
2120 u64 ref_root = 0;
2121 u64 flags = 0;
2122
2123 ins.objectid = node->bytenr;
2124 ins.offset = node->num_bytes;
2125 ins.type = BTRFS_EXTENT_ITEM_KEY;
2126
2127 ref = btrfs_delayed_node_to_data_ref(node);
2128 trace_run_delayed_data_ref(node, ref, node->action);
2129
2130 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2131 parent = ref->parent;
2132 ref_root = ref->root;
2133
2134 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2135 if (extent_op)
2136 flags |= extent_op->flags_to_set;
2137 ret = alloc_reserved_file_extent(trans, root,
2138 parent, ref_root, flags,
2139 ref->objectid, ref->offset,
2140 &ins, node->ref_mod);
2141 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2142 ret = __btrfs_inc_extent_ref(trans, root, node, parent,
2143 ref_root, ref->objectid,
2144 ref->offset, node->ref_mod,
2145 extent_op);
2146 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2147 ret = __btrfs_free_extent(trans, root, node, parent,
2148 ref_root, ref->objectid,
2149 ref->offset, node->ref_mod,
2150 extent_op);
2151 } else {
2152 BUG();
2153 }
2154 return ret;
2155 }
2156
2157 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2158 struct extent_buffer *leaf,
2159 struct btrfs_extent_item *ei)
2160 {
2161 u64 flags = btrfs_extent_flags(leaf, ei);
2162 if (extent_op->update_flags) {
2163 flags |= extent_op->flags_to_set;
2164 btrfs_set_extent_flags(leaf, ei, flags);
2165 }
2166
2167 if (extent_op->update_key) {
2168 struct btrfs_tree_block_info *bi;
2169 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2170 bi = (struct btrfs_tree_block_info *)(ei + 1);
2171 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2172 }
2173 }
2174
2175 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2176 struct btrfs_root *root,
2177 struct btrfs_delayed_ref_node *node,
2178 struct btrfs_delayed_extent_op *extent_op)
2179 {
2180 struct btrfs_key key;
2181 struct btrfs_path *path;
2182 struct btrfs_extent_item *ei;
2183 struct extent_buffer *leaf;
2184 u32 item_size;
2185 int ret;
2186 int err = 0;
2187 int metadata = !extent_op->is_data;
2188
2189 if (trans->aborted)
2190 return 0;
2191
2192 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2193 metadata = 0;
2194
2195 path = btrfs_alloc_path();
2196 if (!path)
2197 return -ENOMEM;
2198
2199 key.objectid = node->bytenr;
2200
2201 if (metadata) {
2202 key.type = BTRFS_METADATA_ITEM_KEY;
2203 key.offset = extent_op->level;
2204 } else {
2205 key.type = BTRFS_EXTENT_ITEM_KEY;
2206 key.offset = node->num_bytes;
2207 }
2208
2209 again:
2210 path->reada = 1;
2211 path->leave_spinning = 1;
2212 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2213 path, 0, 1);
2214 if (ret < 0) {
2215 err = ret;
2216 goto out;
2217 }
2218 if (ret > 0) {
2219 if (metadata) {
2220 if (path->slots[0] > 0) {
2221 path->slots[0]--;
2222 btrfs_item_key_to_cpu(path->nodes[0], &key,
2223 path->slots[0]);
2224 if (key.objectid == node->bytenr &&
2225 key.type == BTRFS_EXTENT_ITEM_KEY &&
2226 key.offset == node->num_bytes)
2227 ret = 0;
2228 }
2229 if (ret > 0) {
2230 btrfs_release_path(path);
2231 metadata = 0;
2232
2233 key.objectid = node->bytenr;
2234 key.offset = node->num_bytes;
2235 key.type = BTRFS_EXTENT_ITEM_KEY;
2236 goto again;
2237 }
2238 } else {
2239 err = -EIO;
2240 goto out;
2241 }
2242 }
2243
2244 leaf = path->nodes[0];
2245 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2246 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2247 if (item_size < sizeof(*ei)) {
2248 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2249 path, (u64)-1, 0);
2250 if (ret < 0) {
2251 err = ret;
2252 goto out;
2253 }
2254 leaf = path->nodes[0];
2255 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2256 }
2257 #endif
2258 BUG_ON(item_size < sizeof(*ei));
2259 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2260 __run_delayed_extent_op(extent_op, leaf, ei);
2261
2262 btrfs_mark_buffer_dirty(leaf);
2263 out:
2264 btrfs_free_path(path);
2265 return err;
2266 }
2267
2268 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2269 struct btrfs_root *root,
2270 struct btrfs_delayed_ref_node *node,
2271 struct btrfs_delayed_extent_op *extent_op,
2272 int insert_reserved)
2273 {
2274 int ret = 0;
2275 struct btrfs_delayed_tree_ref *ref;
2276 struct btrfs_key ins;
2277 u64 parent = 0;
2278 u64 ref_root = 0;
2279 bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
2280 SKINNY_METADATA);
2281
2282 ref = btrfs_delayed_node_to_tree_ref(node);
2283 trace_run_delayed_tree_ref(node, ref, node->action);
2284
2285 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2286 parent = ref->parent;
2287 ref_root = ref->root;
2288
2289 ins.objectid = node->bytenr;
2290 if (skinny_metadata) {
2291 ins.offset = ref->level;
2292 ins.type = BTRFS_METADATA_ITEM_KEY;
2293 } else {
2294 ins.offset = node->num_bytes;
2295 ins.type = BTRFS_EXTENT_ITEM_KEY;
2296 }
2297
2298 BUG_ON(node->ref_mod != 1);
2299 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2300 BUG_ON(!extent_op || !extent_op->update_flags);
2301 ret = alloc_reserved_tree_block(trans, root,
2302 parent, ref_root,
2303 extent_op->flags_to_set,
2304 &extent_op->key,
2305 ref->level, &ins,
2306 node->no_quota);
2307 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2308 ret = __btrfs_inc_extent_ref(trans, root, node,
2309 parent, ref_root,
2310 ref->level, 0, 1,
2311 extent_op);
2312 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2313 ret = __btrfs_free_extent(trans, root, node,
2314 parent, ref_root,
2315 ref->level, 0, 1, extent_op);
2316 } else {
2317 BUG();
2318 }
2319 return ret;
2320 }
2321
2322 /* helper function to actually process a single delayed ref entry */
2323 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2324 struct btrfs_root *root,
2325 struct btrfs_delayed_ref_node *node,
2326 struct btrfs_delayed_extent_op *extent_op,
2327 int insert_reserved)
2328 {
2329 int ret = 0;
2330
2331 if (trans->aborted) {
2332 if (insert_reserved)
2333 btrfs_pin_extent(root, node->bytenr,
2334 node->num_bytes, 1);
2335 return 0;
2336 }
2337
2338 if (btrfs_delayed_ref_is_head(node)) {
2339 struct btrfs_delayed_ref_head *head;
2340 /*
2341 * we've hit the end of the chain and we were supposed
2342 * to insert this extent into the tree. But, it got
2343 * deleted before we ever needed to insert it, so all
2344 * we have to do is clean up the accounting
2345 */
2346 BUG_ON(extent_op);
2347 head = btrfs_delayed_node_to_head(node);
2348 trace_run_delayed_ref_head(node, head, node->action);
2349
2350 if (insert_reserved) {
2351 btrfs_pin_extent(root, node->bytenr,
2352 node->num_bytes, 1);
2353 if (head->is_data) {
2354 ret = btrfs_del_csums(trans, root,
2355 node->bytenr,
2356 node->num_bytes);
2357 }
2358 }
2359 return ret;
2360 }
2361
2362 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2363 node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2364 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2365 insert_reserved);
2366 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2367 node->type == BTRFS_SHARED_DATA_REF_KEY)
2368 ret = run_delayed_data_ref(trans, root, node, extent_op,
2369 insert_reserved);
2370 else
2371 BUG();
2372 return ret;
2373 }
2374
2375 static inline struct btrfs_delayed_ref_node *
2376 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2377 {
2378 struct btrfs_delayed_ref_node *ref;
2379
2380 if (list_empty(&head->ref_list))
2381 return NULL;
2382
2383 /*
2384 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2385 * This is to prevent a ref count from going down to zero, which deletes
2386 * the extent item from the extent tree, when there still are references
2387 * to add, which would fail because they would not find the extent item.
2388 */
2389 list_for_each_entry(ref, &head->ref_list, list) {
2390 if (ref->action == BTRFS_ADD_DELAYED_REF)
2391 return ref;
2392 }
2393
2394 return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
2395 list);
2396 }
2397
2398 /*
2399 * Returns 0 on success or if called with an already aborted transaction.
2400 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2401 */
2402 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2403 struct btrfs_root *root,
2404 unsigned long nr)
2405 {
2406 struct btrfs_delayed_ref_root *delayed_refs;
2407 struct btrfs_delayed_ref_node *ref;
2408 struct btrfs_delayed_ref_head *locked_ref = NULL;
2409 struct btrfs_delayed_extent_op *extent_op;
2410 struct btrfs_fs_info *fs_info = root->fs_info;
2411 ktime_t start = ktime_get();
2412 int ret;
2413 unsigned long count = 0;
2414 unsigned long actual_count = 0;
2415 int must_insert_reserved = 0;
2416
2417 delayed_refs = &trans->transaction->delayed_refs;
2418 while (1) {
2419 if (!locked_ref) {
2420 if (count >= nr)
2421 break;
2422
2423 spin_lock(&delayed_refs->lock);
2424 locked_ref = btrfs_select_ref_head(trans);
2425 if (!locked_ref) {
2426 spin_unlock(&delayed_refs->lock);
2427 break;
2428 }
2429
2430 /* grab the lock that says we are going to process
2431 * all the refs for this head */
2432 ret = btrfs_delayed_ref_lock(trans, locked_ref);
2433 spin_unlock(&delayed_refs->lock);
2434 /*
2435 * we may have dropped the spin lock to get the head
2436 * mutex lock, and that might have given someone else
2437 * time to free the head. If that's true, it has been
2438 * removed from our list and we can move on.
2439 */
2440 if (ret == -EAGAIN) {
2441 locked_ref = NULL;
2442 count++;
2443 continue;
2444 }
2445 }
2446
2447 spin_lock(&locked_ref->lock);
2448
2449 /*
2450 * locked_ref is the head node, so we have to go one
2451 * node back for any delayed ref updates
2452 */
2453 ref = select_delayed_ref(locked_ref);
2454
2455 if (ref && ref->seq &&
2456 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2457 spin_unlock(&locked_ref->lock);
2458 btrfs_delayed_ref_unlock(locked_ref);
2459 spin_lock(&delayed_refs->lock);
2460 locked_ref->processing = 0;
2461 delayed_refs->num_heads_ready++;
2462 spin_unlock(&delayed_refs->lock);
2463 locked_ref = NULL;
2464 cond_resched();
2465 count++;
2466 continue;
2467 }
2468
2469 /*
2470 * record the must insert reserved flag before we
2471 * drop the spin lock.
2472 */
2473 must_insert_reserved = locked_ref->must_insert_reserved;
2474 locked_ref->must_insert_reserved = 0;
2475
2476 extent_op = locked_ref->extent_op;
2477 locked_ref->extent_op = NULL;
2478
2479 if (!ref) {
2480
2481
2482 /* All delayed refs have been processed, Go ahead
2483 * and send the head node to run_one_delayed_ref,
2484 * so that any accounting fixes can happen
2485 */
2486 ref = &locked_ref->node;
2487
2488 if (extent_op && must_insert_reserved) {
2489 btrfs_free_delayed_extent_op(extent_op);
2490 extent_op = NULL;
2491 }
2492
2493 if (extent_op) {
2494 spin_unlock(&locked_ref->lock);
2495 ret = run_delayed_extent_op(trans, root,
2496 ref, extent_op);
2497 btrfs_free_delayed_extent_op(extent_op);
2498
2499 if (ret) {
2500 /*
2501 * Need to reset must_insert_reserved if
2502 * there was an error so the abort stuff
2503 * can cleanup the reserved space
2504 * properly.
2505 */
2506 if (must_insert_reserved)
2507 locked_ref->must_insert_reserved = 1;
2508 locked_ref->processing = 0;
2509 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2510 btrfs_delayed_ref_unlock(locked_ref);
2511 return ret;
2512 }
2513 continue;
2514 }
2515
2516 /*
2517 * Need to drop our head ref lock and re-aqcuire the
2518 * delayed ref lock and then re-check to make sure
2519 * nobody got added.
2520 */
2521 spin_unlock(&locked_ref->lock);
2522 spin_lock(&delayed_refs->lock);
2523 spin_lock(&locked_ref->lock);
2524 if (!list_empty(&locked_ref->ref_list) ||
2525 locked_ref->extent_op) {
2526 spin_unlock(&locked_ref->lock);
2527 spin_unlock(&delayed_refs->lock);
2528 continue;
2529 }
2530 ref->in_tree = 0;
2531 delayed_refs->num_heads--;
2532 rb_erase(&locked_ref->href_node,
2533 &delayed_refs->href_root);
2534 spin_unlock(&delayed_refs->lock);
2535 } else {
2536 actual_count++;
2537 ref->in_tree = 0;
2538 list_del(&ref->list);
2539 }
2540 atomic_dec(&delayed_refs->num_entries);
2541
2542 if (!btrfs_delayed_ref_is_head(ref)) {
2543 /*
2544 * when we play the delayed ref, also correct the
2545 * ref_mod on head
2546 */
2547 switch (ref->action) {
2548 case BTRFS_ADD_DELAYED_REF:
2549 case BTRFS_ADD_DELAYED_EXTENT:
2550 locked_ref->node.ref_mod -= ref->ref_mod;
2551 break;
2552 case BTRFS_DROP_DELAYED_REF:
2553 locked_ref->node.ref_mod += ref->ref_mod;
2554 break;
2555 default:
2556 WARN_ON(1);
2557 }
2558 }
2559 spin_unlock(&locked_ref->lock);
2560
2561 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2562 must_insert_reserved);
2563
2564 btrfs_free_delayed_extent_op(extent_op);
2565 if (ret) {
2566 locked_ref->processing = 0;
2567 btrfs_delayed_ref_unlock(locked_ref);
2568 btrfs_put_delayed_ref(ref);
2569 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2570 return ret;
2571 }
2572
2573 /*
2574 * If this node is a head, that means all the refs in this head
2575 * have been dealt with, and we will pick the next head to deal
2576 * with, so we must unlock the head and drop it from the cluster
2577 * list before we release it.
2578 */
2579 if (btrfs_delayed_ref_is_head(ref)) {
2580 if (locked_ref->is_data &&
2581 locked_ref->total_ref_mod < 0) {
2582 spin_lock(&delayed_refs->lock);
2583 delayed_refs->pending_csums -= ref->num_bytes;
2584 spin_unlock(&delayed_refs->lock);
2585 }
2586 btrfs_delayed_ref_unlock(locked_ref);
2587 locked_ref = NULL;
2588 }
2589 btrfs_put_delayed_ref(ref);
2590 count++;
2591 cond_resched();
2592 }
2593
2594 /*
2595 * We don't want to include ref heads since we can have empty ref heads
2596 * and those will drastically skew our runtime down since we just do
2597 * accounting, no actual extent tree updates.
2598 */
2599 if (actual_count > 0) {
2600 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2601 u64 avg;
2602
2603 /*
2604 * We weigh the current average higher than our current runtime
2605 * to avoid large swings in the average.
2606 */
2607 spin_lock(&delayed_refs->lock);
2608 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2609 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
2610 spin_unlock(&delayed_refs->lock);
2611 }
2612 return 0;
2613 }
2614
2615 #ifdef SCRAMBLE_DELAYED_REFS
2616 /*
2617 * Normally delayed refs get processed in ascending bytenr order. This
2618 * correlates in most cases to the order added. To expose dependencies on this
2619 * order, we start to process the tree in the middle instead of the beginning
2620 */
2621 static u64 find_middle(struct rb_root *root)
2622 {
2623 struct rb_node *n = root->rb_node;
2624 struct btrfs_delayed_ref_node *entry;
2625 int alt = 1;
2626 u64 middle;
2627 u64 first = 0, last = 0;
2628
2629 n = rb_first(root);
2630 if (n) {
2631 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2632 first = entry->bytenr;
2633 }
2634 n = rb_last(root);
2635 if (n) {
2636 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2637 last = entry->bytenr;
2638 }
2639 n = root->rb_node;
2640
2641 while (n) {
2642 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2643 WARN_ON(!entry->in_tree);
2644
2645 middle = entry->bytenr;
2646
2647 if (alt)
2648 n = n->rb_left;
2649 else
2650 n = n->rb_right;
2651
2652 alt = 1 - alt;
2653 }
2654 return middle;
2655 }
2656 #endif
2657
2658 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2659 {
2660 u64 num_bytes;
2661
2662 num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2663 sizeof(struct btrfs_extent_inline_ref));
2664 if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2665 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2666
2667 /*
2668 * We don't ever fill up leaves all the way so multiply by 2 just to be
2669 * closer to what we're really going to want to ouse.
2670 */
2671 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2672 }
2673
2674 /*
2675 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2676 * would require to store the csums for that many bytes.
2677 */
2678 u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
2679 {
2680 u64 csum_size;
2681 u64 num_csums_per_leaf;
2682 u64 num_csums;
2683
2684 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
2685 num_csums_per_leaf = div64_u64(csum_size,
2686 (u64)btrfs_super_csum_size(root->fs_info->super_copy));
2687 num_csums = div64_u64(csum_bytes, root->sectorsize);
2688 num_csums += num_csums_per_leaf - 1;
2689 num_csums = div64_u64(num_csums, num_csums_per_leaf);
2690 return num_csums;
2691 }
2692
2693 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2694 struct btrfs_root *root)
2695 {
2696 struct btrfs_block_rsv *global_rsv;
2697 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2698 u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2699 u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
2700 u64 num_bytes, num_dirty_bgs_bytes;
2701 int ret = 0;
2702
2703 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2704 num_heads = heads_to_leaves(root, num_heads);
2705 if (num_heads > 1)
2706 num_bytes += (num_heads - 1) * root->nodesize;
2707 num_bytes <<= 1;
2708 num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
2709 num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
2710 num_dirty_bgs);
2711 global_rsv = &root->fs_info->global_block_rsv;
2712
2713 /*
2714 * If we can't allocate any more chunks lets make sure we have _lots_ of
2715 * wiggle room since running delayed refs can create more delayed refs.
2716 */
2717 if (global_rsv->space_info->full) {
2718 num_dirty_bgs_bytes <<= 1;
2719 num_bytes <<= 1;
2720 }
2721
2722 spin_lock(&global_rsv->lock);
2723 if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2724 ret = 1;
2725 spin_unlock(&global_rsv->lock);
2726 return ret;
2727 }
2728
2729 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2730 struct btrfs_root *root)
2731 {
2732 struct btrfs_fs_info *fs_info = root->fs_info;
2733 u64 num_entries =
2734 atomic_read(&trans->transaction->delayed_refs.num_entries);
2735 u64 avg_runtime;
2736 u64 val;
2737
2738 smp_mb();
2739 avg_runtime = fs_info->avg_delayed_ref_runtime;
2740 val = num_entries * avg_runtime;
2741 if (num_entries * avg_runtime >= NSEC_PER_SEC)
2742 return 1;
2743 if (val >= NSEC_PER_SEC / 2)
2744 return 2;
2745
2746 return btrfs_check_space_for_delayed_refs(trans, root);
2747 }
2748
2749 struct async_delayed_refs {
2750 struct btrfs_root *root;
2751 int count;
2752 int error;
2753 int sync;
2754 struct completion wait;
2755 struct btrfs_work work;
2756 };
2757
2758 static void delayed_ref_async_start(struct btrfs_work *work)
2759 {
2760 struct async_delayed_refs *async;
2761 struct btrfs_trans_handle *trans;
2762 int ret;
2763
2764 async = container_of(work, struct async_delayed_refs, work);
2765
2766 trans = btrfs_join_transaction(async->root);
2767 if (IS_ERR(trans)) {
2768 async->error = PTR_ERR(trans);
2769 goto done;
2770 }
2771
2772 /*
2773 * trans->sync means that when we call end_transaciton, we won't
2774 * wait on delayed refs
2775 */
2776 trans->sync = true;
2777 ret = btrfs_run_delayed_refs(trans, async->root, async->count);
2778 if (ret)
2779 async->error = ret;
2780
2781 ret = btrfs_end_transaction(trans, async->root);
2782 if (ret && !async->error)
2783 async->error = ret;
2784 done:
2785 if (async->sync)
2786 complete(&async->wait);
2787 else
2788 kfree(async);
2789 }
2790
2791 int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2792 unsigned long count, int wait)
2793 {
2794 struct async_delayed_refs *async;
2795 int ret;
2796
2797 async = kmalloc(sizeof(*async), GFP_NOFS);
2798 if (!async)
2799 return -ENOMEM;
2800
2801 async->root = root->fs_info->tree_root;
2802 async->count = count;
2803 async->error = 0;
2804 if (wait)
2805 async->sync = 1;
2806 else
2807 async->sync = 0;
2808 init_completion(&async->wait);
2809
2810 btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2811 delayed_ref_async_start, NULL, NULL);
2812
2813 btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2814
2815 if (wait) {
2816 wait_for_completion(&async->wait);
2817 ret = async->error;
2818 kfree(async);
2819 return ret;
2820 }
2821 return 0;
2822 }
2823
2824 /*
2825 * this starts processing the delayed reference count updates and
2826 * extent insertions we have queued up so far. count can be
2827 * 0, which means to process everything in the tree at the start
2828 * of the run (but not newly added entries), or it can be some target
2829 * number you'd like to process.
2830 *
2831 * Returns 0 on success or if called with an aborted transaction
2832 * Returns <0 on error and aborts the transaction
2833 */
2834 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2835 struct btrfs_root *root, unsigned long count)
2836 {
2837 struct rb_node *node;
2838 struct btrfs_delayed_ref_root *delayed_refs;
2839 struct btrfs_delayed_ref_head *head;
2840 int ret;
2841 int run_all = count == (unsigned long)-1;
2842 bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
2843
2844 /* We'll clean this up in btrfs_cleanup_transaction */
2845 if (trans->aborted)
2846 return 0;
2847
2848 if (root == root->fs_info->extent_root)
2849 root = root->fs_info->tree_root;
2850
2851 delayed_refs = &trans->transaction->delayed_refs;
2852 if (count == 0)
2853 count = atomic_read(&delayed_refs->num_entries) * 2;
2854
2855 again:
2856 #ifdef SCRAMBLE_DELAYED_REFS
2857 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2858 #endif
2859 trans->can_flush_pending_bgs = false;
2860 ret = __btrfs_run_delayed_refs(trans, root, count);
2861 if (ret < 0) {
2862 btrfs_abort_transaction(trans, root, ret);
2863 return ret;
2864 }
2865
2866 if (run_all) {
2867 if (!list_empty(&trans->new_bgs))
2868 btrfs_create_pending_block_groups(trans, root);
2869
2870 spin_lock(&delayed_refs->lock);
2871 node = rb_first(&delayed_refs->href_root);
2872 if (!node) {
2873 spin_unlock(&delayed_refs->lock);
2874 goto out;
2875 }
2876 count = (unsigned long)-1;
2877
2878 while (node) {
2879 head = rb_entry(node, struct btrfs_delayed_ref_head,
2880 href_node);
2881 if (btrfs_delayed_ref_is_head(&head->node)) {
2882 struct btrfs_delayed_ref_node *ref;
2883
2884 ref = &head->node;
2885 atomic_inc(&ref->refs);
2886
2887 spin_unlock(&delayed_refs->lock);
2888 /*
2889 * Mutex was contended, block until it's
2890 * released and try again
2891 */
2892 mutex_lock(&head->mutex);
2893 mutex_unlock(&head->mutex);
2894
2895 btrfs_put_delayed_ref(ref);
2896 cond_resched();
2897 goto again;
2898 } else {
2899 WARN_ON(1);
2900 }
2901 node = rb_next(node);
2902 }
2903 spin_unlock(&delayed_refs->lock);
2904 cond_resched();
2905 goto again;
2906 }
2907 out:
2908 assert_qgroups_uptodate(trans);
2909 trans->can_flush_pending_bgs = can_flush_pending_bgs;
2910 return 0;
2911 }
2912
2913 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2914 struct btrfs_root *root,
2915 u64 bytenr, u64 num_bytes, u64 flags,
2916 int level, int is_data)
2917 {
2918 struct btrfs_delayed_extent_op *extent_op;
2919 int ret;
2920
2921 extent_op = btrfs_alloc_delayed_extent_op();
2922 if (!extent_op)
2923 return -ENOMEM;
2924
2925 extent_op->flags_to_set = flags;
2926 extent_op->update_flags = 1;
2927 extent_op->update_key = 0;
2928 extent_op->is_data = is_data ? 1 : 0;
2929 extent_op->level = level;
2930
2931 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2932 num_bytes, extent_op);
2933 if (ret)
2934 btrfs_free_delayed_extent_op(extent_op);
2935 return ret;
2936 }
2937
2938 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2939 struct btrfs_root *root,
2940 struct btrfs_path *path,
2941 u64 objectid, u64 offset, u64 bytenr)
2942 {
2943 struct btrfs_delayed_ref_head *head;
2944 struct btrfs_delayed_ref_node *ref;
2945 struct btrfs_delayed_data_ref *data_ref;
2946 struct btrfs_delayed_ref_root *delayed_refs;
2947 int ret = 0;
2948
2949 delayed_refs = &trans->transaction->delayed_refs;
2950 spin_lock(&delayed_refs->lock);
2951 head = btrfs_find_delayed_ref_head(trans, bytenr);
2952 if (!head) {
2953 spin_unlock(&delayed_refs->lock);
2954 return 0;
2955 }
2956
2957 if (!mutex_trylock(&head->mutex)) {
2958 atomic_inc(&head->node.refs);
2959 spin_unlock(&delayed_refs->lock);
2960
2961 btrfs_release_path(path);
2962
2963 /*
2964 * Mutex was contended, block until it's released and let
2965 * caller try again
2966 */
2967 mutex_lock(&head->mutex);
2968 mutex_unlock(&head->mutex);
2969 btrfs_put_delayed_ref(&head->node);
2970 return -EAGAIN;
2971 }
2972 spin_unlock(&delayed_refs->lock);
2973
2974 spin_lock(&head->lock);
2975 list_for_each_entry(ref, &head->ref_list, list) {
2976 /* If it's a shared ref we know a cross reference exists */
2977 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2978 ret = 1;
2979 break;
2980 }
2981
2982 data_ref = btrfs_delayed_node_to_data_ref(ref);
2983
2984 /*
2985 * If our ref doesn't match the one we're currently looking at
2986 * then we have a cross reference.
2987 */
2988 if (data_ref->root != root->root_key.objectid ||
2989 data_ref->objectid != objectid ||
2990 data_ref->offset != offset) {
2991 ret = 1;
2992 break;
2993 }
2994 }
2995 spin_unlock(&head->lock);
2996 mutex_unlock(&head->mutex);
2997 return ret;
2998 }
2999
3000 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
3001 struct btrfs_root *root,
3002 struct btrfs_path *path,
3003 u64 objectid, u64 offset, u64 bytenr)
3004 {
3005 struct btrfs_root *extent_root = root->fs_info->extent_root;
3006 struct extent_buffer *leaf;
3007 struct btrfs_extent_data_ref *ref;
3008 struct btrfs_extent_inline_ref *iref;
3009 struct btrfs_extent_item *ei;
3010 struct btrfs_key key;
3011 u32 item_size;
3012 int ret;
3013
3014 key.objectid = bytenr;
3015 key.offset = (u64)-1;
3016 key.type = BTRFS_EXTENT_ITEM_KEY;
3017
3018 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3019 if (ret < 0)
3020 goto out;
3021 BUG_ON(ret == 0); /* Corruption */
3022
3023 ret = -ENOENT;
3024 if (path->slots[0] == 0)
3025 goto out;
3026
3027 path->slots[0]--;
3028 leaf = path->nodes[0];
3029 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3030
3031 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3032 goto out;
3033
3034 ret = 1;
3035 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3036 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3037 if (item_size < sizeof(*ei)) {
3038 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3039 goto out;
3040 }
3041 #endif
3042 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3043
3044 if (item_size != sizeof(*ei) +
3045 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3046 goto out;
3047
3048 if (btrfs_extent_generation(leaf, ei) <=
3049 btrfs_root_last_snapshot(&root->root_item))
3050 goto out;
3051
3052 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3053 if (btrfs_extent_inline_ref_type(leaf, iref) !=
3054 BTRFS_EXTENT_DATA_REF_KEY)
3055 goto out;
3056
3057 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3058 if (btrfs_extent_refs(leaf, ei) !=
3059 btrfs_extent_data_ref_count(leaf, ref) ||
3060 btrfs_extent_data_ref_root(leaf, ref) !=
3061 root->root_key.objectid ||
3062 btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3063 btrfs_extent_data_ref_offset(leaf, ref) != offset)
3064 goto out;
3065
3066 ret = 0;
3067 out:
3068 return ret;
3069 }
3070
3071 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
3072 struct btrfs_root *root,
3073 u64 objectid, u64 offset, u64 bytenr)
3074 {
3075 struct btrfs_path *path;
3076 int ret;
3077 int ret2;
3078
3079 path = btrfs_alloc_path();
3080 if (!path)
3081 return -ENOENT;
3082
3083 do {
3084 ret = check_committed_ref(trans, root, path, objectid,
3085 offset, bytenr);
3086 if (ret && ret != -ENOENT)
3087 goto out;
3088
3089 ret2 = check_delayed_ref(trans, root, path, objectid,
3090 offset, bytenr);
3091 } while (ret2 == -EAGAIN);
3092
3093 if (ret2 && ret2 != -ENOENT) {
3094 ret = ret2;
3095 goto out;
3096 }
3097
3098 if (ret != -ENOENT || ret2 != -ENOENT)
3099 ret = 0;
3100 out:
3101 btrfs_free_path(path);
3102 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3103 WARN_ON(ret > 0);
3104 return ret;
3105 }
3106
3107 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3108 struct btrfs_root *root,
3109 struct extent_buffer *buf,
3110 int full_backref, int inc)
3111 {
3112 u64 bytenr;
3113 u64 num_bytes;
3114 u64 parent;
3115 u64 ref_root;
3116 u32 nritems;
3117 struct btrfs_key key;
3118 struct btrfs_file_extent_item *fi;
3119 int i;
3120 int level;
3121 int ret = 0;
3122 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3123 u64, u64, u64, u64, u64, u64, int);
3124
3125
3126 if (btrfs_test_is_dummy_root(root))
3127 return 0;
3128
3129 ref_root = btrfs_header_owner(buf);
3130 nritems = btrfs_header_nritems(buf);
3131 level = btrfs_header_level(buf);
3132
3133 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3134 return 0;
3135
3136 if (inc)
3137 process_func = btrfs_inc_extent_ref;
3138 else
3139 process_func = btrfs_free_extent;
3140
3141 if (full_backref)
3142 parent = buf->start;
3143 else
3144 parent = 0;
3145
3146 for (i = 0; i < nritems; i++) {
3147 if (level == 0) {
3148 btrfs_item_key_to_cpu(buf, &key, i);
3149 if (key.type != BTRFS_EXTENT_DATA_KEY)
3150 continue;
3151 fi = btrfs_item_ptr(buf, i,
3152 struct btrfs_file_extent_item);
3153 if (btrfs_file_extent_type(buf, fi) ==
3154 BTRFS_FILE_EXTENT_INLINE)
3155 continue;
3156 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3157 if (bytenr == 0)
3158 continue;
3159
3160 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3161 key.offset -= btrfs_file_extent_offset(buf, fi);
3162 ret = process_func(trans, root, bytenr, num_bytes,
3163 parent, ref_root, key.objectid,
3164 key.offset, 1);
3165 if (ret)
3166 goto fail;
3167 } else {
3168 bytenr = btrfs_node_blockptr(buf, i);
3169 num_bytes = root->nodesize;
3170 ret = process_func(trans, root, bytenr, num_bytes,
3171 parent, ref_root, level - 1, 0,
3172 1);
3173 if (ret)
3174 goto fail;
3175 }
3176 }
3177 return 0;
3178 fail:
3179 return ret;
3180 }
3181
3182 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3183 struct extent_buffer *buf, int full_backref)
3184 {
3185 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3186 }
3187
3188 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3189 struct extent_buffer *buf, int full_backref)
3190 {
3191 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3192 }
3193
3194 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3195 struct btrfs_root *root,
3196 struct btrfs_path *path,
3197 struct btrfs_block_group_cache *cache)
3198 {
3199 int ret;
3200 struct btrfs_root *extent_root = root->fs_info->extent_root;
3201 unsigned long bi;
3202 struct extent_buffer *leaf;
3203
3204 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3205 if (ret) {
3206 if (ret > 0)
3207 ret = -ENOENT;
3208 goto fail;
3209 }
3210
3211 leaf = path->nodes[0];
3212 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3213 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3214 btrfs_mark_buffer_dirty(leaf);
3215 fail:
3216 btrfs_release_path(path);
3217 return ret;
3218
3219 }
3220
3221 static struct btrfs_block_group_cache *
3222 next_block_group(struct btrfs_root *root,
3223 struct btrfs_block_group_cache *cache)
3224 {
3225 struct rb_node *node;
3226
3227 spin_lock(&root->fs_info->block_group_cache_lock);
3228
3229 /* If our block group was removed, we need a full search. */
3230 if (RB_EMPTY_NODE(&cache->cache_node)) {
3231 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3232
3233 spin_unlock(&root->fs_info->block_group_cache_lock);
3234 btrfs_put_block_group(cache);
3235 cache = btrfs_lookup_first_block_group(root->fs_info,
3236 next_bytenr);
3237 return cache;
3238 }
3239 node = rb_next(&cache->cache_node);
3240 btrfs_put_block_group(cache);
3241 if (node) {
3242 cache = rb_entry(node, struct btrfs_block_group_cache,
3243 cache_node);
3244 btrfs_get_block_group(cache);
3245 } else
3246 cache = NULL;
3247 spin_unlock(&root->fs_info->block_group_cache_lock);
3248 return cache;
3249 }
3250
3251 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3252 struct btrfs_trans_handle *trans,
3253 struct btrfs_path *path)
3254 {
3255 struct btrfs_root *root = block_group->fs_info->tree_root;
3256 struct inode *inode = NULL;
3257 u64 alloc_hint = 0;
3258 int dcs = BTRFS_DC_ERROR;
3259 u64 num_pages = 0;
3260 int retries = 0;
3261 int ret = 0;
3262
3263 /*
3264 * If this block group is smaller than 100 megs don't bother caching the
3265 * block group.
3266 */
3267 if (block_group->key.offset < (100 * 1024 * 1024)) {
3268 spin_lock(&block_group->lock);
3269 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3270 spin_unlock(&block_group->lock);
3271 return 0;
3272 }
3273
3274 if (trans->aborted)
3275 return 0;
3276 again:
3277 inode = lookup_free_space_inode(root, block_group, path);
3278 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3279 ret = PTR_ERR(inode);
3280 btrfs_release_path(path);
3281 goto out;
3282 }
3283
3284 if (IS_ERR(inode)) {
3285 BUG_ON(retries);
3286 retries++;
3287
3288 if (block_group->ro)
3289 goto out_free;
3290
3291 ret = create_free_space_inode(root, trans, block_group, path);
3292 if (ret)
3293 goto out_free;
3294 goto again;
3295 }
3296
3297 /* We've already setup this transaction, go ahead and exit */
3298 if (block_group->cache_generation == trans->transid &&
3299 i_size_read(inode)) {
3300 dcs = BTRFS_DC_SETUP;
3301 goto out_put;
3302 }
3303
3304 /*
3305 * We want to set the generation to 0, that way if anything goes wrong
3306 * from here on out we know not to trust this cache when we load up next
3307 * time.
3308 */
3309 BTRFS_I(inode)->generation = 0;
3310 ret = btrfs_update_inode(trans, root, inode);
3311 if (ret) {
3312 /*
3313 * So theoretically we could recover from this, simply set the
3314 * super cache generation to 0 so we know to invalidate the
3315 * cache, but then we'd have to keep track of the block groups
3316 * that fail this way so we know we _have_ to reset this cache
3317 * before the next commit or risk reading stale cache. So to
3318 * limit our exposure to horrible edge cases lets just abort the
3319 * transaction, this only happens in really bad situations
3320 * anyway.
3321 */
3322 btrfs_abort_transaction(trans, root, ret);
3323 goto out_put;
3324 }
3325 WARN_ON(ret);
3326
3327 if (i_size_read(inode) > 0) {
3328 ret = btrfs_check_trunc_cache_free_space(root,
3329 &root->fs_info->global_block_rsv);
3330 if (ret)
3331 goto out_put;
3332
3333 ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
3334 if (ret)
3335 goto out_put;
3336 }
3337
3338 spin_lock(&block_group->lock);
3339 if (block_group->cached != BTRFS_CACHE_FINISHED ||
3340 !btrfs_test_opt(root, SPACE_CACHE)) {
3341 /*
3342 * don't bother trying to write stuff out _if_
3343 * a) we're not cached,
3344 * b) we're with nospace_cache mount option.
3345 */
3346 dcs = BTRFS_DC_WRITTEN;
3347 spin_unlock(&block_group->lock);
3348 goto out_put;
3349 }
3350 spin_unlock(&block_group->lock);
3351
3352 /*
3353 * Try to preallocate enough space based on how big the block group is.
3354 * Keep in mind this has to include any pinned space which could end up
3355 * taking up quite a bit since it's not folded into the other space
3356 * cache.
3357 */
3358 num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
3359 if (!num_pages)
3360 num_pages = 1;
3361
3362 num_pages *= 16;
3363 num_pages *= PAGE_CACHE_SIZE;
3364
3365 ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
3366 if (ret)
3367 goto out_put;
3368
3369 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3370 num_pages, num_pages,
3371 &alloc_hint);
3372 if (!ret)
3373 dcs = BTRFS_DC_SETUP;
3374 btrfs_free_reserved_data_space(inode, num_pages);
3375
3376 out_put:
3377 iput(inode);
3378 out_free:
3379 btrfs_release_path(path);
3380 out:
3381 spin_lock(&block_group->lock);
3382 if (!ret && dcs == BTRFS_DC_SETUP)
3383 block_group->cache_generation = trans->transid;
3384 block_group->disk_cache_state = dcs;
3385 spin_unlock(&block_group->lock);
3386
3387 return ret;
3388 }
3389
3390 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3391 struct btrfs_root *root)
3392 {
3393 struct btrfs_block_group_cache *cache, *tmp;
3394 struct btrfs_transaction *cur_trans = trans->transaction;
3395 struct btrfs_path *path;
3396
3397 if (list_empty(&cur_trans->dirty_bgs) ||
3398 !btrfs_test_opt(root, SPACE_CACHE))
3399 return 0;
3400
3401 path = btrfs_alloc_path();
3402 if (!path)
3403 return -ENOMEM;
3404
3405 /* Could add new block groups, use _safe just in case */
3406 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3407 dirty_list) {
3408 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3409 cache_save_setup(cache, trans, path);
3410 }
3411
3412 btrfs_free_path(path);
3413 return 0;
3414 }
3415
3416 /*
3417 * transaction commit does final block group cache writeback during a
3418 * critical section where nothing is allowed to change the FS. This is
3419 * required in order for the cache to actually match the block group,
3420 * but can introduce a lot of latency into the commit.
3421 *
3422 * So, btrfs_start_dirty_block_groups is here to kick off block group
3423 * cache IO. There's a chance we'll have to redo some of it if the
3424 * block group changes again during the commit, but it greatly reduces
3425 * the commit latency by getting rid of the easy block groups while
3426 * we're still allowing others to join the commit.
3427 */
3428 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3429 struct btrfs_root *root)
3430 {
3431 struct btrfs_block_group_cache *cache;
3432 struct btrfs_transaction *cur_trans = trans->transaction;
3433 int ret = 0;
3434 int should_put;
3435 struct btrfs_path *path = NULL;
3436 LIST_HEAD(dirty);
3437 struct list_head *io = &cur_trans->io_bgs;
3438 int num_started = 0;
3439 int loops = 0;
3440
3441 spin_lock(&cur_trans->dirty_bgs_lock);
3442 if (list_empty(&cur_trans->dirty_bgs)) {
3443 spin_unlock(&cur_trans->dirty_bgs_lock);
3444 return 0;
3445 }
3446 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3447 spin_unlock(&cur_trans->dirty_bgs_lock);
3448
3449 again:
3450 /*
3451 * make sure all the block groups on our dirty list actually
3452 * exist
3453 */
3454 btrfs_create_pending_block_groups(trans, root);
3455
3456 if (!path) {
3457 path = btrfs_alloc_path();
3458 if (!path)
3459 return -ENOMEM;
3460 }
3461
3462 /*
3463 * cache_write_mutex is here only to save us from balance or automatic
3464 * removal of empty block groups deleting this block group while we are
3465 * writing out the cache
3466 */
3467 mutex_lock(&trans->transaction->cache_write_mutex);
3468 while (!list_empty(&dirty)) {
3469 cache = list_first_entry(&dirty,
3470 struct btrfs_block_group_cache,
3471 dirty_list);
3472 /*
3473 * this can happen if something re-dirties a block
3474 * group that is already under IO. Just wait for it to
3475 * finish and then do it all again
3476 */
3477 if (!list_empty(&cache->io_list)) {
3478 list_del_init(&cache->io_list);
3479 btrfs_wait_cache_io(root, trans, cache,
3480 &cache->io_ctl, path,
3481 cache->key.objectid);
3482 btrfs_put_block_group(cache);
3483 }
3484
3485
3486 /*
3487 * btrfs_wait_cache_io uses the cache->dirty_list to decide
3488 * if it should update the cache_state. Don't delete
3489 * until after we wait.
3490 *
3491 * Since we're not running in the commit critical section
3492 * we need the dirty_bgs_lock to protect from update_block_group
3493 */
3494 spin_lock(&cur_trans->dirty_bgs_lock);
3495 list_del_init(&cache->dirty_list);
3496 spin_unlock(&cur_trans->dirty_bgs_lock);
3497
3498 should_put = 1;
3499
3500 cache_save_setup(cache, trans, path);
3501
3502 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3503 cache->io_ctl.inode = NULL;
3504 ret = btrfs_write_out_cache(root, trans, cache, path);
3505 if (ret == 0 && cache->io_ctl.inode) {
3506 num_started++;
3507 should_put = 0;
3508
3509 /*
3510 * the cache_write_mutex is protecting
3511 * the io_list
3512 */
3513 list_add_tail(&cache->io_list, io);
3514 } else {
3515 /*
3516 * if we failed to write the cache, the
3517 * generation will be bad and life goes on
3518 */
3519 ret = 0;
3520 }
3521 }
3522 if (!ret) {
3523 ret = write_one_cache_group(trans, root, path, cache);
3524 /*
3525 * Our block group might still be attached to the list
3526 * of new block groups in the transaction handle of some
3527 * other task (struct btrfs_trans_handle->new_bgs). This
3528 * means its block group item isn't yet in the extent
3529 * tree. If this happens ignore the error, as we will
3530 * try again later in the critical section of the
3531 * transaction commit.
3532 */
3533 if (ret == -ENOENT) {
3534 ret = 0;
3535 spin_lock(&cur_trans->dirty_bgs_lock);
3536 if (list_empty(&cache->dirty_list)) {
3537 list_add_tail(&cache->dirty_list,
3538 &cur_trans->dirty_bgs);
3539 btrfs_get_block_group(cache);
3540 }
3541 spin_unlock(&cur_trans->dirty_bgs_lock);
3542 } else if (ret) {
3543 btrfs_abort_transaction(trans, root, ret);
3544 }
3545 }
3546
3547 /* if its not on the io list, we need to put the block group */
3548 if (should_put)
3549 btrfs_put_block_group(cache);
3550
3551 if (ret)
3552 break;
3553
3554 /*
3555 * Avoid blocking other tasks for too long. It might even save
3556 * us from writing caches for block groups that are going to be
3557 * removed.
3558 */
3559 mutex_unlock(&trans->transaction->cache_write_mutex);
3560 mutex_lock(&trans->transaction->cache_write_mutex);
3561 }
3562 mutex_unlock(&trans->transaction->cache_write_mutex);
3563
3564 /*
3565 * go through delayed refs for all the stuff we've just kicked off
3566 * and then loop back (just once)
3567 */
3568 ret = btrfs_run_delayed_refs(trans, root, 0);
3569 if (!ret && loops == 0) {
3570 loops++;
3571 spin_lock(&cur_trans->dirty_bgs_lock);
3572 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3573 /*
3574 * dirty_bgs_lock protects us from concurrent block group
3575 * deletes too (not just cache_write_mutex).
3576 */
3577 if (!list_empty(&dirty)) {
3578 spin_unlock(&cur_trans->dirty_bgs_lock);
3579 goto again;
3580 }
3581 spin_unlock(&cur_trans->dirty_bgs_lock);
3582 }
3583
3584 btrfs_free_path(path);
3585 return ret;
3586 }
3587
3588 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3589 struct btrfs_root *root)
3590 {
3591 struct btrfs_block_group_cache *cache;
3592 struct btrfs_transaction *cur_trans = trans->transaction;
3593 int ret = 0;
3594 int should_put;
3595 struct btrfs_path *path;
3596 struct list_head *io = &cur_trans->io_bgs;
3597 int num_started = 0;
3598
3599 path = btrfs_alloc_path();
3600 if (!path)
3601 return -ENOMEM;
3602
3603 /*
3604 * We don't need the lock here since we are protected by the transaction
3605 * commit. We want to do the cache_save_setup first and then run the
3606 * delayed refs to make sure we have the best chance at doing this all
3607 * in one shot.
3608 */
3609 while (!list_empty(&cur_trans->dirty_bgs)) {
3610 cache = list_first_entry(&cur_trans->dirty_bgs,
3611 struct btrfs_block_group_cache,
3612 dirty_list);
3613
3614 /*
3615 * this can happen if cache_save_setup re-dirties a block
3616 * group that is already under IO. Just wait for it to
3617 * finish and then do it all again
3618 */
3619 if (!list_empty(&cache->io_list)) {
3620 list_del_init(&cache->io_list);
3621 btrfs_wait_cache_io(root, trans, cache,
3622 &cache->io_ctl, path,
3623 cache->key.objectid);
3624 btrfs_put_block_group(cache);
3625 }
3626
3627 /*
3628 * don't remove from the dirty list until after we've waited
3629 * on any pending IO
3630 */
3631 list_del_init(&cache->dirty_list);
3632 should_put = 1;
3633
3634 cache_save_setup(cache, trans, path);
3635
3636 if (!ret)
3637 ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
3638
3639 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3640 cache->io_ctl.inode = NULL;
3641 ret = btrfs_write_out_cache(root, trans, cache, path);
3642 if (ret == 0 && cache->io_ctl.inode) {
3643 num_started++;
3644 should_put = 0;
3645 list_add_tail(&cache->io_list, io);
3646 } else {
3647 /*
3648 * if we failed to write the cache, the
3649 * generation will be bad and life goes on
3650 */
3651 ret = 0;
3652 }
3653 }
3654 if (!ret) {
3655 ret = write_one_cache_group(trans, root, path, cache);
3656 if (ret)
3657 btrfs_abort_transaction(trans, root, ret);
3658 }
3659
3660 /* if its not on the io list, we need to put the block group */
3661 if (should_put)
3662 btrfs_put_block_group(cache);
3663 }
3664
3665 while (!list_empty(io)) {
3666 cache = list_first_entry(io, struct btrfs_block_group_cache,
3667 io_list);
3668 list_del_init(&cache->io_list);
3669 btrfs_wait_cache_io(root, trans, cache,
3670 &cache->io_ctl, path, cache->key.objectid);
3671 btrfs_put_block_group(cache);
3672 }
3673
3674 btrfs_free_path(path);
3675 return ret;
3676 }
3677
3678 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3679 {
3680 struct btrfs_block_group_cache *block_group;
3681 int readonly = 0;
3682
3683 block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3684 if (!block_group || block_group->ro)
3685 readonly = 1;
3686 if (block_group)
3687 btrfs_put_block_group(block_group);
3688 return readonly;
3689 }
3690
3691 static const char *alloc_name(u64 flags)
3692 {
3693 switch (flags) {
3694 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3695 return "mixed";
3696 case BTRFS_BLOCK_GROUP_METADATA:
3697 return "metadata";
3698 case BTRFS_BLOCK_GROUP_DATA:
3699 return "data";
3700 case BTRFS_BLOCK_GROUP_SYSTEM:
3701 return "system";
3702 default:
3703 WARN_ON(1);
3704 return "invalid-combination";
3705 };
3706 }
3707
3708 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3709 u64 total_bytes, u64 bytes_used,
3710 struct btrfs_space_info **space_info)
3711 {
3712 struct btrfs_space_info *found;
3713 int i;
3714 int factor;
3715 int ret;
3716
3717 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3718 BTRFS_BLOCK_GROUP_RAID10))
3719 factor = 2;
3720 else
3721 factor = 1;
3722
3723 found = __find_space_info(info, flags);
3724 if (found) {
3725 spin_lock(&found->lock);
3726 found->total_bytes += total_bytes;
3727 found->disk_total += total_bytes * factor;
3728 found->bytes_used += bytes_used;
3729 found->disk_used += bytes_used * factor;
3730 if (total_bytes > 0)
3731 found->full = 0;
3732 spin_unlock(&found->lock);
3733 *space_info = found;
3734 return 0;
3735 }
3736 found = kzalloc(sizeof(*found), GFP_NOFS);
3737 if (!found)
3738 return -ENOMEM;
3739
3740 ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
3741 if (ret) {
3742 kfree(found);
3743 return ret;
3744 }
3745
3746 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3747 INIT_LIST_HEAD(&found->block_groups[i]);
3748 init_rwsem(&found->groups_sem);
3749 spin_lock_init(&found->lock);
3750 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3751 found->total_bytes = total_bytes;
3752 found->disk_total = total_bytes * factor;
3753 found->bytes_used = bytes_used;
3754 found->disk_used = bytes_used * factor;
3755 found->bytes_pinned = 0;
3756 found->bytes_reserved = 0;
3757 found->bytes_readonly = 0;
3758 found->bytes_may_use = 0;
3759 found->full = 0;
3760 found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3761 found->chunk_alloc = 0;
3762 found->flush = 0;
3763 init_waitqueue_head(&found->wait);
3764 INIT_LIST_HEAD(&found->ro_bgs);
3765
3766 ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3767 info->space_info_kobj, "%s",
3768 alloc_name(found->flags));
3769 if (ret) {
3770 kfree(found);
3771 return ret;
3772 }
3773
3774 *space_info = found;
3775 list_add_rcu(&found->list, &info->space_info);
3776 if (flags & BTRFS_BLOCK_GROUP_DATA)
3777 info->data_sinfo = found;
3778
3779 return ret;
3780 }
3781
3782 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3783 {
3784 u64 extra_flags = chunk_to_extended(flags) &
3785 BTRFS_EXTENDED_PROFILE_MASK;
3786
3787 write_seqlock(&fs_info->profiles_lock);
3788 if (flags & BTRFS_BLOCK_GROUP_DATA)
3789 fs_info->avail_data_alloc_bits |= extra_flags;
3790 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3791 fs_info->avail_metadata_alloc_bits |= extra_flags;
3792 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3793 fs_info->avail_system_alloc_bits |= extra_flags;
3794 write_sequnlock(&fs_info->profiles_lock);
3795 }
3796
3797 /*
3798 * returns target flags in extended format or 0 if restripe for this
3799 * chunk_type is not in progress
3800 *
3801 * should be called with either volume_mutex or balance_lock held
3802 */
3803 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3804 {
3805 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3806 u64 target = 0;
3807
3808 if (!bctl)
3809 return 0;
3810
3811 if (flags & BTRFS_BLOCK_GROUP_DATA &&
3812 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3813 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3814 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3815 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3816 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3817 } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3818 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3819 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3820 }
3821
3822 return target;
3823 }
3824
3825 /*
3826 * @flags: available profiles in extended format (see ctree.h)
3827 *
3828 * Returns reduced profile in chunk format. If profile changing is in
3829 * progress (either running or paused) picks the target profile (if it's
3830 * already available), otherwise falls back to plain reducing.
3831 */
3832 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3833 {
3834 u64 num_devices = root->fs_info->fs_devices->rw_devices;
3835 u64 target;
3836 u64 tmp;
3837
3838 /*
3839 * see if restripe for this chunk_type is in progress, if so
3840 * try to reduce to the target profile
3841 */
3842 spin_lock(&root->fs_info->balance_lock);
3843 target = get_restripe_target(root->fs_info, flags);
3844 if (target) {
3845 /* pick target profile only if it's already available */
3846 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3847 spin_unlock(&root->fs_info->balance_lock);
3848 return extended_to_chunk(target);
3849 }
3850 }
3851 spin_unlock(&root->fs_info->balance_lock);
3852
3853 /* First, mask out the RAID levels which aren't possible */
3854 if (num_devices == 1)
3855 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3856 BTRFS_BLOCK_GROUP_RAID5);
3857 if (num_devices < 3)
3858 flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3859 if (num_devices < 4)
3860 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3861
3862 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3863 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3864 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3865 flags &= ~tmp;
3866
3867 if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3868 tmp = BTRFS_BLOCK_GROUP_RAID6;
3869 else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3870 tmp = BTRFS_BLOCK_GROUP_RAID5;
3871 else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3872 tmp = BTRFS_BLOCK_GROUP_RAID10;
3873 else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3874 tmp = BTRFS_BLOCK_GROUP_RAID1;
3875 else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3876 tmp = BTRFS_BLOCK_GROUP_RAID0;
3877
3878 return extended_to_chunk(flags | tmp);
3879 }
3880
3881 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
3882 {
3883 unsigned seq;
3884 u64 flags;
3885
3886 do {
3887 flags = orig_flags;
3888 seq = read_seqbegin(&root->fs_info->profiles_lock);
3889
3890 if (flags & BTRFS_BLOCK_GROUP_DATA)
3891 flags |= root->fs_info->avail_data_alloc_bits;
3892 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3893 flags |= root->fs_info->avail_system_alloc_bits;
3894 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3895 flags |= root->fs_info->avail_metadata_alloc_bits;
3896 } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3897
3898 return btrfs_reduce_alloc_profile(root, flags);
3899 }
3900
3901 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3902 {
3903 u64 flags;
3904 u64 ret;
3905
3906 if (data)
3907 flags = BTRFS_BLOCK_GROUP_DATA;
3908 else if (root == root->fs_info->chunk_root)
3909 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3910 else
3911 flags = BTRFS_BLOCK_GROUP_METADATA;
3912
3913 ret = get_alloc_profile(root, flags);
3914 return ret;
3915 }
3916
3917 /*
3918 * This will check the space that the inode allocates from to make sure we have
3919 * enough space for bytes.
3920 */
3921 int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
3922 {
3923 struct btrfs_space_info *data_sinfo;
3924 struct btrfs_root *root = BTRFS_I(inode)->root;
3925 struct btrfs_fs_info *fs_info = root->fs_info;
3926 u64 used;
3927 int ret = 0;
3928 int need_commit = 2;
3929 int have_pinned_space;
3930
3931 /* make sure bytes are sectorsize aligned */
3932 bytes = ALIGN(bytes, root->sectorsize);
3933
3934 if (btrfs_is_free_space_inode(inode)) {
3935 need_commit = 0;
3936 ASSERT(current->journal_info);
3937 }
3938
3939 data_sinfo = fs_info->data_sinfo;
3940 if (!data_sinfo)
3941 goto alloc;
3942
3943 again:
3944 /* make sure we have enough space to handle the data first */
3945 spin_lock(&data_sinfo->lock);
3946 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3947 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3948 data_sinfo->bytes_may_use;
3949
3950 if (used + bytes > data_sinfo->total_bytes) {
3951 struct btrfs_trans_handle *trans;
3952
3953 /*
3954 * if we don't have enough free bytes in this space then we need
3955 * to alloc a new chunk.
3956 */
3957 if (!data_sinfo->full) {
3958 u64 alloc_target;
3959
3960 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3961 spin_unlock(&data_sinfo->lock);
3962 alloc:
3963 alloc_target = btrfs_get_alloc_profile(root, 1);
3964 /*
3965 * It is ugly that we don't call nolock join
3966 * transaction for the free space inode case here.
3967 * But it is safe because we only do the data space
3968 * reservation for the free space cache in the
3969 * transaction context, the common join transaction
3970 * just increase the counter of the current transaction
3971 * handler, doesn't try to acquire the trans_lock of
3972 * the fs.
3973 */
3974 trans = btrfs_join_transaction(root);
3975 if (IS_ERR(trans))
3976 return PTR_ERR(trans);
3977
3978 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3979 alloc_target,
3980 CHUNK_ALLOC_NO_FORCE);
3981 btrfs_end_transaction(trans, root);
3982 if (ret < 0) {
3983 if (ret != -ENOSPC)
3984 return ret;
3985 else {
3986 have_pinned_space = 1;
3987 goto commit_trans;
3988 }
3989 }
3990
3991 if (!data_sinfo)
3992 data_sinfo = fs_info->data_sinfo;
3993
3994 goto again;
3995 }
3996
3997 /*
3998 * If we don't have enough pinned space to deal with this
3999 * allocation, and no removed chunk in current transaction,
4000 * don't bother committing the transaction.
4001 */
4002 have_pinned_space = percpu_counter_compare(
4003 &data_sinfo->total_bytes_pinned,
4004 used + bytes - data_sinfo->total_bytes);
4005 spin_unlock(&data_sinfo->lock);
4006
4007 /* commit the current transaction and try again */
4008 commit_trans:
4009 if (need_commit &&
4010 !atomic_read(&root->fs_info->open_ioctl_trans)) {
4011 need_commit--;
4012
4013 if (need_commit > 0)
4014 btrfs_wait_ordered_roots(fs_info, -1);
4015
4016 trans = btrfs_join_transaction(root);
4017 if (IS_ERR(trans))
4018 return PTR_ERR(trans);
4019 if (have_pinned_space >= 0 ||
4020 trans->transaction->have_free_bgs ||
4021 need_commit > 0) {
4022 ret = btrfs_commit_transaction(trans, root);
4023 if (ret)
4024 return ret;
4025 /*
4026 * make sure that all running delayed iput are
4027 * done
4028 */
4029 down_write(&root->fs_info->delayed_iput_sem);
4030 up_write(&root->fs_info->delayed_iput_sem);
4031 goto again;
4032 } else {
4033 btrfs_end_transaction(trans, root);
4034 }
4035 }
4036
4037 trace_btrfs_space_reservation(root->fs_info,
4038 "space_info:enospc",
4039 data_sinfo->flags, bytes, 1);
4040 return -ENOSPC;
4041 }
4042 ret = btrfs_qgroup_reserve(root, write_bytes);
4043 if (ret)
4044 goto out;
4045 data_sinfo->bytes_may_use += bytes;
4046 trace_btrfs_space_reservation(root->fs_info, "space_info",
4047 data_sinfo->flags, bytes, 1);
4048 out:
4049 spin_unlock(&data_sinfo->lock);
4050
4051 return ret;
4052 }
4053
4054 /*
4055 * Called if we need to clear a data reservation for this inode.
4056 */
4057 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
4058 {
4059 struct btrfs_root *root = BTRFS_I(inode)->root;
4060 struct btrfs_space_info *data_sinfo;
4061
4062 /* make sure bytes are sectorsize aligned */
4063 bytes = ALIGN(bytes, root->sectorsize);
4064
4065 data_sinfo = root->fs_info->data_sinfo;
4066 spin_lock(&data_sinfo->lock);
4067 WARN_ON(data_sinfo->bytes_may_use < bytes);
4068 data_sinfo->bytes_may_use -= bytes;
4069 trace_btrfs_space_reservation(root->fs_info, "space_info",
4070 data_sinfo->flags, bytes, 0);
4071 spin_unlock(&data_sinfo->lock);
4072 }
4073
4074 static void force_metadata_allocation(struct btrfs_fs_info *info)
4075 {
4076 struct list_head *head = &info->space_info;
4077 struct btrfs_space_info *found;
4078
4079 rcu_read_lock();
4080 list_for_each_entry_rcu(found, head, list) {
4081 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4082 found->force_alloc = CHUNK_ALLOC_FORCE;
4083 }
4084 rcu_read_unlock();
4085 }
4086
4087 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4088 {
4089 return (global->size << 1);
4090 }
4091
4092 static int should_alloc_chunk(struct btrfs_root *root,
4093 struct btrfs_space_info *sinfo, int force)
4094 {
4095 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4096 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
4097 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
4098 u64 thresh;
4099
4100 if (force == CHUNK_ALLOC_FORCE)
4101 return 1;
4102
4103 /*
4104 * We need to take into account the global rsv because for all intents
4105 * and purposes it's used space. Don't worry about locking the
4106 * global_rsv, it doesn't change except when the transaction commits.
4107 */
4108 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4109 num_allocated += calc_global_rsv_need_space(global_rsv);
4110
4111 /*
4112 * in limited mode, we want to have some free space up to
4113 * about 1% of the FS size.
4114 */
4115 if (force == CHUNK_ALLOC_LIMITED) {
4116 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
4117 thresh = max_t(u64, 64 * 1024 * 1024,
4118 div_factor_fine(thresh, 1));
4119
4120 if (num_bytes - num_allocated < thresh)
4121 return 1;
4122 }
4123
4124 if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
4125 return 0;
4126 return 1;
4127 }
4128
4129 static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
4130 {
4131 u64 num_dev;
4132
4133 if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4134 BTRFS_BLOCK_GROUP_RAID0 |
4135 BTRFS_BLOCK_GROUP_RAID5 |
4136 BTRFS_BLOCK_GROUP_RAID6))
4137 num_dev = root->fs_info->fs_devices->rw_devices;
4138 else if (type & BTRFS_BLOCK_GROUP_RAID1)
4139 num_dev = 2;
4140 else
4141 num_dev = 1; /* DUP or single */
4142
4143 return num_dev;
4144 }
4145
4146 /*
4147 * If @is_allocation is true, reserve space in the system space info necessary
4148 * for allocating a chunk, otherwise if it's false, reserve space necessary for
4149 * removing a chunk.
4150 */
4151 void check_system_chunk(struct btrfs_trans_handle *trans,
4152 struct btrfs_root *root,
4153 u64 type)
4154 {
4155 struct btrfs_space_info *info;
4156 u64 left;
4157 u64 thresh;
4158 int ret = 0;
4159 u64 num_devs;
4160
4161 /*
4162 * Needed because we can end up allocating a system chunk and for an
4163 * atomic and race free space reservation in the chunk block reserve.
4164 */
4165 ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
4166
4167 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4168 spin_lock(&info->lock);
4169 left = info->total_bytes - info->bytes_used - info->bytes_pinned -
4170 info->bytes_reserved - info->bytes_readonly -
4171 info->bytes_may_use;
4172 spin_unlock(&info->lock);
4173
4174 num_devs = get_profile_num_devs(root, type);
4175
4176 /* num_devs device items to update and 1 chunk item to add or remove */
4177 thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
4178 btrfs_calc_trans_metadata_size(root, 1);
4179
4180 if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
4181 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
4182 left, thresh, type);
4183 dump_space_info(info, 0, 0);
4184 }
4185
4186 if (left < thresh) {
4187 u64 flags;
4188
4189 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
4190 /*
4191 * Ignore failure to create system chunk. We might end up not
4192 * needing it, as we might not need to COW all nodes/leafs from
4193 * the paths we visit in the chunk tree (they were already COWed
4194 * or created in the current transaction for example).
4195 */
4196 ret = btrfs_alloc_chunk(trans, root, flags);
4197 }
4198
4199 if (!ret) {
4200 ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
4201 &root->fs_info->chunk_block_rsv,
4202 thresh, BTRFS_RESERVE_NO_FLUSH);
4203 if (!ret)
4204 trans->chunk_bytes_reserved += thresh;
4205 }
4206 }
4207
4208 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4209 struct btrfs_root *extent_root, u64 flags, int force)
4210 {
4211 struct btrfs_space_info *space_info;
4212 struct btrfs_fs_info *fs_info = extent_root->fs_info;
4213 int wait_for_alloc = 0;
4214 int ret = 0;
4215
4216 /* Don't re-enter if we're already allocating a chunk */
4217 if (trans->allocating_chunk)
4218 return -ENOSPC;
4219
4220 space_info = __find_space_info(extent_root->fs_info, flags);
4221 if (!space_info) {
4222 ret = update_space_info(extent_root->fs_info, flags,
4223 0, 0, &space_info);
4224 BUG_ON(ret); /* -ENOMEM */
4225 }
4226 BUG_ON(!space_info); /* Logic error */
4227
4228 again:
4229 spin_lock(&space_info->lock);
4230 if (force < space_info->force_alloc)
4231 force = space_info->force_alloc;
4232 if (space_info->full) {
4233 if (should_alloc_chunk(extent_root, space_info, force))
4234 ret = -ENOSPC;
4235 else
4236 ret = 0;
4237 spin_unlock(&space_info->lock);
4238 return ret;
4239 }
4240
4241 if (!should_alloc_chunk(extent_root, space_info, force)) {
4242 spin_unlock(&space_info->lock);
4243 return 0;
4244 } else if (space_info->chunk_alloc) {
4245 wait_for_alloc = 1;
4246 } else {
4247 space_info->chunk_alloc = 1;
4248 }
4249
4250 spin_unlock(&space_info->lock);
4251
4252 mutex_lock(&fs_info->chunk_mutex);
4253
4254 /*
4255 * The chunk_mutex is held throughout the entirety of a chunk
4256 * allocation, so once we've acquired the chunk_mutex we know that the
4257 * other guy is done and we need to recheck and see if we should
4258 * allocate.
4259 */
4260 if (wait_for_alloc) {
4261 mutex_unlock(&fs_info->chunk_mutex);
4262 wait_for_alloc = 0;
4263 goto again;
4264 }
4265
4266 trans->allocating_chunk = true;
4267
4268 /*
4269 * If we have mixed data/metadata chunks we want to make sure we keep
4270 * allocating mixed chunks instead of individual chunks.
4271 */
4272 if (btrfs_mixed_space_info(space_info))
4273 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4274
4275 /*
4276 * if we're doing a data chunk, go ahead and make sure that
4277 * we keep a reasonable number of metadata chunks allocated in the
4278 * FS as well.
4279 */
4280 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4281 fs_info->data_chunk_allocations++;
4282 if (!(fs_info->data_chunk_allocations %
4283 fs_info->metadata_ratio))
4284 force_metadata_allocation(fs_info);
4285 }
4286
4287 /*
4288 * Check if we have enough space in SYSTEM chunk because we may need
4289 * to update devices.
4290 */
4291 check_system_chunk(trans, extent_root, flags);
4292
4293 ret = btrfs_alloc_chunk(trans, extent_root, flags);
4294 trans->allocating_chunk = false;
4295
4296 spin_lock(&space_info->lock);
4297 if (ret < 0 && ret != -ENOSPC)
4298 goto out;
4299 if (ret)
4300 space_info->full = 1;
4301 else
4302 ret = 1;
4303
4304 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4305 out:
4306 space_info->chunk_alloc = 0;
4307 spin_unlock(&space_info->lock);
4308 mutex_unlock(&fs_info->chunk_mutex);
4309 /*
4310 * When we allocate a new chunk we reserve space in the chunk block
4311 * reserve to make sure we can COW nodes/leafs in the chunk tree or
4312 * add new nodes/leafs to it if we end up needing to do it when
4313 * inserting the chunk item and updating device items as part of the
4314 * second phase of chunk allocation, performed by
4315 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4316 * large number of new block groups to create in our transaction
4317 * handle's new_bgs list to avoid exhausting the chunk block reserve
4318 * in extreme cases - like having a single transaction create many new
4319 * block groups when starting to write out the free space caches of all
4320 * the block groups that were made dirty during the lifetime of the
4321 * transaction.
4322 */
4323 if (trans->can_flush_pending_bgs &&
4324 trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
4325 btrfs_create_pending_block_groups(trans, trans->root);
4326 btrfs_trans_release_chunk_metadata(trans);
4327 }
4328 return ret;
4329 }
4330
4331 static int can_overcommit(struct btrfs_root *root,
4332 struct btrfs_space_info *space_info, u64 bytes,
4333 enum btrfs_reserve_flush_enum flush)
4334 {
4335 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4336 u64 profile = btrfs_get_alloc_profile(root, 0);
4337 u64 space_size;
4338 u64 avail;
4339 u64 used;
4340
4341 used = space_info->bytes_used + space_info->bytes_reserved +
4342 space_info->bytes_pinned + space_info->bytes_readonly;
4343
4344 /*
4345 * We only want to allow over committing if we have lots of actual space
4346 * free, but if we don't have enough space to handle the global reserve
4347 * space then we could end up having a real enospc problem when trying
4348 * to allocate a chunk or some other such important allocation.
4349 */
4350 spin_lock(&global_rsv->lock);
4351 space_size = calc_global_rsv_need_space(global_rsv);
4352 spin_unlock(&global_rsv->lock);
4353 if (used + space_size >= space_info->total_bytes)
4354 return 0;
4355
4356 used += space_info->bytes_may_use;
4357
4358 spin_lock(&root->fs_info->free_chunk_lock);
4359 avail = root->fs_info->free_chunk_space;
4360 spin_unlock(&root->fs_info->free_chunk_lock);
4361
4362 /*
4363 * If we have dup, raid1 or raid10 then only half of the free
4364 * space is actually useable. For raid56, the space info used
4365 * doesn't include the parity drive, so we don't have to
4366 * change the math
4367 */
4368 if (profile & (BTRFS_BLOCK_GROUP_DUP |
4369 BTRFS_BLOCK_GROUP_RAID1 |
4370 BTRFS_BLOCK_GROUP_RAID10))
4371 avail >>= 1;
4372
4373 /*
4374 * If we aren't flushing all things, let us overcommit up to
4375 * 1/2th of the space. If we can flush, don't let us overcommit
4376 * too much, let it overcommit up to 1/8 of the space.
4377 */
4378 if (flush == BTRFS_RESERVE_FLUSH_ALL)
4379 avail >>= 3;
4380 else
4381 avail >>= 1;
4382
4383 if (used + bytes < space_info->total_bytes + avail)
4384 return 1;
4385 return 0;
4386 }
4387
4388 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
4389 unsigned long nr_pages, int nr_items)
4390 {
4391 struct super_block *sb = root->fs_info->sb;
4392
4393 if (down_read_trylock(&sb->s_umount)) {
4394 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4395 up_read(&sb->s_umount);
4396 } else {
4397 /*
4398 * We needn't worry the filesystem going from r/w to r/o though
4399 * we don't acquire ->s_umount mutex, because the filesystem
4400 * should guarantee the delalloc inodes list be empty after
4401 * the filesystem is readonly(all dirty pages are written to
4402 * the disk).
4403 */
4404 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
4405 if (!current->journal_info)
4406 btrfs_wait_ordered_roots(root->fs_info, nr_items);
4407 }
4408 }
4409
4410 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
4411 {
4412 u64 bytes;
4413 int nr;
4414
4415 bytes = btrfs_calc_trans_metadata_size(root, 1);
4416 nr = (int)div64_u64(to_reclaim, bytes);
4417 if (!nr)
4418 nr = 1;
4419 return nr;
4420 }
4421
4422 #define EXTENT_SIZE_PER_ITEM (256 * 1024)
4423
4424 /*
4425 * shrink metadata reservation for delalloc
4426 */
4427 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4428 bool wait_ordered)
4429 {
4430 struct btrfs_block_rsv *block_rsv;
4431 struct btrfs_space_info *space_info;
4432 struct btrfs_trans_handle *trans;
4433 u64 delalloc_bytes;
4434 u64 max_reclaim;
4435 long time_left;
4436 unsigned long nr_pages;
4437 int loops;
4438 int items;
4439 enum btrfs_reserve_flush_enum flush;
4440
4441 /* Calc the number of the pages we need flush for space reservation */
4442 items = calc_reclaim_items_nr(root, to_reclaim);
4443 to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4444
4445 trans = (struct btrfs_trans_handle *)current->journal_info;
4446 block_rsv = &root->fs_info->delalloc_block_rsv;
4447 space_info = block_rsv->space_info;
4448
4449 delalloc_bytes = percpu_counter_sum_positive(
4450 &root->fs_info->delalloc_bytes);
4451 if (delalloc_bytes == 0) {
4452 if (trans)
4453 return;
4454 if (wait_ordered)
4455 btrfs_wait_ordered_roots(root->fs_info, items);
4456 return;
4457 }
4458
4459 loops = 0;
4460 while (delalloc_bytes && loops < 3) {
4461 max_reclaim = min(delalloc_bytes, to_reclaim);
4462 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
4463 btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4464 /*
4465 * We need to wait for the async pages to actually start before
4466 * we do anything.
4467 */
4468 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
4469 if (!max_reclaim)
4470 goto skip_async;
4471
4472 if (max_reclaim <= nr_pages)
4473 max_reclaim = 0;
4474 else
4475 max_reclaim -= nr_pages;
4476
4477 wait_event(root->fs_info->async_submit_wait,
4478 atomic_read(&root->fs_info->async_delalloc_pages) <=
4479 (int)max_reclaim);
4480 skip_async:
4481 if (!trans)
4482 flush = BTRFS_RESERVE_FLUSH_ALL;
4483 else
4484 flush = BTRFS_RESERVE_NO_FLUSH;
4485 spin_lock(&space_info->lock);
4486 if (can_overcommit(root, space_info, orig, flush)) {
4487 spin_unlock(&space_info->lock);
4488 break;
4489 }
4490 spin_unlock(&space_info->lock);
4491
4492 loops++;
4493 if (wait_ordered && !trans) {
4494 btrfs_wait_ordered_roots(root->fs_info, items);
4495 } else {
4496 time_left = schedule_timeout_killable(1);
4497 if (time_left)
4498 break;
4499 }
4500 delalloc_bytes = percpu_counter_sum_positive(
4501 &root->fs_info->delalloc_bytes);
4502 }
4503 }
4504
4505 /**
4506 * maybe_commit_transaction - possibly commit the transaction if its ok to
4507 * @root - the root we're allocating for
4508 * @bytes - the number of bytes we want to reserve
4509 * @force - force the commit
4510 *
4511 * This will check to make sure that committing the transaction will actually
4512 * get us somewhere and then commit the transaction if it does. Otherwise it
4513 * will return -ENOSPC.
4514 */
4515 static int may_commit_transaction(struct btrfs_root *root,
4516 struct btrfs_space_info *space_info,
4517 u64 bytes, int force)
4518 {
4519 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
4520 struct btrfs_trans_handle *trans;
4521
4522 trans = (struct btrfs_trans_handle *)current->journal_info;
4523 if (trans)
4524 return -EAGAIN;
4525
4526 if (force)
4527 goto commit;
4528
4529 /* See if there is enough pinned space to make this reservation */
4530 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4531 bytes) >= 0)
4532 goto commit;
4533
4534 /*
4535 * See if there is some space in the delayed insertion reservation for
4536 * this reservation.
4537 */
4538 if (space_info != delayed_rsv->space_info)
4539 return -ENOSPC;
4540
4541 spin_lock(&delayed_rsv->lock);
4542 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4543 bytes - delayed_rsv->size) >= 0) {
4544 spin_unlock(&delayed_rsv->lock);
4545 return -ENOSPC;
4546 }
4547 spin_unlock(&delayed_rsv->lock);
4548
4549 commit:
4550 trans = btrfs_join_transaction(root);
4551 if (IS_ERR(trans))
4552 return -ENOSPC;
4553
4554 return btrfs_commit_transaction(trans, root);
4555 }
4556
4557 enum flush_state {
4558 FLUSH_DELAYED_ITEMS_NR = 1,
4559 FLUSH_DELAYED_ITEMS = 2,
4560 FLUSH_DELALLOC = 3,
4561 FLUSH_DELALLOC_WAIT = 4,
4562 ALLOC_CHUNK = 5,
4563 COMMIT_TRANS = 6,
4564 };
4565
4566 static int flush_space(struct btrfs_root *root,
4567 struct btrfs_space_info *space_info, u64 num_bytes,
4568 u64 orig_bytes, int state)
4569 {
4570 struct btrfs_trans_handle *trans;
4571 int nr;
4572 int ret = 0;
4573
4574 switch (state) {
4575 case FLUSH_DELAYED_ITEMS_NR:
4576 case FLUSH_DELAYED_ITEMS:
4577 if (state == FLUSH_DELAYED_ITEMS_NR)
4578 nr = calc_reclaim_items_nr(root, num_bytes) * 2;
4579 else
4580 nr = -1;
4581
4582 trans = btrfs_join_transaction(root);
4583 if (IS_ERR(trans)) {
4584 ret = PTR_ERR(trans);
4585 break;
4586 }
4587 ret = btrfs_run_delayed_items_nr(trans, root, nr);
4588 btrfs_end_transaction(trans, root);
4589 break;
4590 case FLUSH_DELALLOC:
4591 case FLUSH_DELALLOC_WAIT:
4592 shrink_delalloc(root, num_bytes * 2, orig_bytes,
4593 state == FLUSH_DELALLOC_WAIT);
4594 break;
4595 case ALLOC_CHUNK:
4596 trans = btrfs_join_transaction(root);
4597 if (IS_ERR(trans)) {
4598 ret = PTR_ERR(trans);
4599 break;
4600 }
4601 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4602 btrfs_get_alloc_profile(root, 0),
4603 CHUNK_ALLOC_NO_FORCE);
4604 btrfs_end_transaction(trans, root);
4605 if (ret == -ENOSPC)
4606 ret = 0;
4607 break;
4608 case COMMIT_TRANS:
4609 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4610 break;
4611 default:
4612 ret = -ENOSPC;
4613 break;
4614 }
4615
4616 return ret;
4617 }
4618
4619 static inline u64
4620 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4621 struct btrfs_space_info *space_info)
4622 {
4623 u64 used;
4624 u64 expected;
4625 u64 to_reclaim;
4626
4627 to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
4628 16 * 1024 * 1024);
4629 spin_lock(&space_info->lock);
4630 if (can_overcommit(root, space_info, to_reclaim,
4631 BTRFS_RESERVE_FLUSH_ALL)) {
4632 to_reclaim = 0;
4633 goto out;
4634 }
4635
4636 used = space_info->bytes_used + space_info->bytes_reserved +
4637 space_info->bytes_pinned + space_info->bytes_readonly +
4638 space_info->bytes_may_use;
4639 if (can_overcommit(root, space_info, 1024 * 1024,
4640 BTRFS_RESERVE_FLUSH_ALL))
4641 expected = div_factor_fine(space_info->total_bytes, 95);
4642 else
4643 expected = div_factor_fine(space_info->total_bytes, 90);
4644
4645 if (used > expected)
4646 to_reclaim = used - expected;
4647 else
4648 to_reclaim = 0;
4649 to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4650 space_info->bytes_reserved);
4651 out:
4652 spin_unlock(&space_info->lock);
4653
4654 return to_reclaim;
4655 }
4656
4657 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4658 struct btrfs_fs_info *fs_info, u64 used)
4659 {
4660 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4661
4662 /* If we're just plain full then async reclaim just slows us down. */
4663 if (space_info->bytes_used >= thresh)
4664 return 0;
4665
4666 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
4667 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4668 }
4669
4670 static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
4671 struct btrfs_fs_info *fs_info,
4672 int flush_state)
4673 {
4674 u64 used;
4675
4676 spin_lock(&space_info->lock);
4677 /*
4678 * We run out of space and have not got any free space via flush_space,
4679 * so don't bother doing async reclaim.
4680 */
4681 if (flush_state > COMMIT_TRANS && space_info->full) {
4682 spin_unlock(&space_info->lock);
4683 return 0;
4684 }
4685
4686 used = space_info->bytes_used + space_info->bytes_reserved +
4687 space_info->bytes_pinned + space_info->bytes_readonly +
4688 space_info->bytes_may_use;
4689 if (need_do_async_reclaim(space_info, fs_info, used)) {
4690 spin_unlock(&space_info->lock);
4691 return 1;
4692 }
4693 spin_unlock(&space_info->lock);
4694
4695 return 0;
4696 }
4697
4698 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4699 {
4700 struct btrfs_fs_info *fs_info;
4701 struct btrfs_space_info *space_info;
4702 u64 to_reclaim;
4703 int flush_state;
4704
4705 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4706 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4707
4708 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
4709 space_info);
4710 if (!to_reclaim)
4711 return;
4712
4713 flush_state = FLUSH_DELAYED_ITEMS_NR;
4714 do {
4715 flush_space(fs_info->fs_root, space_info, to_reclaim,
4716 to_reclaim, flush_state);
4717 flush_state++;
4718 if (!btrfs_need_do_async_reclaim(space_info, fs_info,
4719 flush_state))
4720 return;
4721 } while (flush_state < COMMIT_TRANS);
4722 }
4723
4724 void btrfs_init_async_reclaim_work(struct work_struct *work)
4725 {
4726 INIT_WORK(work, btrfs_async_reclaim_metadata_space);
4727 }
4728
4729 /**
4730 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4731 * @root - the root we're allocating for
4732 * @block_rsv - the block_rsv we're allocating for
4733 * @orig_bytes - the number of bytes we want
4734 * @flush - whether or not we can flush to make our reservation
4735 *
4736 * This will reserve orgi_bytes number of bytes from the space info associated
4737 * with the block_rsv. If there is not enough space it will make an attempt to
4738 * flush out space to make room. It will do this by flushing delalloc if
4739 * possible or committing the transaction. If flush is 0 then no attempts to
4740 * regain reservations will be made and this will fail if there is not enough
4741 * space already.
4742 */
4743 static int reserve_metadata_bytes(struct btrfs_root *root,
4744 struct btrfs_block_rsv *block_rsv,
4745 u64 orig_bytes,
4746 enum btrfs_reserve_flush_enum flush)
4747 {
4748 struct btrfs_space_info *space_info = block_rsv->space_info;
4749 u64 used;
4750 u64 num_bytes = orig_bytes;
4751 int flush_state = FLUSH_DELAYED_ITEMS_NR;
4752 int ret = 0;
4753 bool flushing = false;
4754
4755 again:
4756 ret = 0;
4757 spin_lock(&space_info->lock);
4758 /*
4759 * We only want to wait if somebody other than us is flushing and we
4760 * are actually allowed to flush all things.
4761 */
4762 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
4763 space_info->flush) {
4764 spin_unlock(&space_info->lock);
4765 /*
4766 * If we have a trans handle we can't wait because the flusher
4767 * may have to commit the transaction, which would mean we would
4768 * deadlock since we are waiting for the flusher to finish, but
4769 * hold the current transaction open.
4770 */
4771 if (current->journal_info)
4772 return -EAGAIN;
4773 ret = wait_event_killable(space_info->wait, !space_info->flush);
4774 /* Must have been killed, return */
4775 if (ret)
4776 return -EINTR;
4777
4778 spin_lock(&space_info->lock);
4779 }
4780
4781 ret = -ENOSPC;
4782 used = space_info->bytes_used + space_info->bytes_reserved +
4783 space_info->bytes_pinned + space_info->bytes_readonly +
4784 space_info->bytes_may_use;
4785
4786 /*
4787 * The idea here is that we've not already over-reserved the block group
4788 * then we can go ahead and save our reservation first and then start
4789 * flushing if we need to. Otherwise if we've already overcommitted
4790 * lets start flushing stuff first and then come back and try to make
4791 * our reservation.
4792 */
4793 if (used <= space_info->total_bytes) {
4794 if (used + orig_bytes <= space_info->total_bytes) {
4795 space_info->bytes_may_use += orig_bytes;
4796 trace_btrfs_space_reservation(root->fs_info,
4797 "space_info", space_info->flags, orig_bytes, 1);
4798 ret = 0;
4799 } else {
4800 /*
4801 * Ok set num_bytes to orig_bytes since we aren't
4802 * overocmmitted, this way we only try and reclaim what
4803 * we need.
4804 */
4805 num_bytes = orig_bytes;
4806 }
4807 } else {
4808 /*
4809 * Ok we're over committed, set num_bytes to the overcommitted
4810 * amount plus the amount of bytes that we need for this
4811 * reservation.
4812 */
4813 num_bytes = used - space_info->total_bytes +
4814 (orig_bytes * 2);
4815 }
4816
4817 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
4818 space_info->bytes_may_use += orig_bytes;
4819 trace_btrfs_space_reservation(root->fs_info, "space_info",
4820 space_info->flags, orig_bytes,
4821 1);
4822 ret = 0;
4823 }
4824
4825 /*
4826 * Couldn't make our reservation, save our place so while we're trying
4827 * to reclaim space we can actually use it instead of somebody else
4828 * stealing it from us.
4829 *
4830 * We make the other tasks wait for the flush only when we can flush
4831 * all things.
4832 */
4833 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4834 flushing = true;
4835 space_info->flush = 1;
4836 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
4837 used += orig_bytes;
4838 /*
4839 * We will do the space reservation dance during log replay,
4840 * which means we won't have fs_info->fs_root set, so don't do
4841 * the async reclaim as we will panic.
4842 */
4843 if (!root->fs_info->log_root_recovering &&
4844 need_do_async_reclaim(space_info, root->fs_info, used) &&
4845 !work_busy(&root->fs_info->async_reclaim_work))
4846 queue_work(system_unbound_wq,
4847 &root->fs_info->async_reclaim_work);
4848 }
4849 spin_unlock(&space_info->lock);
4850
4851 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4852 goto out;
4853
4854 ret = flush_space(root, space_info, num_bytes, orig_bytes,
4855 flush_state);
4856 flush_state++;
4857
4858 /*
4859 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4860 * would happen. So skip delalloc flush.
4861 */
4862 if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4863 (flush_state == FLUSH_DELALLOC ||
4864 flush_state == FLUSH_DELALLOC_WAIT))
4865 flush_state = ALLOC_CHUNK;
4866
4867 if (!ret)
4868 goto again;
4869 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4870 flush_state < COMMIT_TRANS)
4871 goto again;
4872 else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4873 flush_state <= COMMIT_TRANS)
4874 goto again;
4875
4876 out:
4877 if (ret == -ENOSPC &&
4878 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
4879 struct btrfs_block_rsv *global_rsv =
4880 &root->fs_info->global_block_rsv;
4881
4882 if (block_rsv != global_rsv &&
4883 !block_rsv_use_bytes(global_rsv, orig_bytes))
4884 ret = 0;
4885 }
4886 if (ret == -ENOSPC)
4887 trace_btrfs_space_reservation(root->fs_info,
4888 "space_info:enospc",
4889 space_info->flags, orig_bytes, 1);
4890 if (flushing) {
4891 spin_lock(&space_info->lock);
4892 space_info->flush = 0;
4893 wake_up_all(&space_info->wait);
4894 spin_unlock(&space_info->lock);
4895 }
4896 return ret;
4897 }
4898
4899 static struct btrfs_block_rsv *get_block_rsv(
4900 const struct btrfs_trans_handle *trans,
4901 const struct btrfs_root *root)
4902 {
4903 struct btrfs_block_rsv *block_rsv = NULL;
4904
4905 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4906 block_rsv = trans->block_rsv;
4907
4908 if (root == root->fs_info->csum_root && trans->adding_csums)
4909 block_rsv = trans->block_rsv;
4910
4911 if (root == root->fs_info->uuid_root)
4912 block_rsv = trans->block_rsv;
4913
4914 if (!block_rsv)
4915 block_rsv = root->block_rsv;
4916
4917 if (!block_rsv)
4918 block_rsv = &root->fs_info->empty_block_rsv;
4919
4920 return block_rsv;
4921 }
4922
4923 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4924 u64 num_bytes)
4925 {
4926 int ret = -ENOSPC;
4927 spin_lock(&block_rsv->lock);
4928 if (block_rsv->reserved >= num_bytes) {
4929 block_rsv->reserved -= num_bytes;
4930 if (block_rsv->reserved < block_rsv->size)
4931 block_rsv->full = 0;
4932 ret = 0;
4933 }
4934 spin_unlock(&block_rsv->lock);
4935 return ret;
4936 }
4937
4938 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4939 u64 num_bytes, int update_size)
4940 {
4941 spin_lock(&block_rsv->lock);
4942 block_rsv->reserved += num_bytes;
4943 if (update_size)
4944 block_rsv->size += num_bytes;
4945 else if (block_rsv->reserved >= block_rsv->size)
4946 block_rsv->full = 1;
4947 spin_unlock(&block_rsv->lock);
4948 }
4949
4950 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
4951 struct btrfs_block_rsv *dest, u64 num_bytes,
4952 int min_factor)
4953 {
4954 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4955 u64 min_bytes;
4956
4957 if (global_rsv->space_info != dest->space_info)
4958 return -ENOSPC;
4959
4960 spin_lock(&global_rsv->lock);
4961 min_bytes = div_factor(global_rsv->size, min_factor);
4962 if (global_rsv->reserved < min_bytes + num_bytes) {
4963 spin_unlock(&global_rsv->lock);
4964 return -ENOSPC;
4965 }
4966 global_rsv->reserved -= num_bytes;
4967 if (global_rsv->reserved < global_rsv->size)
4968 global_rsv->full = 0;
4969 spin_unlock(&global_rsv->lock);
4970
4971 block_rsv_add_bytes(dest, num_bytes, 1);
4972 return 0;
4973 }
4974
4975 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4976 struct btrfs_block_rsv *block_rsv,
4977 struct btrfs_block_rsv *dest, u64 num_bytes)
4978 {
4979 struct btrfs_space_info *space_info = block_rsv->space_info;
4980
4981 spin_lock(&block_rsv->lock);
4982 if (num_bytes == (u64)-1)
4983 num_bytes = block_rsv->size;
4984 block_rsv->size -= num_bytes;
4985 if (block_rsv->reserved >= block_rsv->size) {
4986 num_bytes = block_rsv->reserved - block_rsv->size;
4987 block_rsv->reserved = block_rsv->size;
4988 block_rsv->full = 1;
4989 } else {
4990 num_bytes = 0;
4991 }
4992 spin_unlock(&block_rsv->lock);
4993
4994 if (num_bytes > 0) {
4995 if (dest) {
4996 spin_lock(&dest->lock);
4997 if (!dest->full) {
4998 u64 bytes_to_add;
4999
5000 bytes_to_add = dest->size - dest->reserved;
5001 bytes_to_add = min(num_bytes, bytes_to_add);
5002 dest->reserved += bytes_to_add;
5003 if (dest->reserved >= dest->size)
5004 dest->full = 1;
5005 num_bytes -= bytes_to_add;
5006 }
5007 spin_unlock(&dest->lock);
5008 }
5009 if (num_bytes) {
5010 spin_lock(&space_info->lock);
5011 space_info->bytes_may_use -= num_bytes;
5012 trace_btrfs_space_reservation(fs_info, "space_info",
5013 space_info->flags, num_bytes, 0);
5014 spin_unlock(&space_info->lock);
5015 }
5016 }
5017 }
5018
5019 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
5020 struct btrfs_block_rsv *dst, u64 num_bytes)
5021 {
5022 int ret;
5023
5024 ret = block_rsv_use_bytes(src, num_bytes);
5025 if (ret)
5026 return ret;
5027
5028 block_rsv_add_bytes(dst, num_bytes, 1);
5029 return 0;
5030 }
5031
5032 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5033 {
5034 memset(rsv, 0, sizeof(*rsv));
5035 spin_lock_init(&rsv->lock);
5036 rsv->type = type;
5037 }
5038
5039 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
5040 unsigned short type)
5041 {
5042 struct btrfs_block_rsv *block_rsv;
5043 struct btrfs_fs_info *fs_info = root->fs_info;
5044
5045 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5046 if (!block_rsv)
5047 return NULL;
5048
5049 btrfs_init_block_rsv(block_rsv, type);
5050 block_rsv->space_info = __find_space_info(fs_info,
5051 BTRFS_BLOCK_GROUP_METADATA);
5052 return block_rsv;
5053 }
5054
5055 void btrfs_free_block_rsv(struct btrfs_root *root,
5056 struct btrfs_block_rsv *rsv)
5057 {
5058 if (!rsv)
5059 return;
5060 btrfs_block_rsv_release(root, rsv, (u64)-1);
5061 kfree(rsv);
5062 }
5063
5064 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
5065 {
5066 kfree(rsv);
5067 }
5068
5069 int btrfs_block_rsv_add(struct btrfs_root *root,
5070 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5071 enum btrfs_reserve_flush_enum flush)
5072 {
5073 int ret;
5074
5075 if (num_bytes == 0)
5076 return 0;
5077
5078 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5079 if (!ret) {
5080 block_rsv_add_bytes(block_rsv, num_bytes, 1);
5081 return 0;
5082 }
5083
5084 return ret;
5085 }
5086
5087 int btrfs_block_rsv_check(struct btrfs_root *root,
5088 struct btrfs_block_rsv *block_rsv, int min_factor)
5089 {
5090 u64 num_bytes = 0;
5091 int ret = -ENOSPC;
5092
5093 if (!block_rsv)
5094 return 0;
5095
5096 spin_lock(&block_rsv->lock);
5097 num_bytes = div_factor(block_rsv->size, min_factor);
5098 if (block_rsv->reserved >= num_bytes)
5099 ret = 0;
5100 spin_unlock(&block_rsv->lock);
5101
5102 return ret;
5103 }
5104
5105 int btrfs_block_rsv_refill(struct btrfs_root *root,
5106 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5107 enum btrfs_reserve_flush_enum flush)
5108 {
5109 u64 num_bytes = 0;
5110 int ret = -ENOSPC;
5111
5112 if (!block_rsv)
5113 return 0;
5114
5115 spin_lock(&block_rsv->lock);
5116 num_bytes = min_reserved;
5117 if (block_rsv->reserved >= num_bytes)
5118 ret = 0;
5119 else
5120 num_bytes -= block_rsv->reserved;
5121 spin_unlock(&block_rsv->lock);
5122
5123 if (!ret)
5124 return 0;
5125
5126 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5127 if (!ret) {
5128 block_rsv_add_bytes(block_rsv, num_bytes, 0);
5129 return 0;
5130 }
5131
5132 return ret;
5133 }
5134
5135 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
5136 struct btrfs_block_rsv *dst_rsv,
5137 u64 num_bytes)
5138 {
5139 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
5140 }
5141
5142 void btrfs_block_rsv_release(struct btrfs_root *root,
5143 struct btrfs_block_rsv *block_rsv,
5144 u64 num_bytes)
5145 {
5146 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5147 if (global_rsv == block_rsv ||
5148 block_rsv->space_info != global_rsv->space_info)
5149 global_rsv = NULL;
5150 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
5151 num_bytes);
5152 }
5153
5154 /*
5155 * helper to calculate size of global block reservation.
5156 * the desired value is sum of space used by extent tree,
5157 * checksum tree and root tree
5158 */
5159 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
5160 {
5161 struct btrfs_space_info *sinfo;
5162 u64 num_bytes;
5163 u64 meta_used;
5164 u64 data_used;
5165 int csum_size = btrfs_super_csum_size(fs_info->super_copy);
5166
5167 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
5168 spin_lock(&sinfo->lock);
5169 data_used = sinfo->bytes_used;
5170 spin_unlock(&sinfo->lock);
5171
5172 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5173 spin_lock(&sinfo->lock);
5174 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
5175 data_used = 0;
5176 meta_used = sinfo->bytes_used;
5177 spin_unlock(&sinfo->lock);
5178
5179 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
5180 csum_size * 2;
5181 num_bytes += div_u64(data_used + meta_used, 50);
5182
5183 if (num_bytes * 3 > meta_used)
5184 num_bytes = div_u64(meta_used, 3);
5185
5186 return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
5187 }
5188
5189 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5190 {
5191 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5192 struct btrfs_space_info *sinfo = block_rsv->space_info;
5193 u64 num_bytes;
5194
5195 num_bytes = calc_global_metadata_size(fs_info);
5196
5197 spin_lock(&sinfo->lock);
5198 spin_lock(&block_rsv->lock);
5199
5200 block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
5201
5202 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
5203 sinfo->bytes_reserved + sinfo->bytes_readonly +
5204 sinfo->bytes_may_use;
5205
5206 if (sinfo->total_bytes > num_bytes) {
5207 num_bytes = sinfo->total_bytes - num_bytes;
5208 block_rsv->reserved += num_bytes;
5209 sinfo->bytes_may_use += num_bytes;
5210 trace_btrfs_space_reservation(fs_info, "space_info",
5211 sinfo->flags, num_bytes, 1);
5212 }
5213
5214 if (block_rsv->reserved >= block_rsv->size) {
5215 num_bytes = block_rsv->reserved - block_rsv->size;
5216 sinfo->bytes_may_use -= num_bytes;
5217 trace_btrfs_space_reservation(fs_info, "space_info",
5218 sinfo->flags, num_bytes, 0);
5219 block_rsv->reserved = block_rsv->size;
5220 block_rsv->full = 1;
5221 }
5222
5223 spin_unlock(&block_rsv->lock);
5224 spin_unlock(&sinfo->lock);
5225 }
5226
5227 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5228 {
5229 struct btrfs_space_info *space_info;
5230
5231 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5232 fs_info->chunk_block_rsv.space_info = space_info;
5233
5234 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5235 fs_info->global_block_rsv.space_info = space_info;
5236 fs_info->delalloc_block_rsv.space_info = space_info;
5237 fs_info->trans_block_rsv.space_info = space_info;
5238 fs_info->empty_block_rsv.space_info = space_info;
5239 fs_info->delayed_block_rsv.space_info = space_info;
5240
5241 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5242 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5243 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5244 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5245 if (fs_info->quota_root)
5246 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5247 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5248
5249 update_global_block_rsv(fs_info);
5250 }
5251
5252 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5253 {
5254 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5255 (u64)-1);
5256 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
5257 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
5258 WARN_ON(fs_info->trans_block_rsv.size > 0);
5259 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5260 WARN_ON(fs_info->chunk_block_rsv.size > 0);
5261 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5262 WARN_ON(fs_info->delayed_block_rsv.size > 0);
5263 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5264 }
5265
5266 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
5267 struct btrfs_root *root)
5268 {
5269 if (!trans->block_rsv)
5270 return;
5271
5272 if (!trans->bytes_reserved)
5273 return;
5274
5275 trace_btrfs_space_reservation(root->fs_info, "transaction",
5276 trans->transid, trans->bytes_reserved, 0);
5277 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
5278 trans->bytes_reserved = 0;
5279 }
5280
5281 /*
5282 * To be called after all the new block groups attached to the transaction
5283 * handle have been created (btrfs_create_pending_block_groups()).
5284 */
5285 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5286 {
5287 struct btrfs_fs_info *fs_info = trans->root->fs_info;
5288
5289 if (!trans->chunk_bytes_reserved)
5290 return;
5291
5292 WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5293
5294 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5295 trans->chunk_bytes_reserved);
5296 trans->chunk_bytes_reserved = 0;
5297 }
5298
5299 /* Can only return 0 or -ENOSPC */
5300 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
5301 struct inode *inode)
5302 {
5303 struct btrfs_root *root = BTRFS_I(inode)->root;
5304 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
5305 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
5306
5307 /*
5308 * We need to hold space in order to delete our orphan item once we've
5309 * added it, so this takes the reservation so we can release it later
5310 * when we are truly done with the orphan item.
5311 */
5312 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5313 trace_btrfs_space_reservation(root->fs_info, "orphan",
5314 btrfs_ino(inode), num_bytes, 1);
5315 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
5316 }
5317
5318 void btrfs_orphan_release_metadata(struct inode *inode)
5319 {
5320 struct btrfs_root *root = BTRFS_I(inode)->root;
5321 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5322 trace_btrfs_space_reservation(root->fs_info, "orphan",
5323 btrfs_ino(inode), num_bytes, 0);
5324 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
5325 }
5326
5327 /*
5328 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5329 * root: the root of the parent directory
5330 * rsv: block reservation
5331 * items: the number of items that we need do reservation
5332 * qgroup_reserved: used to return the reserved size in qgroup
5333 *
5334 * This function is used to reserve the space for snapshot/subvolume
5335 * creation and deletion. Those operations are different with the
5336 * common file/directory operations, they change two fs/file trees
5337 * and root tree, the number of items that the qgroup reserves is
5338 * different with the free space reservation. So we can not use
5339 * the space reseravtion mechanism in start_transaction().
5340 */
5341 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5342 struct btrfs_block_rsv *rsv,
5343 int items,
5344 u64 *qgroup_reserved,
5345 bool use_global_rsv)
5346 {
5347 u64 num_bytes;
5348 int ret;
5349 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5350
5351 if (root->fs_info->quota_enabled) {
5352 /* One for parent inode, two for dir entries */
5353 num_bytes = 3 * root->nodesize;
5354 ret = btrfs_qgroup_reserve(root, num_bytes);
5355 if (ret)
5356 return ret;
5357 } else {
5358 num_bytes = 0;
5359 }
5360
5361 *qgroup_reserved = num_bytes;
5362
5363 num_bytes = btrfs_calc_trans_metadata_size(root, items);
5364 rsv->space_info = __find_space_info(root->fs_info,
5365 BTRFS_BLOCK_GROUP_METADATA);
5366 ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5367 BTRFS_RESERVE_FLUSH_ALL);
5368
5369 if (ret == -ENOSPC && use_global_rsv)
5370 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
5371
5372 if (ret) {
5373 if (*qgroup_reserved)
5374 btrfs_qgroup_free(root, *qgroup_reserved);
5375 }
5376
5377 return ret;
5378 }
5379
5380 void btrfs_subvolume_release_metadata(struct btrfs_root *root,
5381 struct btrfs_block_rsv *rsv,
5382 u64 qgroup_reserved)
5383 {
5384 btrfs_block_rsv_release(root, rsv, (u64)-1);
5385 }
5386
5387 /**
5388 * drop_outstanding_extent - drop an outstanding extent
5389 * @inode: the inode we're dropping the extent for
5390 * @num_bytes: the number of bytes we're relaseing.
5391 *
5392 * This is called when we are freeing up an outstanding extent, either called
5393 * after an error or after an extent is written. This will return the number of
5394 * reserved extents that need to be freed. This must be called with
5395 * BTRFS_I(inode)->lock held.
5396 */
5397 static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
5398 {
5399 unsigned drop_inode_space = 0;
5400 unsigned dropped_extents = 0;
5401 unsigned num_extents = 0;
5402
5403 num_extents = (unsigned)div64_u64(num_bytes +
5404 BTRFS_MAX_EXTENT_SIZE - 1,
5405 BTRFS_MAX_EXTENT_SIZE);
5406 ASSERT(num_extents);
5407 ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
5408 BTRFS_I(inode)->outstanding_extents -= num_extents;
5409
5410 if (BTRFS_I(inode)->outstanding_extents == 0 &&
5411 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5412 &BTRFS_I(inode)->runtime_flags))
5413 drop_inode_space = 1;
5414
5415 /*
5416 * If we have more or the same amount of outsanding extents than we have
5417 * reserved then we need to leave the reserved extents count alone.
5418 */
5419 if (BTRFS_I(inode)->outstanding_extents >=
5420 BTRFS_I(inode)->reserved_extents)
5421 return drop_inode_space;
5422
5423 dropped_extents = BTRFS_I(inode)->reserved_extents -
5424 BTRFS_I(inode)->outstanding_extents;
5425 BTRFS_I(inode)->reserved_extents -= dropped_extents;
5426 return dropped_extents + drop_inode_space;
5427 }
5428
5429 /**
5430 * calc_csum_metadata_size - return the amount of metada space that must be
5431 * reserved/free'd for the given bytes.
5432 * @inode: the inode we're manipulating
5433 * @num_bytes: the number of bytes in question
5434 * @reserve: 1 if we are reserving space, 0 if we are freeing space
5435 *
5436 * This adjusts the number of csum_bytes in the inode and then returns the
5437 * correct amount of metadata that must either be reserved or freed. We
5438 * calculate how many checksums we can fit into one leaf and then divide the
5439 * number of bytes that will need to be checksumed by this value to figure out
5440 * how many checksums will be required. If we are adding bytes then the number
5441 * may go up and we will return the number of additional bytes that must be
5442 * reserved. If it is going down we will return the number of bytes that must
5443 * be freed.
5444 *
5445 * This must be called with BTRFS_I(inode)->lock held.
5446 */
5447 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
5448 int reserve)
5449 {
5450 struct btrfs_root *root = BTRFS_I(inode)->root;
5451 u64 old_csums, num_csums;
5452
5453 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
5454 BTRFS_I(inode)->csum_bytes == 0)
5455 return 0;
5456
5457 old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5458 if (reserve)
5459 BTRFS_I(inode)->csum_bytes += num_bytes;
5460 else
5461 BTRFS_I(inode)->csum_bytes -= num_bytes;
5462 num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5463
5464 /* No change, no need to reserve more */
5465 if (old_csums == num_csums)
5466 return 0;
5467
5468 if (reserve)
5469 return btrfs_calc_trans_metadata_size(root,
5470 num_csums - old_csums);
5471
5472 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
5473 }
5474
5475 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5476 {
5477 struct btrfs_root *root = BTRFS_I(inode)->root;
5478 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
5479 u64 to_reserve = 0;
5480 u64 csum_bytes;
5481 unsigned nr_extents = 0;
5482 int extra_reserve = 0;
5483 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5484 int ret = 0;
5485 bool delalloc_lock = true;
5486 u64 to_free = 0;
5487 unsigned dropped;
5488
5489 /* If we are a free space inode we need to not flush since we will be in
5490 * the middle of a transaction commit. We also don't need the delalloc
5491 * mutex since we won't race with anybody. We need this mostly to make
5492 * lockdep shut its filthy mouth.
5493 */
5494 if (btrfs_is_free_space_inode(inode)) {
5495 flush = BTRFS_RESERVE_NO_FLUSH;
5496 delalloc_lock = false;
5497 }
5498
5499 if (flush != BTRFS_RESERVE_NO_FLUSH &&
5500 btrfs_transaction_in_commit(root->fs_info))
5501 schedule_timeout(1);
5502
5503 if (delalloc_lock)
5504 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
5505
5506 num_bytes = ALIGN(num_bytes, root->sectorsize);
5507
5508 spin_lock(&BTRFS_I(inode)->lock);
5509 nr_extents = (unsigned)div64_u64(num_bytes +
5510 BTRFS_MAX_EXTENT_SIZE - 1,
5511 BTRFS_MAX_EXTENT_SIZE);
5512 BTRFS_I(inode)->outstanding_extents += nr_extents;
5513 nr_extents = 0;
5514
5515 if (BTRFS_I(inode)->outstanding_extents >
5516 BTRFS_I(inode)->reserved_extents)
5517 nr_extents = BTRFS_I(inode)->outstanding_extents -
5518 BTRFS_I(inode)->reserved_extents;
5519
5520 /*
5521 * Add an item to reserve for updating the inode when we complete the
5522 * delalloc io.
5523 */
5524 if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5525 &BTRFS_I(inode)->runtime_flags)) {
5526 nr_extents++;
5527 extra_reserve = 1;
5528 }
5529
5530 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
5531 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
5532 csum_bytes = BTRFS_I(inode)->csum_bytes;
5533 spin_unlock(&BTRFS_I(inode)->lock);
5534
5535 if (root->fs_info->quota_enabled) {
5536 ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
5537 if (ret)
5538 goto out_fail;
5539 }
5540
5541 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
5542 if (unlikely(ret)) {
5543 if (root->fs_info->quota_enabled)
5544 btrfs_qgroup_free(root, nr_extents * root->nodesize);
5545 goto out_fail;
5546 }
5547
5548 spin_lock(&BTRFS_I(inode)->lock);
5549 if (extra_reserve) {
5550 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5551 &BTRFS_I(inode)->runtime_flags);
5552 nr_extents--;
5553 }
5554 BTRFS_I(inode)->reserved_extents += nr_extents;
5555 spin_unlock(&BTRFS_I(inode)->lock);
5556
5557 if (delalloc_lock)
5558 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5559
5560 if (to_reserve)
5561 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5562 btrfs_ino(inode), to_reserve, 1);
5563 block_rsv_add_bytes(block_rsv, to_reserve, 1);
5564
5565 return 0;
5566
5567 out_fail:
5568 spin_lock(&BTRFS_I(inode)->lock);
5569 dropped = drop_outstanding_extent(inode, num_bytes);
5570 /*
5571 * If the inodes csum_bytes is the same as the original
5572 * csum_bytes then we know we haven't raced with any free()ers
5573 * so we can just reduce our inodes csum bytes and carry on.
5574 */
5575 if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
5576 calc_csum_metadata_size(inode, num_bytes, 0);
5577 } else {
5578 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
5579 u64 bytes;
5580
5581 /*
5582 * This is tricky, but first we need to figure out how much we
5583 * free'd from any free-ers that occured during this
5584 * reservation, so we reset ->csum_bytes to the csum_bytes
5585 * before we dropped our lock, and then call the free for the
5586 * number of bytes that were freed while we were trying our
5587 * reservation.
5588 */
5589 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
5590 BTRFS_I(inode)->csum_bytes = csum_bytes;
5591 to_free = calc_csum_metadata_size(inode, bytes, 0);
5592
5593
5594 /*
5595 * Now we need to see how much we would have freed had we not
5596 * been making this reservation and our ->csum_bytes were not
5597 * artificially inflated.
5598 */
5599 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
5600 bytes = csum_bytes - orig_csum_bytes;
5601 bytes = calc_csum_metadata_size(inode, bytes, 0);
5602
5603 /*
5604 * Now reset ->csum_bytes to what it should be. If bytes is
5605 * more than to_free then we would have free'd more space had we
5606 * not had an artificially high ->csum_bytes, so we need to free
5607 * the remainder. If bytes is the same or less then we don't
5608 * need to do anything, the other free-ers did the correct
5609 * thing.
5610 */
5611 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
5612 if (bytes > to_free)
5613 to_free = bytes - to_free;
5614 else
5615 to_free = 0;
5616 }
5617 spin_unlock(&BTRFS_I(inode)->lock);
5618 if (dropped)
5619 to_free += btrfs_calc_trans_metadata_size(root, dropped);
5620
5621 if (to_free) {
5622 btrfs_block_rsv_release(root, block_rsv, to_free);
5623 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5624 btrfs_ino(inode), to_free, 0);
5625 }
5626 if (delalloc_lock)
5627 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5628 return ret;
5629 }
5630
5631 /**
5632 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5633 * @inode: the inode to release the reservation for
5634 * @num_bytes: the number of bytes we're releasing
5635 *
5636 * This will release the metadata reservation for an inode. This can be called
5637 * once we complete IO for a given set of bytes to release their metadata
5638 * reservations.
5639 */
5640 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
5641 {
5642 struct btrfs_root *root = BTRFS_I(inode)->root;
5643 u64 to_free = 0;
5644 unsigned dropped;
5645
5646 num_bytes = ALIGN(num_bytes, root->sectorsize);
5647 spin_lock(&BTRFS_I(inode)->lock);
5648 dropped = drop_outstanding_extent(inode, num_bytes);
5649
5650 if (num_bytes)
5651 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
5652 spin_unlock(&BTRFS_I(inode)->lock);
5653 if (dropped > 0)
5654 to_free += btrfs_calc_trans_metadata_size(root, dropped);
5655
5656 if (btrfs_test_is_dummy_root(root))
5657 return;
5658
5659 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5660 btrfs_ino(inode), to_free, 0);
5661
5662 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
5663 to_free);
5664 }
5665
5666 /**
5667 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
5668 * @inode: inode we're writing to
5669 * @num_bytes: the number of bytes we want to allocate
5670 *
5671 * This will do the following things
5672 *
5673 * o reserve space in the data space info for num_bytes
5674 * o reserve space in the metadata space info based on number of outstanding
5675 * extents and how much csums will be needed
5676 * o add to the inodes ->delalloc_bytes
5677 * o add it to the fs_info's delalloc inodes list.
5678 *
5679 * This will return 0 for success and -ENOSPC if there is no space left.
5680 */
5681 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
5682 {
5683 int ret;
5684
5685 ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
5686 if (ret)
5687 return ret;
5688
5689 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
5690 if (ret) {
5691 btrfs_free_reserved_data_space(inode, num_bytes);
5692 return ret;
5693 }
5694
5695 return 0;
5696 }
5697
5698 /**
5699 * btrfs_delalloc_release_space - release data and metadata space for delalloc
5700 * @inode: inode we're releasing space for
5701 * @num_bytes: the number of bytes we want to free up
5702 *
5703 * This must be matched with a call to btrfs_delalloc_reserve_space. This is
5704 * called in the case that we don't need the metadata AND data reservations
5705 * anymore. So if there is an error or we insert an inline extent.
5706 *
5707 * This function will release the metadata space that was not used and will
5708 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
5709 * list if there are no delalloc bytes left.
5710 */
5711 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
5712 {
5713 btrfs_delalloc_release_metadata(inode, num_bytes);
5714 btrfs_free_reserved_data_space(inode, num_bytes);
5715 }
5716
5717 static int update_block_group(struct btrfs_trans_handle *trans,
5718 struct btrfs_root *root, u64 bytenr,
5719 u64 num_bytes, int alloc)
5720 {
5721 struct btrfs_block_group_cache *cache = NULL;
5722 struct btrfs_fs_info *info = root->fs_info;
5723 u64 total = num_bytes;
5724 u64 old_val;
5725 u64 byte_in_group;
5726 int factor;
5727
5728 /* block accounting for super block */
5729 spin_lock(&info->delalloc_root_lock);
5730 old_val = btrfs_super_bytes_used(info->super_copy);
5731 if (alloc)
5732 old_val += num_bytes;
5733 else
5734 old_val -= num_bytes;
5735 btrfs_set_super_bytes_used(info->super_copy, old_val);
5736 spin_unlock(&info->delalloc_root_lock);
5737
5738 while (total) {
5739 cache = btrfs_lookup_block_group(info, bytenr);
5740 if (!cache)
5741 return -ENOENT;
5742 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
5743 BTRFS_BLOCK_GROUP_RAID1 |
5744 BTRFS_BLOCK_GROUP_RAID10))
5745 factor = 2;
5746 else
5747 factor = 1;
5748 /*
5749 * If this block group has free space cache written out, we
5750 * need to make sure to load it if we are removing space. This
5751 * is because we need the unpinning stage to actually add the
5752 * space back to the block group, otherwise we will leak space.
5753 */
5754 if (!alloc && cache->cached == BTRFS_CACHE_NO)
5755 cache_block_group(cache, 1);
5756
5757 byte_in_group = bytenr - cache->key.objectid;
5758 WARN_ON(byte_in_group > cache->key.offset);
5759
5760 spin_lock(&cache->space_info->lock);
5761 spin_lock(&cache->lock);
5762
5763 if (btrfs_test_opt(root, SPACE_CACHE) &&
5764 cache->disk_cache_state < BTRFS_DC_CLEAR)
5765 cache->disk_cache_state = BTRFS_DC_CLEAR;
5766
5767 old_val = btrfs_block_group_used(&cache->item);
5768 num_bytes = min(total, cache->key.offset - byte_in_group);
5769 if (alloc) {
5770 old_val += num_bytes;
5771 btrfs_set_block_group_used(&cache->item, old_val);
5772 cache->reserved -= num_bytes;
5773 cache->space_info->bytes_reserved -= num_bytes;
5774 cache->space_info->bytes_used += num_bytes;
5775 cache->space_info->disk_used += num_bytes * factor;
5776 spin_unlock(&cache->lock);
5777 spin_unlock(&cache->space_info->lock);
5778 } else {
5779 old_val -= num_bytes;
5780 btrfs_set_block_group_used(&cache->item, old_val);
5781 cache->pinned += num_bytes;
5782 cache->space_info->bytes_pinned += num_bytes;
5783 cache->space_info->bytes_used -= num_bytes;
5784 cache->space_info->disk_used -= num_bytes * factor;
5785 spin_unlock(&cache->lock);
5786 spin_unlock(&cache->space_info->lock);
5787
5788 set_extent_dirty(info->pinned_extents,
5789 bytenr, bytenr + num_bytes - 1,
5790 GFP_NOFS | __GFP_NOFAIL);
5791 /*
5792 * No longer have used bytes in this block group, queue
5793 * it for deletion.
5794 */
5795 if (old_val == 0) {
5796 spin_lock(&info->unused_bgs_lock);
5797 if (list_empty(&cache->bg_list)) {
5798 btrfs_get_block_group(cache);
5799 list_add_tail(&cache->bg_list,
5800 &info->unused_bgs);
5801 }
5802 spin_unlock(&info->unused_bgs_lock);
5803 }
5804 }
5805
5806 spin_lock(&trans->transaction->dirty_bgs_lock);
5807 if (list_empty(&cache->dirty_list)) {
5808 list_add_tail(&cache->dirty_list,
5809 &trans->transaction->dirty_bgs);
5810 trans->transaction->num_dirty_bgs++;
5811 btrfs_get_block_group(cache);
5812 }
5813 spin_unlock(&trans->transaction->dirty_bgs_lock);
5814
5815 btrfs_put_block_group(cache);
5816 total -= num_bytes;
5817 bytenr += num_bytes;
5818 }
5819 return 0;
5820 }
5821
5822 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
5823 {
5824 struct btrfs_block_group_cache *cache;
5825 u64 bytenr;
5826
5827 spin_lock(&root->fs_info->block_group_cache_lock);
5828 bytenr = root->fs_info->first_logical_byte;
5829 spin_unlock(&root->fs_info->block_group_cache_lock);
5830
5831 if (bytenr < (u64)-1)
5832 return bytenr;
5833
5834 cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
5835 if (!cache)
5836 return 0;
5837
5838 bytenr = cache->key.objectid;
5839 btrfs_put_block_group(cache);
5840
5841 return bytenr;
5842 }
5843
5844 static int pin_down_extent(struct btrfs_root *root,
5845 struct btrfs_block_group_cache *cache,
5846 u64 bytenr, u64 num_bytes, int reserved)
5847 {
5848 spin_lock(&cache->space_info->lock);
5849 spin_lock(&cache->lock);
5850 cache->pinned += num_bytes;
5851 cache->space_info->bytes_pinned += num_bytes;
5852 if (reserved) {
5853 cache->reserved -= num_bytes;
5854 cache->space_info->bytes_reserved -= num_bytes;
5855 }
5856 spin_unlock(&cache->lock);
5857 spin_unlock(&cache->space_info->lock);
5858
5859 set_extent_dirty(root->fs_info->pinned_extents, bytenr,
5860 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
5861 if (reserved)
5862 trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
5863 return 0;
5864 }
5865
5866 /*
5867 * this function must be called within transaction
5868 */
5869 int btrfs_pin_extent(struct btrfs_root *root,
5870 u64 bytenr, u64 num_bytes, int reserved)
5871 {
5872 struct btrfs_block_group_cache *cache;
5873
5874 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5875 BUG_ON(!cache); /* Logic error */
5876
5877 pin_down_extent(root, cache, bytenr, num_bytes, reserved);
5878
5879 btrfs_put_block_group(cache);
5880 return 0;
5881 }
5882
5883 /*
5884 * this function must be called within transaction
5885 */
5886 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
5887 u64 bytenr, u64 num_bytes)
5888 {
5889 struct btrfs_block_group_cache *cache;
5890 int ret;
5891
5892 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5893 if (!cache)
5894 return -EINVAL;
5895
5896 /*
5897 * pull in the free space cache (if any) so that our pin
5898 * removes the free space from the cache. We have load_only set
5899 * to one because the slow code to read in the free extents does check
5900 * the pinned extents.
5901 */
5902 cache_block_group(cache, 1);
5903
5904 pin_down_extent(root, cache, bytenr, num_bytes, 0);
5905
5906 /* remove us from the free space cache (if we're there at all) */
5907 ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
5908 btrfs_put_block_group(cache);
5909 return ret;
5910 }
5911
5912 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
5913 {
5914 int ret;
5915 struct btrfs_block_group_cache *block_group;
5916 struct btrfs_caching_control *caching_ctl;
5917
5918 block_group = btrfs_lookup_block_group(root->fs_info, start);
5919 if (!block_group)
5920 return -EINVAL;
5921
5922 cache_block_group(block_group, 0);
5923 caching_ctl = get_caching_control(block_group);
5924
5925 if (!caching_ctl) {
5926 /* Logic error */
5927 BUG_ON(!block_group_cache_done(block_group));
5928 ret = btrfs_remove_free_space(block_group, start, num_bytes);
5929 } else {
5930 mutex_lock(&caching_ctl->mutex);
5931
5932 if (start >= caching_ctl->progress) {
5933 ret = add_excluded_extent(root, start, num_bytes);
5934 } else if (start + num_bytes <= caching_ctl->progress) {
5935 ret = btrfs_remove_free_space(block_group,
5936 start, num_bytes);
5937 } else {
5938 num_bytes = caching_ctl->progress - start;
5939 ret = btrfs_remove_free_space(block_group,
5940 start, num_bytes);
5941 if (ret)
5942 goto out_lock;
5943
5944 num_bytes = (start + num_bytes) -
5945 caching_ctl->progress;
5946 start = caching_ctl->progress;
5947 ret = add_excluded_extent(root, start, num_bytes);
5948 }
5949 out_lock:
5950 mutex_unlock(&caching_ctl->mutex);
5951 put_caching_control(caching_ctl);
5952 }
5953 btrfs_put_block_group(block_group);
5954 return ret;
5955 }
5956
5957 int btrfs_exclude_logged_extents(struct btrfs_root *log,
5958 struct extent_buffer *eb)
5959 {
5960 struct btrfs_file_extent_item *item;
5961 struct btrfs_key key;
5962 int found_type;
5963 int i;
5964
5965 if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
5966 return 0;
5967
5968 for (i = 0; i < btrfs_header_nritems(eb); i++) {
5969 btrfs_item_key_to_cpu(eb, &key, i);
5970 if (key.type != BTRFS_EXTENT_DATA_KEY)
5971 continue;
5972 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
5973 found_type = btrfs_file_extent_type(eb, item);
5974 if (found_type == BTRFS_FILE_EXTENT_INLINE)
5975 continue;
5976 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
5977 continue;
5978 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
5979 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
5980 __exclude_logged_extent(log, key.objectid, key.offset);
5981 }
5982
5983 return 0;
5984 }
5985
5986 /**
5987 * btrfs_update_reserved_bytes - update the block_group and space info counters
5988 * @cache: The cache we are manipulating
5989 * @num_bytes: The number of bytes in question
5990 * @reserve: One of the reservation enums
5991 * @delalloc: The blocks are allocated for the delalloc write
5992 *
5993 * This is called by the allocator when it reserves space, or by somebody who is
5994 * freeing space that was never actually used on disk. For example if you
5995 * reserve some space for a new leaf in transaction A and before transaction A
5996 * commits you free that leaf, you call this with reserve set to 0 in order to
5997 * clear the reservation.
5998 *
5999 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
6000 * ENOSPC accounting. For data we handle the reservation through clearing the
6001 * delalloc bits in the io_tree. We have to do this since we could end up
6002 * allocating less disk space for the amount of data we have reserved in the
6003 * case of compression.
6004 *
6005 * If this is a reservation and the block group has become read only we cannot
6006 * make the reservation and return -EAGAIN, otherwise this function always
6007 * succeeds.
6008 */
6009 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
6010 u64 num_bytes, int reserve, int delalloc)
6011 {
6012 struct btrfs_space_info *space_info = cache->space_info;
6013 int ret = 0;
6014
6015 spin_lock(&space_info->lock);
6016 spin_lock(&cache->lock);
6017 if (reserve != RESERVE_FREE) {
6018 if (cache->ro) {
6019 ret = -EAGAIN;
6020 } else {
6021 cache->reserved += num_bytes;
6022 space_info->bytes_reserved += num_bytes;
6023 if (reserve == RESERVE_ALLOC) {
6024 trace_btrfs_space_reservation(cache->fs_info,
6025 "space_info", space_info->flags,
6026 num_bytes, 0);
6027 space_info->bytes_may_use -= num_bytes;
6028 }
6029
6030 if (delalloc)
6031 cache->delalloc_bytes += num_bytes;
6032 }
6033 } else {
6034 if (cache->ro)
6035 space_info->bytes_readonly += num_bytes;
6036 cache->reserved -= num_bytes;
6037 space_info->bytes_reserved -= num_bytes;
6038
6039 if (delalloc)
6040 cache->delalloc_bytes -= num_bytes;
6041 }
6042 spin_unlock(&cache->lock);
6043 spin_unlock(&space_info->lock);
6044 return ret;
6045 }
6046
6047 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
6048 struct btrfs_root *root)
6049 {
6050 struct btrfs_fs_info *fs_info = root->fs_info;
6051 struct btrfs_caching_control *next;
6052 struct btrfs_caching_control *caching_ctl;
6053 struct btrfs_block_group_cache *cache;
6054
6055 down_write(&fs_info->commit_root_sem);
6056
6057 list_for_each_entry_safe(caching_ctl, next,
6058 &fs_info->caching_block_groups, list) {
6059 cache = caching_ctl->block_group;
6060 if (block_group_cache_done(cache)) {
6061 cache->last_byte_to_unpin = (u64)-1;
6062 list_del_init(&caching_ctl->list);
6063 put_caching_control(caching_ctl);
6064 } else {
6065 cache->last_byte_to_unpin = caching_ctl->progress;
6066 }
6067 }
6068
6069 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6070 fs_info->pinned_extents = &fs_info->freed_extents[1];
6071 else
6072 fs_info->pinned_extents = &fs_info->freed_extents[0];
6073
6074 up_write(&fs_info->commit_root_sem);
6075
6076 update_global_block_rsv(fs_info);
6077 }
6078
6079 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
6080 const bool return_free_space)
6081 {
6082 struct btrfs_fs_info *fs_info = root->fs_info;
6083 struct btrfs_block_group_cache *cache = NULL;
6084 struct btrfs_space_info *space_info;
6085 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6086 u64 len;
6087 bool readonly;
6088
6089 while (start <= end) {
6090 readonly = false;
6091 if (!cache ||
6092 start >= cache->key.objectid + cache->key.offset) {
6093 if (cache)
6094 btrfs_put_block_group(cache);
6095 cache = btrfs_lookup_block_group(fs_info, start);
6096 BUG_ON(!cache); /* Logic error */
6097 }
6098
6099 len = cache->key.objectid + cache->key.offset - start;
6100 len = min(len, end + 1 - start);
6101
6102 if (start < cache->last_byte_to_unpin) {
6103 len = min(len, cache->last_byte_to_unpin - start);
6104 if (return_free_space)
6105 btrfs_add_free_space(cache, start, len);
6106 }
6107
6108 start += len;
6109 space_info = cache->space_info;
6110
6111 spin_lock(&space_info->lock);
6112 spin_lock(&cache->lock);
6113 cache->pinned -= len;
6114 space_info->bytes_pinned -= len;
6115 percpu_counter_add(&space_info->total_bytes_pinned, -len);
6116 if (cache->ro) {
6117 space_info->bytes_readonly += len;
6118 readonly = true;
6119 }
6120 spin_unlock(&cache->lock);
6121 if (!readonly && global_rsv->space_info == space_info) {
6122 spin_lock(&global_rsv->lock);
6123 if (!global_rsv->full) {
6124 len = min(len, global_rsv->size -
6125 global_rsv->reserved);
6126 global_rsv->reserved += len;
6127 space_info->bytes_may_use += len;
6128 if (global_rsv->reserved >= global_rsv->size)
6129 global_rsv->full = 1;
6130 }
6131 spin_unlock(&global_rsv->lock);
6132 }
6133 spin_unlock(&space_info->lock);
6134 }
6135
6136 if (cache)
6137 btrfs_put_block_group(cache);
6138 return 0;
6139 }
6140
6141 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
6142 struct btrfs_root *root)
6143 {
6144 struct btrfs_fs_info *fs_info = root->fs_info;
6145 struct btrfs_block_group_cache *block_group, *tmp;
6146 struct list_head *deleted_bgs;
6147 struct extent_io_tree *unpin;
6148 u64 start;
6149 u64 end;
6150 int ret;
6151
6152 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6153 unpin = &fs_info->freed_extents[1];
6154 else
6155 unpin = &fs_info->freed_extents[0];
6156
6157 while (!trans->aborted) {
6158 mutex_lock(&fs_info->unused_bg_unpin_mutex);
6159 ret = find_first_extent_bit(unpin, 0, &start, &end,
6160 EXTENT_DIRTY, NULL);
6161 if (ret) {
6162 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6163 break;
6164 }
6165
6166 if (btrfs_test_opt(root, DISCARD))
6167 ret = btrfs_discard_extent(root, start,
6168 end + 1 - start, NULL);
6169
6170 clear_extent_dirty(unpin, start, end, GFP_NOFS);
6171 unpin_extent_range(root, start, end, true);
6172 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6173 cond_resched();
6174 }
6175
6176 /*
6177 * Transaction is finished. We don't need the lock anymore. We
6178 * do need to clean up the block groups in case of a transaction
6179 * abort.
6180 */
6181 deleted_bgs = &trans->transaction->deleted_bgs;
6182 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6183 u64 trimmed = 0;
6184
6185 ret = -EROFS;
6186 if (!trans->aborted)
6187 ret = btrfs_discard_extent(root,
6188 block_group->key.objectid,
6189 block_group->key.offset,
6190 &trimmed);
6191
6192 list_del_init(&block_group->bg_list);
6193 btrfs_put_block_group_trimming(block_group);
6194 btrfs_put_block_group(block_group);
6195
6196 if (ret) {
6197 const char *errstr = btrfs_decode_error(ret);
6198 btrfs_warn(fs_info,
6199 "Discard failed while removing blockgroup: errno=%d %s\n",
6200 ret, errstr);
6201 }
6202 }
6203
6204 return 0;
6205 }
6206
6207 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
6208 u64 owner, u64 root_objectid)
6209 {
6210 struct btrfs_space_info *space_info;
6211 u64 flags;
6212
6213 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6214 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
6215 flags = BTRFS_BLOCK_GROUP_SYSTEM;
6216 else
6217 flags = BTRFS_BLOCK_GROUP_METADATA;
6218 } else {
6219 flags = BTRFS_BLOCK_GROUP_DATA;
6220 }
6221
6222 space_info = __find_space_info(fs_info, flags);
6223 BUG_ON(!space_info); /* Logic bug */
6224 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
6225 }
6226
6227
6228 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6229 struct btrfs_root *root,
6230 struct btrfs_delayed_ref_node *node, u64 parent,
6231 u64 root_objectid, u64 owner_objectid,
6232 u64 owner_offset, int refs_to_drop,
6233 struct btrfs_delayed_extent_op *extent_op)
6234 {
6235 struct btrfs_key key;
6236 struct btrfs_path *path;
6237 struct btrfs_fs_info *info = root->fs_info;
6238 struct btrfs_root *extent_root = info->extent_root;
6239 struct extent_buffer *leaf;
6240 struct btrfs_extent_item *ei;
6241 struct btrfs_extent_inline_ref *iref;
6242 int ret;
6243 int is_data;
6244 int extent_slot = 0;
6245 int found_extent = 0;
6246 int num_to_del = 1;
6247 int no_quota = node->no_quota;
6248 u32 item_size;
6249 u64 refs;
6250 u64 bytenr = node->bytenr;
6251 u64 num_bytes = node->num_bytes;
6252 int last_ref = 0;
6253 bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6254 SKINNY_METADATA);
6255
6256 if (!info->quota_enabled || !is_fstree(root_objectid))
6257 no_quota = 1;
6258
6259 path = btrfs_alloc_path();
6260 if (!path)
6261 return -ENOMEM;
6262
6263 path->reada = 1;
6264 path->leave_spinning = 1;
6265
6266 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6267 BUG_ON(!is_data && refs_to_drop != 1);
6268
6269 if (is_data)
6270 skinny_metadata = 0;
6271
6272 ret = lookup_extent_backref(trans, extent_root, path, &iref,
6273 bytenr, num_bytes, parent,
6274 root_objectid, owner_objectid,
6275 owner_offset);
6276 if (ret == 0) {
6277 extent_slot = path->slots[0];
6278 while (extent_slot >= 0) {
6279 btrfs_item_key_to_cpu(path->nodes[0], &key,
6280 extent_slot);
6281 if (key.objectid != bytenr)
6282 break;
6283 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6284 key.offset == num_bytes) {
6285 found_extent = 1;
6286 break;
6287 }
6288 if (key.type == BTRFS_METADATA_ITEM_KEY &&
6289 key.offset == owner_objectid) {
6290 found_extent = 1;
6291 break;
6292 }
6293 if (path->slots[0] - extent_slot > 5)
6294 break;
6295 extent_slot--;
6296 }
6297 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6298 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
6299 if (found_extent && item_size < sizeof(*ei))
6300 found_extent = 0;
6301 #endif
6302 if (!found_extent) {
6303 BUG_ON(iref);
6304 ret = remove_extent_backref(trans, extent_root, path,
6305 NULL, refs_to_drop,
6306 is_data, &last_ref);
6307 if (ret) {
6308 btrfs_abort_transaction(trans, extent_root, ret);
6309 goto out;
6310 }
6311 btrfs_release_path(path);
6312 path->leave_spinning = 1;
6313
6314 key.objectid = bytenr;
6315 key.type = BTRFS_EXTENT_ITEM_KEY;
6316 key.offset = num_bytes;
6317
6318 if (!is_data && skinny_metadata) {
6319 key.type = BTRFS_METADATA_ITEM_KEY;
6320 key.offset = owner_objectid;
6321 }
6322
6323 ret = btrfs_search_slot(trans, extent_root,
6324 &key, path, -1, 1);
6325 if (ret > 0 && skinny_metadata && path->slots[0]) {
6326 /*
6327 * Couldn't find our skinny metadata item,
6328 * see if we have ye olde extent item.
6329 */
6330 path->slots[0]--;
6331 btrfs_item_key_to_cpu(path->nodes[0], &key,
6332 path->slots[0]);
6333 if (key.objectid == bytenr &&
6334 key.type == BTRFS_EXTENT_ITEM_KEY &&
6335 key.offset == num_bytes)
6336 ret = 0;
6337 }
6338
6339 if (ret > 0 && skinny_metadata) {
6340 skinny_metadata = false;
6341 key.objectid = bytenr;
6342 key.type = BTRFS_EXTENT_ITEM_KEY;
6343 key.offset = num_bytes;
6344 btrfs_release_path(path);
6345 ret = btrfs_search_slot(trans, extent_root,
6346 &key, path, -1, 1);
6347 }
6348
6349 if (ret) {
6350 btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6351 ret, bytenr);
6352 if (ret > 0)
6353 btrfs_print_leaf(extent_root,
6354 path->nodes[0]);
6355 }
6356 if (ret < 0) {
6357 btrfs_abort_transaction(trans, extent_root, ret);
6358 goto out;
6359 }
6360 extent_slot = path->slots[0];
6361 }
6362 } else if (WARN_ON(ret == -ENOENT)) {
6363 btrfs_print_leaf(extent_root, path->nodes[0]);
6364 btrfs_err(info,
6365 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
6366 bytenr, parent, root_objectid, owner_objectid,
6367 owner_offset);
6368 btrfs_abort_transaction(trans, extent_root, ret);
6369 goto out;
6370 } else {
6371 btrfs_abort_transaction(trans, extent_root, ret);
6372 goto out;
6373 }
6374
6375 leaf = path->nodes[0];
6376 item_size = btrfs_item_size_nr(leaf, extent_slot);
6377 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6378 if (item_size < sizeof(*ei)) {
6379 BUG_ON(found_extent || extent_slot != path->slots[0]);
6380 ret = convert_extent_item_v0(trans, extent_root, path,
6381 owner_objectid, 0);
6382 if (ret < 0) {
6383 btrfs_abort_transaction(trans, extent_root, ret);
6384 goto out;
6385 }
6386
6387 btrfs_release_path(path);
6388 path->leave_spinning = 1;
6389
6390 key.objectid = bytenr;
6391 key.type = BTRFS_EXTENT_ITEM_KEY;
6392 key.offset = num_bytes;
6393
6394 ret = btrfs_search_slot(trans, extent_root, &key, path,
6395 -1, 1);
6396 if (ret) {
6397 btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6398 ret, bytenr);
6399 btrfs_print_leaf(extent_root, path->nodes[0]);
6400 }
6401 if (ret < 0) {
6402 btrfs_abort_transaction(trans, extent_root, ret);
6403 goto out;
6404 }
6405
6406 extent_slot = path->slots[0];
6407 leaf = path->nodes[0];
6408 item_size = btrfs_item_size_nr(leaf, extent_slot);
6409 }
6410 #endif
6411 BUG_ON(item_size < sizeof(*ei));
6412 ei = btrfs_item_ptr(leaf, extent_slot,
6413 struct btrfs_extent_item);
6414 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
6415 key.type == BTRFS_EXTENT_ITEM_KEY) {
6416 struct btrfs_tree_block_info *bi;
6417 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
6418 bi = (struct btrfs_tree_block_info *)(ei + 1);
6419 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
6420 }
6421
6422 refs = btrfs_extent_refs(leaf, ei);
6423 if (refs < refs_to_drop) {
6424 btrfs_err(info, "trying to drop %d refs but we only have %Lu "
6425 "for bytenr %Lu", refs_to_drop, refs, bytenr);
6426 ret = -EINVAL;
6427 btrfs_abort_transaction(trans, extent_root, ret);
6428 goto out;
6429 }
6430 refs -= refs_to_drop;
6431
6432 if (refs > 0) {
6433 if (extent_op)
6434 __run_delayed_extent_op(extent_op, leaf, ei);
6435 /*
6436 * In the case of inline back ref, reference count will
6437 * be updated by remove_extent_backref
6438 */
6439 if (iref) {
6440 BUG_ON(!found_extent);
6441 } else {
6442 btrfs_set_extent_refs(leaf, ei, refs);
6443 btrfs_mark_buffer_dirty(leaf);
6444 }
6445 if (found_extent) {
6446 ret = remove_extent_backref(trans, extent_root, path,
6447 iref, refs_to_drop,
6448 is_data, &last_ref);
6449 if (ret) {
6450 btrfs_abort_transaction(trans, extent_root, ret);
6451 goto out;
6452 }
6453 }
6454 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
6455 root_objectid);
6456 } else {
6457 if (found_extent) {
6458 BUG_ON(is_data && refs_to_drop !=
6459 extent_data_ref_count(path, iref));
6460 if (iref) {
6461 BUG_ON(path->slots[0] != extent_slot);
6462 } else {
6463 BUG_ON(path->slots[0] != extent_slot + 1);
6464 path->slots[0] = extent_slot;
6465 num_to_del = 2;
6466 }
6467 }
6468
6469 last_ref = 1;
6470 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
6471 num_to_del);
6472 if (ret) {
6473 btrfs_abort_transaction(trans, extent_root, ret);
6474 goto out;
6475 }
6476 btrfs_release_path(path);
6477
6478 if (is_data) {
6479 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
6480 if (ret) {
6481 btrfs_abort_transaction(trans, extent_root, ret);
6482 goto out;
6483 }
6484 }
6485
6486 ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
6487 num_bytes);
6488 if (ret) {
6489 btrfs_abort_transaction(trans, extent_root, ret);
6490 goto out;
6491 }
6492
6493 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
6494 if (ret) {
6495 btrfs_abort_transaction(trans, extent_root, ret);
6496 goto out;
6497 }
6498 }
6499 btrfs_release_path(path);
6500
6501 out:
6502 btrfs_free_path(path);
6503 return ret;
6504 }
6505
6506 /*
6507 * when we free an block, it is possible (and likely) that we free the last
6508 * delayed ref for that extent as well. This searches the delayed ref tree for
6509 * a given extent, and if there are no other delayed refs to be processed, it
6510 * removes it from the tree.
6511 */
6512 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
6513 struct btrfs_root *root, u64 bytenr)
6514 {
6515 struct btrfs_delayed_ref_head *head;
6516 struct btrfs_delayed_ref_root *delayed_refs;
6517 int ret = 0;
6518
6519 delayed_refs = &trans->transaction->delayed_refs;
6520 spin_lock(&delayed_refs->lock);
6521 head = btrfs_find_delayed_ref_head(trans, bytenr);
6522 if (!head)
6523 goto out_delayed_unlock;
6524
6525 spin_lock(&head->lock);
6526 if (!list_empty(&head->ref_list))
6527 goto out;
6528
6529 if (head->extent_op) {
6530 if (!head->must_insert_reserved)
6531 goto out;
6532 btrfs_free_delayed_extent_op(head->extent_op);
6533 head->extent_op = NULL;
6534 }
6535
6536 /*
6537 * waiting for the lock here would deadlock. If someone else has it
6538 * locked they are already in the process of dropping it anyway
6539 */
6540 if (!mutex_trylock(&head->mutex))
6541 goto out;
6542
6543 /*
6544 * at this point we have a head with no other entries. Go
6545 * ahead and process it.
6546 */
6547 head->node.in_tree = 0;
6548 rb_erase(&head->href_node, &delayed_refs->href_root);
6549
6550 atomic_dec(&delayed_refs->num_entries);
6551
6552 /*
6553 * we don't take a ref on the node because we're removing it from the
6554 * tree, so we just steal the ref the tree was holding.
6555 */
6556 delayed_refs->num_heads--;
6557 if (head->processing == 0)
6558 delayed_refs->num_heads_ready--;
6559 head->processing = 0;
6560 spin_unlock(&head->lock);
6561 spin_unlock(&delayed_refs->lock);
6562
6563 BUG_ON(head->extent_op);
6564 if (head->must_insert_reserved)
6565 ret = 1;
6566
6567 mutex_unlock(&head->mutex);
6568 btrfs_put_delayed_ref(&head->node);
6569 return ret;
6570 out:
6571 spin_unlock(&head->lock);
6572
6573 out_delayed_unlock:
6574 spin_unlock(&delayed_refs->lock);
6575 return 0;
6576 }
6577
6578 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6579 struct btrfs_root *root,
6580 struct extent_buffer *buf,
6581 u64 parent, int last_ref)
6582 {
6583 int pin = 1;
6584 int ret;
6585
6586 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6587 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6588 buf->start, buf->len,
6589 parent, root->root_key.objectid,
6590 btrfs_header_level(buf),
6591 BTRFS_DROP_DELAYED_REF, NULL, 0);
6592 BUG_ON(ret); /* -ENOMEM */
6593 }
6594
6595 if (!last_ref)
6596 return;
6597
6598 if (btrfs_header_generation(buf) == trans->transid) {
6599 struct btrfs_block_group_cache *cache;
6600
6601 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6602 ret = check_ref_cleanup(trans, root, buf->start);
6603 if (!ret)
6604 goto out;
6605 }
6606
6607 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
6608
6609 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
6610 pin_down_extent(root, cache, buf->start, buf->len, 1);
6611 btrfs_put_block_group(cache);
6612 goto out;
6613 }
6614
6615 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
6616
6617 btrfs_add_free_space(cache, buf->start, buf->len);
6618 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
6619 btrfs_put_block_group(cache);
6620 trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
6621 pin = 0;
6622 }
6623 out:
6624 if (pin)
6625 add_pinned_bytes(root->fs_info, buf->len,
6626 btrfs_header_level(buf),
6627 root->root_key.objectid);
6628
6629 /*
6630 * Deleting the buffer, clear the corrupt flag since it doesn't matter
6631 * anymore.
6632 */
6633 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
6634 }
6635
6636 /* Can return -ENOMEM */
6637 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6638 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
6639 u64 owner, u64 offset, int no_quota)
6640 {
6641 int ret;
6642 struct btrfs_fs_info *fs_info = root->fs_info;
6643
6644 if (btrfs_test_is_dummy_root(root))
6645 return 0;
6646
6647 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
6648
6649 /*
6650 * tree log blocks never actually go into the extent allocation
6651 * tree, just update pinning info and exit early.
6652 */
6653 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
6654 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
6655 /* unlocks the pinned mutex */
6656 btrfs_pin_extent(root, bytenr, num_bytes, 1);
6657 ret = 0;
6658 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6659 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
6660 num_bytes,
6661 parent, root_objectid, (int)owner,
6662 BTRFS_DROP_DELAYED_REF, NULL, no_quota);
6663 } else {
6664 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
6665 num_bytes,
6666 parent, root_objectid, owner,
6667 offset, BTRFS_DROP_DELAYED_REF,
6668 NULL, no_quota);
6669 }
6670 return ret;
6671 }
6672
6673 /*
6674 * when we wait for progress in the block group caching, its because
6675 * our allocation attempt failed at least once. So, we must sleep
6676 * and let some progress happen before we try again.
6677 *
6678 * This function will sleep at least once waiting for new free space to
6679 * show up, and then it will check the block group free space numbers
6680 * for our min num_bytes. Another option is to have it go ahead
6681 * and look in the rbtree for a free extent of a given size, but this
6682 * is a good start.
6683 *
6684 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
6685 * any of the information in this block group.
6686 */
6687 static noinline void
6688 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
6689 u64 num_bytes)
6690 {
6691 struct btrfs_caching_control *caching_ctl;
6692
6693 caching_ctl = get_caching_control(cache);
6694 if (!caching_ctl)
6695 return;
6696
6697 wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
6698 (cache->free_space_ctl->free_space >= num_bytes));
6699
6700 put_caching_control(caching_ctl);
6701 }
6702
6703 static noinline int
6704 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
6705 {
6706 struct btrfs_caching_control *caching_ctl;
6707 int ret = 0;
6708
6709 caching_ctl = get_caching_control(cache);
6710 if (!caching_ctl)
6711 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
6712
6713 wait_event(caching_ctl->wait, block_group_cache_done(cache));
6714 if (cache->cached == BTRFS_CACHE_ERROR)
6715 ret = -EIO;
6716 put_caching_control(caching_ctl);
6717 return ret;
6718 }
6719
6720 int __get_raid_index(u64 flags)
6721 {
6722 if (flags & BTRFS_BLOCK_GROUP_RAID10)
6723 return BTRFS_RAID_RAID10;
6724 else if (flags & BTRFS_BLOCK_GROUP_RAID1)
6725 return BTRFS_RAID_RAID1;
6726 else if (flags & BTRFS_BLOCK_GROUP_DUP)
6727 return BTRFS_RAID_DUP;
6728 else if (flags & BTRFS_BLOCK_GROUP_RAID0)
6729 return BTRFS_RAID_RAID0;
6730 else if (flags & BTRFS_BLOCK_GROUP_RAID5)
6731 return BTRFS_RAID_RAID5;
6732 else if (flags & BTRFS_BLOCK_GROUP_RAID6)
6733 return BTRFS_RAID_RAID6;
6734
6735 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
6736 }
6737
6738 int get_block_group_index(struct btrfs_block_group_cache *cache)
6739 {
6740 return __get_raid_index(cache->flags);
6741 }
6742
6743 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
6744 [BTRFS_RAID_RAID10] = "raid10",
6745 [BTRFS_RAID_RAID1] = "raid1",
6746 [BTRFS_RAID_DUP] = "dup",
6747 [BTRFS_RAID_RAID0] = "raid0",
6748 [BTRFS_RAID_SINGLE] = "single",
6749 [BTRFS_RAID_RAID5] = "raid5",
6750 [BTRFS_RAID_RAID6] = "raid6",
6751 };
6752
6753 static const char *get_raid_name(enum btrfs_raid_types type)
6754 {
6755 if (type >= BTRFS_NR_RAID_TYPES)
6756 return NULL;
6757
6758 return btrfs_raid_type_names[type];
6759 }
6760
6761 enum btrfs_loop_type {
6762 LOOP_CACHING_NOWAIT = 0,
6763 LOOP_CACHING_WAIT = 1,
6764 LOOP_ALLOC_CHUNK = 2,
6765 LOOP_NO_EMPTY_SIZE = 3,
6766 };
6767
6768 static inline void
6769 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
6770 int delalloc)
6771 {
6772 if (delalloc)
6773 down_read(&cache->data_rwsem);
6774 }
6775
6776 static inline void
6777 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
6778 int delalloc)
6779 {
6780 btrfs_get_block_group(cache);
6781 if (delalloc)
6782 down_read(&cache->data_rwsem);
6783 }
6784
6785 static struct btrfs_block_group_cache *
6786 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
6787 struct btrfs_free_cluster *cluster,
6788 int delalloc)
6789 {
6790 struct btrfs_block_group_cache *used_bg;
6791 bool locked = false;
6792 again:
6793 spin_lock(&cluster->refill_lock);
6794 if (locked) {
6795 if (used_bg == cluster->block_group)
6796 return used_bg;
6797
6798 up_read(&used_bg->data_rwsem);
6799 btrfs_put_block_group(used_bg);
6800 }
6801
6802 used_bg = cluster->block_group;
6803 if (!used_bg)
6804 return NULL;
6805
6806 if (used_bg == block_group)
6807 return used_bg;
6808
6809 btrfs_get_block_group(used_bg);
6810
6811 if (!delalloc)
6812 return used_bg;
6813
6814 if (down_read_trylock(&used_bg->data_rwsem))
6815 return used_bg;
6816
6817 spin_unlock(&cluster->refill_lock);
6818 down_read(&used_bg->data_rwsem);
6819 locked = true;
6820 goto again;
6821 }
6822
6823 static inline void
6824 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
6825 int delalloc)
6826 {
6827 if (delalloc)
6828 up_read(&cache->data_rwsem);
6829 btrfs_put_block_group(cache);
6830 }
6831
6832 /*
6833 * walks the btree of allocated extents and find a hole of a given size.
6834 * The key ins is changed to record the hole:
6835 * ins->objectid == start position
6836 * ins->flags = BTRFS_EXTENT_ITEM_KEY
6837 * ins->offset == the size of the hole.
6838 * Any available blocks before search_start are skipped.
6839 *
6840 * If there is no suitable free space, we will record the max size of
6841 * the free space extent currently.
6842 */
6843 static noinline int find_free_extent(struct btrfs_root *orig_root,
6844 u64 num_bytes, u64 empty_size,
6845 u64 hint_byte, struct btrfs_key *ins,
6846 u64 flags, int delalloc)
6847 {
6848 int ret = 0;
6849 struct btrfs_root *root = orig_root->fs_info->extent_root;
6850 struct btrfs_free_cluster *last_ptr = NULL;
6851 struct btrfs_block_group_cache *block_group = NULL;
6852 u64 search_start = 0;
6853 u64 max_extent_size = 0;
6854 int empty_cluster = 2 * 1024 * 1024;
6855 struct btrfs_space_info *space_info;
6856 int loop = 0;
6857 int index = __get_raid_index(flags);
6858 int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
6859 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
6860 bool failed_cluster_refill = false;
6861 bool failed_alloc = false;
6862 bool use_cluster = true;
6863 bool have_caching_bg = false;
6864
6865 WARN_ON(num_bytes < root->sectorsize);
6866 ins->type = BTRFS_EXTENT_ITEM_KEY;
6867 ins->objectid = 0;
6868 ins->offset = 0;
6869
6870 trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
6871
6872 space_info = __find_space_info(root->fs_info, flags);
6873 if (!space_info) {
6874 btrfs_err(root->fs_info, "No space info for %llu", flags);
6875 return -ENOSPC;
6876 }
6877
6878 /*
6879 * If the space info is for both data and metadata it means we have a
6880 * small filesystem and we can't use the clustering stuff.
6881 */
6882 if (btrfs_mixed_space_info(space_info))
6883 use_cluster = false;
6884
6885 if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
6886 last_ptr = &root->fs_info->meta_alloc_cluster;
6887 if (!btrfs_test_opt(root, SSD))
6888 empty_cluster = 64 * 1024;
6889 }
6890
6891 if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
6892 btrfs_test_opt(root, SSD)) {
6893 last_ptr = &root->fs_info->data_alloc_cluster;
6894 }
6895
6896 if (last_ptr) {
6897 spin_lock(&last_ptr->lock);
6898 if (last_ptr->block_group)
6899 hint_byte = last_ptr->window_start;
6900 spin_unlock(&last_ptr->lock);
6901 }
6902
6903 search_start = max(search_start, first_logical_byte(root, 0));
6904 search_start = max(search_start, hint_byte);
6905
6906 if (!last_ptr)
6907 empty_cluster = 0;
6908
6909 if (search_start == hint_byte) {
6910 block_group = btrfs_lookup_block_group(root->fs_info,
6911 search_start);
6912 /*
6913 * we don't want to use the block group if it doesn't match our
6914 * allocation bits, or if its not cached.
6915 *
6916 * However if we are re-searching with an ideal block group
6917 * picked out then we don't care that the block group is cached.
6918 */
6919 if (block_group && block_group_bits(block_group, flags) &&
6920 block_group->cached != BTRFS_CACHE_NO) {
6921 down_read(&space_info->groups_sem);
6922 if (list_empty(&block_group->list) ||
6923 block_group->ro) {
6924 /*
6925 * someone is removing this block group,
6926 * we can't jump into the have_block_group
6927 * target because our list pointers are not
6928 * valid
6929 */
6930 btrfs_put_block_group(block_group);
6931 up_read(&space_info->groups_sem);
6932 } else {
6933 index = get_block_group_index(block_group);
6934 btrfs_lock_block_group(block_group, delalloc);
6935 goto have_block_group;
6936 }
6937 } else if (block_group) {
6938 btrfs_put_block_group(block_group);
6939 }
6940 }
6941 search:
6942 have_caching_bg = false;
6943 down_read(&space_info->groups_sem);
6944 list_for_each_entry(block_group, &space_info->block_groups[index],
6945 list) {
6946 u64 offset;
6947 int cached;
6948
6949 btrfs_grab_block_group(block_group, delalloc);
6950 search_start = block_group->key.objectid;
6951
6952 /*
6953 * this can happen if we end up cycling through all the
6954 * raid types, but we want to make sure we only allocate
6955 * for the proper type.
6956 */
6957 if (!block_group_bits(block_group, flags)) {
6958 u64 extra = BTRFS_BLOCK_GROUP_DUP |
6959 BTRFS_BLOCK_GROUP_RAID1 |
6960 BTRFS_BLOCK_GROUP_RAID5 |
6961 BTRFS_BLOCK_GROUP_RAID6 |
6962 BTRFS_BLOCK_GROUP_RAID10;
6963
6964 /*
6965 * if they asked for extra copies and this block group
6966 * doesn't provide them, bail. This does allow us to
6967 * fill raid0 from raid1.
6968 */
6969 if ((flags & extra) && !(block_group->flags & extra))
6970 goto loop;
6971 }
6972
6973 have_block_group:
6974 cached = block_group_cache_done(block_group);
6975 if (unlikely(!cached)) {
6976 ret = cache_block_group(block_group, 0);
6977 BUG_ON(ret < 0);
6978 ret = 0;
6979 }
6980
6981 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
6982 goto loop;
6983 if (unlikely(block_group->ro))
6984 goto loop;
6985
6986 /*
6987 * Ok we want to try and use the cluster allocator, so
6988 * lets look there
6989 */
6990 if (last_ptr) {
6991 struct btrfs_block_group_cache *used_block_group;
6992 unsigned long aligned_cluster;
6993 /*
6994 * the refill lock keeps out other
6995 * people trying to start a new cluster
6996 */
6997 used_block_group = btrfs_lock_cluster(block_group,
6998 last_ptr,
6999 delalloc);
7000 if (!used_block_group)
7001 goto refill_cluster;
7002
7003 if (used_block_group != block_group &&
7004 (used_block_group->ro ||
7005 !block_group_bits(used_block_group, flags)))
7006 goto release_cluster;
7007
7008 offset = btrfs_alloc_from_cluster(used_block_group,
7009 last_ptr,
7010 num_bytes,
7011 used_block_group->key.objectid,
7012 &max_extent_size);
7013 if (offset) {
7014 /* we have a block, we're done */
7015 spin_unlock(&last_ptr->refill_lock);
7016 trace_btrfs_reserve_extent_cluster(root,
7017 used_block_group,
7018 search_start, num_bytes);
7019 if (used_block_group != block_group) {
7020 btrfs_release_block_group(block_group,
7021 delalloc);
7022 block_group = used_block_group;
7023 }
7024 goto checks;
7025 }
7026
7027 WARN_ON(last_ptr->block_group != used_block_group);
7028 release_cluster:
7029 /* If we are on LOOP_NO_EMPTY_SIZE, we can't
7030 * set up a new clusters, so lets just skip it
7031 * and let the allocator find whatever block
7032 * it can find. If we reach this point, we
7033 * will have tried the cluster allocator
7034 * plenty of times and not have found
7035 * anything, so we are likely way too
7036 * fragmented for the clustering stuff to find
7037 * anything.
7038 *
7039 * However, if the cluster is taken from the
7040 * current block group, release the cluster
7041 * first, so that we stand a better chance of
7042 * succeeding in the unclustered
7043 * allocation. */
7044 if (loop >= LOOP_NO_EMPTY_SIZE &&
7045 used_block_group != block_group) {
7046 spin_unlock(&last_ptr->refill_lock);
7047 btrfs_release_block_group(used_block_group,
7048 delalloc);
7049 goto unclustered_alloc;
7050 }
7051
7052 /*
7053 * this cluster didn't work out, free it and
7054 * start over
7055 */
7056 btrfs_return_cluster_to_free_space(NULL, last_ptr);
7057
7058 if (used_block_group != block_group)
7059 btrfs_release_block_group(used_block_group,
7060 delalloc);
7061 refill_cluster:
7062 if (loop >= LOOP_NO_EMPTY_SIZE) {
7063 spin_unlock(&last_ptr->refill_lock);
7064 goto unclustered_alloc;
7065 }
7066
7067 aligned_cluster = max_t(unsigned long,
7068 empty_cluster + empty_size,
7069 block_group->full_stripe_len);
7070
7071 /* allocate a cluster in this block group */
7072 ret = btrfs_find_space_cluster(root, block_group,
7073 last_ptr, search_start,
7074 num_bytes,
7075 aligned_cluster);
7076 if (ret == 0) {
7077 /*
7078 * now pull our allocation out of this
7079 * cluster
7080 */
7081 offset = btrfs_alloc_from_cluster(block_group,
7082 last_ptr,
7083 num_bytes,
7084 search_start,
7085 &max_extent_size);
7086 if (offset) {
7087 /* we found one, proceed */
7088 spin_unlock(&last_ptr->refill_lock);
7089 trace_btrfs_reserve_extent_cluster(root,
7090 block_group, search_start,
7091 num_bytes);
7092 goto checks;
7093 }
7094 } else if (!cached && loop > LOOP_CACHING_NOWAIT
7095 && !failed_cluster_refill) {
7096 spin_unlock(&last_ptr->refill_lock);
7097
7098 failed_cluster_refill = true;
7099 wait_block_group_cache_progress(block_group,
7100 num_bytes + empty_cluster + empty_size);
7101 goto have_block_group;
7102 }
7103
7104 /*
7105 * at this point we either didn't find a cluster
7106 * or we weren't able to allocate a block from our
7107 * cluster. Free the cluster we've been trying
7108 * to use, and go to the next block group
7109 */
7110 btrfs_return_cluster_to_free_space(NULL, last_ptr);
7111 spin_unlock(&last_ptr->refill_lock);
7112 goto loop;
7113 }
7114
7115 unclustered_alloc:
7116 spin_lock(&block_group->free_space_ctl->tree_lock);
7117 if (cached &&
7118 block_group->free_space_ctl->free_space <
7119 num_bytes + empty_cluster + empty_size) {
7120 if (block_group->free_space_ctl->free_space >
7121 max_extent_size)
7122 max_extent_size =
7123 block_group->free_space_ctl->free_space;
7124 spin_unlock(&block_group->free_space_ctl->tree_lock);
7125 goto loop;
7126 }
7127 spin_unlock(&block_group->free_space_ctl->tree_lock);
7128
7129 offset = btrfs_find_space_for_alloc(block_group, search_start,
7130 num_bytes, empty_size,
7131 &max_extent_size);
7132 /*
7133 * If we didn't find a chunk, and we haven't failed on this
7134 * block group before, and this block group is in the middle of
7135 * caching and we are ok with waiting, then go ahead and wait
7136 * for progress to be made, and set failed_alloc to true.
7137 *
7138 * If failed_alloc is true then we've already waited on this
7139 * block group once and should move on to the next block group.
7140 */
7141 if (!offset && !failed_alloc && !cached &&
7142 loop > LOOP_CACHING_NOWAIT) {
7143 wait_block_group_cache_progress(block_group,
7144 num_bytes + empty_size);
7145 failed_alloc = true;
7146 goto have_block_group;
7147 } else if (!offset) {
7148 if (!cached)
7149 have_caching_bg = true;
7150 goto loop;
7151 }
7152 checks:
7153 search_start = ALIGN(offset, root->stripesize);
7154
7155 /* move on to the next group */
7156 if (search_start + num_bytes >
7157 block_group->key.objectid + block_group->key.offset) {
7158 btrfs_add_free_space(block_group, offset, num_bytes);
7159 goto loop;
7160 }
7161
7162 if (offset < search_start)
7163 btrfs_add_free_space(block_group, offset,
7164 search_start - offset);
7165 BUG_ON(offset > search_start);
7166
7167 ret = btrfs_update_reserved_bytes(block_group, num_bytes,
7168 alloc_type, delalloc);
7169 if (ret == -EAGAIN) {
7170 btrfs_add_free_space(block_group, offset, num_bytes);
7171 goto loop;
7172 }
7173
7174 /* we are all good, lets return */
7175 ins->objectid = search_start;
7176 ins->offset = num_bytes;
7177
7178 trace_btrfs_reserve_extent(orig_root, block_group,
7179 search_start, num_bytes);
7180 btrfs_release_block_group(block_group, delalloc);
7181 break;
7182 loop:
7183 failed_cluster_refill = false;
7184 failed_alloc = false;
7185 BUG_ON(index != get_block_group_index(block_group));
7186 btrfs_release_block_group(block_group, delalloc);
7187 }
7188 up_read(&space_info->groups_sem);
7189
7190 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7191 goto search;
7192
7193 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7194 goto search;
7195
7196 /*
7197 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7198 * caching kthreads as we move along
7199 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7200 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7201 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7202 * again
7203 */
7204 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7205 index = 0;
7206 loop++;
7207 if (loop == LOOP_ALLOC_CHUNK) {
7208 struct btrfs_trans_handle *trans;
7209 int exist = 0;
7210
7211 trans = current->journal_info;
7212 if (trans)
7213 exist = 1;
7214 else
7215 trans = btrfs_join_transaction(root);
7216
7217 if (IS_ERR(trans)) {
7218 ret = PTR_ERR(trans);
7219 goto out;
7220 }
7221
7222 ret = do_chunk_alloc(trans, root, flags,
7223 CHUNK_ALLOC_FORCE);
7224 /*
7225 * Do not bail out on ENOSPC since we
7226 * can do more things.
7227 */
7228 if (ret < 0 && ret != -ENOSPC)
7229 btrfs_abort_transaction(trans,
7230 root, ret);
7231 else
7232 ret = 0;
7233 if (!exist)
7234 btrfs_end_transaction(trans, root);
7235 if (ret)
7236 goto out;
7237 }
7238
7239 if (loop == LOOP_NO_EMPTY_SIZE) {
7240 empty_size = 0;
7241 empty_cluster = 0;
7242 }
7243
7244 goto search;
7245 } else if (!ins->objectid) {
7246 ret = -ENOSPC;
7247 } else if (ins->objectid) {
7248 ret = 0;
7249 }
7250 out:
7251 if (ret == -ENOSPC)
7252 ins->offset = max_extent_size;
7253 return ret;
7254 }
7255
7256 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
7257 int dump_block_groups)
7258 {
7259 struct btrfs_block_group_cache *cache;
7260 int index = 0;
7261
7262 spin_lock(&info->lock);
7263 printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
7264 info->flags,
7265 info->total_bytes - info->bytes_used - info->bytes_pinned -
7266 info->bytes_reserved - info->bytes_readonly,
7267 (info->full) ? "" : "not ");
7268 printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
7269 "reserved=%llu, may_use=%llu, readonly=%llu\n",
7270 info->total_bytes, info->bytes_used, info->bytes_pinned,
7271 info->bytes_reserved, info->bytes_may_use,
7272 info->bytes_readonly);
7273 spin_unlock(&info->lock);
7274
7275 if (!dump_block_groups)
7276 return;
7277
7278 down_read(&info->groups_sem);
7279 again:
7280 list_for_each_entry(cache, &info->block_groups[index], list) {
7281 spin_lock(&cache->lock);
7282 printk(KERN_INFO "BTRFS: "
7283 "block group %llu has %llu bytes, "
7284 "%llu used %llu pinned %llu reserved %s\n",
7285 cache->key.objectid, cache->key.offset,
7286 btrfs_block_group_used(&cache->item), cache->pinned,
7287 cache->reserved, cache->ro ? "[readonly]" : "");
7288 btrfs_dump_free_space(cache, bytes);
7289 spin_unlock(&cache->lock);
7290 }
7291 if (++index < BTRFS_NR_RAID_TYPES)
7292 goto again;
7293 up_read(&info->groups_sem);
7294 }
7295
7296 int btrfs_reserve_extent(struct btrfs_root *root,
7297 u64 num_bytes, u64 min_alloc_size,
7298 u64 empty_size, u64 hint_byte,
7299 struct btrfs_key *ins, int is_data, int delalloc)
7300 {
7301 bool final_tried = false;
7302 u64 flags;
7303 int ret;
7304
7305 flags = btrfs_get_alloc_profile(root, is_data);
7306 again:
7307 WARN_ON(num_bytes < root->sectorsize);
7308 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
7309 flags, delalloc);
7310
7311 if (ret == -ENOSPC) {
7312 if (!final_tried && ins->offset) {
7313 num_bytes = min(num_bytes >> 1, ins->offset);
7314 num_bytes = round_down(num_bytes, root->sectorsize);
7315 num_bytes = max(num_bytes, min_alloc_size);
7316 if (num_bytes == min_alloc_size)
7317 final_tried = true;
7318 goto again;
7319 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
7320 struct btrfs_space_info *sinfo;
7321
7322 sinfo = __find_space_info(root->fs_info, flags);
7323 btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
7324 flags, num_bytes);
7325 if (sinfo)
7326 dump_space_info(sinfo, num_bytes, 1);
7327 }
7328 }
7329
7330 return ret;
7331 }
7332
7333 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
7334 u64 start, u64 len,
7335 int pin, int delalloc)
7336 {
7337 struct btrfs_block_group_cache *cache;
7338 int ret = 0;
7339
7340 cache = btrfs_lookup_block_group(root->fs_info, start);
7341 if (!cache) {
7342 btrfs_err(root->fs_info, "Unable to find block group for %llu",
7343 start);
7344 return -ENOSPC;
7345 }
7346
7347 if (pin)
7348 pin_down_extent(root, cache, start, len, 1);
7349 else {
7350 if (btrfs_test_opt(root, DISCARD))
7351 ret = btrfs_discard_extent(root, start, len, NULL);
7352 btrfs_add_free_space(cache, start, len);
7353 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
7354 }
7355
7356 btrfs_put_block_group(cache);
7357
7358 trace_btrfs_reserved_extent_free(root, start, len);
7359
7360 return ret;
7361 }
7362
7363 int btrfs_free_reserved_extent(struct btrfs_root *root,
7364 u64 start, u64 len, int delalloc)
7365 {
7366 return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
7367 }
7368
7369 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
7370 u64 start, u64 len)
7371 {
7372 return __btrfs_free_reserved_extent(root, start, len, 1, 0);
7373 }
7374
7375 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7376 struct btrfs_root *root,
7377 u64 parent, u64 root_objectid,
7378 u64 flags, u64 owner, u64 offset,
7379 struct btrfs_key *ins, int ref_mod)
7380 {
7381 int ret;
7382 struct btrfs_fs_info *fs_info = root->fs_info;
7383 struct btrfs_extent_item *extent_item;
7384 struct btrfs_extent_inline_ref *iref;
7385 struct btrfs_path *path;
7386 struct extent_buffer *leaf;
7387 int type;
7388 u32 size;
7389
7390 if (parent > 0)
7391 type = BTRFS_SHARED_DATA_REF_KEY;
7392 else
7393 type = BTRFS_EXTENT_DATA_REF_KEY;
7394
7395 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7396
7397 path = btrfs_alloc_path();
7398 if (!path)
7399 return -ENOMEM;
7400
7401 path->leave_spinning = 1;
7402 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7403 ins, size);
7404 if (ret) {
7405 btrfs_free_path(path);
7406 return ret;
7407 }
7408
7409 leaf = path->nodes[0];
7410 extent_item = btrfs_item_ptr(leaf, path->slots[0],
7411 struct btrfs_extent_item);
7412 btrfs_set_extent_refs(leaf, extent_item, ref_mod);
7413 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7414 btrfs_set_extent_flags(leaf, extent_item,
7415 flags | BTRFS_EXTENT_FLAG_DATA);
7416
7417 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7418 btrfs_set_extent_inline_ref_type(leaf, iref, type);
7419 if (parent > 0) {
7420 struct btrfs_shared_data_ref *ref;
7421 ref = (struct btrfs_shared_data_ref *)(iref + 1);
7422 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7423 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
7424 } else {
7425 struct btrfs_extent_data_ref *ref;
7426 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
7427 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
7428 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
7429 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
7430 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
7431 }
7432
7433 btrfs_mark_buffer_dirty(path->nodes[0]);
7434 btrfs_free_path(path);
7435
7436 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
7437 ins->offset);
7438 if (ret)
7439 return ret;
7440
7441 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
7442 if (ret) { /* -ENOENT, logic error */
7443 btrfs_err(fs_info, "update block group failed for %llu %llu",
7444 ins->objectid, ins->offset);
7445 BUG();
7446 }
7447 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
7448 return ret;
7449 }
7450
7451 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7452 struct btrfs_root *root,
7453 u64 parent, u64 root_objectid,
7454 u64 flags, struct btrfs_disk_key *key,
7455 int level, struct btrfs_key *ins,
7456 int no_quota)
7457 {
7458 int ret;
7459 struct btrfs_fs_info *fs_info = root->fs_info;
7460 struct btrfs_extent_item *extent_item;
7461 struct btrfs_tree_block_info *block_info;
7462 struct btrfs_extent_inline_ref *iref;
7463 struct btrfs_path *path;
7464 struct extent_buffer *leaf;
7465 u32 size = sizeof(*extent_item) + sizeof(*iref);
7466 u64 num_bytes = ins->offset;
7467 bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7468 SKINNY_METADATA);
7469
7470 if (!skinny_metadata)
7471 size += sizeof(*block_info);
7472
7473 path = btrfs_alloc_path();
7474 if (!path) {
7475 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7476 root->nodesize);
7477 return -ENOMEM;
7478 }
7479
7480 path->leave_spinning = 1;
7481 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7482 ins, size);
7483 if (ret) {
7484 btrfs_free_path(path);
7485 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7486 root->nodesize);
7487 return ret;
7488 }
7489
7490 leaf = path->nodes[0];
7491 extent_item = btrfs_item_ptr(leaf, path->slots[0],
7492 struct btrfs_extent_item);
7493 btrfs_set_extent_refs(leaf, extent_item, 1);
7494 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7495 btrfs_set_extent_flags(leaf, extent_item,
7496 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
7497
7498 if (skinny_metadata) {
7499 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7500 num_bytes = root->nodesize;
7501 } else {
7502 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
7503 btrfs_set_tree_block_key(leaf, block_info, key);
7504 btrfs_set_tree_block_level(leaf, block_info, level);
7505 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
7506 }
7507
7508 if (parent > 0) {
7509 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
7510 btrfs_set_extent_inline_ref_type(leaf, iref,
7511 BTRFS_SHARED_BLOCK_REF_KEY);
7512 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7513 } else {
7514 btrfs_set_extent_inline_ref_type(leaf, iref,
7515 BTRFS_TREE_BLOCK_REF_KEY);
7516 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
7517 }
7518
7519 btrfs_mark_buffer_dirty(leaf);
7520 btrfs_free_path(path);
7521
7522 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
7523 num_bytes);
7524 if (ret)
7525 return ret;
7526
7527 ret = update_block_group(trans, root, ins->objectid, root->nodesize,
7528 1);
7529 if (ret) { /* -ENOENT, logic error */
7530 btrfs_err(fs_info, "update block group failed for %llu %llu",
7531 ins->objectid, ins->offset);
7532 BUG();
7533 }
7534
7535 trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
7536 return ret;
7537 }
7538
7539 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7540 struct btrfs_root *root,
7541 u64 root_objectid, u64 owner,
7542 u64 offset, struct btrfs_key *ins)
7543 {
7544 int ret;
7545
7546 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
7547
7548 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
7549 ins->offset, 0,
7550 root_objectid, owner, offset,
7551 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
7552 return ret;
7553 }
7554
7555 /*
7556 * this is used by the tree logging recovery code. It records that
7557 * an extent has been allocated and makes sure to clear the free
7558 * space cache bits as well
7559 */
7560 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
7561 struct btrfs_root *root,
7562 u64 root_objectid, u64 owner, u64 offset,
7563 struct btrfs_key *ins)
7564 {
7565 int ret;
7566 struct btrfs_block_group_cache *block_group;
7567
7568 /*
7569 * Mixed block groups will exclude before processing the log so we only
7570 * need to do the exlude dance if this fs isn't mixed.
7571 */
7572 if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
7573 ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
7574 if (ret)
7575 return ret;
7576 }
7577
7578 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
7579 if (!block_group)
7580 return -EINVAL;
7581
7582 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
7583 RESERVE_ALLOC_NO_ACCOUNT, 0);
7584 BUG_ON(ret); /* logic error */
7585 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
7586 0, owner, offset, ins, 1);
7587 btrfs_put_block_group(block_group);
7588 return ret;
7589 }
7590
7591 static struct extent_buffer *
7592 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7593 u64 bytenr, int level)
7594 {
7595 struct extent_buffer *buf;
7596
7597 buf = btrfs_find_create_tree_block(root, bytenr);
7598 if (!buf)
7599 return ERR_PTR(-ENOMEM);
7600 btrfs_set_header_generation(buf, trans->transid);
7601 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
7602 btrfs_tree_lock(buf);
7603 clean_tree_block(trans, root->fs_info, buf);
7604 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
7605
7606 btrfs_set_lock_blocking(buf);
7607 btrfs_set_buffer_uptodate(buf);
7608
7609 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
7610 buf->log_index = root->log_transid % 2;
7611 /*
7612 * we allow two log transactions at a time, use different
7613 * EXENT bit to differentiate dirty pages.
7614 */
7615 if (buf->log_index == 0)
7616 set_extent_dirty(&root->dirty_log_pages, buf->start,
7617 buf->start + buf->len - 1, GFP_NOFS);
7618 else
7619 set_extent_new(&root->dirty_log_pages, buf->start,
7620 buf->start + buf->len - 1, GFP_NOFS);
7621 } else {
7622 buf->log_index = -1;
7623 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
7624 buf->start + buf->len - 1, GFP_NOFS);
7625 }
7626 trans->blocks_used++;
7627 /* this returns a buffer locked for blocking */
7628 return buf;
7629 }
7630
7631 static struct btrfs_block_rsv *
7632 use_block_rsv(struct btrfs_trans_handle *trans,
7633 struct btrfs_root *root, u32 blocksize)
7634 {
7635 struct btrfs_block_rsv *block_rsv;
7636 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
7637 int ret;
7638 bool global_updated = false;
7639
7640 block_rsv = get_block_rsv(trans, root);
7641
7642 if (unlikely(block_rsv->size == 0))
7643 goto try_reserve;
7644 again:
7645 ret = block_rsv_use_bytes(block_rsv, blocksize);
7646 if (!ret)
7647 return block_rsv;
7648
7649 if (block_rsv->failfast)
7650 return ERR_PTR(ret);
7651
7652 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
7653 global_updated = true;
7654 update_global_block_rsv(root->fs_info);
7655 goto again;
7656 }
7657
7658 if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
7659 static DEFINE_RATELIMIT_STATE(_rs,
7660 DEFAULT_RATELIMIT_INTERVAL * 10,
7661 /*DEFAULT_RATELIMIT_BURST*/ 1);
7662 if (__ratelimit(&_rs))
7663 WARN(1, KERN_DEBUG
7664 "BTRFS: block rsv returned %d\n", ret);
7665 }
7666 try_reserve:
7667 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
7668 BTRFS_RESERVE_NO_FLUSH);
7669 if (!ret)
7670 return block_rsv;
7671 /*
7672 * If we couldn't reserve metadata bytes try and use some from
7673 * the global reserve if its space type is the same as the global
7674 * reservation.
7675 */
7676 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
7677 block_rsv->space_info == global_rsv->space_info) {
7678 ret = block_rsv_use_bytes(global_rsv, blocksize);
7679 if (!ret)
7680 return global_rsv;
7681 }
7682 return ERR_PTR(ret);
7683 }
7684
7685 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
7686 struct btrfs_block_rsv *block_rsv, u32 blocksize)
7687 {
7688 block_rsv_add_bytes(block_rsv, blocksize, 0);
7689 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
7690 }
7691
7692 /*
7693 * finds a free extent and does all the dirty work required for allocation
7694 * returns the tree buffer or an ERR_PTR on error.
7695 */
7696 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
7697 struct btrfs_root *root,
7698 u64 parent, u64 root_objectid,
7699 struct btrfs_disk_key *key, int level,
7700 u64 hint, u64 empty_size)
7701 {
7702 struct btrfs_key ins;
7703 struct btrfs_block_rsv *block_rsv;
7704 struct extent_buffer *buf;
7705 struct btrfs_delayed_extent_op *extent_op;
7706 u64 flags = 0;
7707 int ret;
7708 u32 blocksize = root->nodesize;
7709 bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7710 SKINNY_METADATA);
7711
7712 if (btrfs_test_is_dummy_root(root)) {
7713 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
7714 level);
7715 if (!IS_ERR(buf))
7716 root->alloc_bytenr += blocksize;
7717 return buf;
7718 }
7719
7720 block_rsv = use_block_rsv(trans, root, blocksize);
7721 if (IS_ERR(block_rsv))
7722 return ERR_CAST(block_rsv);
7723
7724 ret = btrfs_reserve_extent(root, blocksize, blocksize,
7725 empty_size, hint, &ins, 0, 0);
7726 if (ret)
7727 goto out_unuse;
7728
7729 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
7730 if (IS_ERR(buf)) {
7731 ret = PTR_ERR(buf);
7732 goto out_free_reserved;
7733 }
7734
7735 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
7736 if (parent == 0)
7737 parent = ins.objectid;
7738 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
7739 } else
7740 BUG_ON(parent > 0);
7741
7742 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
7743 extent_op = btrfs_alloc_delayed_extent_op();
7744 if (!extent_op) {
7745 ret = -ENOMEM;
7746 goto out_free_buf;
7747 }
7748 if (key)
7749 memcpy(&extent_op->key, key, sizeof(extent_op->key));
7750 else
7751 memset(&extent_op->key, 0, sizeof(extent_op->key));
7752 extent_op->flags_to_set = flags;
7753 if (skinny_metadata)
7754 extent_op->update_key = 0;
7755 else
7756 extent_op->update_key = 1;
7757 extent_op->update_flags = 1;
7758 extent_op->is_data = 0;
7759 extent_op->level = level;
7760
7761 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
7762 ins.objectid, ins.offset,
7763 parent, root_objectid, level,
7764 BTRFS_ADD_DELAYED_EXTENT,
7765 extent_op, 0);
7766 if (ret)
7767 goto out_free_delayed;
7768 }
7769 return buf;
7770
7771 out_free_delayed:
7772 btrfs_free_delayed_extent_op(extent_op);
7773 out_free_buf:
7774 free_extent_buffer(buf);
7775 out_free_reserved:
7776 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
7777 out_unuse:
7778 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
7779 return ERR_PTR(ret);
7780 }
7781
7782 struct walk_control {
7783 u64 refs[BTRFS_MAX_LEVEL];
7784 u64 flags[BTRFS_MAX_LEVEL];
7785 struct btrfs_key update_progress;
7786 int stage;
7787 int level;
7788 int shared_level;
7789 int update_ref;
7790 int keep_locks;
7791 int reada_slot;
7792 int reada_count;
7793 int for_reloc;
7794 };
7795
7796 #define DROP_REFERENCE 1
7797 #define UPDATE_BACKREF 2
7798
7799 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7800 struct btrfs_root *root,
7801 struct walk_control *wc,
7802 struct btrfs_path *path)
7803 {
7804 u64 bytenr;
7805 u64 generation;
7806 u64 refs;
7807 u64 flags;
7808 u32 nritems;
7809 u32 blocksize;
7810 struct btrfs_key key;
7811 struct extent_buffer *eb;
7812 int ret;
7813 int slot;
7814 int nread = 0;
7815
7816 if (path->slots[wc->level] < wc->reada_slot) {
7817 wc->reada_count = wc->reada_count * 2 / 3;
7818 wc->reada_count = max(wc->reada_count, 2);
7819 } else {
7820 wc->reada_count = wc->reada_count * 3 / 2;
7821 wc->reada_count = min_t(int, wc->reada_count,
7822 BTRFS_NODEPTRS_PER_BLOCK(root));
7823 }
7824
7825 eb = path->nodes[wc->level];
7826 nritems = btrfs_header_nritems(eb);
7827 blocksize = root->nodesize;
7828
7829 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
7830 if (nread >= wc->reada_count)
7831 break;
7832
7833 cond_resched();
7834 bytenr = btrfs_node_blockptr(eb, slot);
7835 generation = btrfs_node_ptr_generation(eb, slot);
7836
7837 if (slot == path->slots[wc->level])
7838 goto reada;
7839
7840 if (wc->stage == UPDATE_BACKREF &&
7841 generation <= root->root_key.offset)
7842 continue;
7843
7844 /* We don't lock the tree block, it's OK to be racy here */
7845 ret = btrfs_lookup_extent_info(trans, root, bytenr,
7846 wc->level - 1, 1, &refs,
7847 &flags);
7848 /* We don't care about errors in readahead. */
7849 if (ret < 0)
7850 continue;
7851 BUG_ON(refs == 0);
7852
7853 if (wc->stage == DROP_REFERENCE) {
7854 if (refs == 1)
7855 goto reada;
7856
7857 if (wc->level == 1 &&
7858 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7859 continue;
7860 if (!wc->update_ref ||
7861 generation <= root->root_key.offset)
7862 continue;
7863 btrfs_node_key_to_cpu(eb, &key, slot);
7864 ret = btrfs_comp_cpu_keys(&key,
7865 &wc->update_progress);
7866 if (ret < 0)
7867 continue;
7868 } else {
7869 if (wc->level == 1 &&
7870 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7871 continue;
7872 }
7873 reada:
7874 readahead_tree_block(root, bytenr);
7875 nread++;
7876 }
7877 wc->reada_slot = slot;
7878 }
7879
7880 /*
7881 * TODO: Modify related function to add related node/leaf to dirty_extent_root,
7882 * for later qgroup accounting.
7883 *
7884 * Current, this function does nothing.
7885 */
7886 static int account_leaf_items(struct btrfs_trans_handle *trans,
7887 struct btrfs_root *root,
7888 struct extent_buffer *eb)
7889 {
7890 int nr = btrfs_header_nritems(eb);
7891 int i, extent_type;
7892 struct btrfs_key key;
7893 struct btrfs_file_extent_item *fi;
7894 u64 bytenr, num_bytes;
7895
7896 for (i = 0; i < nr; i++) {
7897 btrfs_item_key_to_cpu(eb, &key, i);
7898
7899 if (key.type != BTRFS_EXTENT_DATA_KEY)
7900 continue;
7901
7902 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
7903 /* filter out non qgroup-accountable extents */
7904 extent_type = btrfs_file_extent_type(eb, fi);
7905
7906 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
7907 continue;
7908
7909 bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
7910 if (!bytenr)
7911 continue;
7912
7913 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
7914 }
7915 return 0;
7916 }
7917
7918 /*
7919 * Walk up the tree from the bottom, freeing leaves and any interior
7920 * nodes which have had all slots visited. If a node (leaf or
7921 * interior) is freed, the node above it will have it's slot
7922 * incremented. The root node will never be freed.
7923 *
7924 * At the end of this function, we should have a path which has all
7925 * slots incremented to the next position for a search. If we need to
7926 * read a new node it will be NULL and the node above it will have the
7927 * correct slot selected for a later read.
7928 *
7929 * If we increment the root nodes slot counter past the number of
7930 * elements, 1 is returned to signal completion of the search.
7931 */
7932 static int adjust_slots_upwards(struct btrfs_root *root,
7933 struct btrfs_path *path, int root_level)
7934 {
7935 int level = 0;
7936 int nr, slot;
7937 struct extent_buffer *eb;
7938
7939 if (root_level == 0)
7940 return 1;
7941
7942 while (level <= root_level) {
7943 eb = path->nodes[level];
7944 nr = btrfs_header_nritems(eb);
7945 path->slots[level]++;
7946 slot = path->slots[level];
7947 if (slot >= nr || level == 0) {
7948 /*
7949 * Don't free the root - we will detect this
7950 * condition after our loop and return a
7951 * positive value for caller to stop walking the tree.
7952 */
7953 if (level != root_level) {
7954 btrfs_tree_unlock_rw(eb, path->locks[level]);
7955 path->locks[level] = 0;
7956
7957 free_extent_buffer(eb);
7958 path->nodes[level] = NULL;
7959 path->slots[level] = 0;
7960 }
7961 } else {
7962 /*
7963 * We have a valid slot to walk back down
7964 * from. Stop here so caller can process these
7965 * new nodes.
7966 */
7967 break;
7968 }
7969
7970 level++;
7971 }
7972
7973 eb = path->nodes[root_level];
7974 if (path->slots[root_level] >= btrfs_header_nritems(eb))
7975 return 1;
7976
7977 return 0;
7978 }
7979
7980 /*
7981 * root_eb is the subtree root and is locked before this function is called.
7982 * TODO: Modify this function to mark all (including complete shared node)
7983 * to dirty_extent_root to allow it get accounted in qgroup.
7984 */
7985 static int account_shared_subtree(struct btrfs_trans_handle *trans,
7986 struct btrfs_root *root,
7987 struct extent_buffer *root_eb,
7988 u64 root_gen,
7989 int root_level)
7990 {
7991 int ret = 0;
7992 int level;
7993 struct extent_buffer *eb = root_eb;
7994 struct btrfs_path *path = NULL;
7995
7996 BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
7997 BUG_ON(root_eb == NULL);
7998
7999 if (!root->fs_info->quota_enabled)
8000 return 0;
8001
8002 if (!extent_buffer_uptodate(root_eb)) {
8003 ret = btrfs_read_buffer(root_eb, root_gen);
8004 if (ret)
8005 goto out;
8006 }
8007
8008 if (root_level == 0) {
8009 ret = account_leaf_items(trans, root, root_eb);
8010 goto out;
8011 }
8012
8013 path = btrfs_alloc_path();
8014 if (!path)
8015 return -ENOMEM;
8016
8017 /*
8018 * Walk down the tree. Missing extent blocks are filled in as
8019 * we go. Metadata is accounted every time we read a new
8020 * extent block.
8021 *
8022 * When we reach a leaf, we account for file extent items in it,
8023 * walk back up the tree (adjusting slot pointers as we go)
8024 * and restart the search process.
8025 */
8026 extent_buffer_get(root_eb); /* For path */
8027 path->nodes[root_level] = root_eb;
8028 path->slots[root_level] = 0;
8029 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
8030 walk_down:
8031 level = root_level;
8032 while (level >= 0) {
8033 if (path->nodes[level] == NULL) {
8034 int parent_slot;
8035 u64 child_gen;
8036 u64 child_bytenr;
8037
8038 /* We need to get child blockptr/gen from
8039 * parent before we can read it. */
8040 eb = path->nodes[level + 1];
8041 parent_slot = path->slots[level + 1];
8042 child_bytenr = btrfs_node_blockptr(eb, parent_slot);
8043 child_gen = btrfs_node_ptr_generation(eb, parent_slot);
8044
8045 eb = read_tree_block(root, child_bytenr, child_gen);
8046 if (IS_ERR(eb)) {
8047 ret = PTR_ERR(eb);
8048 goto out;
8049 } else if (!extent_buffer_uptodate(eb)) {
8050 free_extent_buffer(eb);
8051 ret = -EIO;
8052 goto out;
8053 }
8054
8055 path->nodes[level] = eb;
8056 path->slots[level] = 0;
8057
8058 btrfs_tree_read_lock(eb);
8059 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
8060 path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
8061 }
8062
8063 if (level == 0) {
8064 ret = account_leaf_items(trans, root, path->nodes[level]);
8065 if (ret)
8066 goto out;
8067
8068 /* Nonzero return here means we completed our search */
8069 ret = adjust_slots_upwards(root, path, root_level);
8070 if (ret)
8071 break;
8072
8073 /* Restart search with new slots */
8074 goto walk_down;
8075 }
8076
8077 level--;
8078 }
8079
8080 ret = 0;
8081 out:
8082 btrfs_free_path(path);
8083
8084 return ret;
8085 }
8086
8087 /*
8088 * helper to process tree block while walking down the tree.
8089 *
8090 * when wc->stage == UPDATE_BACKREF, this function updates
8091 * back refs for pointers in the block.
8092 *
8093 * NOTE: return value 1 means we should stop walking down.
8094 */
8095 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8096 struct btrfs_root *root,
8097 struct btrfs_path *path,
8098 struct walk_control *wc, int lookup_info)
8099 {
8100 int level = wc->level;
8101 struct extent_buffer *eb = path->nodes[level];
8102 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8103 int ret;
8104
8105 if (wc->stage == UPDATE_BACKREF &&
8106 btrfs_header_owner(eb) != root->root_key.objectid)
8107 return 1;
8108
8109 /*
8110 * when reference count of tree block is 1, it won't increase
8111 * again. once full backref flag is set, we never clear it.
8112 */
8113 if (lookup_info &&
8114 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8115 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8116 BUG_ON(!path->locks[level]);
8117 ret = btrfs_lookup_extent_info(trans, root,
8118 eb->start, level, 1,
8119 &wc->refs[level],
8120 &wc->flags[level]);
8121 BUG_ON(ret == -ENOMEM);
8122 if (ret)
8123 return ret;
8124 BUG_ON(wc->refs[level] == 0);
8125 }
8126
8127 if (wc->stage == DROP_REFERENCE) {
8128 if (wc->refs[level] > 1)
8129 return 1;
8130
8131 if (path->locks[level] && !wc->keep_locks) {
8132 btrfs_tree_unlock_rw(eb, path->locks[level]);
8133 path->locks[level] = 0;
8134 }
8135 return 0;
8136 }
8137
8138 /* wc->stage == UPDATE_BACKREF */
8139 if (!(wc->flags[level] & flag)) {
8140 BUG_ON(!path->locks[level]);
8141 ret = btrfs_inc_ref(trans, root, eb, 1);
8142 BUG_ON(ret); /* -ENOMEM */
8143 ret = btrfs_dec_ref(trans, root, eb, 0);
8144 BUG_ON(ret); /* -ENOMEM */
8145 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
8146 eb->len, flag,
8147 btrfs_header_level(eb), 0);
8148 BUG_ON(ret); /* -ENOMEM */
8149 wc->flags[level] |= flag;
8150 }
8151
8152 /*
8153 * the block is shared by multiple trees, so it's not good to
8154 * keep the tree lock
8155 */
8156 if (path->locks[level] && level > 0) {
8157 btrfs_tree_unlock_rw(eb, path->locks[level]);
8158 path->locks[level] = 0;
8159 }
8160 return 0;
8161 }
8162
8163 /*
8164 * helper to process tree block pointer.
8165 *
8166 * when wc->stage == DROP_REFERENCE, this function checks
8167 * reference count of the block pointed to. if the block
8168 * is shared and we need update back refs for the subtree
8169 * rooted at the block, this function changes wc->stage to
8170 * UPDATE_BACKREF. if the block is shared and there is no
8171 * need to update back, this function drops the reference
8172 * to the block.
8173 *
8174 * NOTE: return value 1 means we should stop walking down.
8175 */
8176 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8177 struct btrfs_root *root,
8178 struct btrfs_path *path,
8179 struct walk_control *wc, int *lookup_info)
8180 {
8181 u64 bytenr;
8182 u64 generation;
8183 u64 parent;
8184 u32 blocksize;
8185 struct btrfs_key key;
8186 struct extent_buffer *next;
8187 int level = wc->level;
8188 int reada = 0;
8189 int ret = 0;
8190 bool need_account = false;
8191
8192 generation = btrfs_node_ptr_generation(path->nodes[level],
8193 path->slots[level]);
8194 /*
8195 * if the lower level block was created before the snapshot
8196 * was created, we know there is no need to update back refs
8197 * for the subtree
8198 */
8199 if (wc->stage == UPDATE_BACKREF &&
8200 generation <= root->root_key.offset) {
8201 *lookup_info = 1;
8202 return 1;
8203 }
8204
8205 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8206 blocksize = root->nodesize;
8207
8208 next = btrfs_find_tree_block(root->fs_info, bytenr);
8209 if (!next) {
8210 next = btrfs_find_create_tree_block(root, bytenr);
8211 if (!next)
8212 return -ENOMEM;
8213 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8214 level - 1);
8215 reada = 1;
8216 }
8217 btrfs_tree_lock(next);
8218 btrfs_set_lock_blocking(next);
8219
8220 ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
8221 &wc->refs[level - 1],
8222 &wc->flags[level - 1]);
8223 if (ret < 0) {
8224 btrfs_tree_unlock(next);
8225 return ret;
8226 }
8227
8228 if (unlikely(wc->refs[level - 1] == 0)) {
8229 btrfs_err(root->fs_info, "Missing references.");
8230 BUG();
8231 }
8232 *lookup_info = 0;
8233
8234 if (wc->stage == DROP_REFERENCE) {
8235 if (wc->refs[level - 1] > 1) {
8236 need_account = true;
8237 if (level == 1 &&
8238 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8239 goto skip;
8240
8241 if (!wc->update_ref ||
8242 generation <= root->root_key.offset)
8243 goto skip;
8244
8245 btrfs_node_key_to_cpu(path->nodes[level], &key,
8246 path->slots[level]);
8247 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8248 if (ret < 0)
8249 goto skip;
8250
8251 wc->stage = UPDATE_BACKREF;
8252 wc->shared_level = level - 1;
8253 }
8254 } else {
8255 if (level == 1 &&
8256 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8257 goto skip;
8258 }
8259
8260 if (!btrfs_buffer_uptodate(next, generation, 0)) {
8261 btrfs_tree_unlock(next);
8262 free_extent_buffer(next);
8263 next = NULL;
8264 *lookup_info = 1;
8265 }
8266
8267 if (!next) {
8268 if (reada && level == 1)
8269 reada_walk_down(trans, root, wc, path);
8270 next = read_tree_block(root, bytenr, generation);
8271 if (IS_ERR(next)) {
8272 return PTR_ERR(next);
8273 } else if (!extent_buffer_uptodate(next)) {
8274 free_extent_buffer(next);
8275 return -EIO;
8276 }
8277 btrfs_tree_lock(next);
8278 btrfs_set_lock_blocking(next);
8279 }
8280
8281 level--;
8282 BUG_ON(level != btrfs_header_level(next));
8283 path->nodes[level] = next;
8284 path->slots[level] = 0;
8285 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8286 wc->level = level;
8287 if (wc->level == 1)
8288 wc->reada_slot = 0;
8289 return 0;
8290 skip:
8291 wc->refs[level - 1] = 0;
8292 wc->flags[level - 1] = 0;
8293 if (wc->stage == DROP_REFERENCE) {
8294 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8295 parent = path->nodes[level]->start;
8296 } else {
8297 BUG_ON(root->root_key.objectid !=
8298 btrfs_header_owner(path->nodes[level]));
8299 parent = 0;
8300 }
8301
8302 if (need_account) {
8303 ret = account_shared_subtree(trans, root, next,
8304 generation, level - 1);
8305 if (ret) {
8306 printk_ratelimited(KERN_ERR "BTRFS: %s Error "
8307 "%d accounting shared subtree. Quota "
8308 "is out of sync, rescan required.\n",
8309 root->fs_info->sb->s_id, ret);
8310 }
8311 }
8312 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
8313 root->root_key.objectid, level - 1, 0, 0);
8314 BUG_ON(ret); /* -ENOMEM */
8315 }
8316 btrfs_tree_unlock(next);
8317 free_extent_buffer(next);
8318 *lookup_info = 1;
8319 return 1;
8320 }
8321
8322 /*
8323 * helper to process tree block while walking up the tree.
8324 *
8325 * when wc->stage == DROP_REFERENCE, this function drops
8326 * reference count on the block.
8327 *
8328 * when wc->stage == UPDATE_BACKREF, this function changes
8329 * wc->stage back to DROP_REFERENCE if we changed wc->stage
8330 * to UPDATE_BACKREF previously while processing the block.
8331 *
8332 * NOTE: return value 1 means we should stop walking up.
8333 */
8334 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8335 struct btrfs_root *root,
8336 struct btrfs_path *path,
8337 struct walk_control *wc)
8338 {
8339 int ret;
8340 int level = wc->level;
8341 struct extent_buffer *eb = path->nodes[level];
8342 u64 parent = 0;
8343
8344 if (wc->stage == UPDATE_BACKREF) {
8345 BUG_ON(wc->shared_level < level);
8346 if (level < wc->shared_level)
8347 goto out;
8348
8349 ret = find_next_key(path, level + 1, &wc->update_progress);
8350 if (ret > 0)
8351 wc->update_ref = 0;
8352
8353 wc->stage = DROP_REFERENCE;
8354 wc->shared_level = -1;
8355 path->slots[level] = 0;
8356
8357 /*
8358 * check reference count again if the block isn't locked.
8359 * we should start walking down the tree again if reference
8360 * count is one.
8361 */
8362 if (!path->locks[level]) {
8363 BUG_ON(level == 0);
8364 btrfs_tree_lock(eb);
8365 btrfs_set_lock_blocking(eb);
8366 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8367
8368 ret = btrfs_lookup_extent_info(trans, root,
8369 eb->start, level, 1,
8370 &wc->refs[level],
8371 &wc->flags[level]);
8372 if (ret < 0) {
8373 btrfs_tree_unlock_rw(eb, path->locks[level]);
8374 path->locks[level] = 0;
8375 return ret;
8376 }
8377 BUG_ON(wc->refs[level] == 0);
8378 if (wc->refs[level] == 1) {
8379 btrfs_tree_unlock_rw(eb, path->locks[level]);
8380 path->locks[level] = 0;
8381 return 1;
8382 }
8383 }
8384 }
8385
8386 /* wc->stage == DROP_REFERENCE */
8387 BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
8388
8389 if (wc->refs[level] == 1) {
8390 if (level == 0) {
8391 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8392 ret = btrfs_dec_ref(trans, root, eb, 1);
8393 else
8394 ret = btrfs_dec_ref(trans, root, eb, 0);
8395 BUG_ON(ret); /* -ENOMEM */
8396 ret = account_leaf_items(trans, root, eb);
8397 if (ret) {
8398 printk_ratelimited(KERN_ERR "BTRFS: %s Error "
8399 "%d accounting leaf items. Quota "
8400 "is out of sync, rescan required.\n",
8401 root->fs_info->sb->s_id, ret);
8402 }
8403 }
8404 /* make block locked assertion in clean_tree_block happy */
8405 if (!path->locks[level] &&
8406 btrfs_header_generation(eb) == trans->transid) {
8407 btrfs_tree_lock(eb);
8408 btrfs_set_lock_blocking(eb);
8409 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8410 }
8411 clean_tree_block(trans, root->fs_info, eb);
8412 }
8413
8414 if (eb == root->node) {
8415 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8416 parent = eb->start;
8417 else
8418 BUG_ON(root->root_key.objectid !=
8419 btrfs_header_owner(eb));
8420 } else {
8421 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8422 parent = path->nodes[level + 1]->start;
8423 else
8424 BUG_ON(root->root_key.objectid !=
8425 btrfs_header_owner(path->nodes[level + 1]));
8426 }
8427
8428 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
8429 out:
8430 wc->refs[level] = 0;
8431 wc->flags[level] = 0;
8432 return 0;
8433 }
8434
8435 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
8436 struct btrfs_root *root,
8437 struct btrfs_path *path,
8438 struct walk_control *wc)
8439 {
8440 int level = wc->level;
8441 int lookup_info = 1;
8442 int ret;
8443
8444 while (level >= 0) {
8445 ret = walk_down_proc(trans, root, path, wc, lookup_info);
8446 if (ret > 0)
8447 break;
8448
8449 if (level == 0)
8450 break;
8451
8452 if (path->slots[level] >=
8453 btrfs_header_nritems(path->nodes[level]))
8454 break;
8455
8456 ret = do_walk_down(trans, root, path, wc, &lookup_info);
8457 if (ret > 0) {
8458 path->slots[level]++;
8459 continue;
8460 } else if (ret < 0)
8461 return ret;
8462 level = wc->level;
8463 }
8464 return 0;
8465 }
8466
8467 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
8468 struct btrfs_root *root,
8469 struct btrfs_path *path,
8470 struct walk_control *wc, int max_level)
8471 {
8472 int level = wc->level;
8473 int ret;
8474
8475 path->slots[level] = btrfs_header_nritems(path->nodes[level]);
8476 while (level < max_level && path->nodes[level]) {
8477 wc->level = level;
8478 if (path->slots[level] + 1 <
8479 btrfs_header_nritems(path->nodes[level])) {
8480 path->slots[level]++;
8481 return 0;
8482 } else {
8483 ret = walk_up_proc(trans, root, path, wc);
8484 if (ret > 0)
8485 return 0;
8486
8487 if (path->locks[level]) {
8488 btrfs_tree_unlock_rw(path->nodes[level],
8489 path->locks[level]);
8490 path->locks[level] = 0;
8491 }
8492 free_extent_buffer(path->nodes[level]);
8493 path->nodes[level] = NULL;
8494 level++;
8495 }
8496 }
8497 return 1;
8498 }
8499
8500 /*
8501 * drop a subvolume tree.
8502 *
8503 * this function traverses the tree freeing any blocks that only
8504 * referenced by the tree.
8505 *
8506 * when a shared tree block is found. this function decreases its
8507 * reference count by one. if update_ref is true, this function
8508 * also make sure backrefs for the shared block and all lower level
8509 * blocks are properly updated.
8510 *
8511 * If called with for_reloc == 0, may exit early with -EAGAIN
8512 */
8513 int btrfs_drop_snapshot(struct btrfs_root *root,
8514 struct btrfs_block_rsv *block_rsv, int update_ref,
8515 int for_reloc)
8516 {
8517 struct btrfs_path *path;
8518 struct btrfs_trans_handle *trans;
8519 struct btrfs_root *tree_root = root->fs_info->tree_root;
8520 struct btrfs_root_item *root_item = &root->root_item;
8521 struct walk_control *wc;
8522 struct btrfs_key key;
8523 int err = 0;
8524 int ret;
8525 int level;
8526 bool root_dropped = false;
8527
8528 btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
8529
8530 path = btrfs_alloc_path();
8531 if (!path) {
8532 err = -ENOMEM;
8533 goto out;
8534 }
8535
8536 wc = kzalloc(sizeof(*wc), GFP_NOFS);
8537 if (!wc) {
8538 btrfs_free_path(path);
8539 err = -ENOMEM;
8540 goto out;
8541 }
8542
8543 trans = btrfs_start_transaction(tree_root, 0);
8544 if (IS_ERR(trans)) {
8545 err = PTR_ERR(trans);
8546 goto out_free;
8547 }
8548
8549 if (block_rsv)
8550 trans->block_rsv = block_rsv;
8551
8552 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
8553 level = btrfs_header_level(root->node);
8554 path->nodes[level] = btrfs_lock_root_node(root);
8555 btrfs_set_lock_blocking(path->nodes[level]);
8556 path->slots[level] = 0;
8557 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8558 memset(&wc->update_progress, 0,
8559 sizeof(wc->update_progress));
8560 } else {
8561 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
8562 memcpy(&wc->update_progress, &key,
8563 sizeof(wc->update_progress));
8564
8565 level = root_item->drop_level;
8566 BUG_ON(level == 0);
8567 path->lowest_level = level;
8568 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
8569 path->lowest_level = 0;
8570 if (ret < 0) {
8571 err = ret;
8572 goto out_end_trans;
8573 }
8574 WARN_ON(ret > 0);
8575
8576 /*
8577 * unlock our path, this is safe because only this
8578 * function is allowed to delete this snapshot
8579 */
8580 btrfs_unlock_up_safe(path, 0);
8581
8582 level = btrfs_header_level(root->node);
8583 while (1) {
8584 btrfs_tree_lock(path->nodes[level]);
8585 btrfs_set_lock_blocking(path->nodes[level]);
8586 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8587
8588 ret = btrfs_lookup_extent_info(trans, root,
8589 path->nodes[level]->start,
8590 level, 1, &wc->refs[level],
8591 &wc->flags[level]);
8592 if (ret < 0) {
8593 err = ret;
8594 goto out_end_trans;
8595 }
8596 BUG_ON(wc->refs[level] == 0);
8597
8598 if (level == root_item->drop_level)
8599 break;
8600
8601 btrfs_tree_unlock(path->nodes[level]);
8602 path->locks[level] = 0;
8603 WARN_ON(wc->refs[level] != 1);
8604 level--;
8605 }
8606 }
8607
8608 wc->level = level;
8609 wc->shared_level = -1;
8610 wc->stage = DROP_REFERENCE;
8611 wc->update_ref = update_ref;
8612 wc->keep_locks = 0;
8613 wc->for_reloc = for_reloc;
8614 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
8615
8616 while (1) {
8617
8618 ret = walk_down_tree(trans, root, path, wc);
8619 if (ret < 0) {
8620 err = ret;
8621 break;
8622 }
8623
8624 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
8625 if (ret < 0) {
8626 err = ret;
8627 break;
8628 }
8629
8630 if (ret > 0) {
8631 BUG_ON(wc->stage != DROP_REFERENCE);
8632 break;
8633 }
8634
8635 if (wc->stage == DROP_REFERENCE) {
8636 level = wc->level;
8637 btrfs_node_key(path->nodes[level],
8638 &root_item->drop_progress,
8639 path->slots[level]);
8640 root_item->drop_level = level;
8641 }
8642
8643 BUG_ON(wc->level == 0);
8644 if (btrfs_should_end_transaction(trans, tree_root) ||
8645 (!for_reloc && btrfs_need_cleaner_sleep(root))) {
8646 ret = btrfs_update_root(trans, tree_root,
8647 &root->root_key,
8648 root_item);
8649 if (ret) {
8650 btrfs_abort_transaction(trans, tree_root, ret);
8651 err = ret;
8652 goto out_end_trans;
8653 }
8654
8655 btrfs_end_transaction_throttle(trans, tree_root);
8656 if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
8657 pr_debug("BTRFS: drop snapshot early exit\n");
8658 err = -EAGAIN;
8659 goto out_free;
8660 }
8661
8662 trans = btrfs_start_transaction(tree_root, 0);
8663 if (IS_ERR(trans)) {
8664 err = PTR_ERR(trans);
8665 goto out_free;
8666 }
8667 if (block_rsv)
8668 trans->block_rsv = block_rsv;
8669 }
8670 }
8671 btrfs_release_path(path);
8672 if (err)
8673 goto out_end_trans;
8674
8675 ret = btrfs_del_root(trans, tree_root, &root->root_key);
8676 if (ret) {
8677 btrfs_abort_transaction(trans, tree_root, ret);
8678 goto out_end_trans;
8679 }
8680
8681 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
8682 ret = btrfs_find_root(tree_root, &root->root_key, path,
8683 NULL, NULL);
8684 if (ret < 0) {
8685 btrfs_abort_transaction(trans, tree_root, ret);
8686 err = ret;
8687 goto out_end_trans;
8688 } else if (ret > 0) {
8689 /* if we fail to delete the orphan item this time
8690 * around, it'll get picked up the next time.
8691 *
8692 * The most common failure here is just -ENOENT.
8693 */
8694 btrfs_del_orphan_item(trans, tree_root,
8695 root->root_key.objectid);
8696 }
8697 }
8698
8699 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
8700 btrfs_add_dropped_root(trans, root);
8701 } else {
8702 free_extent_buffer(root->node);
8703 free_extent_buffer(root->commit_root);
8704 btrfs_put_fs_root(root);
8705 }
8706 root_dropped = true;
8707 out_end_trans:
8708 btrfs_end_transaction_throttle(trans, tree_root);
8709 out_free:
8710 kfree(wc);
8711 btrfs_free_path(path);
8712 out:
8713 /*
8714 * So if we need to stop dropping the snapshot for whatever reason we
8715 * need to make sure to add it back to the dead root list so that we
8716 * keep trying to do the work later. This also cleans up roots if we
8717 * don't have it in the radix (like when we recover after a power fail
8718 * or unmount) so we don't leak memory.
8719 */
8720 if (!for_reloc && root_dropped == false)
8721 btrfs_add_dead_root(root);
8722 if (err && err != -EAGAIN)
8723 btrfs_std_error(root->fs_info, err);
8724 return err;
8725 }
8726
8727 /*
8728 * drop subtree rooted at tree block 'node'.
8729 *
8730 * NOTE: this function will unlock and release tree block 'node'
8731 * only used by relocation code
8732 */
8733 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
8734 struct btrfs_root *root,
8735 struct extent_buffer *node,
8736 struct extent_buffer *parent)
8737 {
8738 struct btrfs_path *path;
8739 struct walk_control *wc;
8740 int level;
8741 int parent_level;
8742 int ret = 0;
8743 int wret;
8744
8745 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
8746
8747 path = btrfs_alloc_path();
8748 if (!path)
8749 return -ENOMEM;
8750
8751 wc = kzalloc(sizeof(*wc), GFP_NOFS);
8752 if (!wc) {
8753 btrfs_free_path(path);
8754 return -ENOMEM;
8755 }
8756
8757 btrfs_assert_tree_locked(parent);
8758 parent_level = btrfs_header_level(parent);
8759 extent_buffer_get(parent);
8760 path->nodes[parent_level] = parent;
8761 path->slots[parent_level] = btrfs_header_nritems(parent);
8762
8763 btrfs_assert_tree_locked(node);
8764 level = btrfs_header_level(node);
8765 path->nodes[level] = node;
8766 path->slots[level] = 0;
8767 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8768
8769 wc->refs[parent_level] = 1;
8770 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8771 wc->level = level;
8772 wc->shared_level = -1;
8773 wc->stage = DROP_REFERENCE;
8774 wc->update_ref = 0;
8775 wc->keep_locks = 1;
8776 wc->for_reloc = 1;
8777 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
8778
8779 while (1) {
8780 wret = walk_down_tree(trans, root, path, wc);
8781 if (wret < 0) {
8782 ret = wret;
8783 break;
8784 }
8785
8786 wret = walk_up_tree(trans, root, path, wc, parent_level);
8787 if (wret < 0)
8788 ret = wret;
8789 if (wret != 0)
8790 break;
8791 }
8792
8793 kfree(wc);
8794 btrfs_free_path(path);
8795 return ret;
8796 }
8797
8798 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
8799 {
8800 u64 num_devices;
8801 u64 stripped;
8802
8803 /*
8804 * if restripe for this chunk_type is on pick target profile and
8805 * return, otherwise do the usual balance
8806 */
8807 stripped = get_restripe_target(root->fs_info, flags);
8808 if (stripped)
8809 return extended_to_chunk(stripped);
8810
8811 num_devices = root->fs_info->fs_devices->rw_devices;
8812
8813 stripped = BTRFS_BLOCK_GROUP_RAID0 |
8814 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
8815 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
8816
8817 if (num_devices == 1) {
8818 stripped |= BTRFS_BLOCK_GROUP_DUP;
8819 stripped = flags & ~stripped;
8820
8821 /* turn raid0 into single device chunks */
8822 if (flags & BTRFS_BLOCK_GROUP_RAID0)
8823 return stripped;
8824
8825 /* turn mirroring into duplication */
8826 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
8827 BTRFS_BLOCK_GROUP_RAID10))
8828 return stripped | BTRFS_BLOCK_GROUP_DUP;
8829 } else {
8830 /* they already had raid on here, just return */
8831 if (flags & stripped)
8832 return flags;
8833
8834 stripped |= BTRFS_BLOCK_GROUP_DUP;
8835 stripped = flags & ~stripped;
8836
8837 /* switch duplicated blocks with raid1 */
8838 if (flags & BTRFS_BLOCK_GROUP_DUP)
8839 return stripped | BTRFS_BLOCK_GROUP_RAID1;
8840
8841 /* this is drive concat, leave it alone */
8842 }
8843
8844 return flags;
8845 }
8846
8847 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
8848 {
8849 struct btrfs_space_info *sinfo = cache->space_info;
8850 u64 num_bytes;
8851 u64 min_allocable_bytes;
8852 int ret = -ENOSPC;
8853
8854 /*
8855 * We need some metadata space and system metadata space for
8856 * allocating chunks in some corner cases until we force to set
8857 * it to be readonly.
8858 */
8859 if ((sinfo->flags &
8860 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
8861 !force)
8862 min_allocable_bytes = 1 * 1024 * 1024;
8863 else
8864 min_allocable_bytes = 0;
8865
8866 spin_lock(&sinfo->lock);
8867 spin_lock(&cache->lock);
8868
8869 if (cache->ro) {
8870 cache->ro++;
8871 ret = 0;
8872 goto out;
8873 }
8874
8875 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8876 cache->bytes_super - btrfs_block_group_used(&cache->item);
8877
8878 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
8879 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
8880 min_allocable_bytes <= sinfo->total_bytes) {
8881 sinfo->bytes_readonly += num_bytes;
8882 cache->ro++;
8883 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
8884 ret = 0;
8885 }
8886 out:
8887 spin_unlock(&cache->lock);
8888 spin_unlock(&sinfo->lock);
8889 return ret;
8890 }
8891
8892 int btrfs_inc_block_group_ro(struct btrfs_root *root,
8893 struct btrfs_block_group_cache *cache)
8894
8895 {
8896 struct btrfs_trans_handle *trans;
8897 u64 alloc_flags;
8898 int ret;
8899
8900 again:
8901 trans = btrfs_join_transaction(root);
8902 if (IS_ERR(trans))
8903 return PTR_ERR(trans);
8904
8905 /*
8906 * we're not allowed to set block groups readonly after the dirty
8907 * block groups cache has started writing. If it already started,
8908 * back off and let this transaction commit
8909 */
8910 mutex_lock(&root->fs_info->ro_block_group_mutex);
8911 if (trans->transaction->dirty_bg_run) {
8912 u64 transid = trans->transid;
8913
8914 mutex_unlock(&root->fs_info->ro_block_group_mutex);
8915 btrfs_end_transaction(trans, root);
8916
8917 ret = btrfs_wait_for_commit(root, transid);
8918 if (ret)
8919 return ret;
8920 goto again;
8921 }
8922
8923 /*
8924 * if we are changing raid levels, try to allocate a corresponding
8925 * block group with the new raid level.
8926 */
8927 alloc_flags = update_block_group_flags(root, cache->flags);
8928 if (alloc_flags != cache->flags) {
8929 ret = do_chunk_alloc(trans, root, alloc_flags,
8930 CHUNK_ALLOC_FORCE);
8931 /*
8932 * ENOSPC is allowed here, we may have enough space
8933 * already allocated at the new raid level to
8934 * carry on
8935 */
8936 if (ret == -ENOSPC)
8937 ret = 0;
8938 if (ret < 0)
8939 goto out;
8940 }
8941
8942 ret = inc_block_group_ro(cache, 0);
8943 if (!ret)
8944 goto out;
8945 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
8946 ret = do_chunk_alloc(trans, root, alloc_flags,
8947 CHUNK_ALLOC_FORCE);
8948 if (ret < 0)
8949 goto out;
8950 ret = inc_block_group_ro(cache, 0);
8951 out:
8952 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
8953 alloc_flags = update_block_group_flags(root, cache->flags);
8954 lock_chunks(root->fs_info->chunk_root);
8955 check_system_chunk(trans, root, alloc_flags);
8956 unlock_chunks(root->fs_info->chunk_root);
8957 }
8958 mutex_unlock(&root->fs_info->ro_block_group_mutex);
8959
8960 btrfs_end_transaction(trans, root);
8961 return ret;
8962 }
8963
8964 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
8965 struct btrfs_root *root, u64 type)
8966 {
8967 u64 alloc_flags = get_alloc_profile(root, type);
8968 return do_chunk_alloc(trans, root, alloc_flags,
8969 CHUNK_ALLOC_FORCE);
8970 }
8971
8972 /*
8973 * helper to account the unused space of all the readonly block group in the
8974 * space_info. takes mirrors into account.
8975 */
8976 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8977 {
8978 struct btrfs_block_group_cache *block_group;
8979 u64 free_bytes = 0;
8980 int factor;
8981
8982 /* It's df, we don't care if it's racey */
8983 if (list_empty(&sinfo->ro_bgs))
8984 return 0;
8985
8986 spin_lock(&sinfo->lock);
8987 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
8988 spin_lock(&block_group->lock);
8989
8990 if (!block_group->ro) {
8991 spin_unlock(&block_group->lock);
8992 continue;
8993 }
8994
8995 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
8996 BTRFS_BLOCK_GROUP_RAID10 |
8997 BTRFS_BLOCK_GROUP_DUP))
8998 factor = 2;
8999 else
9000 factor = 1;
9001
9002 free_bytes += (block_group->key.offset -
9003 btrfs_block_group_used(&block_group->item)) *
9004 factor;
9005
9006 spin_unlock(&block_group->lock);
9007 }
9008 spin_unlock(&sinfo->lock);
9009
9010 return free_bytes;
9011 }
9012
9013 void btrfs_dec_block_group_ro(struct btrfs_root *root,
9014 struct btrfs_block_group_cache *cache)
9015 {
9016 struct btrfs_space_info *sinfo = cache->space_info;
9017 u64 num_bytes;
9018
9019 BUG_ON(!cache->ro);
9020
9021 spin_lock(&sinfo->lock);
9022 spin_lock(&cache->lock);
9023 if (!--cache->ro) {
9024 num_bytes = cache->key.offset - cache->reserved -
9025 cache->pinned - cache->bytes_super -
9026 btrfs_block_group_used(&cache->item);
9027 sinfo->bytes_readonly -= num_bytes;
9028 list_del_init(&cache->ro_list);
9029 }
9030 spin_unlock(&cache->lock);
9031 spin_unlock(&sinfo->lock);
9032 }
9033
9034 /*
9035 * checks to see if its even possible to relocate this block group.
9036 *
9037 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9038 * ok to go ahead and try.
9039 */
9040 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
9041 {
9042 struct btrfs_block_group_cache *block_group;
9043 struct btrfs_space_info *space_info;
9044 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
9045 struct btrfs_device *device;
9046 struct btrfs_trans_handle *trans;
9047 u64 min_free;
9048 u64 dev_min = 1;
9049 u64 dev_nr = 0;
9050 u64 target;
9051 int index;
9052 int full = 0;
9053 int ret = 0;
9054
9055 block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
9056
9057 /* odd, couldn't find the block group, leave it alone */
9058 if (!block_group)
9059 return -1;
9060
9061 min_free = btrfs_block_group_used(&block_group->item);
9062
9063 /* no bytes used, we're good */
9064 if (!min_free)
9065 goto out;
9066
9067 space_info = block_group->space_info;
9068 spin_lock(&space_info->lock);
9069
9070 full = space_info->full;
9071
9072 /*
9073 * if this is the last block group we have in this space, we can't
9074 * relocate it unless we're able to allocate a new chunk below.
9075 *
9076 * Otherwise, we need to make sure we have room in the space to handle
9077 * all of the extents from this block group. If we can, we're good
9078 */
9079 if ((space_info->total_bytes != block_group->key.offset) &&
9080 (space_info->bytes_used + space_info->bytes_reserved +
9081 space_info->bytes_pinned + space_info->bytes_readonly +
9082 min_free < space_info->total_bytes)) {
9083 spin_unlock(&space_info->lock);
9084 goto out;
9085 }
9086 spin_unlock(&space_info->lock);
9087
9088 /*
9089 * ok we don't have enough space, but maybe we have free space on our
9090 * devices to allocate new chunks for relocation, so loop through our
9091 * alloc devices and guess if we have enough space. if this block
9092 * group is going to be restriped, run checks against the target
9093 * profile instead of the current one.
9094 */
9095 ret = -1;
9096
9097 /*
9098 * index:
9099 * 0: raid10
9100 * 1: raid1
9101 * 2: dup
9102 * 3: raid0
9103 * 4: single
9104 */
9105 target = get_restripe_target(root->fs_info, block_group->flags);
9106 if (target) {
9107 index = __get_raid_index(extended_to_chunk(target));
9108 } else {
9109 /*
9110 * this is just a balance, so if we were marked as full
9111 * we know there is no space for a new chunk
9112 */
9113 if (full)
9114 goto out;
9115
9116 index = get_block_group_index(block_group);
9117 }
9118
9119 if (index == BTRFS_RAID_RAID10) {
9120 dev_min = 4;
9121 /* Divide by 2 */
9122 min_free >>= 1;
9123 } else if (index == BTRFS_RAID_RAID1) {
9124 dev_min = 2;
9125 } else if (index == BTRFS_RAID_DUP) {
9126 /* Multiply by 2 */
9127 min_free <<= 1;
9128 } else if (index == BTRFS_RAID_RAID0) {
9129 dev_min = fs_devices->rw_devices;
9130 min_free = div64_u64(min_free, dev_min);
9131 }
9132
9133 /* We need to do this so that we can look at pending chunks */
9134 trans = btrfs_join_transaction(root);
9135 if (IS_ERR(trans)) {
9136 ret = PTR_ERR(trans);
9137 goto out;
9138 }
9139
9140 mutex_lock(&root->fs_info->chunk_mutex);
9141 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9142 u64 dev_offset;
9143
9144 /*
9145 * check to make sure we can actually find a chunk with enough
9146 * space to fit our block group in.
9147 */
9148 if (device->total_bytes > device->bytes_used + min_free &&
9149 !device->is_tgtdev_for_dev_replace) {
9150 ret = find_free_dev_extent(trans, device, min_free,
9151 &dev_offset, NULL);
9152 if (!ret)
9153 dev_nr++;
9154
9155 if (dev_nr >= dev_min)
9156 break;
9157
9158 ret = -1;
9159 }
9160 }
9161 mutex_unlock(&root->fs_info->chunk_mutex);
9162 btrfs_end_transaction(trans, root);
9163 out:
9164 btrfs_put_block_group(block_group);
9165 return ret;
9166 }
9167
9168 static int find_first_block_group(struct btrfs_root *root,
9169 struct btrfs_path *path, struct btrfs_key *key)
9170 {
9171 int ret = 0;
9172 struct btrfs_key found_key;
9173 struct extent_buffer *leaf;
9174 int slot;
9175
9176 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9177 if (ret < 0)
9178 goto out;
9179
9180 while (1) {
9181 slot = path->slots[0];
9182 leaf = path->nodes[0];
9183 if (slot >= btrfs_header_nritems(leaf)) {
9184 ret = btrfs_next_leaf(root, path);
9185 if (ret == 0)
9186 continue;
9187 if (ret < 0)
9188 goto out;
9189 break;
9190 }
9191 btrfs_item_key_to_cpu(leaf, &found_key, slot);
9192
9193 if (found_key.objectid >= key->objectid &&
9194 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9195 ret = 0;
9196 goto out;
9197 }
9198 path->slots[0]++;
9199 }
9200 out:
9201 return ret;
9202 }
9203
9204 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9205 {
9206 struct btrfs_block_group_cache *block_group;
9207 u64 last = 0;
9208
9209 while (1) {
9210 struct inode *inode;
9211
9212 block_group = btrfs_lookup_first_block_group(info, last);
9213 while (block_group) {
9214 spin_lock(&block_group->lock);
9215 if (block_group->iref)
9216 break;
9217 spin_unlock(&block_group->lock);
9218 block_group = next_block_group(info->tree_root,
9219 block_group);
9220 }
9221 if (!block_group) {
9222 if (last == 0)
9223 break;
9224 last = 0;
9225 continue;
9226 }
9227
9228 inode = block_group->inode;
9229 block_group->iref = 0;
9230 block_group->inode = NULL;
9231 spin_unlock(&block_group->lock);
9232 iput(inode);
9233 last = block_group->key.objectid + block_group->key.offset;
9234 btrfs_put_block_group(block_group);
9235 }
9236 }
9237
9238 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9239 {
9240 struct btrfs_block_group_cache *block_group;
9241 struct btrfs_space_info *space_info;
9242 struct btrfs_caching_control *caching_ctl;
9243 struct rb_node *n;
9244
9245 down_write(&info->commit_root_sem);
9246 while (!list_empty(&info->caching_block_groups)) {
9247 caching_ctl = list_entry(info->caching_block_groups.next,
9248 struct btrfs_caching_control, list);
9249 list_del(&caching_ctl->list);
9250 put_caching_control(caching_ctl);
9251 }
9252 up_write(&info->commit_root_sem);
9253
9254 spin_lock(&info->unused_bgs_lock);
9255 while (!list_empty(&info->unused_bgs)) {
9256 block_group = list_first_entry(&info->unused_bgs,
9257 struct btrfs_block_group_cache,
9258 bg_list);
9259 list_del_init(&block_group->bg_list);
9260 btrfs_put_block_group(block_group);
9261 }
9262 spin_unlock(&info->unused_bgs_lock);
9263
9264 spin_lock(&info->block_group_cache_lock);
9265 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9266 block_group = rb_entry(n, struct btrfs_block_group_cache,
9267 cache_node);
9268 rb_erase(&block_group->cache_node,
9269 &info->block_group_cache_tree);
9270 RB_CLEAR_NODE(&block_group->cache_node);
9271 spin_unlock(&info->block_group_cache_lock);
9272
9273 down_write(&block_group->space_info->groups_sem);
9274 list_del(&block_group->list);
9275 up_write(&block_group->space_info->groups_sem);
9276
9277 if (block_group->cached == BTRFS_CACHE_STARTED)
9278 wait_block_group_cache_done(block_group);
9279
9280 /*
9281 * We haven't cached this block group, which means we could
9282 * possibly have excluded extents on this block group.
9283 */
9284 if (block_group->cached == BTRFS_CACHE_NO ||
9285 block_group->cached == BTRFS_CACHE_ERROR)
9286 free_excluded_extents(info->extent_root, block_group);
9287
9288 btrfs_remove_free_space_cache(block_group);
9289 btrfs_put_block_group(block_group);
9290
9291 spin_lock(&info->block_group_cache_lock);
9292 }
9293 spin_unlock(&info->block_group_cache_lock);
9294
9295 /* now that all the block groups are freed, go through and
9296 * free all the space_info structs. This is only called during
9297 * the final stages of unmount, and so we know nobody is
9298 * using them. We call synchronize_rcu() once before we start,
9299 * just to be on the safe side.
9300 */
9301 synchronize_rcu();
9302
9303 release_global_block_rsv(info);
9304
9305 while (!list_empty(&info->space_info)) {
9306 int i;
9307
9308 space_info = list_entry(info->space_info.next,
9309 struct btrfs_space_info,
9310 list);
9311 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
9312 if (WARN_ON(space_info->bytes_pinned > 0 ||
9313 space_info->bytes_reserved > 0 ||
9314 space_info->bytes_may_use > 0)) {
9315 dump_space_info(space_info, 0, 0);
9316 }
9317 }
9318 list_del(&space_info->list);
9319 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9320 struct kobject *kobj;
9321 kobj = space_info->block_group_kobjs[i];
9322 space_info->block_group_kobjs[i] = NULL;
9323 if (kobj) {
9324 kobject_del(kobj);
9325 kobject_put(kobj);
9326 }
9327 }
9328 kobject_del(&space_info->kobj);
9329 kobject_put(&space_info->kobj);
9330 }
9331 return 0;
9332 }
9333
9334 static void __link_block_group(struct btrfs_space_info *space_info,
9335 struct btrfs_block_group_cache *cache)
9336 {
9337 int index = get_block_group_index(cache);
9338 bool first = false;
9339
9340 down_write(&space_info->groups_sem);
9341 if (list_empty(&space_info->block_groups[index]))
9342 first = true;
9343 list_add_tail(&cache->list, &space_info->block_groups[index]);
9344 up_write(&space_info->groups_sem);
9345
9346 if (first) {
9347 struct raid_kobject *rkobj;
9348 int ret;
9349
9350 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
9351 if (!rkobj)
9352 goto out_err;
9353 rkobj->raid_type = index;
9354 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
9355 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9356 "%s", get_raid_name(index));
9357 if (ret) {
9358 kobject_put(&rkobj->kobj);
9359 goto out_err;
9360 }
9361 space_info->block_group_kobjs[index] = &rkobj->kobj;
9362 }
9363
9364 return;
9365 out_err:
9366 pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
9367 }
9368
9369 static struct btrfs_block_group_cache *
9370 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
9371 {
9372 struct btrfs_block_group_cache *cache;
9373
9374 cache = kzalloc(sizeof(*cache), GFP_NOFS);
9375 if (!cache)
9376 return NULL;
9377
9378 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
9379 GFP_NOFS);
9380 if (!cache->free_space_ctl) {
9381 kfree(cache);
9382 return NULL;
9383 }
9384
9385 cache->key.objectid = start;
9386 cache->key.offset = size;
9387 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9388
9389 cache->sectorsize = root->sectorsize;
9390 cache->fs_info = root->fs_info;
9391 cache->full_stripe_len = btrfs_full_stripe_len(root,
9392 &root->fs_info->mapping_tree,
9393 start);
9394 set_free_space_tree_thresholds(cache);
9395
9396 atomic_set(&cache->count, 1);
9397 spin_lock_init(&cache->lock);
9398 init_rwsem(&cache->data_rwsem);
9399 INIT_LIST_HEAD(&cache->list);
9400 INIT_LIST_HEAD(&cache->cluster_list);
9401 INIT_LIST_HEAD(&cache->bg_list);
9402 INIT_LIST_HEAD(&cache->ro_list);
9403 INIT_LIST_HEAD(&cache->dirty_list);
9404 INIT_LIST_HEAD(&cache->io_list);
9405 btrfs_init_free_space_ctl(cache);
9406 atomic_set(&cache->trimming, 0);
9407 mutex_init(&cache->free_space_lock);
9408
9409 return cache;
9410 }
9411
9412 int btrfs_read_block_groups(struct btrfs_root *root)
9413 {
9414 struct btrfs_path *path;
9415 int ret;
9416 struct btrfs_block_group_cache *cache;
9417 struct btrfs_fs_info *info = root->fs_info;
9418 struct btrfs_space_info *space_info;
9419 struct btrfs_key key;
9420 struct btrfs_key found_key;
9421 struct extent_buffer *leaf;
9422 int need_clear = 0;
9423 u64 cache_gen;
9424
9425 root = info->extent_root;
9426 key.objectid = 0;
9427 key.offset = 0;
9428 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9429 path = btrfs_alloc_path();
9430 if (!path)
9431 return -ENOMEM;
9432 path->reada = 1;
9433
9434 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
9435 if (btrfs_test_opt(root, SPACE_CACHE) &&
9436 btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
9437 need_clear = 1;
9438 if (btrfs_test_opt(root, CLEAR_CACHE))
9439 need_clear = 1;
9440
9441 while (1) {
9442 ret = find_first_block_group(root, path, &key);
9443 if (ret > 0)
9444 break;
9445 if (ret != 0)
9446 goto error;
9447
9448 leaf = path->nodes[0];
9449 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
9450
9451 cache = btrfs_create_block_group_cache(root, found_key.objectid,
9452 found_key.offset);
9453 if (!cache) {
9454 ret = -ENOMEM;
9455 goto error;
9456 }
9457
9458 if (need_clear) {
9459 /*
9460 * When we mount with old space cache, we need to
9461 * set BTRFS_DC_CLEAR and set dirty flag.
9462 *
9463 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
9464 * truncate the old free space cache inode and
9465 * setup a new one.
9466 * b) Setting 'dirty flag' makes sure that we flush
9467 * the new space cache info onto disk.
9468 */
9469 if (btrfs_test_opt(root, SPACE_CACHE))
9470 cache->disk_cache_state = BTRFS_DC_CLEAR;
9471 }
9472
9473 read_extent_buffer(leaf, &cache->item,
9474 btrfs_item_ptr_offset(leaf, path->slots[0]),
9475 sizeof(cache->item));
9476 cache->flags = btrfs_block_group_flags(&cache->item);
9477
9478 key.objectid = found_key.objectid + found_key.offset;
9479 btrfs_release_path(path);
9480
9481 /*
9482 * We need to exclude the super stripes now so that the space
9483 * info has super bytes accounted for, otherwise we'll think
9484 * we have more space than we actually do.
9485 */
9486 ret = exclude_super_stripes(root, cache);
9487 if (ret) {
9488 /*
9489 * We may have excluded something, so call this just in
9490 * case.
9491 */
9492 free_excluded_extents(root, cache);
9493 btrfs_put_block_group(cache);
9494 goto error;
9495 }
9496
9497 /*
9498 * check for two cases, either we are full, and therefore
9499 * don't need to bother with the caching work since we won't
9500 * find any space, or we are empty, and we can just add all
9501 * the space in and be done with it. This saves us _alot_ of
9502 * time, particularly in the full case.
9503 */
9504 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
9505 cache->last_byte_to_unpin = (u64)-1;
9506 cache->cached = BTRFS_CACHE_FINISHED;
9507 free_excluded_extents(root, cache);
9508 } else if (btrfs_block_group_used(&cache->item) == 0) {
9509 cache->last_byte_to_unpin = (u64)-1;
9510 cache->cached = BTRFS_CACHE_FINISHED;
9511 add_new_free_space(cache, root->fs_info,
9512 found_key.objectid,
9513 found_key.objectid +
9514 found_key.offset);
9515 free_excluded_extents(root, cache);
9516 }
9517
9518 ret = btrfs_add_block_group_cache(root->fs_info, cache);
9519 if (ret) {
9520 btrfs_remove_free_space_cache(cache);
9521 btrfs_put_block_group(cache);
9522 goto error;
9523 }
9524
9525 ret = update_space_info(info, cache->flags, found_key.offset,
9526 btrfs_block_group_used(&cache->item),
9527 &space_info);
9528 if (ret) {
9529 btrfs_remove_free_space_cache(cache);
9530 spin_lock(&info->block_group_cache_lock);
9531 rb_erase(&cache->cache_node,
9532 &info->block_group_cache_tree);
9533 RB_CLEAR_NODE(&cache->cache_node);
9534 spin_unlock(&info->block_group_cache_lock);
9535 btrfs_put_block_group(cache);
9536 goto error;
9537 }
9538
9539 cache->space_info = space_info;
9540 spin_lock(&cache->space_info->lock);
9541 cache->space_info->bytes_readonly += cache->bytes_super;
9542 spin_unlock(&cache->space_info->lock);
9543
9544 __link_block_group(space_info, cache);
9545
9546 set_avail_alloc_bits(root->fs_info, cache->flags);
9547 if (btrfs_chunk_readonly(root, cache->key.objectid)) {
9548 inc_block_group_ro(cache, 1);
9549 } else if (btrfs_block_group_used(&cache->item) == 0) {
9550 spin_lock(&info->unused_bgs_lock);
9551 /* Should always be true but just in case. */
9552 if (list_empty(&cache->bg_list)) {
9553 btrfs_get_block_group(cache);
9554 list_add_tail(&cache->bg_list,
9555 &info->unused_bgs);
9556 }
9557 spin_unlock(&info->unused_bgs_lock);
9558 }
9559 }
9560
9561 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
9562 if (!(get_alloc_profile(root, space_info->flags) &
9563 (BTRFS_BLOCK_GROUP_RAID10 |
9564 BTRFS_BLOCK_GROUP_RAID1 |
9565 BTRFS_BLOCK_GROUP_RAID5 |
9566 BTRFS_BLOCK_GROUP_RAID6 |
9567 BTRFS_BLOCK_GROUP_DUP)))
9568 continue;
9569 /*
9570 * avoid allocating from un-mirrored block group if there are
9571 * mirrored block groups.
9572 */
9573 list_for_each_entry(cache,
9574 &space_info->block_groups[BTRFS_RAID_RAID0],
9575 list)
9576 inc_block_group_ro(cache, 1);
9577 list_for_each_entry(cache,
9578 &space_info->block_groups[BTRFS_RAID_SINGLE],
9579 list)
9580 inc_block_group_ro(cache, 1);
9581 }
9582
9583 init_global_block_rsv(info);
9584 ret = 0;
9585 error:
9586 btrfs_free_path(path);
9587 return ret;
9588 }
9589
9590 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9591 struct btrfs_root *root)
9592 {
9593 struct btrfs_block_group_cache *block_group, *tmp;
9594 struct btrfs_root *extent_root = root->fs_info->extent_root;
9595 struct btrfs_block_group_item item;
9596 struct btrfs_key key;
9597 int ret = 0;
9598 bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
9599
9600 trans->can_flush_pending_bgs = false;
9601 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
9602 if (ret)
9603 goto next;
9604
9605 spin_lock(&block_group->lock);
9606 memcpy(&item, &block_group->item, sizeof(item));
9607 memcpy(&key, &block_group->key, sizeof(key));
9608 spin_unlock(&block_group->lock);
9609
9610 ret = btrfs_insert_item(trans, extent_root, &key, &item,
9611 sizeof(item));
9612 if (ret)
9613 btrfs_abort_transaction(trans, extent_root, ret);
9614 ret = btrfs_finish_chunk_alloc(trans, extent_root,
9615 key.objectid, key.offset);
9616 if (ret)
9617 btrfs_abort_transaction(trans, extent_root, ret);
9618 add_block_group_free_space(trans, root->fs_info, block_group);
9619 /* already aborted the transaction if it failed. */
9620 next:
9621 list_del_init(&block_group->bg_list);
9622 }
9623 trans->can_flush_pending_bgs = can_flush_pending_bgs;
9624 }
9625
9626 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
9627 struct btrfs_root *root, u64 bytes_used,
9628 u64 type, u64 chunk_objectid, u64 chunk_offset,
9629 u64 size)
9630 {
9631 int ret;
9632 struct btrfs_root *extent_root;
9633 struct btrfs_block_group_cache *cache;
9634
9635 extent_root = root->fs_info->extent_root;
9636
9637 btrfs_set_log_full_commit(root->fs_info, trans);
9638
9639 cache = btrfs_create_block_group_cache(root, chunk_offset, size);
9640 if (!cache)
9641 return -ENOMEM;
9642
9643 btrfs_set_block_group_used(&cache->item, bytes_used);
9644 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
9645 btrfs_set_block_group_flags(&cache->item, type);
9646
9647 cache->flags = type;
9648 cache->last_byte_to_unpin = (u64)-1;
9649 cache->cached = BTRFS_CACHE_FINISHED;
9650 cache->needs_free_space = 1;
9651 ret = exclude_super_stripes(root, cache);
9652 if (ret) {
9653 /*
9654 * We may have excluded something, so call this just in
9655 * case.
9656 */
9657 free_excluded_extents(root, cache);
9658 btrfs_put_block_group(cache);
9659 return ret;
9660 }
9661
9662 add_new_free_space(cache, root->fs_info, chunk_offset,
9663 chunk_offset + size);
9664
9665 free_excluded_extents(root, cache);
9666
9667 /*
9668 * Call to ensure the corresponding space_info object is created and
9669 * assigned to our block group, but don't update its counters just yet.
9670 * We want our bg to be added to the rbtree with its ->space_info set.
9671 */
9672 ret = update_space_info(root->fs_info, cache->flags, 0, 0,
9673 &cache->space_info);
9674 if (ret) {
9675 btrfs_remove_free_space_cache(cache);
9676 btrfs_put_block_group(cache);
9677 return ret;
9678 }
9679
9680 ret = btrfs_add_block_group_cache(root->fs_info, cache);
9681 if (ret) {
9682 btrfs_remove_free_space_cache(cache);
9683 btrfs_put_block_group(cache);
9684 return ret;
9685 }
9686
9687 /*
9688 * Now that our block group has its ->space_info set and is inserted in
9689 * the rbtree, update the space info's counters.
9690 */
9691 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
9692 &cache->space_info);
9693 if (ret) {
9694 btrfs_remove_free_space_cache(cache);
9695 spin_lock(&root->fs_info->block_group_cache_lock);
9696 rb_erase(&cache->cache_node,
9697 &root->fs_info->block_group_cache_tree);
9698 RB_CLEAR_NODE(&cache->cache_node);
9699 spin_unlock(&root->fs_info->block_group_cache_lock);
9700 btrfs_put_block_group(cache);
9701 return ret;
9702 }
9703 update_global_block_rsv(root->fs_info);
9704
9705 spin_lock(&cache->space_info->lock);
9706 cache->space_info->bytes_readonly += cache->bytes_super;
9707 spin_unlock(&cache->space_info->lock);
9708
9709 __link_block_group(cache->space_info, cache);
9710
9711 list_add_tail(&cache->bg_list, &trans->new_bgs);
9712
9713 set_avail_alloc_bits(extent_root->fs_info, type);
9714
9715 return 0;
9716 }
9717
9718 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
9719 {
9720 u64 extra_flags = chunk_to_extended(flags) &
9721 BTRFS_EXTENDED_PROFILE_MASK;
9722
9723 write_seqlock(&fs_info->profiles_lock);
9724 if (flags & BTRFS_BLOCK_GROUP_DATA)
9725 fs_info->avail_data_alloc_bits &= ~extra_flags;
9726 if (flags & BTRFS_BLOCK_GROUP_METADATA)
9727 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
9728 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
9729 fs_info->avail_system_alloc_bits &= ~extra_flags;
9730 write_sequnlock(&fs_info->profiles_lock);
9731 }
9732
9733 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9734 struct btrfs_root *root, u64 group_start,
9735 struct extent_map *em)
9736 {
9737 struct btrfs_path *path;
9738 struct btrfs_block_group_cache *block_group;
9739 struct btrfs_free_cluster *cluster;
9740 struct btrfs_root *tree_root = root->fs_info->tree_root;
9741 struct btrfs_key key;
9742 struct inode *inode;
9743 struct kobject *kobj = NULL;
9744 int ret;
9745 int index;
9746 int factor;
9747 struct btrfs_caching_control *caching_ctl = NULL;
9748 bool remove_em;
9749
9750 root = root->fs_info->extent_root;
9751
9752 block_group = btrfs_lookup_block_group(root->fs_info, group_start);
9753 BUG_ON(!block_group);
9754 BUG_ON(!block_group->ro);
9755
9756 /*
9757 * Free the reserved super bytes from this block group before
9758 * remove it.
9759 */
9760 free_excluded_extents(root, block_group);
9761
9762 memcpy(&key, &block_group->key, sizeof(key));
9763 index = get_block_group_index(block_group);
9764 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
9765 BTRFS_BLOCK_GROUP_RAID1 |
9766 BTRFS_BLOCK_GROUP_RAID10))
9767 factor = 2;
9768 else
9769 factor = 1;
9770
9771 /* make sure this block group isn't part of an allocation cluster */
9772 cluster = &root->fs_info->data_alloc_cluster;
9773 spin_lock(&cluster->refill_lock);
9774 btrfs_return_cluster_to_free_space(block_group, cluster);
9775 spin_unlock(&cluster->refill_lock);
9776
9777 /*
9778 * make sure this block group isn't part of a metadata
9779 * allocation cluster
9780 */
9781 cluster = &root->fs_info->meta_alloc_cluster;
9782 spin_lock(&cluster->refill_lock);
9783 btrfs_return_cluster_to_free_space(block_group, cluster);
9784 spin_unlock(&cluster->refill_lock);
9785
9786 path = btrfs_alloc_path();
9787 if (!path) {
9788 ret = -ENOMEM;
9789 goto out;
9790 }
9791
9792 /*
9793 * get the inode first so any iput calls done for the io_list
9794 * aren't the final iput (no unlinks allowed now)
9795 */
9796 inode = lookup_free_space_inode(tree_root, block_group, path);
9797
9798 mutex_lock(&trans->transaction->cache_write_mutex);
9799 /*
9800 * make sure our free spache cache IO is done before remove the
9801 * free space inode
9802 */
9803 spin_lock(&trans->transaction->dirty_bgs_lock);
9804 if (!list_empty(&block_group->io_list)) {
9805 list_del_init(&block_group->io_list);
9806
9807 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
9808
9809 spin_unlock(&trans->transaction->dirty_bgs_lock);
9810 btrfs_wait_cache_io(root, trans, block_group,
9811 &block_group->io_ctl, path,
9812 block_group->key.objectid);
9813 btrfs_put_block_group(block_group);
9814 spin_lock(&trans->transaction->dirty_bgs_lock);
9815 }
9816
9817 if (!list_empty(&block_group->dirty_list)) {
9818 list_del_init(&block_group->dirty_list);
9819 btrfs_put_block_group(block_group);
9820 }
9821 spin_unlock(&trans->transaction->dirty_bgs_lock);
9822 mutex_unlock(&trans->transaction->cache_write_mutex);
9823
9824 if (!IS_ERR(inode)) {
9825 ret = btrfs_orphan_add(trans, inode);
9826 if (ret) {
9827 btrfs_add_delayed_iput(inode);
9828 goto out;
9829 }
9830 clear_nlink(inode);
9831 /* One for the block groups ref */
9832 spin_lock(&block_group->lock);
9833 if (block_group->iref) {
9834 block_group->iref = 0;
9835 block_group->inode = NULL;
9836 spin_unlock(&block_group->lock);
9837 iput(inode);
9838 } else {
9839 spin_unlock(&block_group->lock);
9840 }
9841 /* One for our lookup ref */
9842 btrfs_add_delayed_iput(inode);
9843 }
9844
9845 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
9846 key.offset = block_group->key.objectid;
9847 key.type = 0;
9848
9849 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
9850 if (ret < 0)
9851 goto out;
9852 if (ret > 0)
9853 btrfs_release_path(path);
9854 if (ret == 0) {
9855 ret = btrfs_del_item(trans, tree_root, path);
9856 if (ret)
9857 goto out;
9858 btrfs_release_path(path);
9859 }
9860
9861 spin_lock(&root->fs_info->block_group_cache_lock);
9862 rb_erase(&block_group->cache_node,
9863 &root->fs_info->block_group_cache_tree);
9864 RB_CLEAR_NODE(&block_group->cache_node);
9865
9866 if (root->fs_info->first_logical_byte == block_group->key.objectid)
9867 root->fs_info->first_logical_byte = (u64)-1;
9868 spin_unlock(&root->fs_info->block_group_cache_lock);
9869
9870 down_write(&block_group->space_info->groups_sem);
9871 /*
9872 * we must use list_del_init so people can check to see if they
9873 * are still on the list after taking the semaphore
9874 */
9875 list_del_init(&block_group->list);
9876 if (list_empty(&block_group->space_info->block_groups[index])) {
9877 kobj = block_group->space_info->block_group_kobjs[index];
9878 block_group->space_info->block_group_kobjs[index] = NULL;
9879 clear_avail_alloc_bits(root->fs_info, block_group->flags);
9880 }
9881 up_write(&block_group->space_info->groups_sem);
9882 if (kobj) {
9883 kobject_del(kobj);
9884 kobject_put(kobj);
9885 }
9886
9887 if (block_group->has_caching_ctl)
9888 caching_ctl = get_caching_control(block_group);
9889 if (block_group->cached == BTRFS_CACHE_STARTED)
9890 wait_block_group_cache_done(block_group);
9891 if (block_group->has_caching_ctl) {
9892 down_write(&root->fs_info->commit_root_sem);
9893 if (!caching_ctl) {
9894 struct btrfs_caching_control *ctl;
9895
9896 list_for_each_entry(ctl,
9897 &root->fs_info->caching_block_groups, list)
9898 if (ctl->block_group == block_group) {
9899 caching_ctl = ctl;
9900 atomic_inc(&caching_ctl->count);
9901 break;
9902 }
9903 }
9904 if (caching_ctl)
9905 list_del_init(&caching_ctl->list);
9906 up_write(&root->fs_info->commit_root_sem);
9907 if (caching_ctl) {
9908 /* Once for the caching bgs list and once for us. */
9909 put_caching_control(caching_ctl);
9910 put_caching_control(caching_ctl);
9911 }
9912 }
9913
9914 spin_lock(&trans->transaction->dirty_bgs_lock);
9915 if (!list_empty(&block_group->dirty_list)) {
9916 WARN_ON(1);
9917 }
9918 if (!list_empty(&block_group->io_list)) {
9919 WARN_ON(1);
9920 }
9921 spin_unlock(&trans->transaction->dirty_bgs_lock);
9922 btrfs_remove_free_space_cache(block_group);
9923
9924 spin_lock(&block_group->space_info->lock);
9925 list_del_init(&block_group->ro_list);
9926
9927 if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
9928 WARN_ON(block_group->space_info->total_bytes
9929 < block_group->key.offset);
9930 WARN_ON(block_group->space_info->bytes_readonly
9931 < block_group->key.offset);
9932 WARN_ON(block_group->space_info->disk_total
9933 < block_group->key.offset * factor);
9934 }
9935 block_group->space_info->total_bytes -= block_group->key.offset;
9936 block_group->space_info->bytes_readonly -= block_group->key.offset;
9937 block_group->space_info->disk_total -= block_group->key.offset * factor;
9938
9939 spin_unlock(&block_group->space_info->lock);
9940
9941 memcpy(&key, &block_group->key, sizeof(key));
9942
9943 lock_chunks(root);
9944 if (!list_empty(&em->list)) {
9945 /* We're in the transaction->pending_chunks list. */
9946 free_extent_map(em);
9947 }
9948 spin_lock(&block_group->lock);
9949 block_group->removed = 1;
9950 /*
9951 * At this point trimming can't start on this block group, because we
9952 * removed the block group from the tree fs_info->block_group_cache_tree
9953 * so no one can't find it anymore and even if someone already got this
9954 * block group before we removed it from the rbtree, they have already
9955 * incremented block_group->trimming - if they didn't, they won't find
9956 * any free space entries because we already removed them all when we
9957 * called btrfs_remove_free_space_cache().
9958 *
9959 * And we must not remove the extent map from the fs_info->mapping_tree
9960 * to prevent the same logical address range and physical device space
9961 * ranges from being reused for a new block group. This is because our
9962 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
9963 * completely transactionless, so while it is trimming a range the
9964 * currently running transaction might finish and a new one start,
9965 * allowing for new block groups to be created that can reuse the same
9966 * physical device locations unless we take this special care.
9967 *
9968 * There may also be an implicit trim operation if the file system
9969 * is mounted with -odiscard. The same protections must remain
9970 * in place until the extents have been discarded completely when
9971 * the transaction commit has completed.
9972 */
9973 remove_em = (atomic_read(&block_group->trimming) == 0);
9974 /*
9975 * Make sure a trimmer task always sees the em in the pinned_chunks list
9976 * if it sees block_group->removed == 1 (needs to lock block_group->lock
9977 * before checking block_group->removed).
9978 */
9979 if (!remove_em) {
9980 /*
9981 * Our em might be in trans->transaction->pending_chunks which
9982 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
9983 * and so is the fs_info->pinned_chunks list.
9984 *
9985 * So at this point we must be holding the chunk_mutex to avoid
9986 * any races with chunk allocation (more specifically at
9987 * volumes.c:contains_pending_extent()), to ensure it always
9988 * sees the em, either in the pending_chunks list or in the
9989 * pinned_chunks list.
9990 */
9991 list_move_tail(&em->list, &root->fs_info->pinned_chunks);
9992 }
9993 spin_unlock(&block_group->lock);
9994
9995 if (remove_em) {
9996 struct extent_map_tree *em_tree;
9997
9998 em_tree = &root->fs_info->mapping_tree.map_tree;
9999 write_lock(&em_tree->lock);
10000 /*
10001 * The em might be in the pending_chunks list, so make sure the
10002 * chunk mutex is locked, since remove_extent_mapping() will
10003 * delete us from that list.
10004 */
10005 remove_extent_mapping(em_tree, em);
10006 write_unlock(&em_tree->lock);
10007 /* once for the tree */
10008 free_extent_map(em);
10009 }
10010
10011 unlock_chunks(root);
10012
10013 ret = remove_block_group_free_space(trans, root->fs_info, block_group);
10014 if (ret)
10015 goto out;
10016
10017 btrfs_put_block_group(block_group);
10018 btrfs_put_block_group(block_group);
10019
10020 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10021 if (ret > 0)
10022 ret = -EIO;
10023 if (ret < 0)
10024 goto out;
10025
10026 ret = btrfs_del_item(trans, root, path);
10027 out:
10028 btrfs_free_path(path);
10029 return ret;
10030 }
10031
10032 /*
10033 * Process the unused_bgs list and remove any that don't have any allocated
10034 * space inside of them.
10035 */
10036 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10037 {
10038 struct btrfs_block_group_cache *block_group;
10039 struct btrfs_space_info *space_info;
10040 struct btrfs_root *root = fs_info->extent_root;
10041 struct btrfs_trans_handle *trans;
10042 int ret = 0;
10043
10044 if (!fs_info->open)
10045 return;
10046
10047 spin_lock(&fs_info->unused_bgs_lock);
10048 while (!list_empty(&fs_info->unused_bgs)) {
10049 u64 start, end;
10050 int trimming;
10051
10052 block_group = list_first_entry(&fs_info->unused_bgs,
10053 struct btrfs_block_group_cache,
10054 bg_list);
10055 space_info = block_group->space_info;
10056 list_del_init(&block_group->bg_list);
10057 if (ret || btrfs_mixed_space_info(space_info)) {
10058 btrfs_put_block_group(block_group);
10059 continue;
10060 }
10061 spin_unlock(&fs_info->unused_bgs_lock);
10062
10063 mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
10064
10065 /* Don't want to race with allocators so take the groups_sem */
10066 down_write(&space_info->groups_sem);
10067 spin_lock(&block_group->lock);
10068 if (block_group->reserved ||
10069 btrfs_block_group_used(&block_group->item) ||
10070 block_group->ro) {
10071 /*
10072 * We want to bail if we made new allocations or have
10073 * outstanding allocations in this block group. We do
10074 * the ro check in case balance is currently acting on
10075 * this block group.
10076 */
10077 spin_unlock(&block_group->lock);
10078 up_write(&space_info->groups_sem);
10079 goto next;
10080 }
10081 spin_unlock(&block_group->lock);
10082
10083 /* We don't want to force the issue, only flip if it's ok. */
10084 ret = inc_block_group_ro(block_group, 0);
10085 up_write(&space_info->groups_sem);
10086 if (ret < 0) {
10087 ret = 0;
10088 goto next;
10089 }
10090
10091 /*
10092 * Want to do this before we do anything else so we can recover
10093 * properly if we fail to join the transaction.
10094 */
10095 /* 1 for btrfs_orphan_reserve_metadata() */
10096 trans = btrfs_start_transaction(root, 1);
10097 if (IS_ERR(trans)) {
10098 btrfs_dec_block_group_ro(root, block_group);
10099 ret = PTR_ERR(trans);
10100 goto next;
10101 }
10102
10103 /*
10104 * We could have pending pinned extents for this block group,
10105 * just delete them, we don't care about them anymore.
10106 */
10107 start = block_group->key.objectid;
10108 end = start + block_group->key.offset - 1;
10109 /*
10110 * Hold the unused_bg_unpin_mutex lock to avoid racing with
10111 * btrfs_finish_extent_commit(). If we are at transaction N,
10112 * another task might be running finish_extent_commit() for the
10113 * previous transaction N - 1, and have seen a range belonging
10114 * to the block group in freed_extents[] before we were able to
10115 * clear the whole block group range from freed_extents[]. This
10116 * means that task can lookup for the block group after we
10117 * unpinned it from freed_extents[] and removed it, leading to
10118 * a BUG_ON() at btrfs_unpin_extent_range().
10119 */
10120 mutex_lock(&fs_info->unused_bg_unpin_mutex);
10121 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10122 EXTENT_DIRTY, GFP_NOFS);
10123 if (ret) {
10124 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10125 btrfs_dec_block_group_ro(root, block_group);
10126 goto end_trans;
10127 }
10128 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10129 EXTENT_DIRTY, GFP_NOFS);
10130 if (ret) {
10131 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10132 btrfs_dec_block_group_ro(root, block_group);
10133 goto end_trans;
10134 }
10135 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10136
10137 /* Reset pinned so btrfs_put_block_group doesn't complain */
10138 spin_lock(&space_info->lock);
10139 spin_lock(&block_group->lock);
10140
10141 space_info->bytes_pinned -= block_group->pinned;
10142 space_info->bytes_readonly += block_group->pinned;
10143 percpu_counter_add(&space_info->total_bytes_pinned,
10144 -block_group->pinned);
10145 block_group->pinned = 0;
10146
10147 spin_unlock(&block_group->lock);
10148 spin_unlock(&space_info->lock);
10149
10150 /* DISCARD can flip during remount */
10151 trimming = btrfs_test_opt(root, DISCARD);
10152
10153 /* Implicit trim during transaction commit. */
10154 if (trimming)
10155 btrfs_get_block_group_trimming(block_group);
10156
10157 /*
10158 * Btrfs_remove_chunk will abort the transaction if things go
10159 * horribly wrong.
10160 */
10161 ret = btrfs_remove_chunk(trans, root,
10162 block_group->key.objectid);
10163
10164 if (ret) {
10165 if (trimming)
10166 btrfs_put_block_group_trimming(block_group);
10167 goto end_trans;
10168 }
10169
10170 /*
10171 * If we're not mounted with -odiscard, we can just forget
10172 * about this block group. Otherwise we'll need to wait
10173 * until transaction commit to do the actual discard.
10174 */
10175 if (trimming) {
10176 WARN_ON(!list_empty(&block_group->bg_list));
10177 spin_lock(&trans->transaction->deleted_bgs_lock);
10178 list_move(&block_group->bg_list,
10179 &trans->transaction->deleted_bgs);
10180 spin_unlock(&trans->transaction->deleted_bgs_lock);
10181 btrfs_get_block_group(block_group);
10182 }
10183 end_trans:
10184 btrfs_end_transaction(trans, root);
10185 next:
10186 mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
10187 btrfs_put_block_group(block_group);
10188 spin_lock(&fs_info->unused_bgs_lock);
10189 }
10190 spin_unlock(&fs_info->unused_bgs_lock);
10191 }
10192
10193 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10194 {
10195 struct btrfs_space_info *space_info;
10196 struct btrfs_super_block *disk_super;
10197 u64 features;
10198 u64 flags;
10199 int mixed = 0;
10200 int ret;
10201
10202 disk_super = fs_info->super_copy;
10203 if (!btrfs_super_root(disk_super))
10204 return 1;
10205
10206 features = btrfs_super_incompat_flags(disk_super);
10207 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10208 mixed = 1;
10209
10210 flags = BTRFS_BLOCK_GROUP_SYSTEM;
10211 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10212 if (ret)
10213 goto out;
10214
10215 if (mixed) {
10216 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10217 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10218 } else {
10219 flags = BTRFS_BLOCK_GROUP_METADATA;
10220 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10221 if (ret)
10222 goto out;
10223
10224 flags = BTRFS_BLOCK_GROUP_DATA;
10225 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10226 }
10227 out:
10228 return ret;
10229 }
10230
10231 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
10232 {
10233 return unpin_extent_range(root, start, end, false);
10234 }
10235
10236 /*
10237 * It used to be that old block groups would be left around forever.
10238 * Iterating over them would be enough to trim unused space. Since we
10239 * now automatically remove them, we also need to iterate over unallocated
10240 * space.
10241 *
10242 * We don't want a transaction for this since the discard may take a
10243 * substantial amount of time. We don't require that a transaction be
10244 * running, but we do need to take a running transaction into account
10245 * to ensure that we're not discarding chunks that were released in
10246 * the current transaction.
10247 *
10248 * Holding the chunks lock will prevent other threads from allocating
10249 * or releasing chunks, but it won't prevent a running transaction
10250 * from committing and releasing the memory that the pending chunks
10251 * list head uses. For that, we need to take a reference to the
10252 * transaction.
10253 */
10254 static int btrfs_trim_free_extents(struct btrfs_device *device,
10255 u64 minlen, u64 *trimmed)
10256 {
10257 u64 start = 0, len = 0;
10258 int ret;
10259
10260 *trimmed = 0;
10261
10262 /* Not writeable = nothing to do. */
10263 if (!device->writeable)
10264 return 0;
10265
10266 /* No free space = nothing to do. */
10267 if (device->total_bytes <= device->bytes_used)
10268 return 0;
10269
10270 ret = 0;
10271
10272 while (1) {
10273 struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
10274 struct btrfs_transaction *trans;
10275 u64 bytes;
10276
10277 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
10278 if (ret)
10279 return ret;
10280
10281 down_read(&fs_info->commit_root_sem);
10282
10283 spin_lock(&fs_info->trans_lock);
10284 trans = fs_info->running_transaction;
10285 if (trans)
10286 atomic_inc(&trans->use_count);
10287 spin_unlock(&fs_info->trans_lock);
10288
10289 ret = find_free_dev_extent_start(trans, device, minlen, start,
10290 &start, &len);
10291 if (trans)
10292 btrfs_put_transaction(trans);
10293
10294 if (ret) {
10295 up_read(&fs_info->commit_root_sem);
10296 mutex_unlock(&fs_info->chunk_mutex);
10297 if (ret == -ENOSPC)
10298 ret = 0;
10299 break;
10300 }
10301
10302 ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
10303 up_read(&fs_info->commit_root_sem);
10304 mutex_unlock(&fs_info->chunk_mutex);
10305
10306 if (ret)
10307 break;
10308
10309 start += len;
10310 *trimmed += bytes;
10311
10312 if (fatal_signal_pending(current)) {
10313 ret = -ERESTARTSYS;
10314 break;
10315 }
10316
10317 cond_resched();
10318 }
10319
10320 return ret;
10321 }
10322
10323 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
10324 {
10325 struct btrfs_fs_info *fs_info = root->fs_info;
10326 struct btrfs_block_group_cache *cache = NULL;
10327 struct btrfs_device *device;
10328 struct list_head *devices;
10329 u64 group_trimmed;
10330 u64 start;
10331 u64 end;
10332 u64 trimmed = 0;
10333 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
10334 int ret = 0;
10335
10336 /*
10337 * try to trim all FS space, our block group may start from non-zero.
10338 */
10339 if (range->len == total_bytes)
10340 cache = btrfs_lookup_first_block_group(fs_info, range->start);
10341 else
10342 cache = btrfs_lookup_block_group(fs_info, range->start);
10343
10344 while (cache) {
10345 if (cache->key.objectid >= (range->start + range->len)) {
10346 btrfs_put_block_group(cache);
10347 break;
10348 }
10349
10350 start = max(range->start, cache->key.objectid);
10351 end = min(range->start + range->len,
10352 cache->key.objectid + cache->key.offset);
10353
10354 if (end - start >= range->minlen) {
10355 if (!block_group_cache_done(cache)) {
10356 ret = cache_block_group(cache, 0);
10357 if (ret) {
10358 btrfs_put_block_group(cache);
10359 break;
10360 }
10361 ret = wait_block_group_cache_done(cache);
10362 if (ret) {
10363 btrfs_put_block_group(cache);
10364 break;
10365 }
10366 }
10367 ret = btrfs_trim_block_group(cache,
10368 &group_trimmed,
10369 start,
10370 end,
10371 range->minlen);
10372
10373 trimmed += group_trimmed;
10374 if (ret) {
10375 btrfs_put_block_group(cache);
10376 break;
10377 }
10378 }
10379
10380 cache = next_block_group(fs_info->tree_root, cache);
10381 }
10382
10383 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
10384 devices = &root->fs_info->fs_devices->alloc_list;
10385 list_for_each_entry(device, devices, dev_alloc_list) {
10386 ret = btrfs_trim_free_extents(device, range->minlen,
10387 &group_trimmed);
10388 if (ret)
10389 break;
10390
10391 trimmed += group_trimmed;
10392 }
10393 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
10394
10395 range->len = trimmed;
10396 return ret;
10397 }
10398
10399 /*
10400 * btrfs_{start,end}_write_no_snapshoting() are similar to
10401 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
10402 * data into the page cache through nocow before the subvolume is snapshoted,
10403 * but flush the data into disk after the snapshot creation, or to prevent
10404 * operations while snapshoting is ongoing and that cause the snapshot to be
10405 * inconsistent (writes followed by expanding truncates for example).
10406 */
10407 void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
10408 {
10409 percpu_counter_dec(&root->subv_writers->counter);
10410 /*
10411 * Make sure counter is updated before we wake up
10412 * waiters.
10413 */
10414 smp_mb();
10415 if (waitqueue_active(&root->subv_writers->wait))
10416 wake_up(&root->subv_writers->wait);
10417 }
10418
10419 int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
10420 {
10421 if (atomic_read(&root->will_be_snapshoted))
10422 return 0;
10423
10424 percpu_counter_inc(&root->subv_writers->counter);
10425 /*
10426 * Make sure counter is updated before we check for snapshot creation.
10427 */
10428 smp_mb();
10429 if (atomic_read(&root->will_be_snapshoted)) {
10430 btrfs_end_write_no_snapshoting(root);
10431 return 0;
10432 }
10433 return 1;
10434 }
This page took 0.376094 seconds and 5 git commands to generate.