2 * Copyright (C) 2007 Oracle. All rights reserved.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <linux/ratelimit.h>
27 #include <linux/kthread.h>
28 #include <linux/raid/pq.h>
29 #include <asm/div64.h>
32 #include "extent_map.h"
34 #include "transaction.h"
35 #include "print-tree.h"
38 #include "async-thread.h"
39 #include "check-integrity.h"
40 #include "rcu-string.h"
42 #include "dev-replace.h"
44 static int init_first_rw_device(struct btrfs_trans_handle
*trans
,
45 struct btrfs_root
*root
,
46 struct btrfs_device
*device
);
47 static int btrfs_relocate_sys_chunks(struct btrfs_root
*root
);
48 static void __btrfs_reset_dev_stats(struct btrfs_device
*dev
);
49 static void btrfs_dev_stat_print_on_load(struct btrfs_device
*device
);
51 static DEFINE_MUTEX(uuid_mutex
);
52 static LIST_HEAD(fs_uuids
);
54 static void lock_chunks(struct btrfs_root
*root
)
56 mutex_lock(&root
->fs_info
->chunk_mutex
);
59 static void unlock_chunks(struct btrfs_root
*root
)
61 mutex_unlock(&root
->fs_info
->chunk_mutex
);
64 static void free_fs_devices(struct btrfs_fs_devices
*fs_devices
)
66 struct btrfs_device
*device
;
67 WARN_ON(fs_devices
->opened
);
68 while (!list_empty(&fs_devices
->devices
)) {
69 device
= list_entry(fs_devices
->devices
.next
,
70 struct btrfs_device
, dev_list
);
71 list_del(&device
->dev_list
);
72 rcu_string_free(device
->name
);
78 static void btrfs_kobject_uevent(struct block_device
*bdev
,
79 enum kobject_action action
)
83 ret
= kobject_uevent(&disk_to_dev(bdev
->bd_disk
)->kobj
, action
);
85 pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
87 kobject_name(&disk_to_dev(bdev
->bd_disk
)->kobj
),
88 &disk_to_dev(bdev
->bd_disk
)->kobj
);
91 void btrfs_cleanup_fs_uuids(void)
93 struct btrfs_fs_devices
*fs_devices
;
95 while (!list_empty(&fs_uuids
)) {
96 fs_devices
= list_entry(fs_uuids
.next
,
97 struct btrfs_fs_devices
, list
);
98 list_del(&fs_devices
->list
);
99 free_fs_devices(fs_devices
);
103 static noinline
struct btrfs_device
*__find_device(struct list_head
*head
,
106 struct btrfs_device
*dev
;
108 list_for_each_entry(dev
, head
, dev_list
) {
109 if (dev
->devid
== devid
&&
110 (!uuid
|| !memcmp(dev
->uuid
, uuid
, BTRFS_UUID_SIZE
))) {
117 static noinline
struct btrfs_fs_devices
*find_fsid(u8
*fsid
)
119 struct btrfs_fs_devices
*fs_devices
;
121 list_for_each_entry(fs_devices
, &fs_uuids
, list
) {
122 if (memcmp(fsid
, fs_devices
->fsid
, BTRFS_FSID_SIZE
) == 0)
129 btrfs_get_bdev_and_sb(const char *device_path
, fmode_t flags
, void *holder
,
130 int flush
, struct block_device
**bdev
,
131 struct buffer_head
**bh
)
135 *bdev
= blkdev_get_by_path(device_path
, flags
, holder
);
138 ret
= PTR_ERR(*bdev
);
139 printk(KERN_INFO
"btrfs: open %s failed\n", device_path
);
144 filemap_write_and_wait((*bdev
)->bd_inode
->i_mapping
);
145 ret
= set_blocksize(*bdev
, 4096);
147 blkdev_put(*bdev
, flags
);
150 invalidate_bdev(*bdev
);
151 *bh
= btrfs_read_dev_super(*bdev
);
154 blkdev_put(*bdev
, flags
);
166 static void requeue_list(struct btrfs_pending_bios
*pending_bios
,
167 struct bio
*head
, struct bio
*tail
)
170 struct bio
*old_head
;
172 old_head
= pending_bios
->head
;
173 pending_bios
->head
= head
;
174 if (pending_bios
->tail
)
175 tail
->bi_next
= old_head
;
177 pending_bios
->tail
= tail
;
181 * we try to collect pending bios for a device so we don't get a large
182 * number of procs sending bios down to the same device. This greatly
183 * improves the schedulers ability to collect and merge the bios.
185 * But, it also turns into a long list of bios to process and that is sure
186 * to eventually make the worker thread block. The solution here is to
187 * make some progress and then put this work struct back at the end of
188 * the list if the block device is congested. This way, multiple devices
189 * can make progress from a single worker thread.
191 static noinline
void run_scheduled_bios(struct btrfs_device
*device
)
194 struct backing_dev_info
*bdi
;
195 struct btrfs_fs_info
*fs_info
;
196 struct btrfs_pending_bios
*pending_bios
;
200 unsigned long num_run
;
201 unsigned long batch_run
= 0;
203 unsigned long last_waited
= 0;
205 int sync_pending
= 0;
206 struct blk_plug plug
;
209 * this function runs all the bios we've collected for
210 * a particular device. We don't want to wander off to
211 * another device without first sending all of these down.
212 * So, setup a plug here and finish it off before we return
214 blk_start_plug(&plug
);
216 bdi
= blk_get_backing_dev_info(device
->bdev
);
217 fs_info
= device
->dev_root
->fs_info
;
218 limit
= btrfs_async_submit_limit(fs_info
);
219 limit
= limit
* 2 / 3;
222 spin_lock(&device
->io_lock
);
227 /* take all the bios off the list at once and process them
228 * later on (without the lock held). But, remember the
229 * tail and other pointers so the bios can be properly reinserted
230 * into the list if we hit congestion
232 if (!force_reg
&& device
->pending_sync_bios
.head
) {
233 pending_bios
= &device
->pending_sync_bios
;
236 pending_bios
= &device
->pending_bios
;
240 pending
= pending_bios
->head
;
241 tail
= pending_bios
->tail
;
242 WARN_ON(pending
&& !tail
);
245 * if pending was null this time around, no bios need processing
246 * at all and we can stop. Otherwise it'll loop back up again
247 * and do an additional check so no bios are missed.
249 * device->running_pending is used to synchronize with the
252 if (device
->pending_sync_bios
.head
== NULL
&&
253 device
->pending_bios
.head
== NULL
) {
255 device
->running_pending
= 0;
258 device
->running_pending
= 1;
261 pending_bios
->head
= NULL
;
262 pending_bios
->tail
= NULL
;
264 spin_unlock(&device
->io_lock
);
269 /* we want to work on both lists, but do more bios on the
270 * sync list than the regular list
273 pending_bios
!= &device
->pending_sync_bios
&&
274 device
->pending_sync_bios
.head
) ||
275 (num_run
> 64 && pending_bios
== &device
->pending_sync_bios
&&
276 device
->pending_bios
.head
)) {
277 spin_lock(&device
->io_lock
);
278 requeue_list(pending_bios
, pending
, tail
);
283 pending
= pending
->bi_next
;
286 if (atomic_dec_return(&fs_info
->nr_async_bios
) < limit
&&
287 waitqueue_active(&fs_info
->async_submit_wait
))
288 wake_up(&fs_info
->async_submit_wait
);
290 BUG_ON(atomic_read(&cur
->bi_cnt
) == 0);
293 * if we're doing the sync list, record that our
294 * plug has some sync requests on it
296 * If we're doing the regular list and there are
297 * sync requests sitting around, unplug before
300 if (pending_bios
== &device
->pending_sync_bios
) {
302 } else if (sync_pending
) {
303 blk_finish_plug(&plug
);
304 blk_start_plug(&plug
);
308 btrfsic_submit_bio(cur
->bi_rw
, cur
);
315 * we made progress, there is more work to do and the bdi
316 * is now congested. Back off and let other work structs
319 if (pending
&& bdi_write_congested(bdi
) && batch_run
> 8 &&
320 fs_info
->fs_devices
->open_devices
> 1) {
321 struct io_context
*ioc
;
323 ioc
= current
->io_context
;
326 * the main goal here is that we don't want to
327 * block if we're going to be able to submit
328 * more requests without blocking.
330 * This code does two great things, it pokes into
331 * the elevator code from a filesystem _and_
332 * it makes assumptions about how batching works.
334 if (ioc
&& ioc
->nr_batch_requests
> 0 &&
335 time_before(jiffies
, ioc
->last_waited
+ HZ
/50UL) &&
337 ioc
->last_waited
== last_waited
)) {
339 * we want to go through our batch of
340 * requests and stop. So, we copy out
341 * the ioc->last_waited time and test
342 * against it before looping
344 last_waited
= ioc
->last_waited
;
349 spin_lock(&device
->io_lock
);
350 requeue_list(pending_bios
, pending
, tail
);
351 device
->running_pending
= 1;
353 spin_unlock(&device
->io_lock
);
354 btrfs_requeue_work(&device
->work
);
357 /* unplug every 64 requests just for good measure */
358 if (batch_run
% 64 == 0) {
359 blk_finish_plug(&plug
);
360 blk_start_plug(&plug
);
369 spin_lock(&device
->io_lock
);
370 if (device
->pending_bios
.head
|| device
->pending_sync_bios
.head
)
372 spin_unlock(&device
->io_lock
);
375 blk_finish_plug(&plug
);
378 static void pending_bios_fn(struct btrfs_work
*work
)
380 struct btrfs_device
*device
;
382 device
= container_of(work
, struct btrfs_device
, work
);
383 run_scheduled_bios(device
);
386 static noinline
int device_list_add(const char *path
,
387 struct btrfs_super_block
*disk_super
,
388 u64 devid
, struct btrfs_fs_devices
**fs_devices_ret
)
390 struct btrfs_device
*device
;
391 struct btrfs_fs_devices
*fs_devices
;
392 struct rcu_string
*name
;
393 u64 found_transid
= btrfs_super_generation(disk_super
);
395 fs_devices
= find_fsid(disk_super
->fsid
);
397 fs_devices
= kzalloc(sizeof(*fs_devices
), GFP_NOFS
);
400 INIT_LIST_HEAD(&fs_devices
->devices
);
401 INIT_LIST_HEAD(&fs_devices
->alloc_list
);
402 list_add(&fs_devices
->list
, &fs_uuids
);
403 memcpy(fs_devices
->fsid
, disk_super
->fsid
, BTRFS_FSID_SIZE
);
404 fs_devices
->latest_devid
= devid
;
405 fs_devices
->latest_trans
= found_transid
;
406 mutex_init(&fs_devices
->device_list_mutex
);
409 device
= __find_device(&fs_devices
->devices
, devid
,
410 disk_super
->dev_item
.uuid
);
413 if (fs_devices
->opened
)
416 device
= kzalloc(sizeof(*device
), GFP_NOFS
);
418 /* we can safely leave the fs_devices entry around */
421 device
->devid
= devid
;
422 device
->dev_stats_valid
= 0;
423 device
->work
.func
= pending_bios_fn
;
424 memcpy(device
->uuid
, disk_super
->dev_item
.uuid
,
426 spin_lock_init(&device
->io_lock
);
428 name
= rcu_string_strdup(path
, GFP_NOFS
);
433 rcu_assign_pointer(device
->name
, name
);
434 INIT_LIST_HEAD(&device
->dev_alloc_list
);
436 /* init readahead state */
437 spin_lock_init(&device
->reada_lock
);
438 device
->reada_curr_zone
= NULL
;
439 atomic_set(&device
->reada_in_flight
, 0);
440 device
->reada_next
= 0;
441 INIT_RADIX_TREE(&device
->reada_zones
, GFP_NOFS
& ~__GFP_WAIT
);
442 INIT_RADIX_TREE(&device
->reada_extents
, GFP_NOFS
& ~__GFP_WAIT
);
444 mutex_lock(&fs_devices
->device_list_mutex
);
445 list_add_rcu(&device
->dev_list
, &fs_devices
->devices
);
446 mutex_unlock(&fs_devices
->device_list_mutex
);
448 device
->fs_devices
= fs_devices
;
449 fs_devices
->num_devices
++;
450 } else if (!device
->name
|| strcmp(device
->name
->str
, path
)) {
451 name
= rcu_string_strdup(path
, GFP_NOFS
);
454 rcu_string_free(device
->name
);
455 rcu_assign_pointer(device
->name
, name
);
456 if (device
->missing
) {
457 fs_devices
->missing_devices
--;
462 if (found_transid
> fs_devices
->latest_trans
) {
463 fs_devices
->latest_devid
= devid
;
464 fs_devices
->latest_trans
= found_transid
;
466 *fs_devices_ret
= fs_devices
;
470 static struct btrfs_fs_devices
*clone_fs_devices(struct btrfs_fs_devices
*orig
)
472 struct btrfs_fs_devices
*fs_devices
;
473 struct btrfs_device
*device
;
474 struct btrfs_device
*orig_dev
;
476 fs_devices
= kzalloc(sizeof(*fs_devices
), GFP_NOFS
);
478 return ERR_PTR(-ENOMEM
);
480 INIT_LIST_HEAD(&fs_devices
->devices
);
481 INIT_LIST_HEAD(&fs_devices
->alloc_list
);
482 INIT_LIST_HEAD(&fs_devices
->list
);
483 mutex_init(&fs_devices
->device_list_mutex
);
484 fs_devices
->latest_devid
= orig
->latest_devid
;
485 fs_devices
->latest_trans
= orig
->latest_trans
;
486 fs_devices
->total_devices
= orig
->total_devices
;
487 memcpy(fs_devices
->fsid
, orig
->fsid
, sizeof(fs_devices
->fsid
));
489 /* We have held the volume lock, it is safe to get the devices. */
490 list_for_each_entry(orig_dev
, &orig
->devices
, dev_list
) {
491 struct rcu_string
*name
;
493 device
= kzalloc(sizeof(*device
), GFP_NOFS
);
498 * This is ok to do without rcu read locked because we hold the
499 * uuid mutex so nothing we touch in here is going to disappear.
501 name
= rcu_string_strdup(orig_dev
->name
->str
, GFP_NOFS
);
506 rcu_assign_pointer(device
->name
, name
);
508 device
->devid
= orig_dev
->devid
;
509 device
->work
.func
= pending_bios_fn
;
510 memcpy(device
->uuid
, orig_dev
->uuid
, sizeof(device
->uuid
));
511 spin_lock_init(&device
->io_lock
);
512 INIT_LIST_HEAD(&device
->dev_list
);
513 INIT_LIST_HEAD(&device
->dev_alloc_list
);
515 list_add(&device
->dev_list
, &fs_devices
->devices
);
516 device
->fs_devices
= fs_devices
;
517 fs_devices
->num_devices
++;
521 free_fs_devices(fs_devices
);
522 return ERR_PTR(-ENOMEM
);
525 void btrfs_close_extra_devices(struct btrfs_fs_info
*fs_info
,
526 struct btrfs_fs_devices
*fs_devices
, int step
)
528 struct btrfs_device
*device
, *next
;
530 struct block_device
*latest_bdev
= NULL
;
531 u64 latest_devid
= 0;
532 u64 latest_transid
= 0;
534 mutex_lock(&uuid_mutex
);
536 /* This is the initialized path, it is safe to release the devices. */
537 list_for_each_entry_safe(device
, next
, &fs_devices
->devices
, dev_list
) {
538 if (device
->in_fs_metadata
) {
539 if (!device
->is_tgtdev_for_dev_replace
&&
541 device
->generation
> latest_transid
)) {
542 latest_devid
= device
->devid
;
543 latest_transid
= device
->generation
;
544 latest_bdev
= device
->bdev
;
549 if (device
->devid
== BTRFS_DEV_REPLACE_DEVID
) {
551 * In the first step, keep the device which has
552 * the correct fsid and the devid that is used
553 * for the dev_replace procedure.
554 * In the second step, the dev_replace state is
555 * read from the device tree and it is known
556 * whether the procedure is really active or
557 * not, which means whether this device is
558 * used or whether it should be removed.
560 if (step
== 0 || device
->is_tgtdev_for_dev_replace
) {
565 blkdev_put(device
->bdev
, device
->mode
);
567 fs_devices
->open_devices
--;
569 if (device
->writeable
) {
570 list_del_init(&device
->dev_alloc_list
);
571 device
->writeable
= 0;
572 if (!device
->is_tgtdev_for_dev_replace
)
573 fs_devices
->rw_devices
--;
575 list_del_init(&device
->dev_list
);
576 fs_devices
->num_devices
--;
577 rcu_string_free(device
->name
);
581 if (fs_devices
->seed
) {
582 fs_devices
= fs_devices
->seed
;
586 fs_devices
->latest_bdev
= latest_bdev
;
587 fs_devices
->latest_devid
= latest_devid
;
588 fs_devices
->latest_trans
= latest_transid
;
590 mutex_unlock(&uuid_mutex
);
593 static void __free_device(struct work_struct
*work
)
595 struct btrfs_device
*device
;
597 device
= container_of(work
, struct btrfs_device
, rcu_work
);
600 blkdev_put(device
->bdev
, device
->mode
);
602 rcu_string_free(device
->name
);
606 static void free_device(struct rcu_head
*head
)
608 struct btrfs_device
*device
;
610 device
= container_of(head
, struct btrfs_device
, rcu
);
612 INIT_WORK(&device
->rcu_work
, __free_device
);
613 schedule_work(&device
->rcu_work
);
616 static int __btrfs_close_devices(struct btrfs_fs_devices
*fs_devices
)
618 struct btrfs_device
*device
;
620 if (--fs_devices
->opened
> 0)
623 mutex_lock(&fs_devices
->device_list_mutex
);
624 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
625 struct btrfs_device
*new_device
;
626 struct rcu_string
*name
;
629 fs_devices
->open_devices
--;
631 if (device
->writeable
&& !device
->is_tgtdev_for_dev_replace
) {
632 list_del_init(&device
->dev_alloc_list
);
633 fs_devices
->rw_devices
--;
636 if (device
->can_discard
)
637 fs_devices
->num_can_discard
--;
639 new_device
= kmalloc(sizeof(*new_device
), GFP_NOFS
);
640 BUG_ON(!new_device
); /* -ENOMEM */
641 memcpy(new_device
, device
, sizeof(*new_device
));
643 /* Safe because we are under uuid_mutex */
645 name
= rcu_string_strdup(device
->name
->str
, GFP_NOFS
);
646 BUG_ON(device
->name
&& !name
); /* -ENOMEM */
647 rcu_assign_pointer(new_device
->name
, name
);
649 new_device
->bdev
= NULL
;
650 new_device
->writeable
= 0;
651 new_device
->in_fs_metadata
= 0;
652 new_device
->can_discard
= 0;
653 list_replace_rcu(&device
->dev_list
, &new_device
->dev_list
);
655 call_rcu(&device
->rcu
, free_device
);
657 mutex_unlock(&fs_devices
->device_list_mutex
);
659 WARN_ON(fs_devices
->open_devices
);
660 WARN_ON(fs_devices
->rw_devices
);
661 fs_devices
->opened
= 0;
662 fs_devices
->seeding
= 0;
667 int btrfs_close_devices(struct btrfs_fs_devices
*fs_devices
)
669 struct btrfs_fs_devices
*seed_devices
= NULL
;
672 mutex_lock(&uuid_mutex
);
673 ret
= __btrfs_close_devices(fs_devices
);
674 if (!fs_devices
->opened
) {
675 seed_devices
= fs_devices
->seed
;
676 fs_devices
->seed
= NULL
;
678 mutex_unlock(&uuid_mutex
);
680 while (seed_devices
) {
681 fs_devices
= seed_devices
;
682 seed_devices
= fs_devices
->seed
;
683 __btrfs_close_devices(fs_devices
);
684 free_fs_devices(fs_devices
);
689 static int __btrfs_open_devices(struct btrfs_fs_devices
*fs_devices
,
690 fmode_t flags
, void *holder
)
692 struct request_queue
*q
;
693 struct block_device
*bdev
;
694 struct list_head
*head
= &fs_devices
->devices
;
695 struct btrfs_device
*device
;
696 struct block_device
*latest_bdev
= NULL
;
697 struct buffer_head
*bh
;
698 struct btrfs_super_block
*disk_super
;
699 u64 latest_devid
= 0;
700 u64 latest_transid
= 0;
707 list_for_each_entry(device
, head
, dev_list
) {
713 ret
= btrfs_get_bdev_and_sb(device
->name
->str
, flags
, holder
, 1,
718 disk_super
= (struct btrfs_super_block
*)bh
->b_data
;
719 devid
= btrfs_stack_device_id(&disk_super
->dev_item
);
720 if (devid
!= device
->devid
)
723 if (memcmp(device
->uuid
, disk_super
->dev_item
.uuid
,
727 device
->generation
= btrfs_super_generation(disk_super
);
728 if (!latest_transid
|| device
->generation
> latest_transid
) {
729 latest_devid
= devid
;
730 latest_transid
= device
->generation
;
734 if (btrfs_super_flags(disk_super
) & BTRFS_SUPER_FLAG_SEEDING
) {
735 device
->writeable
= 0;
737 device
->writeable
= !bdev_read_only(bdev
);
741 q
= bdev_get_queue(bdev
);
742 if (blk_queue_discard(q
)) {
743 device
->can_discard
= 1;
744 fs_devices
->num_can_discard
++;
748 device
->in_fs_metadata
= 0;
749 device
->mode
= flags
;
751 if (!blk_queue_nonrot(bdev_get_queue(bdev
)))
752 fs_devices
->rotating
= 1;
754 fs_devices
->open_devices
++;
755 if (device
->writeable
&& !device
->is_tgtdev_for_dev_replace
) {
756 fs_devices
->rw_devices
++;
757 list_add(&device
->dev_alloc_list
,
758 &fs_devices
->alloc_list
);
765 blkdev_put(bdev
, flags
);
768 if (fs_devices
->open_devices
== 0) {
772 fs_devices
->seeding
= seeding
;
773 fs_devices
->opened
= 1;
774 fs_devices
->latest_bdev
= latest_bdev
;
775 fs_devices
->latest_devid
= latest_devid
;
776 fs_devices
->latest_trans
= latest_transid
;
777 fs_devices
->total_rw_bytes
= 0;
782 int btrfs_open_devices(struct btrfs_fs_devices
*fs_devices
,
783 fmode_t flags
, void *holder
)
787 mutex_lock(&uuid_mutex
);
788 if (fs_devices
->opened
) {
789 fs_devices
->opened
++;
792 ret
= __btrfs_open_devices(fs_devices
, flags
, holder
);
794 mutex_unlock(&uuid_mutex
);
799 * Look for a btrfs signature on a device. This may be called out of the mount path
800 * and we are not allowed to call set_blocksize during the scan. The superblock
801 * is read via pagecache
803 int btrfs_scan_one_device(const char *path
, fmode_t flags
, void *holder
,
804 struct btrfs_fs_devices
**fs_devices_ret
)
806 struct btrfs_super_block
*disk_super
;
807 struct block_device
*bdev
;
818 * we would like to check all the supers, but that would make
819 * a btrfs mount succeed after a mkfs from a different FS.
820 * So, we need to add a special mount option to scan for
821 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
823 bytenr
= btrfs_sb_offset(0);
825 mutex_lock(&uuid_mutex
);
827 bdev
= blkdev_get_by_path(path
, flags
, holder
);
831 printk(KERN_INFO
"btrfs: open %s failed\n", path
);
835 /* make sure our super fits in the device */
836 if (bytenr
+ PAGE_CACHE_SIZE
>= i_size_read(bdev
->bd_inode
))
839 /* make sure our super fits in the page */
840 if (sizeof(*disk_super
) > PAGE_CACHE_SIZE
)
843 /* make sure our super doesn't straddle pages on disk */
844 index
= bytenr
>> PAGE_CACHE_SHIFT
;
845 if ((bytenr
+ sizeof(*disk_super
) - 1) >> PAGE_CACHE_SHIFT
!= index
)
848 /* pull in the page with our super */
849 page
= read_cache_page_gfp(bdev
->bd_inode
->i_mapping
,
852 if (IS_ERR_OR_NULL(page
))
857 /* align our pointer to the offset of the super block */
858 disk_super
= p
+ (bytenr
& ~PAGE_CACHE_MASK
);
860 if (btrfs_super_bytenr(disk_super
) != bytenr
||
861 disk_super
->magic
!= cpu_to_le64(BTRFS_MAGIC
))
864 devid
= btrfs_stack_device_id(&disk_super
->dev_item
);
865 transid
= btrfs_super_generation(disk_super
);
866 total_devices
= btrfs_super_num_devices(disk_super
);
868 if (disk_super
->label
[0]) {
869 if (disk_super
->label
[BTRFS_LABEL_SIZE
- 1])
870 disk_super
->label
[BTRFS_LABEL_SIZE
- 1] = '\0';
871 printk(KERN_INFO
"device label %s ", disk_super
->label
);
873 printk(KERN_INFO
"device fsid %pU ", disk_super
->fsid
);
876 printk(KERN_CONT
"devid %llu transid %llu %s\n",
877 (unsigned long long)devid
, (unsigned long long)transid
, path
);
879 ret
= device_list_add(path
, disk_super
, devid
, fs_devices_ret
);
880 if (!ret
&& fs_devices_ret
)
881 (*fs_devices_ret
)->total_devices
= total_devices
;
885 page_cache_release(page
);
888 blkdev_put(bdev
, flags
);
890 mutex_unlock(&uuid_mutex
);
894 /* helper to account the used device space in the range */
895 int btrfs_account_dev_extents_size(struct btrfs_device
*device
, u64 start
,
896 u64 end
, u64
*length
)
898 struct btrfs_key key
;
899 struct btrfs_root
*root
= device
->dev_root
;
900 struct btrfs_dev_extent
*dev_extent
;
901 struct btrfs_path
*path
;
905 struct extent_buffer
*l
;
909 if (start
>= device
->total_bytes
|| device
->is_tgtdev_for_dev_replace
)
912 path
= btrfs_alloc_path();
917 key
.objectid
= device
->devid
;
919 key
.type
= BTRFS_DEV_EXTENT_KEY
;
921 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
925 ret
= btrfs_previous_item(root
, path
, key
.objectid
, key
.type
);
932 slot
= path
->slots
[0];
933 if (slot
>= btrfs_header_nritems(l
)) {
934 ret
= btrfs_next_leaf(root
, path
);
942 btrfs_item_key_to_cpu(l
, &key
, slot
);
944 if (key
.objectid
< device
->devid
)
947 if (key
.objectid
> device
->devid
)
950 if (btrfs_key_type(&key
) != BTRFS_DEV_EXTENT_KEY
)
953 dev_extent
= btrfs_item_ptr(l
, slot
, struct btrfs_dev_extent
);
954 extent_end
= key
.offset
+ btrfs_dev_extent_length(l
,
956 if (key
.offset
<= start
&& extent_end
> end
) {
957 *length
= end
- start
+ 1;
959 } else if (key
.offset
<= start
&& extent_end
> start
)
960 *length
+= extent_end
- start
;
961 else if (key
.offset
> start
&& extent_end
<= end
)
962 *length
+= extent_end
- key
.offset
;
963 else if (key
.offset
> start
&& key
.offset
<= end
) {
964 *length
+= end
- key
.offset
+ 1;
966 } else if (key
.offset
> end
)
974 btrfs_free_path(path
);
979 * find_free_dev_extent - find free space in the specified device
980 * @device: the device which we search the free space in
981 * @num_bytes: the size of the free space that we need
982 * @start: store the start of the free space.
983 * @len: the size of the free space. that we find, or the size of the max
984 * free space if we don't find suitable free space
986 * this uses a pretty simple search, the expectation is that it is
987 * called very infrequently and that a given device has a small number
990 * @start is used to store the start of the free space if we find. But if we
991 * don't find suitable free space, it will be used to store the start position
992 * of the max free space.
994 * @len is used to store the size of the free space that we find.
995 * But if we don't find suitable free space, it is used to store the size of
996 * the max free space.
998 int find_free_dev_extent(struct btrfs_device
*device
, u64 num_bytes
,
999 u64
*start
, u64
*len
)
1001 struct btrfs_key key
;
1002 struct btrfs_root
*root
= device
->dev_root
;
1003 struct btrfs_dev_extent
*dev_extent
;
1004 struct btrfs_path
*path
;
1010 u64 search_end
= device
->total_bytes
;
1013 struct extent_buffer
*l
;
1015 /* FIXME use last free of some kind */
1017 /* we don't want to overwrite the superblock on the drive,
1018 * so we make sure to start at an offset of at least 1MB
1020 search_start
= max(root
->fs_info
->alloc_start
, 1024ull * 1024);
1022 max_hole_start
= search_start
;
1026 if (search_start
>= search_end
|| device
->is_tgtdev_for_dev_replace
) {
1031 path
= btrfs_alloc_path();
1038 key
.objectid
= device
->devid
;
1039 key
.offset
= search_start
;
1040 key
.type
= BTRFS_DEV_EXTENT_KEY
;
1042 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1046 ret
= btrfs_previous_item(root
, path
, key
.objectid
, key
.type
);
1053 slot
= path
->slots
[0];
1054 if (slot
>= btrfs_header_nritems(l
)) {
1055 ret
= btrfs_next_leaf(root
, path
);
1063 btrfs_item_key_to_cpu(l
, &key
, slot
);
1065 if (key
.objectid
< device
->devid
)
1068 if (key
.objectid
> device
->devid
)
1071 if (btrfs_key_type(&key
) != BTRFS_DEV_EXTENT_KEY
)
1074 if (key
.offset
> search_start
) {
1075 hole_size
= key
.offset
- search_start
;
1077 if (hole_size
> max_hole_size
) {
1078 max_hole_start
= search_start
;
1079 max_hole_size
= hole_size
;
1083 * If this free space is greater than which we need,
1084 * it must be the max free space that we have found
1085 * until now, so max_hole_start must point to the start
1086 * of this free space and the length of this free space
1087 * is stored in max_hole_size. Thus, we return
1088 * max_hole_start and max_hole_size and go back to the
1091 if (hole_size
>= num_bytes
) {
1097 dev_extent
= btrfs_item_ptr(l
, slot
, struct btrfs_dev_extent
);
1098 extent_end
= key
.offset
+ btrfs_dev_extent_length(l
,
1100 if (extent_end
> search_start
)
1101 search_start
= extent_end
;
1108 * At this point, search_start should be the end of
1109 * allocated dev extents, and when shrinking the device,
1110 * search_end may be smaller than search_start.
1112 if (search_end
> search_start
)
1113 hole_size
= search_end
- search_start
;
1115 if (hole_size
> max_hole_size
) {
1116 max_hole_start
= search_start
;
1117 max_hole_size
= hole_size
;
1121 if (hole_size
< num_bytes
)
1127 btrfs_free_path(path
);
1129 *start
= max_hole_start
;
1131 *len
= max_hole_size
;
1135 static int btrfs_free_dev_extent(struct btrfs_trans_handle
*trans
,
1136 struct btrfs_device
*device
,
1140 struct btrfs_path
*path
;
1141 struct btrfs_root
*root
= device
->dev_root
;
1142 struct btrfs_key key
;
1143 struct btrfs_key found_key
;
1144 struct extent_buffer
*leaf
= NULL
;
1145 struct btrfs_dev_extent
*extent
= NULL
;
1147 path
= btrfs_alloc_path();
1151 key
.objectid
= device
->devid
;
1153 key
.type
= BTRFS_DEV_EXTENT_KEY
;
1155 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
1157 ret
= btrfs_previous_item(root
, path
, key
.objectid
,
1158 BTRFS_DEV_EXTENT_KEY
);
1161 leaf
= path
->nodes
[0];
1162 btrfs_item_key_to_cpu(leaf
, &found_key
, path
->slots
[0]);
1163 extent
= btrfs_item_ptr(leaf
, path
->slots
[0],
1164 struct btrfs_dev_extent
);
1165 BUG_ON(found_key
.offset
> start
|| found_key
.offset
+
1166 btrfs_dev_extent_length(leaf
, extent
) < start
);
1168 btrfs_release_path(path
);
1170 } else if (ret
== 0) {
1171 leaf
= path
->nodes
[0];
1172 extent
= btrfs_item_ptr(leaf
, path
->slots
[0],
1173 struct btrfs_dev_extent
);
1175 btrfs_error(root
->fs_info
, ret
, "Slot search failed");
1179 if (device
->bytes_used
> 0) {
1180 u64 len
= btrfs_dev_extent_length(leaf
, extent
);
1181 device
->bytes_used
-= len
;
1182 spin_lock(&root
->fs_info
->free_chunk_lock
);
1183 root
->fs_info
->free_chunk_space
+= len
;
1184 spin_unlock(&root
->fs_info
->free_chunk_lock
);
1186 ret
= btrfs_del_item(trans
, root
, path
);
1188 btrfs_error(root
->fs_info
, ret
,
1189 "Failed to remove dev extent item");
1192 btrfs_free_path(path
);
1196 int btrfs_alloc_dev_extent(struct btrfs_trans_handle
*trans
,
1197 struct btrfs_device
*device
,
1198 u64 chunk_tree
, u64 chunk_objectid
,
1199 u64 chunk_offset
, u64 start
, u64 num_bytes
)
1202 struct btrfs_path
*path
;
1203 struct btrfs_root
*root
= device
->dev_root
;
1204 struct btrfs_dev_extent
*extent
;
1205 struct extent_buffer
*leaf
;
1206 struct btrfs_key key
;
1208 WARN_ON(!device
->in_fs_metadata
);
1209 WARN_ON(device
->is_tgtdev_for_dev_replace
);
1210 path
= btrfs_alloc_path();
1214 key
.objectid
= device
->devid
;
1216 key
.type
= BTRFS_DEV_EXTENT_KEY
;
1217 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
,
1222 leaf
= path
->nodes
[0];
1223 extent
= btrfs_item_ptr(leaf
, path
->slots
[0],
1224 struct btrfs_dev_extent
);
1225 btrfs_set_dev_extent_chunk_tree(leaf
, extent
, chunk_tree
);
1226 btrfs_set_dev_extent_chunk_objectid(leaf
, extent
, chunk_objectid
);
1227 btrfs_set_dev_extent_chunk_offset(leaf
, extent
, chunk_offset
);
1229 write_extent_buffer(leaf
, root
->fs_info
->chunk_tree_uuid
,
1230 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent
),
1233 btrfs_set_dev_extent_length(leaf
, extent
, num_bytes
);
1234 btrfs_mark_buffer_dirty(leaf
);
1236 btrfs_free_path(path
);
1240 static noinline
int find_next_chunk(struct btrfs_root
*root
,
1241 u64 objectid
, u64
*offset
)
1243 struct btrfs_path
*path
;
1245 struct btrfs_key key
;
1246 struct btrfs_chunk
*chunk
;
1247 struct btrfs_key found_key
;
1249 path
= btrfs_alloc_path();
1253 key
.objectid
= objectid
;
1254 key
.offset
= (u64
)-1;
1255 key
.type
= BTRFS_CHUNK_ITEM_KEY
;
1257 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1261 BUG_ON(ret
== 0); /* Corruption */
1263 ret
= btrfs_previous_item(root
, path
, 0, BTRFS_CHUNK_ITEM_KEY
);
1267 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
,
1269 if (found_key
.objectid
!= objectid
)
1272 chunk
= btrfs_item_ptr(path
->nodes
[0], path
->slots
[0],
1273 struct btrfs_chunk
);
1274 *offset
= found_key
.offset
+
1275 btrfs_chunk_length(path
->nodes
[0], chunk
);
1280 btrfs_free_path(path
);
1284 static noinline
int find_next_devid(struct btrfs_root
*root
, u64
*objectid
)
1287 struct btrfs_key key
;
1288 struct btrfs_key found_key
;
1289 struct btrfs_path
*path
;
1291 root
= root
->fs_info
->chunk_root
;
1293 path
= btrfs_alloc_path();
1297 key
.objectid
= BTRFS_DEV_ITEMS_OBJECTID
;
1298 key
.type
= BTRFS_DEV_ITEM_KEY
;
1299 key
.offset
= (u64
)-1;
1301 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1305 BUG_ON(ret
== 0); /* Corruption */
1307 ret
= btrfs_previous_item(root
, path
, BTRFS_DEV_ITEMS_OBJECTID
,
1308 BTRFS_DEV_ITEM_KEY
);
1312 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
,
1314 *objectid
= found_key
.offset
+ 1;
1318 btrfs_free_path(path
);
1323 * the device information is stored in the chunk root
1324 * the btrfs_device struct should be fully filled in
1326 int btrfs_add_device(struct btrfs_trans_handle
*trans
,
1327 struct btrfs_root
*root
,
1328 struct btrfs_device
*device
)
1331 struct btrfs_path
*path
;
1332 struct btrfs_dev_item
*dev_item
;
1333 struct extent_buffer
*leaf
;
1334 struct btrfs_key key
;
1337 root
= root
->fs_info
->chunk_root
;
1339 path
= btrfs_alloc_path();
1343 key
.objectid
= BTRFS_DEV_ITEMS_OBJECTID
;
1344 key
.type
= BTRFS_DEV_ITEM_KEY
;
1345 key
.offset
= device
->devid
;
1347 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
,
1352 leaf
= path
->nodes
[0];
1353 dev_item
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_dev_item
);
1355 btrfs_set_device_id(leaf
, dev_item
, device
->devid
);
1356 btrfs_set_device_generation(leaf
, dev_item
, 0);
1357 btrfs_set_device_type(leaf
, dev_item
, device
->type
);
1358 btrfs_set_device_io_align(leaf
, dev_item
, device
->io_align
);
1359 btrfs_set_device_io_width(leaf
, dev_item
, device
->io_width
);
1360 btrfs_set_device_sector_size(leaf
, dev_item
, device
->sector_size
);
1361 btrfs_set_device_total_bytes(leaf
, dev_item
, device
->total_bytes
);
1362 btrfs_set_device_bytes_used(leaf
, dev_item
, device
->bytes_used
);
1363 btrfs_set_device_group(leaf
, dev_item
, 0);
1364 btrfs_set_device_seek_speed(leaf
, dev_item
, 0);
1365 btrfs_set_device_bandwidth(leaf
, dev_item
, 0);
1366 btrfs_set_device_start_offset(leaf
, dev_item
, 0);
1368 ptr
= (unsigned long)btrfs_device_uuid(dev_item
);
1369 write_extent_buffer(leaf
, device
->uuid
, ptr
, BTRFS_UUID_SIZE
);
1370 ptr
= (unsigned long)btrfs_device_fsid(dev_item
);
1371 write_extent_buffer(leaf
, root
->fs_info
->fsid
, ptr
, BTRFS_UUID_SIZE
);
1372 btrfs_mark_buffer_dirty(leaf
);
1376 btrfs_free_path(path
);
1380 static int btrfs_rm_dev_item(struct btrfs_root
*root
,
1381 struct btrfs_device
*device
)
1384 struct btrfs_path
*path
;
1385 struct btrfs_key key
;
1386 struct btrfs_trans_handle
*trans
;
1388 root
= root
->fs_info
->chunk_root
;
1390 path
= btrfs_alloc_path();
1394 trans
= btrfs_start_transaction(root
, 0);
1395 if (IS_ERR(trans
)) {
1396 btrfs_free_path(path
);
1397 return PTR_ERR(trans
);
1399 key
.objectid
= BTRFS_DEV_ITEMS_OBJECTID
;
1400 key
.type
= BTRFS_DEV_ITEM_KEY
;
1401 key
.offset
= device
->devid
;
1404 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
1413 ret
= btrfs_del_item(trans
, root
, path
);
1417 btrfs_free_path(path
);
1418 unlock_chunks(root
);
1419 btrfs_commit_transaction(trans
, root
);
1423 int btrfs_rm_device(struct btrfs_root
*root
, char *device_path
)
1425 struct btrfs_device
*device
;
1426 struct btrfs_device
*next_device
;
1427 struct block_device
*bdev
;
1428 struct buffer_head
*bh
= NULL
;
1429 struct btrfs_super_block
*disk_super
;
1430 struct btrfs_fs_devices
*cur_devices
;
1437 bool clear_super
= false;
1439 mutex_lock(&uuid_mutex
);
1442 seq
= read_seqbegin(&root
->fs_info
->profiles_lock
);
1444 all_avail
= root
->fs_info
->avail_data_alloc_bits
|
1445 root
->fs_info
->avail_system_alloc_bits
|
1446 root
->fs_info
->avail_metadata_alloc_bits
;
1447 } while (read_seqretry(&root
->fs_info
->profiles_lock
, seq
));
1449 num_devices
= root
->fs_info
->fs_devices
->num_devices
;
1450 btrfs_dev_replace_lock(&root
->fs_info
->dev_replace
);
1451 if (btrfs_dev_replace_is_ongoing(&root
->fs_info
->dev_replace
)) {
1452 WARN_ON(num_devices
< 1);
1455 btrfs_dev_replace_unlock(&root
->fs_info
->dev_replace
);
1457 if ((all_avail
& BTRFS_BLOCK_GROUP_RAID10
) && num_devices
<= 4) {
1458 printk(KERN_ERR
"btrfs: unable to go below four devices "
1464 if ((all_avail
& BTRFS_BLOCK_GROUP_RAID1
) && num_devices
<= 2) {
1465 printk(KERN_ERR
"btrfs: unable to go below two "
1466 "devices on raid1\n");
1471 if ((all_avail
& BTRFS_BLOCK_GROUP_RAID5
) &&
1472 root
->fs_info
->fs_devices
->rw_devices
<= 2) {
1473 printk(KERN_ERR
"btrfs: unable to go below two "
1474 "devices on raid5\n");
1478 if ((all_avail
& BTRFS_BLOCK_GROUP_RAID6
) &&
1479 root
->fs_info
->fs_devices
->rw_devices
<= 3) {
1480 printk(KERN_ERR
"btrfs: unable to go below three "
1481 "devices on raid6\n");
1486 if (strcmp(device_path
, "missing") == 0) {
1487 struct list_head
*devices
;
1488 struct btrfs_device
*tmp
;
1491 devices
= &root
->fs_info
->fs_devices
->devices
;
1493 * It is safe to read the devices since the volume_mutex
1496 list_for_each_entry(tmp
, devices
, dev_list
) {
1497 if (tmp
->in_fs_metadata
&&
1498 !tmp
->is_tgtdev_for_dev_replace
&&
1508 printk(KERN_ERR
"btrfs: no missing devices found to "
1513 ret
= btrfs_get_bdev_and_sb(device_path
,
1514 FMODE_WRITE
| FMODE_EXCL
,
1515 root
->fs_info
->bdev_holder
, 0,
1519 disk_super
= (struct btrfs_super_block
*)bh
->b_data
;
1520 devid
= btrfs_stack_device_id(&disk_super
->dev_item
);
1521 dev_uuid
= disk_super
->dev_item
.uuid
;
1522 device
= btrfs_find_device(root
->fs_info
, devid
, dev_uuid
,
1530 if (device
->is_tgtdev_for_dev_replace
) {
1531 pr_err("btrfs: unable to remove the dev_replace target dev\n");
1536 if (device
->writeable
&& root
->fs_info
->fs_devices
->rw_devices
== 1) {
1537 printk(KERN_ERR
"btrfs: unable to remove the only writeable "
1543 if (device
->writeable
) {
1545 list_del_init(&device
->dev_alloc_list
);
1546 unlock_chunks(root
);
1547 root
->fs_info
->fs_devices
->rw_devices
--;
1551 ret
= btrfs_shrink_device(device
, 0);
1556 * TODO: the superblock still includes this device in its num_devices
1557 * counter although write_all_supers() is not locked out. This
1558 * could give a filesystem state which requires a degraded mount.
1560 ret
= btrfs_rm_dev_item(root
->fs_info
->chunk_root
, device
);
1564 spin_lock(&root
->fs_info
->free_chunk_lock
);
1565 root
->fs_info
->free_chunk_space
= device
->total_bytes
-
1567 spin_unlock(&root
->fs_info
->free_chunk_lock
);
1569 device
->in_fs_metadata
= 0;
1570 btrfs_scrub_cancel_dev(root
->fs_info
, device
);
1573 * the device list mutex makes sure that we don't change
1574 * the device list while someone else is writing out all
1575 * the device supers.
1578 cur_devices
= device
->fs_devices
;
1579 mutex_lock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1580 list_del_rcu(&device
->dev_list
);
1582 device
->fs_devices
->num_devices
--;
1583 device
->fs_devices
->total_devices
--;
1585 if (device
->missing
)
1586 root
->fs_info
->fs_devices
->missing_devices
--;
1588 next_device
= list_entry(root
->fs_info
->fs_devices
->devices
.next
,
1589 struct btrfs_device
, dev_list
);
1590 if (device
->bdev
== root
->fs_info
->sb
->s_bdev
)
1591 root
->fs_info
->sb
->s_bdev
= next_device
->bdev
;
1592 if (device
->bdev
== root
->fs_info
->fs_devices
->latest_bdev
)
1593 root
->fs_info
->fs_devices
->latest_bdev
= next_device
->bdev
;
1596 device
->fs_devices
->open_devices
--;
1598 call_rcu(&device
->rcu
, free_device
);
1599 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1601 num_devices
= btrfs_super_num_devices(root
->fs_info
->super_copy
) - 1;
1602 btrfs_set_super_num_devices(root
->fs_info
->super_copy
, num_devices
);
1604 if (cur_devices
->open_devices
== 0) {
1605 struct btrfs_fs_devices
*fs_devices
;
1606 fs_devices
= root
->fs_info
->fs_devices
;
1607 while (fs_devices
) {
1608 if (fs_devices
->seed
== cur_devices
)
1610 fs_devices
= fs_devices
->seed
;
1612 fs_devices
->seed
= cur_devices
->seed
;
1613 cur_devices
->seed
= NULL
;
1615 __btrfs_close_devices(cur_devices
);
1616 unlock_chunks(root
);
1617 free_fs_devices(cur_devices
);
1620 root
->fs_info
->num_tolerated_disk_barrier_failures
=
1621 btrfs_calc_num_tolerated_disk_barrier_failures(root
->fs_info
);
1624 * at this point, the device is zero sized. We want to
1625 * remove it from the devices list and zero out the old super
1627 if (clear_super
&& disk_super
) {
1628 /* make sure this device isn't detected as part of
1631 memset(&disk_super
->magic
, 0, sizeof(disk_super
->magic
));
1632 set_buffer_dirty(bh
);
1633 sync_dirty_buffer(bh
);
1638 /* Notify udev that device has changed */
1640 btrfs_kobject_uevent(bdev
, KOBJ_CHANGE
);
1645 blkdev_put(bdev
, FMODE_READ
| FMODE_EXCL
);
1647 mutex_unlock(&uuid_mutex
);
1650 if (device
->writeable
) {
1652 list_add(&device
->dev_alloc_list
,
1653 &root
->fs_info
->fs_devices
->alloc_list
);
1654 unlock_chunks(root
);
1655 root
->fs_info
->fs_devices
->rw_devices
++;
1660 void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info
*fs_info
,
1661 struct btrfs_device
*srcdev
)
1663 WARN_ON(!mutex_is_locked(&fs_info
->fs_devices
->device_list_mutex
));
1664 list_del_rcu(&srcdev
->dev_list
);
1665 list_del_rcu(&srcdev
->dev_alloc_list
);
1666 fs_info
->fs_devices
->num_devices
--;
1667 if (srcdev
->missing
) {
1668 fs_info
->fs_devices
->missing_devices
--;
1669 fs_info
->fs_devices
->rw_devices
++;
1671 if (srcdev
->can_discard
)
1672 fs_info
->fs_devices
->num_can_discard
--;
1674 fs_info
->fs_devices
->open_devices
--;
1676 call_rcu(&srcdev
->rcu
, free_device
);
1679 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info
*fs_info
,
1680 struct btrfs_device
*tgtdev
)
1682 struct btrfs_device
*next_device
;
1685 mutex_lock(&fs_info
->fs_devices
->device_list_mutex
);
1687 btrfs_scratch_superblock(tgtdev
);
1688 fs_info
->fs_devices
->open_devices
--;
1690 fs_info
->fs_devices
->num_devices
--;
1691 if (tgtdev
->can_discard
)
1692 fs_info
->fs_devices
->num_can_discard
++;
1694 next_device
= list_entry(fs_info
->fs_devices
->devices
.next
,
1695 struct btrfs_device
, dev_list
);
1696 if (tgtdev
->bdev
== fs_info
->sb
->s_bdev
)
1697 fs_info
->sb
->s_bdev
= next_device
->bdev
;
1698 if (tgtdev
->bdev
== fs_info
->fs_devices
->latest_bdev
)
1699 fs_info
->fs_devices
->latest_bdev
= next_device
->bdev
;
1700 list_del_rcu(&tgtdev
->dev_list
);
1702 call_rcu(&tgtdev
->rcu
, free_device
);
1704 mutex_unlock(&fs_info
->fs_devices
->device_list_mutex
);
1707 int btrfs_find_device_by_path(struct btrfs_root
*root
, char *device_path
,
1708 struct btrfs_device
**device
)
1711 struct btrfs_super_block
*disk_super
;
1714 struct block_device
*bdev
;
1715 struct buffer_head
*bh
;
1718 ret
= btrfs_get_bdev_and_sb(device_path
, FMODE_READ
,
1719 root
->fs_info
->bdev_holder
, 0, &bdev
, &bh
);
1722 disk_super
= (struct btrfs_super_block
*)bh
->b_data
;
1723 devid
= btrfs_stack_device_id(&disk_super
->dev_item
);
1724 dev_uuid
= disk_super
->dev_item
.uuid
;
1725 *device
= btrfs_find_device(root
->fs_info
, devid
, dev_uuid
,
1730 blkdev_put(bdev
, FMODE_READ
);
1734 int btrfs_find_device_missing_or_by_path(struct btrfs_root
*root
,
1736 struct btrfs_device
**device
)
1739 if (strcmp(device_path
, "missing") == 0) {
1740 struct list_head
*devices
;
1741 struct btrfs_device
*tmp
;
1743 devices
= &root
->fs_info
->fs_devices
->devices
;
1745 * It is safe to read the devices since the volume_mutex
1746 * is held by the caller.
1748 list_for_each_entry(tmp
, devices
, dev_list
) {
1749 if (tmp
->in_fs_metadata
&& !tmp
->bdev
) {
1756 pr_err("btrfs: no missing device found\n");
1762 return btrfs_find_device_by_path(root
, device_path
, device
);
1767 * does all the dirty work required for changing file system's UUID.
1769 static int btrfs_prepare_sprout(struct btrfs_root
*root
)
1771 struct btrfs_fs_devices
*fs_devices
= root
->fs_info
->fs_devices
;
1772 struct btrfs_fs_devices
*old_devices
;
1773 struct btrfs_fs_devices
*seed_devices
;
1774 struct btrfs_super_block
*disk_super
= root
->fs_info
->super_copy
;
1775 struct btrfs_device
*device
;
1778 BUG_ON(!mutex_is_locked(&uuid_mutex
));
1779 if (!fs_devices
->seeding
)
1782 seed_devices
= kzalloc(sizeof(*fs_devices
), GFP_NOFS
);
1786 old_devices
= clone_fs_devices(fs_devices
);
1787 if (IS_ERR(old_devices
)) {
1788 kfree(seed_devices
);
1789 return PTR_ERR(old_devices
);
1792 list_add(&old_devices
->list
, &fs_uuids
);
1794 memcpy(seed_devices
, fs_devices
, sizeof(*seed_devices
));
1795 seed_devices
->opened
= 1;
1796 INIT_LIST_HEAD(&seed_devices
->devices
);
1797 INIT_LIST_HEAD(&seed_devices
->alloc_list
);
1798 mutex_init(&seed_devices
->device_list_mutex
);
1800 mutex_lock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1801 list_splice_init_rcu(&fs_devices
->devices
, &seed_devices
->devices
,
1803 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1805 list_splice_init(&fs_devices
->alloc_list
, &seed_devices
->alloc_list
);
1806 list_for_each_entry(device
, &seed_devices
->devices
, dev_list
) {
1807 device
->fs_devices
= seed_devices
;
1810 fs_devices
->seeding
= 0;
1811 fs_devices
->num_devices
= 0;
1812 fs_devices
->open_devices
= 0;
1813 fs_devices
->total_devices
= 0;
1814 fs_devices
->seed
= seed_devices
;
1816 generate_random_uuid(fs_devices
->fsid
);
1817 memcpy(root
->fs_info
->fsid
, fs_devices
->fsid
, BTRFS_FSID_SIZE
);
1818 memcpy(disk_super
->fsid
, fs_devices
->fsid
, BTRFS_FSID_SIZE
);
1819 super_flags
= btrfs_super_flags(disk_super
) &
1820 ~BTRFS_SUPER_FLAG_SEEDING
;
1821 btrfs_set_super_flags(disk_super
, super_flags
);
1827 * strore the expected generation for seed devices in device items.
1829 static int btrfs_finish_sprout(struct btrfs_trans_handle
*trans
,
1830 struct btrfs_root
*root
)
1832 struct btrfs_path
*path
;
1833 struct extent_buffer
*leaf
;
1834 struct btrfs_dev_item
*dev_item
;
1835 struct btrfs_device
*device
;
1836 struct btrfs_key key
;
1837 u8 fs_uuid
[BTRFS_UUID_SIZE
];
1838 u8 dev_uuid
[BTRFS_UUID_SIZE
];
1842 path
= btrfs_alloc_path();
1846 root
= root
->fs_info
->chunk_root
;
1847 key
.objectid
= BTRFS_DEV_ITEMS_OBJECTID
;
1849 key
.type
= BTRFS_DEV_ITEM_KEY
;
1852 ret
= btrfs_search_slot(trans
, root
, &key
, path
, 0, 1);
1856 leaf
= path
->nodes
[0];
1858 if (path
->slots
[0] >= btrfs_header_nritems(leaf
)) {
1859 ret
= btrfs_next_leaf(root
, path
);
1864 leaf
= path
->nodes
[0];
1865 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
1866 btrfs_release_path(path
);
1870 btrfs_item_key_to_cpu(leaf
, &key
, path
->slots
[0]);
1871 if (key
.objectid
!= BTRFS_DEV_ITEMS_OBJECTID
||
1872 key
.type
!= BTRFS_DEV_ITEM_KEY
)
1875 dev_item
= btrfs_item_ptr(leaf
, path
->slots
[0],
1876 struct btrfs_dev_item
);
1877 devid
= btrfs_device_id(leaf
, dev_item
);
1878 read_extent_buffer(leaf
, dev_uuid
,
1879 (unsigned long)btrfs_device_uuid(dev_item
),
1881 read_extent_buffer(leaf
, fs_uuid
,
1882 (unsigned long)btrfs_device_fsid(dev_item
),
1884 device
= btrfs_find_device(root
->fs_info
, devid
, dev_uuid
,
1886 BUG_ON(!device
); /* Logic error */
1888 if (device
->fs_devices
->seeding
) {
1889 btrfs_set_device_generation(leaf
, dev_item
,
1890 device
->generation
);
1891 btrfs_mark_buffer_dirty(leaf
);
1899 btrfs_free_path(path
);
1903 int btrfs_init_new_device(struct btrfs_root
*root
, char *device_path
)
1905 struct request_queue
*q
;
1906 struct btrfs_trans_handle
*trans
;
1907 struct btrfs_device
*device
;
1908 struct block_device
*bdev
;
1909 struct list_head
*devices
;
1910 struct super_block
*sb
= root
->fs_info
->sb
;
1911 struct rcu_string
*name
;
1913 int seeding_dev
= 0;
1916 if ((sb
->s_flags
& MS_RDONLY
) && !root
->fs_info
->fs_devices
->seeding
)
1919 bdev
= blkdev_get_by_path(device_path
, FMODE_WRITE
| FMODE_EXCL
,
1920 root
->fs_info
->bdev_holder
);
1922 return PTR_ERR(bdev
);
1924 if (root
->fs_info
->fs_devices
->seeding
) {
1926 down_write(&sb
->s_umount
);
1927 mutex_lock(&uuid_mutex
);
1930 filemap_write_and_wait(bdev
->bd_inode
->i_mapping
);
1932 devices
= &root
->fs_info
->fs_devices
->devices
;
1934 mutex_lock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1935 list_for_each_entry(device
, devices
, dev_list
) {
1936 if (device
->bdev
== bdev
) {
1939 &root
->fs_info
->fs_devices
->device_list_mutex
);
1943 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
1945 device
= kzalloc(sizeof(*device
), GFP_NOFS
);
1947 /* we can safely leave the fs_devices entry around */
1952 name
= rcu_string_strdup(device_path
, GFP_NOFS
);
1958 rcu_assign_pointer(device
->name
, name
);
1960 ret
= find_next_devid(root
, &device
->devid
);
1962 rcu_string_free(device
->name
);
1967 trans
= btrfs_start_transaction(root
, 0);
1968 if (IS_ERR(trans
)) {
1969 rcu_string_free(device
->name
);
1971 ret
= PTR_ERR(trans
);
1977 q
= bdev_get_queue(bdev
);
1978 if (blk_queue_discard(q
))
1979 device
->can_discard
= 1;
1980 device
->writeable
= 1;
1981 device
->work
.func
= pending_bios_fn
;
1982 generate_random_uuid(device
->uuid
);
1983 spin_lock_init(&device
->io_lock
);
1984 device
->generation
= trans
->transid
;
1985 device
->io_width
= root
->sectorsize
;
1986 device
->io_align
= root
->sectorsize
;
1987 device
->sector_size
= root
->sectorsize
;
1988 device
->total_bytes
= i_size_read(bdev
->bd_inode
);
1989 device
->disk_total_bytes
= device
->total_bytes
;
1990 device
->dev_root
= root
->fs_info
->dev_root
;
1991 device
->bdev
= bdev
;
1992 device
->in_fs_metadata
= 1;
1993 device
->is_tgtdev_for_dev_replace
= 0;
1994 device
->mode
= FMODE_EXCL
;
1995 set_blocksize(device
->bdev
, 4096);
1998 sb
->s_flags
&= ~MS_RDONLY
;
1999 ret
= btrfs_prepare_sprout(root
);
2000 BUG_ON(ret
); /* -ENOMEM */
2003 device
->fs_devices
= root
->fs_info
->fs_devices
;
2005 mutex_lock(&root
->fs_info
->fs_devices
->device_list_mutex
);
2006 list_add_rcu(&device
->dev_list
, &root
->fs_info
->fs_devices
->devices
);
2007 list_add(&device
->dev_alloc_list
,
2008 &root
->fs_info
->fs_devices
->alloc_list
);
2009 root
->fs_info
->fs_devices
->num_devices
++;
2010 root
->fs_info
->fs_devices
->open_devices
++;
2011 root
->fs_info
->fs_devices
->rw_devices
++;
2012 root
->fs_info
->fs_devices
->total_devices
++;
2013 if (device
->can_discard
)
2014 root
->fs_info
->fs_devices
->num_can_discard
++;
2015 root
->fs_info
->fs_devices
->total_rw_bytes
+= device
->total_bytes
;
2017 spin_lock(&root
->fs_info
->free_chunk_lock
);
2018 root
->fs_info
->free_chunk_space
+= device
->total_bytes
;
2019 spin_unlock(&root
->fs_info
->free_chunk_lock
);
2021 if (!blk_queue_nonrot(bdev_get_queue(bdev
)))
2022 root
->fs_info
->fs_devices
->rotating
= 1;
2024 total_bytes
= btrfs_super_total_bytes(root
->fs_info
->super_copy
);
2025 btrfs_set_super_total_bytes(root
->fs_info
->super_copy
,
2026 total_bytes
+ device
->total_bytes
);
2028 total_bytes
= btrfs_super_num_devices(root
->fs_info
->super_copy
);
2029 btrfs_set_super_num_devices(root
->fs_info
->super_copy
,
2031 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
2034 ret
= init_first_rw_device(trans
, root
, device
);
2036 btrfs_abort_transaction(trans
, root
, ret
);
2039 ret
= btrfs_finish_sprout(trans
, root
);
2041 btrfs_abort_transaction(trans
, root
, ret
);
2045 ret
= btrfs_add_device(trans
, root
, device
);
2047 btrfs_abort_transaction(trans
, root
, ret
);
2053 * we've got more storage, clear any full flags on the space
2056 btrfs_clear_space_info_full(root
->fs_info
);
2058 unlock_chunks(root
);
2059 root
->fs_info
->num_tolerated_disk_barrier_failures
=
2060 btrfs_calc_num_tolerated_disk_barrier_failures(root
->fs_info
);
2061 ret
= btrfs_commit_transaction(trans
, root
);
2064 mutex_unlock(&uuid_mutex
);
2065 up_write(&sb
->s_umount
);
2067 if (ret
) /* transaction commit */
2070 ret
= btrfs_relocate_sys_chunks(root
);
2072 btrfs_error(root
->fs_info
, ret
,
2073 "Failed to relocate sys chunks after "
2074 "device initialization. This can be fixed "
2075 "using the \"btrfs balance\" command.");
2076 trans
= btrfs_attach_transaction(root
);
2077 if (IS_ERR(trans
)) {
2078 if (PTR_ERR(trans
) == -ENOENT
)
2080 return PTR_ERR(trans
);
2082 ret
= btrfs_commit_transaction(trans
, root
);
2088 unlock_chunks(root
);
2089 btrfs_end_transaction(trans
, root
);
2090 rcu_string_free(device
->name
);
2093 blkdev_put(bdev
, FMODE_EXCL
);
2095 mutex_unlock(&uuid_mutex
);
2096 up_write(&sb
->s_umount
);
2101 int btrfs_init_dev_replace_tgtdev(struct btrfs_root
*root
, char *device_path
,
2102 struct btrfs_device
**device_out
)
2104 struct request_queue
*q
;
2105 struct btrfs_device
*device
;
2106 struct block_device
*bdev
;
2107 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
2108 struct list_head
*devices
;
2109 struct rcu_string
*name
;
2113 if (fs_info
->fs_devices
->seeding
)
2116 bdev
= blkdev_get_by_path(device_path
, FMODE_WRITE
| FMODE_EXCL
,
2117 fs_info
->bdev_holder
);
2119 return PTR_ERR(bdev
);
2121 filemap_write_and_wait(bdev
->bd_inode
->i_mapping
);
2123 devices
= &fs_info
->fs_devices
->devices
;
2124 list_for_each_entry(device
, devices
, dev_list
) {
2125 if (device
->bdev
== bdev
) {
2131 device
= kzalloc(sizeof(*device
), GFP_NOFS
);
2137 name
= rcu_string_strdup(device_path
, GFP_NOFS
);
2143 rcu_assign_pointer(device
->name
, name
);
2145 q
= bdev_get_queue(bdev
);
2146 if (blk_queue_discard(q
))
2147 device
->can_discard
= 1;
2148 mutex_lock(&root
->fs_info
->fs_devices
->device_list_mutex
);
2149 device
->writeable
= 1;
2150 device
->work
.func
= pending_bios_fn
;
2151 generate_random_uuid(device
->uuid
);
2152 device
->devid
= BTRFS_DEV_REPLACE_DEVID
;
2153 spin_lock_init(&device
->io_lock
);
2154 device
->generation
= 0;
2155 device
->io_width
= root
->sectorsize
;
2156 device
->io_align
= root
->sectorsize
;
2157 device
->sector_size
= root
->sectorsize
;
2158 device
->total_bytes
= i_size_read(bdev
->bd_inode
);
2159 device
->disk_total_bytes
= device
->total_bytes
;
2160 device
->dev_root
= fs_info
->dev_root
;
2161 device
->bdev
= bdev
;
2162 device
->in_fs_metadata
= 1;
2163 device
->is_tgtdev_for_dev_replace
= 1;
2164 device
->mode
= FMODE_EXCL
;
2165 set_blocksize(device
->bdev
, 4096);
2166 device
->fs_devices
= fs_info
->fs_devices
;
2167 list_add(&device
->dev_list
, &fs_info
->fs_devices
->devices
);
2168 fs_info
->fs_devices
->num_devices
++;
2169 fs_info
->fs_devices
->open_devices
++;
2170 if (device
->can_discard
)
2171 fs_info
->fs_devices
->num_can_discard
++;
2172 mutex_unlock(&root
->fs_info
->fs_devices
->device_list_mutex
);
2174 *device_out
= device
;
2178 blkdev_put(bdev
, FMODE_EXCL
);
2182 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info
*fs_info
,
2183 struct btrfs_device
*tgtdev
)
2185 WARN_ON(fs_info
->fs_devices
->rw_devices
== 0);
2186 tgtdev
->io_width
= fs_info
->dev_root
->sectorsize
;
2187 tgtdev
->io_align
= fs_info
->dev_root
->sectorsize
;
2188 tgtdev
->sector_size
= fs_info
->dev_root
->sectorsize
;
2189 tgtdev
->dev_root
= fs_info
->dev_root
;
2190 tgtdev
->in_fs_metadata
= 1;
2193 static noinline
int btrfs_update_device(struct btrfs_trans_handle
*trans
,
2194 struct btrfs_device
*device
)
2197 struct btrfs_path
*path
;
2198 struct btrfs_root
*root
;
2199 struct btrfs_dev_item
*dev_item
;
2200 struct extent_buffer
*leaf
;
2201 struct btrfs_key key
;
2203 root
= device
->dev_root
->fs_info
->chunk_root
;
2205 path
= btrfs_alloc_path();
2209 key
.objectid
= BTRFS_DEV_ITEMS_OBJECTID
;
2210 key
.type
= BTRFS_DEV_ITEM_KEY
;
2211 key
.offset
= device
->devid
;
2213 ret
= btrfs_search_slot(trans
, root
, &key
, path
, 0, 1);
2222 leaf
= path
->nodes
[0];
2223 dev_item
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_dev_item
);
2225 btrfs_set_device_id(leaf
, dev_item
, device
->devid
);
2226 btrfs_set_device_type(leaf
, dev_item
, device
->type
);
2227 btrfs_set_device_io_align(leaf
, dev_item
, device
->io_align
);
2228 btrfs_set_device_io_width(leaf
, dev_item
, device
->io_width
);
2229 btrfs_set_device_sector_size(leaf
, dev_item
, device
->sector_size
);
2230 btrfs_set_device_total_bytes(leaf
, dev_item
, device
->disk_total_bytes
);
2231 btrfs_set_device_bytes_used(leaf
, dev_item
, device
->bytes_used
);
2232 btrfs_mark_buffer_dirty(leaf
);
2235 btrfs_free_path(path
);
2239 static int __btrfs_grow_device(struct btrfs_trans_handle
*trans
,
2240 struct btrfs_device
*device
, u64 new_size
)
2242 struct btrfs_super_block
*super_copy
=
2243 device
->dev_root
->fs_info
->super_copy
;
2244 u64 old_total
= btrfs_super_total_bytes(super_copy
);
2245 u64 diff
= new_size
- device
->total_bytes
;
2247 if (!device
->writeable
)
2249 if (new_size
<= device
->total_bytes
||
2250 device
->is_tgtdev_for_dev_replace
)
2253 btrfs_set_super_total_bytes(super_copy
, old_total
+ diff
);
2254 device
->fs_devices
->total_rw_bytes
+= diff
;
2256 device
->total_bytes
= new_size
;
2257 device
->disk_total_bytes
= new_size
;
2258 btrfs_clear_space_info_full(device
->dev_root
->fs_info
);
2260 return btrfs_update_device(trans
, device
);
2263 int btrfs_grow_device(struct btrfs_trans_handle
*trans
,
2264 struct btrfs_device
*device
, u64 new_size
)
2267 lock_chunks(device
->dev_root
);
2268 ret
= __btrfs_grow_device(trans
, device
, new_size
);
2269 unlock_chunks(device
->dev_root
);
2273 static int btrfs_free_chunk(struct btrfs_trans_handle
*trans
,
2274 struct btrfs_root
*root
,
2275 u64 chunk_tree
, u64 chunk_objectid
,
2279 struct btrfs_path
*path
;
2280 struct btrfs_key key
;
2282 root
= root
->fs_info
->chunk_root
;
2283 path
= btrfs_alloc_path();
2287 key
.objectid
= chunk_objectid
;
2288 key
.offset
= chunk_offset
;
2289 key
.type
= BTRFS_CHUNK_ITEM_KEY
;
2291 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
2294 else if (ret
> 0) { /* Logic error or corruption */
2295 btrfs_error(root
->fs_info
, -ENOENT
,
2296 "Failed lookup while freeing chunk.");
2301 ret
= btrfs_del_item(trans
, root
, path
);
2303 btrfs_error(root
->fs_info
, ret
,
2304 "Failed to delete chunk item.");
2306 btrfs_free_path(path
);
2310 static int btrfs_del_sys_chunk(struct btrfs_root
*root
, u64 chunk_objectid
, u64
2313 struct btrfs_super_block
*super_copy
= root
->fs_info
->super_copy
;
2314 struct btrfs_disk_key
*disk_key
;
2315 struct btrfs_chunk
*chunk
;
2322 struct btrfs_key key
;
2324 array_size
= btrfs_super_sys_array_size(super_copy
);
2326 ptr
= super_copy
->sys_chunk_array
;
2329 while (cur
< array_size
) {
2330 disk_key
= (struct btrfs_disk_key
*)ptr
;
2331 btrfs_disk_key_to_cpu(&key
, disk_key
);
2333 len
= sizeof(*disk_key
);
2335 if (key
.type
== BTRFS_CHUNK_ITEM_KEY
) {
2336 chunk
= (struct btrfs_chunk
*)(ptr
+ len
);
2337 num_stripes
= btrfs_stack_chunk_num_stripes(chunk
);
2338 len
+= btrfs_chunk_item_size(num_stripes
);
2343 if (key
.objectid
== chunk_objectid
&&
2344 key
.offset
== chunk_offset
) {
2345 memmove(ptr
, ptr
+ len
, array_size
- (cur
+ len
));
2347 btrfs_set_super_sys_array_size(super_copy
, array_size
);
2356 static int btrfs_relocate_chunk(struct btrfs_root
*root
,
2357 u64 chunk_tree
, u64 chunk_objectid
,
2360 struct extent_map_tree
*em_tree
;
2361 struct btrfs_root
*extent_root
;
2362 struct btrfs_trans_handle
*trans
;
2363 struct extent_map
*em
;
2364 struct map_lookup
*map
;
2368 root
= root
->fs_info
->chunk_root
;
2369 extent_root
= root
->fs_info
->extent_root
;
2370 em_tree
= &root
->fs_info
->mapping_tree
.map_tree
;
2372 ret
= btrfs_can_relocate(extent_root
, chunk_offset
);
2376 /* step one, relocate all the extents inside this chunk */
2377 ret
= btrfs_relocate_block_group(extent_root
, chunk_offset
);
2381 trans
= btrfs_start_transaction(root
, 0);
2382 BUG_ON(IS_ERR(trans
));
2387 * step two, delete the device extents and the
2388 * chunk tree entries
2390 read_lock(&em_tree
->lock
);
2391 em
= lookup_extent_mapping(em_tree
, chunk_offset
, 1);
2392 read_unlock(&em_tree
->lock
);
2394 BUG_ON(!em
|| em
->start
> chunk_offset
||
2395 em
->start
+ em
->len
< chunk_offset
);
2396 map
= (struct map_lookup
*)em
->bdev
;
2398 for (i
= 0; i
< map
->num_stripes
; i
++) {
2399 ret
= btrfs_free_dev_extent(trans
, map
->stripes
[i
].dev
,
2400 map
->stripes
[i
].physical
);
2403 if (map
->stripes
[i
].dev
) {
2404 ret
= btrfs_update_device(trans
, map
->stripes
[i
].dev
);
2408 ret
= btrfs_free_chunk(trans
, root
, chunk_tree
, chunk_objectid
,
2413 trace_btrfs_chunk_free(root
, map
, chunk_offset
, em
->len
);
2415 if (map
->type
& BTRFS_BLOCK_GROUP_SYSTEM
) {
2416 ret
= btrfs_del_sys_chunk(root
, chunk_objectid
, chunk_offset
);
2420 ret
= btrfs_remove_block_group(trans
, extent_root
, chunk_offset
);
2423 write_lock(&em_tree
->lock
);
2424 remove_extent_mapping(em_tree
, em
);
2425 write_unlock(&em_tree
->lock
);
2430 /* once for the tree */
2431 free_extent_map(em
);
2433 free_extent_map(em
);
2435 unlock_chunks(root
);
2436 btrfs_end_transaction(trans
, root
);
2440 static int btrfs_relocate_sys_chunks(struct btrfs_root
*root
)
2442 struct btrfs_root
*chunk_root
= root
->fs_info
->chunk_root
;
2443 struct btrfs_path
*path
;
2444 struct extent_buffer
*leaf
;
2445 struct btrfs_chunk
*chunk
;
2446 struct btrfs_key key
;
2447 struct btrfs_key found_key
;
2448 u64 chunk_tree
= chunk_root
->root_key
.objectid
;
2450 bool retried
= false;
2454 path
= btrfs_alloc_path();
2459 key
.objectid
= BTRFS_FIRST_CHUNK_TREE_OBJECTID
;
2460 key
.offset
= (u64
)-1;
2461 key
.type
= BTRFS_CHUNK_ITEM_KEY
;
2464 ret
= btrfs_search_slot(NULL
, chunk_root
, &key
, path
, 0, 0);
2467 BUG_ON(ret
== 0); /* Corruption */
2469 ret
= btrfs_previous_item(chunk_root
, path
, key
.objectid
,
2476 leaf
= path
->nodes
[0];
2477 btrfs_item_key_to_cpu(leaf
, &found_key
, path
->slots
[0]);
2479 chunk
= btrfs_item_ptr(leaf
, path
->slots
[0],
2480 struct btrfs_chunk
);
2481 chunk_type
= btrfs_chunk_type(leaf
, chunk
);
2482 btrfs_release_path(path
);
2484 if (chunk_type
& BTRFS_BLOCK_GROUP_SYSTEM
) {
2485 ret
= btrfs_relocate_chunk(chunk_root
, chunk_tree
,
2494 if (found_key
.offset
== 0)
2496 key
.offset
= found_key
.offset
- 1;
2499 if (failed
&& !retried
) {
2503 } else if (failed
&& retried
) {
2508 btrfs_free_path(path
);
2512 static int insert_balance_item(struct btrfs_root
*root
,
2513 struct btrfs_balance_control
*bctl
)
2515 struct btrfs_trans_handle
*trans
;
2516 struct btrfs_balance_item
*item
;
2517 struct btrfs_disk_balance_args disk_bargs
;
2518 struct btrfs_path
*path
;
2519 struct extent_buffer
*leaf
;
2520 struct btrfs_key key
;
2523 path
= btrfs_alloc_path();
2527 trans
= btrfs_start_transaction(root
, 0);
2528 if (IS_ERR(trans
)) {
2529 btrfs_free_path(path
);
2530 return PTR_ERR(trans
);
2533 key
.objectid
= BTRFS_BALANCE_OBJECTID
;
2534 key
.type
= BTRFS_BALANCE_ITEM_KEY
;
2537 ret
= btrfs_insert_empty_item(trans
, root
, path
, &key
,
2542 leaf
= path
->nodes
[0];
2543 item
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_balance_item
);
2545 memset_extent_buffer(leaf
, 0, (unsigned long)item
, sizeof(*item
));
2547 btrfs_cpu_balance_args_to_disk(&disk_bargs
, &bctl
->data
);
2548 btrfs_set_balance_data(leaf
, item
, &disk_bargs
);
2549 btrfs_cpu_balance_args_to_disk(&disk_bargs
, &bctl
->meta
);
2550 btrfs_set_balance_meta(leaf
, item
, &disk_bargs
);
2551 btrfs_cpu_balance_args_to_disk(&disk_bargs
, &bctl
->sys
);
2552 btrfs_set_balance_sys(leaf
, item
, &disk_bargs
);
2554 btrfs_set_balance_flags(leaf
, item
, bctl
->flags
);
2556 btrfs_mark_buffer_dirty(leaf
);
2558 btrfs_free_path(path
);
2559 err
= btrfs_commit_transaction(trans
, root
);
2565 static int del_balance_item(struct btrfs_root
*root
)
2567 struct btrfs_trans_handle
*trans
;
2568 struct btrfs_path
*path
;
2569 struct btrfs_key key
;
2572 path
= btrfs_alloc_path();
2576 trans
= btrfs_start_transaction(root
, 0);
2577 if (IS_ERR(trans
)) {
2578 btrfs_free_path(path
);
2579 return PTR_ERR(trans
);
2582 key
.objectid
= BTRFS_BALANCE_OBJECTID
;
2583 key
.type
= BTRFS_BALANCE_ITEM_KEY
;
2586 ret
= btrfs_search_slot(trans
, root
, &key
, path
, -1, 1);
2594 ret
= btrfs_del_item(trans
, root
, path
);
2596 btrfs_free_path(path
);
2597 err
= btrfs_commit_transaction(trans
, root
);
2604 * This is a heuristic used to reduce the number of chunks balanced on
2605 * resume after balance was interrupted.
2607 static void update_balance_args(struct btrfs_balance_control
*bctl
)
2610 * Turn on soft mode for chunk types that were being converted.
2612 if (bctl
->data
.flags
& BTRFS_BALANCE_ARGS_CONVERT
)
2613 bctl
->data
.flags
|= BTRFS_BALANCE_ARGS_SOFT
;
2614 if (bctl
->sys
.flags
& BTRFS_BALANCE_ARGS_CONVERT
)
2615 bctl
->sys
.flags
|= BTRFS_BALANCE_ARGS_SOFT
;
2616 if (bctl
->meta
.flags
& BTRFS_BALANCE_ARGS_CONVERT
)
2617 bctl
->meta
.flags
|= BTRFS_BALANCE_ARGS_SOFT
;
2620 * Turn on usage filter if is not already used. The idea is
2621 * that chunks that we have already balanced should be
2622 * reasonably full. Don't do it for chunks that are being
2623 * converted - that will keep us from relocating unconverted
2624 * (albeit full) chunks.
2626 if (!(bctl
->data
.flags
& BTRFS_BALANCE_ARGS_USAGE
) &&
2627 !(bctl
->data
.flags
& BTRFS_BALANCE_ARGS_CONVERT
)) {
2628 bctl
->data
.flags
|= BTRFS_BALANCE_ARGS_USAGE
;
2629 bctl
->data
.usage
= 90;
2631 if (!(bctl
->sys
.flags
& BTRFS_BALANCE_ARGS_USAGE
) &&
2632 !(bctl
->sys
.flags
& BTRFS_BALANCE_ARGS_CONVERT
)) {
2633 bctl
->sys
.flags
|= BTRFS_BALANCE_ARGS_USAGE
;
2634 bctl
->sys
.usage
= 90;
2636 if (!(bctl
->meta
.flags
& BTRFS_BALANCE_ARGS_USAGE
) &&
2637 !(bctl
->meta
.flags
& BTRFS_BALANCE_ARGS_CONVERT
)) {
2638 bctl
->meta
.flags
|= BTRFS_BALANCE_ARGS_USAGE
;
2639 bctl
->meta
.usage
= 90;
2644 * Should be called with both balance and volume mutexes held to
2645 * serialize other volume operations (add_dev/rm_dev/resize) with
2646 * restriper. Same goes for unset_balance_control.
2648 static void set_balance_control(struct btrfs_balance_control
*bctl
)
2650 struct btrfs_fs_info
*fs_info
= bctl
->fs_info
;
2652 BUG_ON(fs_info
->balance_ctl
);
2654 spin_lock(&fs_info
->balance_lock
);
2655 fs_info
->balance_ctl
= bctl
;
2656 spin_unlock(&fs_info
->balance_lock
);
2659 static void unset_balance_control(struct btrfs_fs_info
*fs_info
)
2661 struct btrfs_balance_control
*bctl
= fs_info
->balance_ctl
;
2663 BUG_ON(!fs_info
->balance_ctl
);
2665 spin_lock(&fs_info
->balance_lock
);
2666 fs_info
->balance_ctl
= NULL
;
2667 spin_unlock(&fs_info
->balance_lock
);
2673 * Balance filters. Return 1 if chunk should be filtered out
2674 * (should not be balanced).
2676 static int chunk_profiles_filter(u64 chunk_type
,
2677 struct btrfs_balance_args
*bargs
)
2679 chunk_type
= chunk_to_extended(chunk_type
) &
2680 BTRFS_EXTENDED_PROFILE_MASK
;
2682 if (bargs
->profiles
& chunk_type
)
2688 static int chunk_usage_filter(struct btrfs_fs_info
*fs_info
, u64 chunk_offset
,
2689 struct btrfs_balance_args
*bargs
)
2691 struct btrfs_block_group_cache
*cache
;
2692 u64 chunk_used
, user_thresh
;
2695 cache
= btrfs_lookup_block_group(fs_info
, chunk_offset
);
2696 chunk_used
= btrfs_block_group_used(&cache
->item
);
2698 if (bargs
->usage
== 0)
2700 else if (bargs
->usage
> 100)
2701 user_thresh
= cache
->key
.offset
;
2703 user_thresh
= div_factor_fine(cache
->key
.offset
,
2706 if (chunk_used
< user_thresh
)
2709 btrfs_put_block_group(cache
);
2713 static int chunk_devid_filter(struct extent_buffer
*leaf
,
2714 struct btrfs_chunk
*chunk
,
2715 struct btrfs_balance_args
*bargs
)
2717 struct btrfs_stripe
*stripe
;
2718 int num_stripes
= btrfs_chunk_num_stripes(leaf
, chunk
);
2721 for (i
= 0; i
< num_stripes
; i
++) {
2722 stripe
= btrfs_stripe_nr(chunk
, i
);
2723 if (btrfs_stripe_devid(leaf
, stripe
) == bargs
->devid
)
2730 /* [pstart, pend) */
2731 static int chunk_drange_filter(struct extent_buffer
*leaf
,
2732 struct btrfs_chunk
*chunk
,
2734 struct btrfs_balance_args
*bargs
)
2736 struct btrfs_stripe
*stripe
;
2737 int num_stripes
= btrfs_chunk_num_stripes(leaf
, chunk
);
2743 if (!(bargs
->flags
& BTRFS_BALANCE_ARGS_DEVID
))
2746 if (btrfs_chunk_type(leaf
, chunk
) & (BTRFS_BLOCK_GROUP_DUP
|
2747 BTRFS_BLOCK_GROUP_RAID1
| BTRFS_BLOCK_GROUP_RAID10
)) {
2748 factor
= num_stripes
/ 2;
2749 } else if (btrfs_chunk_type(leaf
, chunk
) & BTRFS_BLOCK_GROUP_RAID5
) {
2750 factor
= num_stripes
- 1;
2751 } else if (btrfs_chunk_type(leaf
, chunk
) & BTRFS_BLOCK_GROUP_RAID6
) {
2752 factor
= num_stripes
- 2;
2754 factor
= num_stripes
;
2757 for (i
= 0; i
< num_stripes
; i
++) {
2758 stripe
= btrfs_stripe_nr(chunk
, i
);
2759 if (btrfs_stripe_devid(leaf
, stripe
) != bargs
->devid
)
2762 stripe_offset
= btrfs_stripe_offset(leaf
, stripe
);
2763 stripe_length
= btrfs_chunk_length(leaf
, chunk
);
2764 do_div(stripe_length
, factor
);
2766 if (stripe_offset
< bargs
->pend
&&
2767 stripe_offset
+ stripe_length
> bargs
->pstart
)
2774 /* [vstart, vend) */
2775 static int chunk_vrange_filter(struct extent_buffer
*leaf
,
2776 struct btrfs_chunk
*chunk
,
2778 struct btrfs_balance_args
*bargs
)
2780 if (chunk_offset
< bargs
->vend
&&
2781 chunk_offset
+ btrfs_chunk_length(leaf
, chunk
) > bargs
->vstart
)
2782 /* at least part of the chunk is inside this vrange */
2788 static int chunk_soft_convert_filter(u64 chunk_type
,
2789 struct btrfs_balance_args
*bargs
)
2791 if (!(bargs
->flags
& BTRFS_BALANCE_ARGS_CONVERT
))
2794 chunk_type
= chunk_to_extended(chunk_type
) &
2795 BTRFS_EXTENDED_PROFILE_MASK
;
2797 if (bargs
->target
== chunk_type
)
2803 static int should_balance_chunk(struct btrfs_root
*root
,
2804 struct extent_buffer
*leaf
,
2805 struct btrfs_chunk
*chunk
, u64 chunk_offset
)
2807 struct btrfs_balance_control
*bctl
= root
->fs_info
->balance_ctl
;
2808 struct btrfs_balance_args
*bargs
= NULL
;
2809 u64 chunk_type
= btrfs_chunk_type(leaf
, chunk
);
2812 if (!((chunk_type
& BTRFS_BLOCK_GROUP_TYPE_MASK
) &
2813 (bctl
->flags
& BTRFS_BALANCE_TYPE_MASK
))) {
2817 if (chunk_type
& BTRFS_BLOCK_GROUP_DATA
)
2818 bargs
= &bctl
->data
;
2819 else if (chunk_type
& BTRFS_BLOCK_GROUP_SYSTEM
)
2821 else if (chunk_type
& BTRFS_BLOCK_GROUP_METADATA
)
2822 bargs
= &bctl
->meta
;
2824 /* profiles filter */
2825 if ((bargs
->flags
& BTRFS_BALANCE_ARGS_PROFILES
) &&
2826 chunk_profiles_filter(chunk_type
, bargs
)) {
2831 if ((bargs
->flags
& BTRFS_BALANCE_ARGS_USAGE
) &&
2832 chunk_usage_filter(bctl
->fs_info
, chunk_offset
, bargs
)) {
2837 if ((bargs
->flags
& BTRFS_BALANCE_ARGS_DEVID
) &&
2838 chunk_devid_filter(leaf
, chunk
, bargs
)) {
2842 /* drange filter, makes sense only with devid filter */
2843 if ((bargs
->flags
& BTRFS_BALANCE_ARGS_DRANGE
) &&
2844 chunk_drange_filter(leaf
, chunk
, chunk_offset
, bargs
)) {
2849 if ((bargs
->flags
& BTRFS_BALANCE_ARGS_VRANGE
) &&
2850 chunk_vrange_filter(leaf
, chunk
, chunk_offset
, bargs
)) {
2854 /* soft profile changing mode */
2855 if ((bargs
->flags
& BTRFS_BALANCE_ARGS_SOFT
) &&
2856 chunk_soft_convert_filter(chunk_type
, bargs
)) {
2863 static int __btrfs_balance(struct btrfs_fs_info
*fs_info
)
2865 struct btrfs_balance_control
*bctl
= fs_info
->balance_ctl
;
2866 struct btrfs_root
*chunk_root
= fs_info
->chunk_root
;
2867 struct btrfs_root
*dev_root
= fs_info
->dev_root
;
2868 struct list_head
*devices
;
2869 struct btrfs_device
*device
;
2872 struct btrfs_chunk
*chunk
;
2873 struct btrfs_path
*path
;
2874 struct btrfs_key key
;
2875 struct btrfs_key found_key
;
2876 struct btrfs_trans_handle
*trans
;
2877 struct extent_buffer
*leaf
;
2880 int enospc_errors
= 0;
2881 bool counting
= true;
2883 /* step one make some room on all the devices */
2884 devices
= &fs_info
->fs_devices
->devices
;
2885 list_for_each_entry(device
, devices
, dev_list
) {
2886 old_size
= device
->total_bytes
;
2887 size_to_free
= div_factor(old_size
, 1);
2888 size_to_free
= min(size_to_free
, (u64
)1 * 1024 * 1024);
2889 if (!device
->writeable
||
2890 device
->total_bytes
- device
->bytes_used
> size_to_free
||
2891 device
->is_tgtdev_for_dev_replace
)
2894 ret
= btrfs_shrink_device(device
, old_size
- size_to_free
);
2899 trans
= btrfs_start_transaction(dev_root
, 0);
2900 BUG_ON(IS_ERR(trans
));
2902 ret
= btrfs_grow_device(trans
, device
, old_size
);
2905 btrfs_end_transaction(trans
, dev_root
);
2908 /* step two, relocate all the chunks */
2909 path
= btrfs_alloc_path();
2915 /* zero out stat counters */
2916 spin_lock(&fs_info
->balance_lock
);
2917 memset(&bctl
->stat
, 0, sizeof(bctl
->stat
));
2918 spin_unlock(&fs_info
->balance_lock
);
2920 key
.objectid
= BTRFS_FIRST_CHUNK_TREE_OBJECTID
;
2921 key
.offset
= (u64
)-1;
2922 key
.type
= BTRFS_CHUNK_ITEM_KEY
;
2925 if ((!counting
&& atomic_read(&fs_info
->balance_pause_req
)) ||
2926 atomic_read(&fs_info
->balance_cancel_req
)) {
2931 ret
= btrfs_search_slot(NULL
, chunk_root
, &key
, path
, 0, 0);
2936 * this shouldn't happen, it means the last relocate
2940 BUG(); /* FIXME break ? */
2942 ret
= btrfs_previous_item(chunk_root
, path
, 0,
2943 BTRFS_CHUNK_ITEM_KEY
);
2949 leaf
= path
->nodes
[0];
2950 slot
= path
->slots
[0];
2951 btrfs_item_key_to_cpu(leaf
, &found_key
, slot
);
2953 if (found_key
.objectid
!= key
.objectid
)
2956 /* chunk zero is special */
2957 if (found_key
.offset
== 0)
2960 chunk
= btrfs_item_ptr(leaf
, slot
, struct btrfs_chunk
);
2963 spin_lock(&fs_info
->balance_lock
);
2964 bctl
->stat
.considered
++;
2965 spin_unlock(&fs_info
->balance_lock
);
2968 ret
= should_balance_chunk(chunk_root
, leaf
, chunk
,
2970 btrfs_release_path(path
);
2975 spin_lock(&fs_info
->balance_lock
);
2976 bctl
->stat
.expected
++;
2977 spin_unlock(&fs_info
->balance_lock
);
2981 ret
= btrfs_relocate_chunk(chunk_root
,
2982 chunk_root
->root_key
.objectid
,
2985 if (ret
&& ret
!= -ENOSPC
)
2987 if (ret
== -ENOSPC
) {
2990 spin_lock(&fs_info
->balance_lock
);
2991 bctl
->stat
.completed
++;
2992 spin_unlock(&fs_info
->balance_lock
);
2995 key
.offset
= found_key
.offset
- 1;
2999 btrfs_release_path(path
);
3004 btrfs_free_path(path
);
3005 if (enospc_errors
) {
3006 printk(KERN_INFO
"btrfs: %d enospc errors during balance\n",
3016 * alloc_profile_is_valid - see if a given profile is valid and reduced
3017 * @flags: profile to validate
3018 * @extended: if true @flags is treated as an extended profile
3020 static int alloc_profile_is_valid(u64 flags
, int extended
)
3022 u64 mask
= (extended
? BTRFS_EXTENDED_PROFILE_MASK
:
3023 BTRFS_BLOCK_GROUP_PROFILE_MASK
);
3025 flags
&= ~BTRFS_BLOCK_GROUP_TYPE_MASK
;
3027 /* 1) check that all other bits are zeroed */
3031 /* 2) see if profile is reduced */
3033 return !extended
; /* "0" is valid for usual profiles */
3035 /* true if exactly one bit set */
3036 return (flags
& (flags
- 1)) == 0;
3039 static inline int balance_need_close(struct btrfs_fs_info
*fs_info
)
3041 /* cancel requested || normal exit path */
3042 return atomic_read(&fs_info
->balance_cancel_req
) ||
3043 (atomic_read(&fs_info
->balance_pause_req
) == 0 &&
3044 atomic_read(&fs_info
->balance_cancel_req
) == 0);
3047 static void __cancel_balance(struct btrfs_fs_info
*fs_info
)
3051 unset_balance_control(fs_info
);
3052 ret
= del_balance_item(fs_info
->tree_root
);
3055 atomic_set(&fs_info
->mutually_exclusive_operation_running
, 0);
3058 void update_ioctl_balance_args(struct btrfs_fs_info
*fs_info
, int lock
,
3059 struct btrfs_ioctl_balance_args
*bargs
);
3062 * Should be called with both balance and volume mutexes held
3064 int btrfs_balance(struct btrfs_balance_control
*bctl
,
3065 struct btrfs_ioctl_balance_args
*bargs
)
3067 struct btrfs_fs_info
*fs_info
= bctl
->fs_info
;
3074 if (btrfs_fs_closing(fs_info
) ||
3075 atomic_read(&fs_info
->balance_pause_req
) ||
3076 atomic_read(&fs_info
->balance_cancel_req
)) {
3081 allowed
= btrfs_super_incompat_flags(fs_info
->super_copy
);
3082 if (allowed
& BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS
)
3086 * In case of mixed groups both data and meta should be picked,
3087 * and identical options should be given for both of them.
3089 allowed
= BTRFS_BALANCE_DATA
| BTRFS_BALANCE_METADATA
;
3090 if (mixed
&& (bctl
->flags
& allowed
)) {
3091 if (!(bctl
->flags
& BTRFS_BALANCE_DATA
) ||
3092 !(bctl
->flags
& BTRFS_BALANCE_METADATA
) ||
3093 memcmp(&bctl
->data
, &bctl
->meta
, sizeof(bctl
->data
))) {
3094 printk(KERN_ERR
"btrfs: with mixed groups data and "
3095 "metadata balance options must be the same\n");
3101 num_devices
= fs_info
->fs_devices
->num_devices
;
3102 btrfs_dev_replace_lock(&fs_info
->dev_replace
);
3103 if (btrfs_dev_replace_is_ongoing(&fs_info
->dev_replace
)) {
3104 BUG_ON(num_devices
< 1);
3107 btrfs_dev_replace_unlock(&fs_info
->dev_replace
);
3108 allowed
= BTRFS_AVAIL_ALLOC_BIT_SINGLE
;
3109 if (num_devices
== 1)
3110 allowed
|= BTRFS_BLOCK_GROUP_DUP
;
3111 else if (num_devices
< 4)
3112 allowed
|= (BTRFS_BLOCK_GROUP_RAID0
| BTRFS_BLOCK_GROUP_RAID1
);
3114 allowed
|= (BTRFS_BLOCK_GROUP_RAID0
| BTRFS_BLOCK_GROUP_RAID1
|
3115 BTRFS_BLOCK_GROUP_RAID10
|
3116 BTRFS_BLOCK_GROUP_RAID5
|
3117 BTRFS_BLOCK_GROUP_RAID6
);
3119 if ((bctl
->data
.flags
& BTRFS_BALANCE_ARGS_CONVERT
) &&
3120 (!alloc_profile_is_valid(bctl
->data
.target
, 1) ||
3121 (bctl
->data
.target
& ~allowed
))) {
3122 printk(KERN_ERR
"btrfs: unable to start balance with target "
3123 "data profile %llu\n",
3124 (unsigned long long)bctl
->data
.target
);
3128 if ((bctl
->meta
.flags
& BTRFS_BALANCE_ARGS_CONVERT
) &&
3129 (!alloc_profile_is_valid(bctl
->meta
.target
, 1) ||
3130 (bctl
->meta
.target
& ~allowed
))) {
3131 printk(KERN_ERR
"btrfs: unable to start balance with target "
3132 "metadata profile %llu\n",
3133 (unsigned long long)bctl
->meta
.target
);
3137 if ((bctl
->sys
.flags
& BTRFS_BALANCE_ARGS_CONVERT
) &&
3138 (!alloc_profile_is_valid(bctl
->sys
.target
, 1) ||
3139 (bctl
->sys
.target
& ~allowed
))) {
3140 printk(KERN_ERR
"btrfs: unable to start balance with target "
3141 "system profile %llu\n",
3142 (unsigned long long)bctl
->sys
.target
);
3147 /* allow dup'ed data chunks only in mixed mode */
3148 if (!mixed
&& (bctl
->data
.flags
& BTRFS_BALANCE_ARGS_CONVERT
) &&
3149 (bctl
->data
.target
& BTRFS_BLOCK_GROUP_DUP
)) {
3150 printk(KERN_ERR
"btrfs: dup for data is not allowed\n");
3155 /* allow to reduce meta or sys integrity only if force set */
3156 allowed
= BTRFS_BLOCK_GROUP_DUP
| BTRFS_BLOCK_GROUP_RAID1
|
3157 BTRFS_BLOCK_GROUP_RAID10
|
3158 BTRFS_BLOCK_GROUP_RAID5
|
3159 BTRFS_BLOCK_GROUP_RAID6
;
3161 seq
= read_seqbegin(&fs_info
->profiles_lock
);
3163 if (((bctl
->sys
.flags
& BTRFS_BALANCE_ARGS_CONVERT
) &&
3164 (fs_info
->avail_system_alloc_bits
& allowed
) &&
3165 !(bctl
->sys
.target
& allowed
)) ||
3166 ((bctl
->meta
.flags
& BTRFS_BALANCE_ARGS_CONVERT
) &&
3167 (fs_info
->avail_metadata_alloc_bits
& allowed
) &&
3168 !(bctl
->meta
.target
& allowed
))) {
3169 if (bctl
->flags
& BTRFS_BALANCE_FORCE
) {
3170 printk(KERN_INFO
"btrfs: force reducing metadata "
3173 printk(KERN_ERR
"btrfs: balance will reduce metadata "
3174 "integrity, use force if you want this\n");
3179 } while (read_seqretry(&fs_info
->profiles_lock
, seq
));
3181 if (bctl
->sys
.flags
& BTRFS_BALANCE_ARGS_CONVERT
) {
3182 int num_tolerated_disk_barrier_failures
;
3183 u64 target
= bctl
->sys
.target
;
3185 num_tolerated_disk_barrier_failures
=
3186 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info
);
3187 if (num_tolerated_disk_barrier_failures
> 0 &&
3189 (BTRFS_BLOCK_GROUP_DUP
| BTRFS_BLOCK_GROUP_RAID0
|
3190 BTRFS_AVAIL_ALLOC_BIT_SINGLE
)))
3191 num_tolerated_disk_barrier_failures
= 0;
3192 else if (num_tolerated_disk_barrier_failures
> 1 &&
3194 (BTRFS_BLOCK_GROUP_RAID1
| BTRFS_BLOCK_GROUP_RAID10
)))
3195 num_tolerated_disk_barrier_failures
= 1;
3197 fs_info
->num_tolerated_disk_barrier_failures
=
3198 num_tolerated_disk_barrier_failures
;
3201 ret
= insert_balance_item(fs_info
->tree_root
, bctl
);
3202 if (ret
&& ret
!= -EEXIST
)
3205 if (!(bctl
->flags
& BTRFS_BALANCE_RESUME
)) {
3206 BUG_ON(ret
== -EEXIST
);
3207 set_balance_control(bctl
);
3209 BUG_ON(ret
!= -EEXIST
);
3210 spin_lock(&fs_info
->balance_lock
);
3211 update_balance_args(bctl
);
3212 spin_unlock(&fs_info
->balance_lock
);
3215 atomic_inc(&fs_info
->balance_running
);
3216 mutex_unlock(&fs_info
->balance_mutex
);
3218 ret
= __btrfs_balance(fs_info
);
3220 mutex_lock(&fs_info
->balance_mutex
);
3221 atomic_dec(&fs_info
->balance_running
);
3223 if (bctl
->sys
.flags
& BTRFS_BALANCE_ARGS_CONVERT
) {
3224 fs_info
->num_tolerated_disk_barrier_failures
=
3225 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info
);
3229 memset(bargs
, 0, sizeof(*bargs
));
3230 update_ioctl_balance_args(fs_info
, 0, bargs
);
3233 wake_up(&fs_info
->balance_wait_q
);
3237 if (bctl
->flags
& BTRFS_BALANCE_RESUME
)
3238 __cancel_balance(fs_info
);
3241 atomic_set(&fs_info
->mutually_exclusive_operation_running
, 0);
3246 static int balance_kthread(void *data
)
3248 struct btrfs_fs_info
*fs_info
= data
;
3251 mutex_lock(&fs_info
->volume_mutex
);
3252 mutex_lock(&fs_info
->balance_mutex
);
3254 if (fs_info
->balance_ctl
) {
3255 printk(KERN_INFO
"btrfs: continuing balance\n");
3256 ret
= btrfs_balance(fs_info
->balance_ctl
, NULL
);
3259 mutex_unlock(&fs_info
->balance_mutex
);
3260 mutex_unlock(&fs_info
->volume_mutex
);
3265 int btrfs_resume_balance_async(struct btrfs_fs_info
*fs_info
)
3267 struct task_struct
*tsk
;
3269 spin_lock(&fs_info
->balance_lock
);
3270 if (!fs_info
->balance_ctl
) {
3271 spin_unlock(&fs_info
->balance_lock
);
3274 spin_unlock(&fs_info
->balance_lock
);
3276 if (btrfs_test_opt(fs_info
->tree_root
, SKIP_BALANCE
)) {
3277 printk(KERN_INFO
"btrfs: force skipping balance\n");
3281 tsk
= kthread_run(balance_kthread
, fs_info
, "btrfs-balance");
3283 return PTR_ERR(tsk
);
3288 int btrfs_recover_balance(struct btrfs_fs_info
*fs_info
)
3290 struct btrfs_balance_control
*bctl
;
3291 struct btrfs_balance_item
*item
;
3292 struct btrfs_disk_balance_args disk_bargs
;
3293 struct btrfs_path
*path
;
3294 struct extent_buffer
*leaf
;
3295 struct btrfs_key key
;
3298 path
= btrfs_alloc_path();
3302 key
.objectid
= BTRFS_BALANCE_OBJECTID
;
3303 key
.type
= BTRFS_BALANCE_ITEM_KEY
;
3306 ret
= btrfs_search_slot(NULL
, fs_info
->tree_root
, &key
, path
, 0, 0);
3309 if (ret
> 0) { /* ret = -ENOENT; */
3314 bctl
= kzalloc(sizeof(*bctl
), GFP_NOFS
);
3320 leaf
= path
->nodes
[0];
3321 item
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_balance_item
);
3323 bctl
->fs_info
= fs_info
;
3324 bctl
->flags
= btrfs_balance_flags(leaf
, item
);
3325 bctl
->flags
|= BTRFS_BALANCE_RESUME
;
3327 btrfs_balance_data(leaf
, item
, &disk_bargs
);
3328 btrfs_disk_balance_args_to_cpu(&bctl
->data
, &disk_bargs
);
3329 btrfs_balance_meta(leaf
, item
, &disk_bargs
);
3330 btrfs_disk_balance_args_to_cpu(&bctl
->meta
, &disk_bargs
);
3331 btrfs_balance_sys(leaf
, item
, &disk_bargs
);
3332 btrfs_disk_balance_args_to_cpu(&bctl
->sys
, &disk_bargs
);
3334 WARN_ON(atomic_xchg(&fs_info
->mutually_exclusive_operation_running
, 1));
3336 mutex_lock(&fs_info
->volume_mutex
);
3337 mutex_lock(&fs_info
->balance_mutex
);
3339 set_balance_control(bctl
);
3341 mutex_unlock(&fs_info
->balance_mutex
);
3342 mutex_unlock(&fs_info
->volume_mutex
);
3344 btrfs_free_path(path
);
3348 int btrfs_pause_balance(struct btrfs_fs_info
*fs_info
)
3352 mutex_lock(&fs_info
->balance_mutex
);
3353 if (!fs_info
->balance_ctl
) {
3354 mutex_unlock(&fs_info
->balance_mutex
);
3358 if (atomic_read(&fs_info
->balance_running
)) {
3359 atomic_inc(&fs_info
->balance_pause_req
);
3360 mutex_unlock(&fs_info
->balance_mutex
);
3362 wait_event(fs_info
->balance_wait_q
,
3363 atomic_read(&fs_info
->balance_running
) == 0);
3365 mutex_lock(&fs_info
->balance_mutex
);
3366 /* we are good with balance_ctl ripped off from under us */
3367 BUG_ON(atomic_read(&fs_info
->balance_running
));
3368 atomic_dec(&fs_info
->balance_pause_req
);
3373 mutex_unlock(&fs_info
->balance_mutex
);
3377 int btrfs_cancel_balance(struct btrfs_fs_info
*fs_info
)
3379 mutex_lock(&fs_info
->balance_mutex
);
3380 if (!fs_info
->balance_ctl
) {
3381 mutex_unlock(&fs_info
->balance_mutex
);
3385 atomic_inc(&fs_info
->balance_cancel_req
);
3387 * if we are running just wait and return, balance item is
3388 * deleted in btrfs_balance in this case
3390 if (atomic_read(&fs_info
->balance_running
)) {
3391 mutex_unlock(&fs_info
->balance_mutex
);
3392 wait_event(fs_info
->balance_wait_q
,
3393 atomic_read(&fs_info
->balance_running
) == 0);
3394 mutex_lock(&fs_info
->balance_mutex
);
3396 /* __cancel_balance needs volume_mutex */
3397 mutex_unlock(&fs_info
->balance_mutex
);
3398 mutex_lock(&fs_info
->volume_mutex
);
3399 mutex_lock(&fs_info
->balance_mutex
);
3401 if (fs_info
->balance_ctl
)
3402 __cancel_balance(fs_info
);
3404 mutex_unlock(&fs_info
->volume_mutex
);
3407 BUG_ON(fs_info
->balance_ctl
|| atomic_read(&fs_info
->balance_running
));
3408 atomic_dec(&fs_info
->balance_cancel_req
);
3409 mutex_unlock(&fs_info
->balance_mutex
);
3414 * shrinking a device means finding all of the device extents past
3415 * the new size, and then following the back refs to the chunks.
3416 * The chunk relocation code actually frees the device extent
3418 int btrfs_shrink_device(struct btrfs_device
*device
, u64 new_size
)
3420 struct btrfs_trans_handle
*trans
;
3421 struct btrfs_root
*root
= device
->dev_root
;
3422 struct btrfs_dev_extent
*dev_extent
= NULL
;
3423 struct btrfs_path
*path
;
3431 bool retried
= false;
3432 struct extent_buffer
*l
;
3433 struct btrfs_key key
;
3434 struct btrfs_super_block
*super_copy
= root
->fs_info
->super_copy
;
3435 u64 old_total
= btrfs_super_total_bytes(super_copy
);
3436 u64 old_size
= device
->total_bytes
;
3437 u64 diff
= device
->total_bytes
- new_size
;
3439 if (device
->is_tgtdev_for_dev_replace
)
3442 path
= btrfs_alloc_path();
3450 device
->total_bytes
= new_size
;
3451 if (device
->writeable
) {
3452 device
->fs_devices
->total_rw_bytes
-= diff
;
3453 spin_lock(&root
->fs_info
->free_chunk_lock
);
3454 root
->fs_info
->free_chunk_space
-= diff
;
3455 spin_unlock(&root
->fs_info
->free_chunk_lock
);
3457 unlock_chunks(root
);
3460 key
.objectid
= device
->devid
;
3461 key
.offset
= (u64
)-1;
3462 key
.type
= BTRFS_DEV_EXTENT_KEY
;
3465 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
3469 ret
= btrfs_previous_item(root
, path
, 0, key
.type
);
3474 btrfs_release_path(path
);
3479 slot
= path
->slots
[0];
3480 btrfs_item_key_to_cpu(l
, &key
, path
->slots
[0]);
3482 if (key
.objectid
!= device
->devid
) {
3483 btrfs_release_path(path
);
3487 dev_extent
= btrfs_item_ptr(l
, slot
, struct btrfs_dev_extent
);
3488 length
= btrfs_dev_extent_length(l
, dev_extent
);
3490 if (key
.offset
+ length
<= new_size
) {
3491 btrfs_release_path(path
);
3495 chunk_tree
= btrfs_dev_extent_chunk_tree(l
, dev_extent
);
3496 chunk_objectid
= btrfs_dev_extent_chunk_objectid(l
, dev_extent
);
3497 chunk_offset
= btrfs_dev_extent_chunk_offset(l
, dev_extent
);
3498 btrfs_release_path(path
);
3500 ret
= btrfs_relocate_chunk(root
, chunk_tree
, chunk_objectid
,
3502 if (ret
&& ret
!= -ENOSPC
)
3506 } while (key
.offset
-- > 0);
3508 if (failed
&& !retried
) {
3512 } else if (failed
&& retried
) {
3516 device
->total_bytes
= old_size
;
3517 if (device
->writeable
)
3518 device
->fs_devices
->total_rw_bytes
+= diff
;
3519 spin_lock(&root
->fs_info
->free_chunk_lock
);
3520 root
->fs_info
->free_chunk_space
+= diff
;
3521 spin_unlock(&root
->fs_info
->free_chunk_lock
);
3522 unlock_chunks(root
);
3526 /* Shrinking succeeded, else we would be at "done". */
3527 trans
= btrfs_start_transaction(root
, 0);
3528 if (IS_ERR(trans
)) {
3529 ret
= PTR_ERR(trans
);
3535 device
->disk_total_bytes
= new_size
;
3536 /* Now btrfs_update_device() will change the on-disk size. */
3537 ret
= btrfs_update_device(trans
, device
);
3539 unlock_chunks(root
);
3540 btrfs_end_transaction(trans
, root
);
3543 WARN_ON(diff
> old_total
);
3544 btrfs_set_super_total_bytes(super_copy
, old_total
- diff
);
3545 unlock_chunks(root
);
3546 btrfs_end_transaction(trans
, root
);
3548 btrfs_free_path(path
);
3552 static int btrfs_add_system_chunk(struct btrfs_root
*root
,
3553 struct btrfs_key
*key
,
3554 struct btrfs_chunk
*chunk
, int item_size
)
3556 struct btrfs_super_block
*super_copy
= root
->fs_info
->super_copy
;
3557 struct btrfs_disk_key disk_key
;
3561 array_size
= btrfs_super_sys_array_size(super_copy
);
3562 if (array_size
+ item_size
> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE
)
3565 ptr
= super_copy
->sys_chunk_array
+ array_size
;
3566 btrfs_cpu_key_to_disk(&disk_key
, key
);
3567 memcpy(ptr
, &disk_key
, sizeof(disk_key
));
3568 ptr
+= sizeof(disk_key
);
3569 memcpy(ptr
, chunk
, item_size
);
3570 item_size
+= sizeof(disk_key
);
3571 btrfs_set_super_sys_array_size(super_copy
, array_size
+ item_size
);
3576 * sort the devices in descending order by max_avail, total_avail
3578 static int btrfs_cmp_device_info(const void *a
, const void *b
)
3580 const struct btrfs_device_info
*di_a
= a
;
3581 const struct btrfs_device_info
*di_b
= b
;
3583 if (di_a
->max_avail
> di_b
->max_avail
)
3585 if (di_a
->max_avail
< di_b
->max_avail
)
3587 if (di_a
->total_avail
> di_b
->total_avail
)
3589 if (di_a
->total_avail
< di_b
->total_avail
)
3594 struct btrfs_raid_attr btrfs_raid_array
[BTRFS_NR_RAID_TYPES
] = {
3595 [BTRFS_RAID_RAID10
] = {
3598 .devs_max
= 0, /* 0 == as many as possible */
3600 .devs_increment
= 2,
3603 [BTRFS_RAID_RAID1
] = {
3608 .devs_increment
= 2,
3611 [BTRFS_RAID_DUP
] = {
3616 .devs_increment
= 1,
3619 [BTRFS_RAID_RAID0
] = {
3624 .devs_increment
= 1,
3627 [BTRFS_RAID_SINGLE
] = {
3632 .devs_increment
= 1,
3635 [BTRFS_RAID_RAID5
] = {
3640 .devs_increment
= 1,
3643 [BTRFS_RAID_RAID6
] = {
3648 .devs_increment
= 1,
3653 static u32
find_raid56_stripe_len(u32 data_devices
, u32 dev_stripe_target
)
3655 /* TODO allow them to set a preferred stripe size */
3659 static void check_raid56_incompat_flag(struct btrfs_fs_info
*info
, u64 type
)
3663 if (!(type
& (BTRFS_BLOCK_GROUP_RAID5
| BTRFS_BLOCK_GROUP_RAID6
)))
3666 features
= btrfs_super_incompat_flags(info
->super_copy
);
3667 if (features
& BTRFS_FEATURE_INCOMPAT_RAID56
)
3670 features
|= BTRFS_FEATURE_INCOMPAT_RAID56
;
3671 btrfs_set_super_incompat_flags(info
->super_copy
, features
);
3672 printk(KERN_INFO
"btrfs: setting RAID5/6 feature flag\n");
3675 static int __btrfs_alloc_chunk(struct btrfs_trans_handle
*trans
,
3676 struct btrfs_root
*extent_root
,
3677 struct map_lookup
**map_ret
,
3678 u64
*num_bytes_out
, u64
*stripe_size_out
,
3679 u64 start
, u64 type
)
3681 struct btrfs_fs_info
*info
= extent_root
->fs_info
;
3682 struct btrfs_fs_devices
*fs_devices
= info
->fs_devices
;
3683 struct list_head
*cur
;
3684 struct map_lookup
*map
= NULL
;
3685 struct extent_map_tree
*em_tree
;
3686 struct extent_map
*em
;
3687 struct btrfs_device_info
*devices_info
= NULL
;
3689 int num_stripes
; /* total number of stripes to allocate */
3690 int data_stripes
; /* number of stripes that count for
3692 int sub_stripes
; /* sub_stripes info for map */
3693 int dev_stripes
; /* stripes per dev */
3694 int devs_max
; /* max devs to use */
3695 int devs_min
; /* min devs needed */
3696 int devs_increment
; /* ndevs has to be a multiple of this */
3697 int ncopies
; /* how many copies to data has */
3699 u64 max_stripe_size
;
3703 u64 raid_stripe_len
= BTRFS_STRIPE_LEN
;
3709 BUG_ON(!alloc_profile_is_valid(type
, 0));
3711 if (list_empty(&fs_devices
->alloc_list
))
3714 index
= __get_raid_index(type
);
3716 sub_stripes
= btrfs_raid_array
[index
].sub_stripes
;
3717 dev_stripes
= btrfs_raid_array
[index
].dev_stripes
;
3718 devs_max
= btrfs_raid_array
[index
].devs_max
;
3719 devs_min
= btrfs_raid_array
[index
].devs_min
;
3720 devs_increment
= btrfs_raid_array
[index
].devs_increment
;
3721 ncopies
= btrfs_raid_array
[index
].ncopies
;
3723 if (type
& BTRFS_BLOCK_GROUP_DATA
) {
3724 max_stripe_size
= 1024 * 1024 * 1024;
3725 max_chunk_size
= 10 * max_stripe_size
;
3726 } else if (type
& BTRFS_BLOCK_GROUP_METADATA
) {
3727 /* for larger filesystems, use larger metadata chunks */
3728 if (fs_devices
->total_rw_bytes
> 50ULL * 1024 * 1024 * 1024)
3729 max_stripe_size
= 1024 * 1024 * 1024;
3731 max_stripe_size
= 256 * 1024 * 1024;
3732 max_chunk_size
= max_stripe_size
;
3733 } else if (type
& BTRFS_BLOCK_GROUP_SYSTEM
) {
3734 max_stripe_size
= 32 * 1024 * 1024;
3735 max_chunk_size
= 2 * max_stripe_size
;
3737 printk(KERN_ERR
"btrfs: invalid chunk type 0x%llx requested\n",
3742 /* we don't want a chunk larger than 10% of writeable space */
3743 max_chunk_size
= min(div_factor(fs_devices
->total_rw_bytes
, 1),
3746 devices_info
= kzalloc(sizeof(*devices_info
) * fs_devices
->rw_devices
,
3751 cur
= fs_devices
->alloc_list
.next
;
3754 * in the first pass through the devices list, we gather information
3755 * about the available holes on each device.
3758 while (cur
!= &fs_devices
->alloc_list
) {
3759 struct btrfs_device
*device
;
3763 device
= list_entry(cur
, struct btrfs_device
, dev_alloc_list
);
3767 if (!device
->writeable
) {
3769 "btrfs: read-only device in alloc_list\n");
3773 if (!device
->in_fs_metadata
||
3774 device
->is_tgtdev_for_dev_replace
)
3777 if (device
->total_bytes
> device
->bytes_used
)
3778 total_avail
= device
->total_bytes
- device
->bytes_used
;
3782 /* If there is no space on this device, skip it. */
3783 if (total_avail
== 0)
3786 ret
= find_free_dev_extent(device
,
3787 max_stripe_size
* dev_stripes
,
3788 &dev_offset
, &max_avail
);
3789 if (ret
&& ret
!= -ENOSPC
)
3793 max_avail
= max_stripe_size
* dev_stripes
;
3795 if (max_avail
< BTRFS_STRIPE_LEN
* dev_stripes
)
3798 if (ndevs
== fs_devices
->rw_devices
) {
3799 WARN(1, "%s: found more than %llu devices\n",
3800 __func__
, fs_devices
->rw_devices
);
3803 devices_info
[ndevs
].dev_offset
= dev_offset
;
3804 devices_info
[ndevs
].max_avail
= max_avail
;
3805 devices_info
[ndevs
].total_avail
= total_avail
;
3806 devices_info
[ndevs
].dev
= device
;
3811 * now sort the devices by hole size / available space
3813 sort(devices_info
, ndevs
, sizeof(struct btrfs_device_info
),
3814 btrfs_cmp_device_info
, NULL
);
3816 /* round down to number of usable stripes */
3817 ndevs
-= ndevs
% devs_increment
;
3819 if (ndevs
< devs_increment
* sub_stripes
|| ndevs
< devs_min
) {
3824 if (devs_max
&& ndevs
> devs_max
)
3827 * the primary goal is to maximize the number of stripes, so use as many
3828 * devices as possible, even if the stripes are not maximum sized.
3830 stripe_size
= devices_info
[ndevs
-1].max_avail
;
3831 num_stripes
= ndevs
* dev_stripes
;
3834 * this will have to be fixed for RAID1 and RAID10 over
3837 data_stripes
= num_stripes
/ ncopies
;
3839 if (stripe_size
* ndevs
> max_chunk_size
* ncopies
) {
3840 stripe_size
= max_chunk_size
* ncopies
;
3841 do_div(stripe_size
, ndevs
);
3843 if (type
& BTRFS_BLOCK_GROUP_RAID5
) {
3844 raid_stripe_len
= find_raid56_stripe_len(ndevs
- 1,
3845 btrfs_super_stripesize(info
->super_copy
));
3846 data_stripes
= num_stripes
- 1;
3848 if (type
& BTRFS_BLOCK_GROUP_RAID6
) {
3849 raid_stripe_len
= find_raid56_stripe_len(ndevs
- 2,
3850 btrfs_super_stripesize(info
->super_copy
));
3851 data_stripes
= num_stripes
- 2;
3853 do_div(stripe_size
, dev_stripes
);
3855 /* align to BTRFS_STRIPE_LEN */
3856 do_div(stripe_size
, raid_stripe_len
);
3857 stripe_size
*= raid_stripe_len
;
3859 map
= kmalloc(map_lookup_size(num_stripes
), GFP_NOFS
);
3864 map
->num_stripes
= num_stripes
;
3866 for (i
= 0; i
< ndevs
; ++i
) {
3867 for (j
= 0; j
< dev_stripes
; ++j
) {
3868 int s
= i
* dev_stripes
+ j
;
3869 map
->stripes
[s
].dev
= devices_info
[i
].dev
;
3870 map
->stripes
[s
].physical
= devices_info
[i
].dev_offset
+
3874 map
->sector_size
= extent_root
->sectorsize
;
3875 map
->stripe_len
= raid_stripe_len
;
3876 map
->io_align
= raid_stripe_len
;
3877 map
->io_width
= raid_stripe_len
;
3879 map
->sub_stripes
= sub_stripes
;
3882 num_bytes
= stripe_size
* data_stripes
;
3884 *stripe_size_out
= stripe_size
;
3885 *num_bytes_out
= num_bytes
;
3887 trace_btrfs_chunk_alloc(info
->chunk_root
, map
, start
, num_bytes
);
3889 em
= alloc_extent_map();
3894 em
->bdev
= (struct block_device
*)map
;
3896 em
->len
= num_bytes
;
3897 em
->block_start
= 0;
3898 em
->block_len
= em
->len
;
3900 em_tree
= &extent_root
->fs_info
->mapping_tree
.map_tree
;
3901 write_lock(&em_tree
->lock
);
3902 ret
= add_extent_mapping(em_tree
, em
);
3903 write_unlock(&em_tree
->lock
);
3905 free_extent_map(em
);
3909 for (i
= 0; i
< map
->num_stripes
; ++i
) {
3910 struct btrfs_device
*device
;
3913 device
= map
->stripes
[i
].dev
;
3914 dev_offset
= map
->stripes
[i
].physical
;
3916 ret
= btrfs_alloc_dev_extent(trans
, device
,
3917 info
->chunk_root
->root_key
.objectid
,
3918 BTRFS_FIRST_CHUNK_TREE_OBJECTID
,
3919 start
, dev_offset
, stripe_size
);
3921 goto error_dev_extent
;
3924 ret
= btrfs_make_block_group(trans
, extent_root
, 0, type
,
3925 BTRFS_FIRST_CHUNK_TREE_OBJECTID
,
3928 i
= map
->num_stripes
- 1;
3929 goto error_dev_extent
;
3932 free_extent_map(em
);
3933 check_raid56_incompat_flag(extent_root
->fs_info
, type
);
3935 kfree(devices_info
);
3939 for (; i
>= 0; i
--) {
3940 struct btrfs_device
*device
;
3943 device
= map
->stripes
[i
].dev
;
3944 err
= btrfs_free_dev_extent(trans
, device
, start
);
3946 btrfs_abort_transaction(trans
, extent_root
, err
);
3950 write_lock(&em_tree
->lock
);
3951 remove_extent_mapping(em_tree
, em
);
3952 write_unlock(&em_tree
->lock
);
3954 /* One for our allocation */
3955 free_extent_map(em
);
3956 /* One for the tree reference */
3957 free_extent_map(em
);
3960 kfree(devices_info
);
3964 static int __finish_chunk_alloc(struct btrfs_trans_handle
*trans
,
3965 struct btrfs_root
*extent_root
,
3966 struct map_lookup
*map
, u64 chunk_offset
,
3967 u64 chunk_size
, u64 stripe_size
)
3970 struct btrfs_key key
;
3971 struct btrfs_root
*chunk_root
= extent_root
->fs_info
->chunk_root
;
3972 struct btrfs_device
*device
;
3973 struct btrfs_chunk
*chunk
;
3974 struct btrfs_stripe
*stripe
;
3975 size_t item_size
= btrfs_chunk_item_size(map
->num_stripes
);
3979 chunk
= kzalloc(item_size
, GFP_NOFS
);
3984 while (index
< map
->num_stripes
) {
3985 device
= map
->stripes
[index
].dev
;
3986 device
->bytes_used
+= stripe_size
;
3987 ret
= btrfs_update_device(trans
, device
);
3993 spin_lock(&extent_root
->fs_info
->free_chunk_lock
);
3994 extent_root
->fs_info
->free_chunk_space
-= (stripe_size
*
3996 spin_unlock(&extent_root
->fs_info
->free_chunk_lock
);
3999 stripe
= &chunk
->stripe
;
4000 while (index
< map
->num_stripes
) {
4001 device
= map
->stripes
[index
].dev
;
4002 dev_offset
= map
->stripes
[index
].physical
;
4004 btrfs_set_stack_stripe_devid(stripe
, device
->devid
);
4005 btrfs_set_stack_stripe_offset(stripe
, dev_offset
);
4006 memcpy(stripe
->dev_uuid
, device
->uuid
, BTRFS_UUID_SIZE
);
4011 btrfs_set_stack_chunk_length(chunk
, chunk_size
);
4012 btrfs_set_stack_chunk_owner(chunk
, extent_root
->root_key
.objectid
);
4013 btrfs_set_stack_chunk_stripe_len(chunk
, map
->stripe_len
);
4014 btrfs_set_stack_chunk_type(chunk
, map
->type
);
4015 btrfs_set_stack_chunk_num_stripes(chunk
, map
->num_stripes
);
4016 btrfs_set_stack_chunk_io_align(chunk
, map
->stripe_len
);
4017 btrfs_set_stack_chunk_io_width(chunk
, map
->stripe_len
);
4018 btrfs_set_stack_chunk_sector_size(chunk
, extent_root
->sectorsize
);
4019 btrfs_set_stack_chunk_sub_stripes(chunk
, map
->sub_stripes
);
4021 key
.objectid
= BTRFS_FIRST_CHUNK_TREE_OBJECTID
;
4022 key
.type
= BTRFS_CHUNK_ITEM_KEY
;
4023 key
.offset
= chunk_offset
;
4025 ret
= btrfs_insert_item(trans
, chunk_root
, &key
, chunk
, item_size
);
4027 if (ret
== 0 && map
->type
& BTRFS_BLOCK_GROUP_SYSTEM
) {
4029 * TODO: Cleanup of inserted chunk root in case of
4032 ret
= btrfs_add_system_chunk(chunk_root
, &key
, chunk
,
4042 * Chunk allocation falls into two parts. The first part does works
4043 * that make the new allocated chunk useable, but not do any operation
4044 * that modifies the chunk tree. The second part does the works that
4045 * require modifying the chunk tree. This division is important for the
4046 * bootstrap process of adding storage to a seed btrfs.
4048 int btrfs_alloc_chunk(struct btrfs_trans_handle
*trans
,
4049 struct btrfs_root
*extent_root
, u64 type
)
4054 struct map_lookup
*map
;
4055 struct btrfs_root
*chunk_root
= extent_root
->fs_info
->chunk_root
;
4058 ret
= find_next_chunk(chunk_root
, BTRFS_FIRST_CHUNK_TREE_OBJECTID
,
4063 ret
= __btrfs_alloc_chunk(trans
, extent_root
, &map
, &chunk_size
,
4064 &stripe_size
, chunk_offset
, type
);
4068 ret
= __finish_chunk_alloc(trans
, extent_root
, map
, chunk_offset
,
4069 chunk_size
, stripe_size
);
4075 static noinline
int init_first_rw_device(struct btrfs_trans_handle
*trans
,
4076 struct btrfs_root
*root
,
4077 struct btrfs_device
*device
)
4080 u64 sys_chunk_offset
;
4084 u64 sys_stripe_size
;
4086 struct map_lookup
*map
;
4087 struct map_lookup
*sys_map
;
4088 struct btrfs_fs_info
*fs_info
= root
->fs_info
;
4089 struct btrfs_root
*extent_root
= fs_info
->extent_root
;
4092 ret
= find_next_chunk(fs_info
->chunk_root
,
4093 BTRFS_FIRST_CHUNK_TREE_OBJECTID
, &chunk_offset
);
4097 alloc_profile
= btrfs_get_alloc_profile(extent_root
, 0);
4098 ret
= __btrfs_alloc_chunk(trans
, extent_root
, &map
, &chunk_size
,
4099 &stripe_size
, chunk_offset
, alloc_profile
);
4103 sys_chunk_offset
= chunk_offset
+ chunk_size
;
4105 alloc_profile
= btrfs_get_alloc_profile(fs_info
->chunk_root
, 0);
4106 ret
= __btrfs_alloc_chunk(trans
, extent_root
, &sys_map
,
4107 &sys_chunk_size
, &sys_stripe_size
,
4108 sys_chunk_offset
, alloc_profile
);
4110 btrfs_abort_transaction(trans
, root
, ret
);
4114 ret
= btrfs_add_device(trans
, fs_info
->chunk_root
, device
);
4116 btrfs_abort_transaction(trans
, root
, ret
);
4121 * Modifying chunk tree needs allocating new blocks from both
4122 * system block group and metadata block group. So we only can
4123 * do operations require modifying the chunk tree after both
4124 * block groups were created.
4126 ret
= __finish_chunk_alloc(trans
, extent_root
, map
, chunk_offset
,
4127 chunk_size
, stripe_size
);
4129 btrfs_abort_transaction(trans
, root
, ret
);
4133 ret
= __finish_chunk_alloc(trans
, extent_root
, sys_map
,
4134 sys_chunk_offset
, sys_chunk_size
,
4137 btrfs_abort_transaction(trans
, root
, ret
);
4144 int btrfs_chunk_readonly(struct btrfs_root
*root
, u64 chunk_offset
)
4146 struct extent_map
*em
;
4147 struct map_lookup
*map
;
4148 struct btrfs_mapping_tree
*map_tree
= &root
->fs_info
->mapping_tree
;
4152 read_lock(&map_tree
->map_tree
.lock
);
4153 em
= lookup_extent_mapping(&map_tree
->map_tree
, chunk_offset
, 1);
4154 read_unlock(&map_tree
->map_tree
.lock
);
4158 if (btrfs_test_opt(root
, DEGRADED
)) {
4159 free_extent_map(em
);
4163 map
= (struct map_lookup
*)em
->bdev
;
4164 for (i
= 0; i
< map
->num_stripes
; i
++) {
4165 if (!map
->stripes
[i
].dev
->writeable
) {
4170 free_extent_map(em
);
4174 void btrfs_mapping_init(struct btrfs_mapping_tree
*tree
)
4176 extent_map_tree_init(&tree
->map_tree
);
4179 void btrfs_mapping_tree_free(struct btrfs_mapping_tree
*tree
)
4181 struct extent_map
*em
;
4184 write_lock(&tree
->map_tree
.lock
);
4185 em
= lookup_extent_mapping(&tree
->map_tree
, 0, (u64
)-1);
4187 remove_extent_mapping(&tree
->map_tree
, em
);
4188 write_unlock(&tree
->map_tree
.lock
);
4193 free_extent_map(em
);
4194 /* once for the tree */
4195 free_extent_map(em
);
4199 int btrfs_num_copies(struct btrfs_fs_info
*fs_info
, u64 logical
, u64 len
)
4201 struct btrfs_mapping_tree
*map_tree
= &fs_info
->mapping_tree
;
4202 struct extent_map
*em
;
4203 struct map_lookup
*map
;
4204 struct extent_map_tree
*em_tree
= &map_tree
->map_tree
;
4207 read_lock(&em_tree
->lock
);
4208 em
= lookup_extent_mapping(em_tree
, logical
, len
);
4209 read_unlock(&em_tree
->lock
);
4212 BUG_ON(em
->start
> logical
|| em
->start
+ em
->len
< logical
);
4213 map
= (struct map_lookup
*)em
->bdev
;
4214 if (map
->type
& (BTRFS_BLOCK_GROUP_DUP
| BTRFS_BLOCK_GROUP_RAID1
))
4215 ret
= map
->num_stripes
;
4216 else if (map
->type
& BTRFS_BLOCK_GROUP_RAID10
)
4217 ret
= map
->sub_stripes
;
4218 else if (map
->type
& BTRFS_BLOCK_GROUP_RAID5
)
4220 else if (map
->type
& BTRFS_BLOCK_GROUP_RAID6
)
4224 free_extent_map(em
);
4226 btrfs_dev_replace_lock(&fs_info
->dev_replace
);
4227 if (btrfs_dev_replace_is_ongoing(&fs_info
->dev_replace
))
4229 btrfs_dev_replace_unlock(&fs_info
->dev_replace
);
4234 unsigned long btrfs_full_stripe_len(struct btrfs_root
*root
,
4235 struct btrfs_mapping_tree
*map_tree
,
4238 struct extent_map
*em
;
4239 struct map_lookup
*map
;
4240 struct extent_map_tree
*em_tree
= &map_tree
->map_tree
;
4241 unsigned long len
= root
->sectorsize
;
4243 read_lock(&em_tree
->lock
);
4244 em
= lookup_extent_mapping(em_tree
, logical
, len
);
4245 read_unlock(&em_tree
->lock
);
4248 BUG_ON(em
->start
> logical
|| em
->start
+ em
->len
< logical
);
4249 map
= (struct map_lookup
*)em
->bdev
;
4250 if (map
->type
& (BTRFS_BLOCK_GROUP_RAID5
|
4251 BTRFS_BLOCK_GROUP_RAID6
)) {
4252 len
= map
->stripe_len
* nr_data_stripes(map
);
4254 free_extent_map(em
);
4258 int btrfs_is_parity_mirror(struct btrfs_mapping_tree
*map_tree
,
4259 u64 logical
, u64 len
, int mirror_num
)
4261 struct extent_map
*em
;
4262 struct map_lookup
*map
;
4263 struct extent_map_tree
*em_tree
= &map_tree
->map_tree
;
4266 read_lock(&em_tree
->lock
);
4267 em
= lookup_extent_mapping(em_tree
, logical
, len
);
4268 read_unlock(&em_tree
->lock
);
4271 BUG_ON(em
->start
> logical
|| em
->start
+ em
->len
< logical
);
4272 map
= (struct map_lookup
*)em
->bdev
;
4273 if (map
->type
& (BTRFS_BLOCK_GROUP_RAID5
|
4274 BTRFS_BLOCK_GROUP_RAID6
))
4276 free_extent_map(em
);
4280 static int find_live_mirror(struct btrfs_fs_info
*fs_info
,
4281 struct map_lookup
*map
, int first
, int num
,
4282 int optimal
, int dev_replace_is_ongoing
)
4286 struct btrfs_device
*srcdev
;
4288 if (dev_replace_is_ongoing
&&
4289 fs_info
->dev_replace
.cont_reading_from_srcdev_mode
==
4290 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID
)
4291 srcdev
= fs_info
->dev_replace
.srcdev
;
4296 * try to avoid the drive that is the source drive for a
4297 * dev-replace procedure, only choose it if no other non-missing
4298 * mirror is available
4300 for (tolerance
= 0; tolerance
< 2; tolerance
++) {
4301 if (map
->stripes
[optimal
].dev
->bdev
&&
4302 (tolerance
|| map
->stripes
[optimal
].dev
!= srcdev
))
4304 for (i
= first
; i
< first
+ num
; i
++) {
4305 if (map
->stripes
[i
].dev
->bdev
&&
4306 (tolerance
|| map
->stripes
[i
].dev
!= srcdev
))
4311 /* we couldn't find one that doesn't fail. Just return something
4312 * and the io error handling code will clean up eventually
4317 static inline int parity_smaller(u64 a
, u64 b
)
4322 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4323 static void sort_parity_stripes(struct btrfs_bio
*bbio
, u64
*raid_map
)
4325 struct btrfs_bio_stripe s
;
4332 for (i
= 0; i
< bbio
->num_stripes
- 1; i
++) {
4333 if (parity_smaller(raid_map
[i
], raid_map
[i
+1])) {
4334 s
= bbio
->stripes
[i
];
4336 bbio
->stripes
[i
] = bbio
->stripes
[i
+1];
4337 raid_map
[i
] = raid_map
[i
+1];
4338 bbio
->stripes
[i
+1] = s
;
4346 static int __btrfs_map_block(struct btrfs_fs_info
*fs_info
, int rw
,
4347 u64 logical
, u64
*length
,
4348 struct btrfs_bio
**bbio_ret
,
4349 int mirror_num
, u64
**raid_map_ret
)
4351 struct extent_map
*em
;
4352 struct map_lookup
*map
;
4353 struct btrfs_mapping_tree
*map_tree
= &fs_info
->mapping_tree
;
4354 struct extent_map_tree
*em_tree
= &map_tree
->map_tree
;
4357 u64 stripe_end_offset
;
4362 u64
*raid_map
= NULL
;
4368 struct btrfs_bio
*bbio
= NULL
;
4369 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
4370 int dev_replace_is_ongoing
= 0;
4371 int num_alloc_stripes
;
4372 int patch_the_first_stripe_for_dev_replace
= 0;
4373 u64 physical_to_patch_in_first_stripe
= 0;
4374 u64 raid56_full_stripe_start
= (u64
)-1;
4376 read_lock(&em_tree
->lock
);
4377 em
= lookup_extent_mapping(em_tree
, logical
, *length
);
4378 read_unlock(&em_tree
->lock
);
4381 printk(KERN_CRIT
"btrfs: unable to find logical %llu len %llu\n",
4382 (unsigned long long)logical
,
4383 (unsigned long long)*length
);
4387 BUG_ON(em
->start
> logical
|| em
->start
+ em
->len
< logical
);
4388 map
= (struct map_lookup
*)em
->bdev
;
4389 offset
= logical
- em
->start
;
4391 if (mirror_num
> map
->num_stripes
)
4394 stripe_len
= map
->stripe_len
;
4397 * stripe_nr counts the total number of stripes we have to stride
4398 * to get to this block
4400 do_div(stripe_nr
, stripe_len
);
4402 stripe_offset
= stripe_nr
* stripe_len
;
4403 BUG_ON(offset
< stripe_offset
);
4405 /* stripe_offset is the offset of this block in its stripe*/
4406 stripe_offset
= offset
- stripe_offset
;
4408 /* if we're here for raid56, we need to know the stripe aligned start */
4409 if (map
->type
& (BTRFS_BLOCK_GROUP_RAID5
| BTRFS_BLOCK_GROUP_RAID6
)) {
4410 unsigned long full_stripe_len
= stripe_len
* nr_data_stripes(map
);
4411 raid56_full_stripe_start
= offset
;
4413 /* allow a write of a full stripe, but make sure we don't
4414 * allow straddling of stripes
4416 do_div(raid56_full_stripe_start
, full_stripe_len
);
4417 raid56_full_stripe_start
*= full_stripe_len
;
4420 if (rw
& REQ_DISCARD
) {
4421 /* we don't discard raid56 yet */
4423 (BTRFS_BLOCK_GROUP_RAID5
| BTRFS_BLOCK_GROUP_RAID6
)) {
4427 *length
= min_t(u64
, em
->len
- offset
, *length
);
4428 } else if (map
->type
& BTRFS_BLOCK_GROUP_PROFILE_MASK
) {
4430 /* For writes to RAID[56], allow a full stripeset across all disks.
4431 For other RAID types and for RAID[56] reads, just allow a single
4432 stripe (on a single disk). */
4433 if (map
->type
& (BTRFS_BLOCK_GROUP_RAID5
| BTRFS_BLOCK_GROUP_RAID6
) &&
4435 max_len
= stripe_len
* nr_data_stripes(map
) -
4436 (offset
- raid56_full_stripe_start
);
4438 /* we limit the length of each bio to what fits in a stripe */
4439 max_len
= stripe_len
- stripe_offset
;
4441 *length
= min_t(u64
, em
->len
- offset
, max_len
);
4443 *length
= em
->len
- offset
;
4446 /* This is for when we're called from btrfs_merge_bio_hook() and all
4447 it cares about is the length */
4451 btrfs_dev_replace_lock(dev_replace
);
4452 dev_replace_is_ongoing
= btrfs_dev_replace_is_ongoing(dev_replace
);
4453 if (!dev_replace_is_ongoing
)
4454 btrfs_dev_replace_unlock(dev_replace
);
4456 if (dev_replace_is_ongoing
&& mirror_num
== map
->num_stripes
+ 1 &&
4457 !(rw
& (REQ_WRITE
| REQ_DISCARD
| REQ_GET_READ_MIRRORS
)) &&
4458 dev_replace
->tgtdev
!= NULL
) {
4460 * in dev-replace case, for repair case (that's the only
4461 * case where the mirror is selected explicitly when
4462 * calling btrfs_map_block), blocks left of the left cursor
4463 * can also be read from the target drive.
4464 * For REQ_GET_READ_MIRRORS, the target drive is added as
4465 * the last one to the array of stripes. For READ, it also
4466 * needs to be supported using the same mirror number.
4467 * If the requested block is not left of the left cursor,
4468 * EIO is returned. This can happen because btrfs_num_copies()
4469 * returns one more in the dev-replace case.
4471 u64 tmp_length
= *length
;
4472 struct btrfs_bio
*tmp_bbio
= NULL
;
4473 int tmp_num_stripes
;
4474 u64 srcdev_devid
= dev_replace
->srcdev
->devid
;
4475 int index_srcdev
= 0;
4477 u64 physical_of_found
= 0;
4479 ret
= __btrfs_map_block(fs_info
, REQ_GET_READ_MIRRORS
,
4480 logical
, &tmp_length
, &tmp_bbio
, 0, NULL
);
4482 WARN_ON(tmp_bbio
!= NULL
);
4486 tmp_num_stripes
= tmp_bbio
->num_stripes
;
4487 if (mirror_num
> tmp_num_stripes
) {
4489 * REQ_GET_READ_MIRRORS does not contain this
4490 * mirror, that means that the requested area
4491 * is not left of the left cursor
4499 * process the rest of the function using the mirror_num
4500 * of the source drive. Therefore look it up first.
4501 * At the end, patch the device pointer to the one of the
4504 for (i
= 0; i
< tmp_num_stripes
; i
++) {
4505 if (tmp_bbio
->stripes
[i
].dev
->devid
== srcdev_devid
) {
4507 * In case of DUP, in order to keep it
4508 * simple, only add the mirror with the
4509 * lowest physical address
4512 physical_of_found
<=
4513 tmp_bbio
->stripes
[i
].physical
)
4518 tmp_bbio
->stripes
[i
].physical
;
4523 mirror_num
= index_srcdev
+ 1;
4524 patch_the_first_stripe_for_dev_replace
= 1;
4525 physical_to_patch_in_first_stripe
= physical_of_found
;
4534 } else if (mirror_num
> map
->num_stripes
) {
4540 stripe_nr_orig
= stripe_nr
;
4541 stripe_nr_end
= (offset
+ *length
+ map
->stripe_len
- 1) &
4542 (~(map
->stripe_len
- 1));
4543 do_div(stripe_nr_end
, map
->stripe_len
);
4544 stripe_end_offset
= stripe_nr_end
* map
->stripe_len
-
4547 if (map
->type
& BTRFS_BLOCK_GROUP_RAID0
) {
4548 if (rw
& REQ_DISCARD
)
4549 num_stripes
= min_t(u64
, map
->num_stripes
,
4550 stripe_nr_end
- stripe_nr_orig
);
4551 stripe_index
= do_div(stripe_nr
, map
->num_stripes
);
4552 } else if (map
->type
& BTRFS_BLOCK_GROUP_RAID1
) {
4553 if (rw
& (REQ_WRITE
| REQ_DISCARD
| REQ_GET_READ_MIRRORS
))
4554 num_stripes
= map
->num_stripes
;
4555 else if (mirror_num
)
4556 stripe_index
= mirror_num
- 1;
4558 stripe_index
= find_live_mirror(fs_info
, map
, 0,
4560 current
->pid
% map
->num_stripes
,
4561 dev_replace_is_ongoing
);
4562 mirror_num
= stripe_index
+ 1;
4565 } else if (map
->type
& BTRFS_BLOCK_GROUP_DUP
) {
4566 if (rw
& (REQ_WRITE
| REQ_DISCARD
| REQ_GET_READ_MIRRORS
)) {
4567 num_stripes
= map
->num_stripes
;
4568 } else if (mirror_num
) {
4569 stripe_index
= mirror_num
- 1;
4574 } else if (map
->type
& BTRFS_BLOCK_GROUP_RAID10
) {
4575 int factor
= map
->num_stripes
/ map
->sub_stripes
;
4577 stripe_index
= do_div(stripe_nr
, factor
);
4578 stripe_index
*= map
->sub_stripes
;
4580 if (rw
& (REQ_WRITE
| REQ_GET_READ_MIRRORS
))
4581 num_stripes
= map
->sub_stripes
;
4582 else if (rw
& REQ_DISCARD
)
4583 num_stripes
= min_t(u64
, map
->sub_stripes
*
4584 (stripe_nr_end
- stripe_nr_orig
),
4586 else if (mirror_num
)
4587 stripe_index
+= mirror_num
- 1;
4589 int old_stripe_index
= stripe_index
;
4590 stripe_index
= find_live_mirror(fs_info
, map
,
4592 map
->sub_stripes
, stripe_index
+
4593 current
->pid
% map
->sub_stripes
,
4594 dev_replace_is_ongoing
);
4595 mirror_num
= stripe_index
- old_stripe_index
+ 1;
4598 } else if (map
->type
& (BTRFS_BLOCK_GROUP_RAID5
|
4599 BTRFS_BLOCK_GROUP_RAID6
)) {
4602 if (bbio_ret
&& ((rw
& REQ_WRITE
) || mirror_num
> 1)
4606 /* push stripe_nr back to the start of the full stripe */
4607 stripe_nr
= raid56_full_stripe_start
;
4608 do_div(stripe_nr
, stripe_len
);
4610 stripe_index
= do_div(stripe_nr
, nr_data_stripes(map
));
4612 /* RAID[56] write or recovery. Return all stripes */
4613 num_stripes
= map
->num_stripes
;
4614 max_errors
= nr_parity_stripes(map
);
4616 raid_map
= kmalloc(sizeof(u64
) * num_stripes
,
4623 /* Work out the disk rotation on this stripe-set */
4625 rot
= do_div(tmp
, num_stripes
);
4627 /* Fill in the logical address of each stripe */
4628 tmp
= stripe_nr
* nr_data_stripes(map
);
4629 for (i
= 0; i
< nr_data_stripes(map
); i
++)
4630 raid_map
[(i
+rot
) % num_stripes
] =
4631 em
->start
+ (tmp
+ i
) * map
->stripe_len
;
4633 raid_map
[(i
+rot
) % map
->num_stripes
] = RAID5_P_STRIPE
;
4634 if (map
->type
& BTRFS_BLOCK_GROUP_RAID6
)
4635 raid_map
[(i
+rot
+1) % num_stripes
] =
4638 *length
= map
->stripe_len
;
4643 * Mirror #0 or #1 means the original data block.
4644 * Mirror #2 is RAID5 parity block.
4645 * Mirror #3 is RAID6 Q block.
4647 stripe_index
= do_div(stripe_nr
, nr_data_stripes(map
));
4649 stripe_index
= nr_data_stripes(map
) +
4652 /* We distribute the parity blocks across stripes */
4653 tmp
= stripe_nr
+ stripe_index
;
4654 stripe_index
= do_div(tmp
, map
->num_stripes
);
4658 * after this do_div call, stripe_nr is the number of stripes
4659 * on this device we have to walk to find the data, and
4660 * stripe_index is the number of our device in the stripe array
4662 stripe_index
= do_div(stripe_nr
, map
->num_stripes
);
4663 mirror_num
= stripe_index
+ 1;
4665 BUG_ON(stripe_index
>= map
->num_stripes
);
4667 num_alloc_stripes
= num_stripes
;
4668 if (dev_replace_is_ongoing
) {
4669 if (rw
& (REQ_WRITE
| REQ_DISCARD
))
4670 num_alloc_stripes
<<= 1;
4671 if (rw
& REQ_GET_READ_MIRRORS
)
4672 num_alloc_stripes
++;
4674 bbio
= kzalloc(btrfs_bio_size(num_alloc_stripes
), GFP_NOFS
);
4679 atomic_set(&bbio
->error
, 0);
4681 if (rw
& REQ_DISCARD
) {
4683 int sub_stripes
= 0;
4684 u64 stripes_per_dev
= 0;
4685 u32 remaining_stripes
= 0;
4686 u32 last_stripe
= 0;
4689 (BTRFS_BLOCK_GROUP_RAID0
| BTRFS_BLOCK_GROUP_RAID10
)) {
4690 if (map
->type
& BTRFS_BLOCK_GROUP_RAID0
)
4693 sub_stripes
= map
->sub_stripes
;
4695 factor
= map
->num_stripes
/ sub_stripes
;
4696 stripes_per_dev
= div_u64_rem(stripe_nr_end
-
4699 &remaining_stripes
);
4700 div_u64_rem(stripe_nr_end
- 1, factor
, &last_stripe
);
4701 last_stripe
*= sub_stripes
;
4704 for (i
= 0; i
< num_stripes
; i
++) {
4705 bbio
->stripes
[i
].physical
=
4706 map
->stripes
[stripe_index
].physical
+
4707 stripe_offset
+ stripe_nr
* map
->stripe_len
;
4708 bbio
->stripes
[i
].dev
= map
->stripes
[stripe_index
].dev
;
4710 if (map
->type
& (BTRFS_BLOCK_GROUP_RAID0
|
4711 BTRFS_BLOCK_GROUP_RAID10
)) {
4712 bbio
->stripes
[i
].length
= stripes_per_dev
*
4715 if (i
/ sub_stripes
< remaining_stripes
)
4716 bbio
->stripes
[i
].length
+=
4720 * Special for the first stripe and
4723 * |-------|...|-------|
4727 if (i
< sub_stripes
)
4728 bbio
->stripes
[i
].length
-=
4731 if (stripe_index
>= last_stripe
&&
4732 stripe_index
<= (last_stripe
+
4734 bbio
->stripes
[i
].length
-=
4737 if (i
== sub_stripes
- 1)
4740 bbio
->stripes
[i
].length
= *length
;
4743 if (stripe_index
== map
->num_stripes
) {
4744 /* This could only happen for RAID0/10 */
4750 for (i
= 0; i
< num_stripes
; i
++) {
4751 bbio
->stripes
[i
].physical
=
4752 map
->stripes
[stripe_index
].physical
+
4754 stripe_nr
* map
->stripe_len
;
4755 bbio
->stripes
[i
].dev
=
4756 map
->stripes
[stripe_index
].dev
;
4761 if (rw
& (REQ_WRITE
| REQ_GET_READ_MIRRORS
)) {
4762 if (map
->type
& (BTRFS_BLOCK_GROUP_RAID1
|
4763 BTRFS_BLOCK_GROUP_RAID10
|
4764 BTRFS_BLOCK_GROUP_RAID5
|
4765 BTRFS_BLOCK_GROUP_DUP
)) {
4767 } else if (map
->type
& BTRFS_BLOCK_GROUP_RAID6
) {
4772 if (dev_replace_is_ongoing
&& (rw
& (REQ_WRITE
| REQ_DISCARD
)) &&
4773 dev_replace
->tgtdev
!= NULL
) {
4774 int index_where_to_add
;
4775 u64 srcdev_devid
= dev_replace
->srcdev
->devid
;
4778 * duplicate the write operations while the dev replace
4779 * procedure is running. Since the copying of the old disk
4780 * to the new disk takes place at run time while the
4781 * filesystem is mounted writable, the regular write
4782 * operations to the old disk have to be duplicated to go
4783 * to the new disk as well.
4784 * Note that device->missing is handled by the caller, and
4785 * that the write to the old disk is already set up in the
4788 index_where_to_add
= num_stripes
;
4789 for (i
= 0; i
< num_stripes
; i
++) {
4790 if (bbio
->stripes
[i
].dev
->devid
== srcdev_devid
) {
4791 /* write to new disk, too */
4792 struct btrfs_bio_stripe
*new =
4793 bbio
->stripes
+ index_where_to_add
;
4794 struct btrfs_bio_stripe
*old
=
4797 new->physical
= old
->physical
;
4798 new->length
= old
->length
;
4799 new->dev
= dev_replace
->tgtdev
;
4800 index_where_to_add
++;
4804 num_stripes
= index_where_to_add
;
4805 } else if (dev_replace_is_ongoing
&& (rw
& REQ_GET_READ_MIRRORS
) &&
4806 dev_replace
->tgtdev
!= NULL
) {
4807 u64 srcdev_devid
= dev_replace
->srcdev
->devid
;
4808 int index_srcdev
= 0;
4810 u64 physical_of_found
= 0;
4813 * During the dev-replace procedure, the target drive can
4814 * also be used to read data in case it is needed to repair
4815 * a corrupt block elsewhere. This is possible if the
4816 * requested area is left of the left cursor. In this area,
4817 * the target drive is a full copy of the source drive.
4819 for (i
= 0; i
< num_stripes
; i
++) {
4820 if (bbio
->stripes
[i
].dev
->devid
== srcdev_devid
) {
4822 * In case of DUP, in order to keep it
4823 * simple, only add the mirror with the
4824 * lowest physical address
4827 physical_of_found
<=
4828 bbio
->stripes
[i
].physical
)
4832 physical_of_found
= bbio
->stripes
[i
].physical
;
4836 u64 length
= map
->stripe_len
;
4838 if (physical_of_found
+ length
<=
4839 dev_replace
->cursor_left
) {
4840 struct btrfs_bio_stripe
*tgtdev_stripe
=
4841 bbio
->stripes
+ num_stripes
;
4843 tgtdev_stripe
->physical
= physical_of_found
;
4844 tgtdev_stripe
->length
=
4845 bbio
->stripes
[index_srcdev
].length
;
4846 tgtdev_stripe
->dev
= dev_replace
->tgtdev
;
4854 bbio
->num_stripes
= num_stripes
;
4855 bbio
->max_errors
= max_errors
;
4856 bbio
->mirror_num
= mirror_num
;
4859 * this is the case that REQ_READ && dev_replace_is_ongoing &&
4860 * mirror_num == num_stripes + 1 && dev_replace target drive is
4861 * available as a mirror
4863 if (patch_the_first_stripe_for_dev_replace
&& num_stripes
> 0) {
4864 WARN_ON(num_stripes
> 1);
4865 bbio
->stripes
[0].dev
= dev_replace
->tgtdev
;
4866 bbio
->stripes
[0].physical
= physical_to_patch_in_first_stripe
;
4867 bbio
->mirror_num
= map
->num_stripes
+ 1;
4870 sort_parity_stripes(bbio
, raid_map
);
4871 *raid_map_ret
= raid_map
;
4874 if (dev_replace_is_ongoing
)
4875 btrfs_dev_replace_unlock(dev_replace
);
4876 free_extent_map(em
);
4880 int btrfs_map_block(struct btrfs_fs_info
*fs_info
, int rw
,
4881 u64 logical
, u64
*length
,
4882 struct btrfs_bio
**bbio_ret
, int mirror_num
)
4884 return __btrfs_map_block(fs_info
, rw
, logical
, length
, bbio_ret
,
4888 int btrfs_rmap_block(struct btrfs_mapping_tree
*map_tree
,
4889 u64 chunk_start
, u64 physical
, u64 devid
,
4890 u64
**logical
, int *naddrs
, int *stripe_len
)
4892 struct extent_map_tree
*em_tree
= &map_tree
->map_tree
;
4893 struct extent_map
*em
;
4894 struct map_lookup
*map
;
4902 read_lock(&em_tree
->lock
);
4903 em
= lookup_extent_mapping(em_tree
, chunk_start
, 1);
4904 read_unlock(&em_tree
->lock
);
4906 BUG_ON(!em
|| em
->start
!= chunk_start
);
4907 map
= (struct map_lookup
*)em
->bdev
;
4910 rmap_len
= map
->stripe_len
;
4912 if (map
->type
& BTRFS_BLOCK_GROUP_RAID10
)
4913 do_div(length
, map
->num_stripes
/ map
->sub_stripes
);
4914 else if (map
->type
& BTRFS_BLOCK_GROUP_RAID0
)
4915 do_div(length
, map
->num_stripes
);
4916 else if (map
->type
& (BTRFS_BLOCK_GROUP_RAID5
|
4917 BTRFS_BLOCK_GROUP_RAID6
)) {
4918 do_div(length
, nr_data_stripes(map
));
4919 rmap_len
= map
->stripe_len
* nr_data_stripes(map
);
4922 buf
= kzalloc(sizeof(u64
) * map
->num_stripes
, GFP_NOFS
);
4923 BUG_ON(!buf
); /* -ENOMEM */
4925 for (i
= 0; i
< map
->num_stripes
; i
++) {
4926 if (devid
&& map
->stripes
[i
].dev
->devid
!= devid
)
4928 if (map
->stripes
[i
].physical
> physical
||
4929 map
->stripes
[i
].physical
+ length
<= physical
)
4932 stripe_nr
= physical
- map
->stripes
[i
].physical
;
4933 do_div(stripe_nr
, map
->stripe_len
);
4935 if (map
->type
& BTRFS_BLOCK_GROUP_RAID10
) {
4936 stripe_nr
= stripe_nr
* map
->num_stripes
+ i
;
4937 do_div(stripe_nr
, map
->sub_stripes
);
4938 } else if (map
->type
& BTRFS_BLOCK_GROUP_RAID0
) {
4939 stripe_nr
= stripe_nr
* map
->num_stripes
+ i
;
4940 } /* else if RAID[56], multiply by nr_data_stripes().
4941 * Alternatively, just use rmap_len below instead of
4942 * map->stripe_len */
4944 bytenr
= chunk_start
+ stripe_nr
* rmap_len
;
4945 WARN_ON(nr
>= map
->num_stripes
);
4946 for (j
= 0; j
< nr
; j
++) {
4947 if (buf
[j
] == bytenr
)
4951 WARN_ON(nr
>= map
->num_stripes
);
4958 *stripe_len
= rmap_len
;
4960 free_extent_map(em
);
4964 static void *merge_stripe_index_into_bio_private(void *bi_private
,
4965 unsigned int stripe_index
)
4968 * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
4970 * The alternative solution (instead of stealing bits from the
4971 * pointer) would be to allocate an intermediate structure
4972 * that contains the old private pointer plus the stripe_index.
4974 BUG_ON((((uintptr_t)bi_private
) & 3) != 0);
4975 BUG_ON(stripe_index
> 3);
4976 return (void *)(((uintptr_t)bi_private
) | stripe_index
);
4979 static struct btrfs_bio
*extract_bbio_from_bio_private(void *bi_private
)
4981 return (struct btrfs_bio
*)(((uintptr_t)bi_private
) & ~((uintptr_t)3));
4984 static unsigned int extract_stripe_index_from_bio_private(void *bi_private
)
4986 return (unsigned int)((uintptr_t)bi_private
) & 3;
4989 static void btrfs_end_bio(struct bio
*bio
, int err
)
4991 struct btrfs_bio
*bbio
= extract_bbio_from_bio_private(bio
->bi_private
);
4992 int is_orig_bio
= 0;
4995 atomic_inc(&bbio
->error
);
4996 if (err
== -EIO
|| err
== -EREMOTEIO
) {
4997 unsigned int stripe_index
=
4998 extract_stripe_index_from_bio_private(
5000 struct btrfs_device
*dev
;
5002 BUG_ON(stripe_index
>= bbio
->num_stripes
);
5003 dev
= bbio
->stripes
[stripe_index
].dev
;
5005 if (bio
->bi_rw
& WRITE
)
5006 btrfs_dev_stat_inc(dev
,
5007 BTRFS_DEV_STAT_WRITE_ERRS
);
5009 btrfs_dev_stat_inc(dev
,
5010 BTRFS_DEV_STAT_READ_ERRS
);
5011 if ((bio
->bi_rw
& WRITE_FLUSH
) == WRITE_FLUSH
)
5012 btrfs_dev_stat_inc(dev
,
5013 BTRFS_DEV_STAT_FLUSH_ERRS
);
5014 btrfs_dev_stat_print_on_error(dev
);
5019 if (bio
== bbio
->orig_bio
)
5022 if (atomic_dec_and_test(&bbio
->stripes_pending
)) {
5025 bio
= bbio
->orig_bio
;
5027 bio
->bi_private
= bbio
->private;
5028 bio
->bi_end_io
= bbio
->end_io
;
5029 bio
->bi_bdev
= (struct block_device
*)
5030 (unsigned long)bbio
->mirror_num
;
5031 /* only send an error to the higher layers if it is
5032 * beyond the tolerance of the btrfs bio
5034 if (atomic_read(&bbio
->error
) > bbio
->max_errors
) {
5038 * this bio is actually up to date, we didn't
5039 * go over the max number of errors
5041 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
5046 bio_endio(bio
, err
);
5047 } else if (!is_orig_bio
) {
5052 struct async_sched
{
5055 struct btrfs_fs_info
*info
;
5056 struct btrfs_work work
;
5060 * see run_scheduled_bios for a description of why bios are collected for
5063 * This will add one bio to the pending list for a device and make sure
5064 * the work struct is scheduled.
5066 noinline
void btrfs_schedule_bio(struct btrfs_root
*root
,
5067 struct btrfs_device
*device
,
5068 int rw
, struct bio
*bio
)
5070 int should_queue
= 1;
5071 struct btrfs_pending_bios
*pending_bios
;
5073 if (device
->missing
|| !device
->bdev
) {
5074 bio_endio(bio
, -EIO
);
5078 /* don't bother with additional async steps for reads, right now */
5079 if (!(rw
& REQ_WRITE
)) {
5081 btrfsic_submit_bio(rw
, bio
);
5087 * nr_async_bios allows us to reliably return congestion to the
5088 * higher layers. Otherwise, the async bio makes it appear we have
5089 * made progress against dirty pages when we've really just put it
5090 * on a queue for later
5092 atomic_inc(&root
->fs_info
->nr_async_bios
);
5093 WARN_ON(bio
->bi_next
);
5094 bio
->bi_next
= NULL
;
5097 spin_lock(&device
->io_lock
);
5098 if (bio
->bi_rw
& REQ_SYNC
)
5099 pending_bios
= &device
->pending_sync_bios
;
5101 pending_bios
= &device
->pending_bios
;
5103 if (pending_bios
->tail
)
5104 pending_bios
->tail
->bi_next
= bio
;
5106 pending_bios
->tail
= bio
;
5107 if (!pending_bios
->head
)
5108 pending_bios
->head
= bio
;
5109 if (device
->running_pending
)
5112 spin_unlock(&device
->io_lock
);
5115 btrfs_queue_worker(&root
->fs_info
->submit_workers
,
5119 static int bio_size_ok(struct block_device
*bdev
, struct bio
*bio
,
5122 struct bio_vec
*prev
;
5123 struct request_queue
*q
= bdev_get_queue(bdev
);
5124 unsigned short max_sectors
= queue_max_sectors(q
);
5125 struct bvec_merge_data bvm
= {
5127 .bi_sector
= sector
,
5128 .bi_rw
= bio
->bi_rw
,
5131 if (bio
->bi_vcnt
== 0) {
5136 prev
= &bio
->bi_io_vec
[bio
->bi_vcnt
- 1];
5137 if ((bio
->bi_size
>> 9) > max_sectors
)
5140 if (!q
->merge_bvec_fn
)
5143 bvm
.bi_size
= bio
->bi_size
- prev
->bv_len
;
5144 if (q
->merge_bvec_fn(q
, &bvm
, prev
) < prev
->bv_len
)
5149 static void submit_stripe_bio(struct btrfs_root
*root
, struct btrfs_bio
*bbio
,
5150 struct bio
*bio
, u64 physical
, int dev_nr
,
5153 struct btrfs_device
*dev
= bbio
->stripes
[dev_nr
].dev
;
5155 bio
->bi_private
= bbio
;
5156 bio
->bi_private
= merge_stripe_index_into_bio_private(
5157 bio
->bi_private
, (unsigned int)dev_nr
);
5158 bio
->bi_end_io
= btrfs_end_bio
;
5159 bio
->bi_sector
= physical
>> 9;
5162 struct rcu_string
*name
;
5165 name
= rcu_dereference(dev
->name
);
5166 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
5167 "(%s id %llu), size=%u\n", rw
,
5168 (u64
)bio
->bi_sector
, (u_long
)dev
->bdev
->bd_dev
,
5169 name
->str
, dev
->devid
, bio
->bi_size
);
5173 bio
->bi_bdev
= dev
->bdev
;
5175 btrfs_schedule_bio(root
, dev
, rw
, bio
);
5177 btrfsic_submit_bio(rw
, bio
);
5180 static int breakup_stripe_bio(struct btrfs_root
*root
, struct btrfs_bio
*bbio
,
5181 struct bio
*first_bio
, struct btrfs_device
*dev
,
5182 int dev_nr
, int rw
, int async
)
5184 struct bio_vec
*bvec
= first_bio
->bi_io_vec
;
5186 int nr_vecs
= bio_get_nr_vecs(dev
->bdev
);
5187 u64 physical
= bbio
->stripes
[dev_nr
].physical
;
5190 bio
= btrfs_bio_alloc(dev
->bdev
, physical
>> 9, nr_vecs
, GFP_NOFS
);
5194 while (bvec
<= (first_bio
->bi_io_vec
+ first_bio
->bi_vcnt
- 1)) {
5195 if (bio_add_page(bio
, bvec
->bv_page
, bvec
->bv_len
,
5196 bvec
->bv_offset
) < bvec
->bv_len
) {
5197 u64 len
= bio
->bi_size
;
5199 atomic_inc(&bbio
->stripes_pending
);
5200 submit_stripe_bio(root
, bbio
, bio
, physical
, dev_nr
,
5208 submit_stripe_bio(root
, bbio
, bio
, physical
, dev_nr
, rw
, async
);
5212 static void bbio_error(struct btrfs_bio
*bbio
, struct bio
*bio
, u64 logical
)
5214 atomic_inc(&bbio
->error
);
5215 if (atomic_dec_and_test(&bbio
->stripes_pending
)) {
5216 bio
->bi_private
= bbio
->private;
5217 bio
->bi_end_io
= bbio
->end_io
;
5218 bio
->bi_bdev
= (struct block_device
*)
5219 (unsigned long)bbio
->mirror_num
;
5220 bio
->bi_sector
= logical
>> 9;
5222 bio_endio(bio
, -EIO
);
5226 int btrfs_map_bio(struct btrfs_root
*root
, int rw
, struct bio
*bio
,
5227 int mirror_num
, int async_submit
)
5229 struct btrfs_device
*dev
;
5230 struct bio
*first_bio
= bio
;
5231 u64 logical
= (u64
)bio
->bi_sector
<< 9;
5234 u64
*raid_map
= NULL
;
5238 struct btrfs_bio
*bbio
= NULL
;
5240 length
= bio
->bi_size
;
5241 map_length
= length
;
5243 ret
= __btrfs_map_block(root
->fs_info
, rw
, logical
, &map_length
, &bbio
,
5244 mirror_num
, &raid_map
);
5245 if (ret
) /* -ENOMEM */
5248 total_devs
= bbio
->num_stripes
;
5249 bbio
->orig_bio
= first_bio
;
5250 bbio
->private = first_bio
->bi_private
;
5251 bbio
->end_io
= first_bio
->bi_end_io
;
5252 atomic_set(&bbio
->stripes_pending
, bbio
->num_stripes
);
5255 /* In this case, map_length has been set to the length of
5256 a single stripe; not the whole write */
5258 return raid56_parity_write(root
, bio
, bbio
,
5259 raid_map
, map_length
);
5261 return raid56_parity_recover(root
, bio
, bbio
,
5262 raid_map
, map_length
,
5267 if (map_length
< length
) {
5268 printk(KERN_CRIT
"btrfs: mapping failed logical %llu bio len %llu "
5269 "len %llu\n", (unsigned long long)logical
,
5270 (unsigned long long)length
,
5271 (unsigned long long)map_length
);
5275 while (dev_nr
< total_devs
) {
5276 dev
= bbio
->stripes
[dev_nr
].dev
;
5277 if (!dev
|| !dev
->bdev
|| (rw
& WRITE
&& !dev
->writeable
)) {
5278 bbio_error(bbio
, first_bio
, logical
);
5284 * Check and see if we're ok with this bio based on it's size
5285 * and offset with the given device.
5287 if (!bio_size_ok(dev
->bdev
, first_bio
,
5288 bbio
->stripes
[dev_nr
].physical
>> 9)) {
5289 ret
= breakup_stripe_bio(root
, bbio
, first_bio
, dev
,
5290 dev_nr
, rw
, async_submit
);
5296 if (dev_nr
< total_devs
- 1) {
5297 bio
= bio_clone(first_bio
, GFP_NOFS
);
5298 BUG_ON(!bio
); /* -ENOMEM */
5303 submit_stripe_bio(root
, bbio
, bio
,
5304 bbio
->stripes
[dev_nr
].physical
, dev_nr
, rw
,
5311 struct btrfs_device
*btrfs_find_device(struct btrfs_fs_info
*fs_info
, u64 devid
,
5314 struct btrfs_device
*device
;
5315 struct btrfs_fs_devices
*cur_devices
;
5317 cur_devices
= fs_info
->fs_devices
;
5318 while (cur_devices
) {
5320 !memcmp(cur_devices
->fsid
, fsid
, BTRFS_UUID_SIZE
)) {
5321 device
= __find_device(&cur_devices
->devices
,
5326 cur_devices
= cur_devices
->seed
;
5331 static struct btrfs_device
*add_missing_dev(struct btrfs_root
*root
,
5332 u64 devid
, u8
*dev_uuid
)
5334 struct btrfs_device
*device
;
5335 struct btrfs_fs_devices
*fs_devices
= root
->fs_info
->fs_devices
;
5337 device
= kzalloc(sizeof(*device
), GFP_NOFS
);
5340 list_add(&device
->dev_list
,
5341 &fs_devices
->devices
);
5342 device
->dev_root
= root
->fs_info
->dev_root
;
5343 device
->devid
= devid
;
5344 device
->work
.func
= pending_bios_fn
;
5345 device
->fs_devices
= fs_devices
;
5346 device
->missing
= 1;
5347 fs_devices
->num_devices
++;
5348 fs_devices
->missing_devices
++;
5349 spin_lock_init(&device
->io_lock
);
5350 INIT_LIST_HEAD(&device
->dev_alloc_list
);
5351 memcpy(device
->uuid
, dev_uuid
, BTRFS_UUID_SIZE
);
5355 static int read_one_chunk(struct btrfs_root
*root
, struct btrfs_key
*key
,
5356 struct extent_buffer
*leaf
,
5357 struct btrfs_chunk
*chunk
)
5359 struct btrfs_mapping_tree
*map_tree
= &root
->fs_info
->mapping_tree
;
5360 struct map_lookup
*map
;
5361 struct extent_map
*em
;
5365 u8 uuid
[BTRFS_UUID_SIZE
];
5370 logical
= key
->offset
;
5371 length
= btrfs_chunk_length(leaf
, chunk
);
5373 read_lock(&map_tree
->map_tree
.lock
);
5374 em
= lookup_extent_mapping(&map_tree
->map_tree
, logical
, 1);
5375 read_unlock(&map_tree
->map_tree
.lock
);
5377 /* already mapped? */
5378 if (em
&& em
->start
<= logical
&& em
->start
+ em
->len
> logical
) {
5379 free_extent_map(em
);
5382 free_extent_map(em
);
5385 em
= alloc_extent_map();
5388 num_stripes
= btrfs_chunk_num_stripes(leaf
, chunk
);
5389 map
= kmalloc(map_lookup_size(num_stripes
), GFP_NOFS
);
5391 free_extent_map(em
);
5395 em
->bdev
= (struct block_device
*)map
;
5396 em
->start
= logical
;
5399 em
->block_start
= 0;
5400 em
->block_len
= em
->len
;
5402 map
->num_stripes
= num_stripes
;
5403 map
->io_width
= btrfs_chunk_io_width(leaf
, chunk
);
5404 map
->io_align
= btrfs_chunk_io_align(leaf
, chunk
);
5405 map
->sector_size
= btrfs_chunk_sector_size(leaf
, chunk
);
5406 map
->stripe_len
= btrfs_chunk_stripe_len(leaf
, chunk
);
5407 map
->type
= btrfs_chunk_type(leaf
, chunk
);
5408 map
->sub_stripes
= btrfs_chunk_sub_stripes(leaf
, chunk
);
5409 for (i
= 0; i
< num_stripes
; i
++) {
5410 map
->stripes
[i
].physical
=
5411 btrfs_stripe_offset_nr(leaf
, chunk
, i
);
5412 devid
= btrfs_stripe_devid_nr(leaf
, chunk
, i
);
5413 read_extent_buffer(leaf
, uuid
, (unsigned long)
5414 btrfs_stripe_dev_uuid_nr(chunk
, i
),
5416 map
->stripes
[i
].dev
= btrfs_find_device(root
->fs_info
, devid
,
5418 if (!map
->stripes
[i
].dev
&& !btrfs_test_opt(root
, DEGRADED
)) {
5420 free_extent_map(em
);
5423 if (!map
->stripes
[i
].dev
) {
5424 map
->stripes
[i
].dev
=
5425 add_missing_dev(root
, devid
, uuid
);
5426 if (!map
->stripes
[i
].dev
) {
5428 free_extent_map(em
);
5432 map
->stripes
[i
].dev
->in_fs_metadata
= 1;
5435 write_lock(&map_tree
->map_tree
.lock
);
5436 ret
= add_extent_mapping(&map_tree
->map_tree
, em
);
5437 write_unlock(&map_tree
->map_tree
.lock
);
5438 BUG_ON(ret
); /* Tree corruption */
5439 free_extent_map(em
);
5444 static void fill_device_from_item(struct extent_buffer
*leaf
,
5445 struct btrfs_dev_item
*dev_item
,
5446 struct btrfs_device
*device
)
5450 device
->devid
= btrfs_device_id(leaf
, dev_item
);
5451 device
->disk_total_bytes
= btrfs_device_total_bytes(leaf
, dev_item
);
5452 device
->total_bytes
= device
->disk_total_bytes
;
5453 device
->bytes_used
= btrfs_device_bytes_used(leaf
, dev_item
);
5454 device
->type
= btrfs_device_type(leaf
, dev_item
);
5455 device
->io_align
= btrfs_device_io_align(leaf
, dev_item
);
5456 device
->io_width
= btrfs_device_io_width(leaf
, dev_item
);
5457 device
->sector_size
= btrfs_device_sector_size(leaf
, dev_item
);
5458 WARN_ON(device
->devid
== BTRFS_DEV_REPLACE_DEVID
);
5459 device
->is_tgtdev_for_dev_replace
= 0;
5461 ptr
= (unsigned long)btrfs_device_uuid(dev_item
);
5462 read_extent_buffer(leaf
, device
->uuid
, ptr
, BTRFS_UUID_SIZE
);
5465 static int open_seed_devices(struct btrfs_root
*root
, u8
*fsid
)
5467 struct btrfs_fs_devices
*fs_devices
;
5470 BUG_ON(!mutex_is_locked(&uuid_mutex
));
5472 fs_devices
= root
->fs_info
->fs_devices
->seed
;
5473 while (fs_devices
) {
5474 if (!memcmp(fs_devices
->fsid
, fsid
, BTRFS_UUID_SIZE
)) {
5478 fs_devices
= fs_devices
->seed
;
5481 fs_devices
= find_fsid(fsid
);
5487 fs_devices
= clone_fs_devices(fs_devices
);
5488 if (IS_ERR(fs_devices
)) {
5489 ret
= PTR_ERR(fs_devices
);
5493 ret
= __btrfs_open_devices(fs_devices
, FMODE_READ
,
5494 root
->fs_info
->bdev_holder
);
5496 free_fs_devices(fs_devices
);
5500 if (!fs_devices
->seeding
) {
5501 __btrfs_close_devices(fs_devices
);
5502 free_fs_devices(fs_devices
);
5507 fs_devices
->seed
= root
->fs_info
->fs_devices
->seed
;
5508 root
->fs_info
->fs_devices
->seed
= fs_devices
;
5513 static int read_one_dev(struct btrfs_root
*root
,
5514 struct extent_buffer
*leaf
,
5515 struct btrfs_dev_item
*dev_item
)
5517 struct btrfs_device
*device
;
5520 u8 fs_uuid
[BTRFS_UUID_SIZE
];
5521 u8 dev_uuid
[BTRFS_UUID_SIZE
];
5523 devid
= btrfs_device_id(leaf
, dev_item
);
5524 read_extent_buffer(leaf
, dev_uuid
,
5525 (unsigned long)btrfs_device_uuid(dev_item
),
5527 read_extent_buffer(leaf
, fs_uuid
,
5528 (unsigned long)btrfs_device_fsid(dev_item
),
5531 if (memcmp(fs_uuid
, root
->fs_info
->fsid
, BTRFS_UUID_SIZE
)) {
5532 ret
= open_seed_devices(root
, fs_uuid
);
5533 if (ret
&& !btrfs_test_opt(root
, DEGRADED
))
5537 device
= btrfs_find_device(root
->fs_info
, devid
, dev_uuid
, fs_uuid
);
5538 if (!device
|| !device
->bdev
) {
5539 if (!btrfs_test_opt(root
, DEGRADED
))
5543 printk(KERN_WARNING
"warning devid %llu missing\n",
5544 (unsigned long long)devid
);
5545 device
= add_missing_dev(root
, devid
, dev_uuid
);
5548 } else if (!device
->missing
) {
5550 * this happens when a device that was properly setup
5551 * in the device info lists suddenly goes bad.
5552 * device->bdev is NULL, and so we have to set
5553 * device->missing to one here
5555 root
->fs_info
->fs_devices
->missing_devices
++;
5556 device
->missing
= 1;
5560 if (device
->fs_devices
!= root
->fs_info
->fs_devices
) {
5561 BUG_ON(device
->writeable
);
5562 if (device
->generation
!=
5563 btrfs_device_generation(leaf
, dev_item
))
5567 fill_device_from_item(leaf
, dev_item
, device
);
5568 device
->dev_root
= root
->fs_info
->dev_root
;
5569 device
->in_fs_metadata
= 1;
5570 if (device
->writeable
&& !device
->is_tgtdev_for_dev_replace
) {
5571 device
->fs_devices
->total_rw_bytes
+= device
->total_bytes
;
5572 spin_lock(&root
->fs_info
->free_chunk_lock
);
5573 root
->fs_info
->free_chunk_space
+= device
->total_bytes
-
5575 spin_unlock(&root
->fs_info
->free_chunk_lock
);
5581 int btrfs_read_sys_array(struct btrfs_root
*root
)
5583 struct btrfs_super_block
*super_copy
= root
->fs_info
->super_copy
;
5584 struct extent_buffer
*sb
;
5585 struct btrfs_disk_key
*disk_key
;
5586 struct btrfs_chunk
*chunk
;
5588 unsigned long sb_ptr
;
5594 struct btrfs_key key
;
5596 sb
= btrfs_find_create_tree_block(root
, BTRFS_SUPER_INFO_OFFSET
,
5597 BTRFS_SUPER_INFO_SIZE
);
5600 btrfs_set_buffer_uptodate(sb
);
5601 btrfs_set_buffer_lockdep_class(root
->root_key
.objectid
, sb
, 0);
5603 * The sb extent buffer is artifical and just used to read the system array.
5604 * btrfs_set_buffer_uptodate() call does not properly mark all it's
5605 * pages up-to-date when the page is larger: extent does not cover the
5606 * whole page and consequently check_page_uptodate does not find all
5607 * the page's extents up-to-date (the hole beyond sb),
5608 * write_extent_buffer then triggers a WARN_ON.
5610 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
5611 * but sb spans only this function. Add an explicit SetPageUptodate call
5612 * to silence the warning eg. on PowerPC 64.
5614 if (PAGE_CACHE_SIZE
> BTRFS_SUPER_INFO_SIZE
)
5615 SetPageUptodate(sb
->pages
[0]);
5617 write_extent_buffer(sb
, super_copy
, 0, BTRFS_SUPER_INFO_SIZE
);
5618 array_size
= btrfs_super_sys_array_size(super_copy
);
5620 ptr
= super_copy
->sys_chunk_array
;
5621 sb_ptr
= offsetof(struct btrfs_super_block
, sys_chunk_array
);
5624 while (cur
< array_size
) {
5625 disk_key
= (struct btrfs_disk_key
*)ptr
;
5626 btrfs_disk_key_to_cpu(&key
, disk_key
);
5628 len
= sizeof(*disk_key
); ptr
+= len
;
5632 if (key
.type
== BTRFS_CHUNK_ITEM_KEY
) {
5633 chunk
= (struct btrfs_chunk
*)sb_ptr
;
5634 ret
= read_one_chunk(root
, &key
, sb
, chunk
);
5637 num_stripes
= btrfs_chunk_num_stripes(sb
, chunk
);
5638 len
= btrfs_chunk_item_size(num_stripes
);
5647 free_extent_buffer(sb
);
5651 int btrfs_read_chunk_tree(struct btrfs_root
*root
)
5653 struct btrfs_path
*path
;
5654 struct extent_buffer
*leaf
;
5655 struct btrfs_key key
;
5656 struct btrfs_key found_key
;
5660 root
= root
->fs_info
->chunk_root
;
5662 path
= btrfs_alloc_path();
5666 mutex_lock(&uuid_mutex
);
5669 /* first we search for all of the device items, and then we
5670 * read in all of the chunk items. This way we can create chunk
5671 * mappings that reference all of the devices that are afound
5673 key
.objectid
= BTRFS_DEV_ITEMS_OBJECTID
;
5677 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
5681 leaf
= path
->nodes
[0];
5682 slot
= path
->slots
[0];
5683 if (slot
>= btrfs_header_nritems(leaf
)) {
5684 ret
= btrfs_next_leaf(root
, path
);
5691 btrfs_item_key_to_cpu(leaf
, &found_key
, slot
);
5692 if (key
.objectid
== BTRFS_DEV_ITEMS_OBJECTID
) {
5693 if (found_key
.objectid
!= BTRFS_DEV_ITEMS_OBJECTID
)
5695 if (found_key
.type
== BTRFS_DEV_ITEM_KEY
) {
5696 struct btrfs_dev_item
*dev_item
;
5697 dev_item
= btrfs_item_ptr(leaf
, slot
,
5698 struct btrfs_dev_item
);
5699 ret
= read_one_dev(root
, leaf
, dev_item
);
5703 } else if (found_key
.type
== BTRFS_CHUNK_ITEM_KEY
) {
5704 struct btrfs_chunk
*chunk
;
5705 chunk
= btrfs_item_ptr(leaf
, slot
, struct btrfs_chunk
);
5706 ret
= read_one_chunk(root
, &found_key
, leaf
, chunk
);
5712 if (key
.objectid
== BTRFS_DEV_ITEMS_OBJECTID
) {
5714 btrfs_release_path(path
);
5719 unlock_chunks(root
);
5720 mutex_unlock(&uuid_mutex
);
5722 btrfs_free_path(path
);
5726 static void __btrfs_reset_dev_stats(struct btrfs_device
*dev
)
5730 for (i
= 0; i
< BTRFS_DEV_STAT_VALUES_MAX
; i
++)
5731 btrfs_dev_stat_reset(dev
, i
);
5734 int btrfs_init_dev_stats(struct btrfs_fs_info
*fs_info
)
5736 struct btrfs_key key
;
5737 struct btrfs_key found_key
;
5738 struct btrfs_root
*dev_root
= fs_info
->dev_root
;
5739 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
5740 struct extent_buffer
*eb
;
5743 struct btrfs_device
*device
;
5744 struct btrfs_path
*path
= NULL
;
5747 path
= btrfs_alloc_path();
5753 mutex_lock(&fs_devices
->device_list_mutex
);
5754 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
5756 struct btrfs_dev_stats_item
*ptr
;
5759 key
.type
= BTRFS_DEV_STATS_KEY
;
5760 key
.offset
= device
->devid
;
5761 ret
= btrfs_search_slot(NULL
, dev_root
, &key
, path
, 0, 0);
5763 __btrfs_reset_dev_stats(device
);
5764 device
->dev_stats_valid
= 1;
5765 btrfs_release_path(path
);
5768 slot
= path
->slots
[0];
5769 eb
= path
->nodes
[0];
5770 btrfs_item_key_to_cpu(eb
, &found_key
, slot
);
5771 item_size
= btrfs_item_size_nr(eb
, slot
);
5773 ptr
= btrfs_item_ptr(eb
, slot
,
5774 struct btrfs_dev_stats_item
);
5776 for (i
= 0; i
< BTRFS_DEV_STAT_VALUES_MAX
; i
++) {
5777 if (item_size
>= (1 + i
) * sizeof(__le64
))
5778 btrfs_dev_stat_set(device
, i
,
5779 btrfs_dev_stats_value(eb
, ptr
, i
));
5781 btrfs_dev_stat_reset(device
, i
);
5784 device
->dev_stats_valid
= 1;
5785 btrfs_dev_stat_print_on_load(device
);
5786 btrfs_release_path(path
);
5788 mutex_unlock(&fs_devices
->device_list_mutex
);
5791 btrfs_free_path(path
);
5792 return ret
< 0 ? ret
: 0;
5795 static int update_dev_stat_item(struct btrfs_trans_handle
*trans
,
5796 struct btrfs_root
*dev_root
,
5797 struct btrfs_device
*device
)
5799 struct btrfs_path
*path
;
5800 struct btrfs_key key
;
5801 struct extent_buffer
*eb
;
5802 struct btrfs_dev_stats_item
*ptr
;
5807 key
.type
= BTRFS_DEV_STATS_KEY
;
5808 key
.offset
= device
->devid
;
5810 path
= btrfs_alloc_path();
5812 ret
= btrfs_search_slot(trans
, dev_root
, &key
, path
, -1, 1);
5814 printk_in_rcu(KERN_WARNING
"btrfs: error %d while searching for dev_stats item for device %s!\n",
5815 ret
, rcu_str_deref(device
->name
));
5820 btrfs_item_size_nr(path
->nodes
[0], path
->slots
[0]) < sizeof(*ptr
)) {
5821 /* need to delete old one and insert a new one */
5822 ret
= btrfs_del_item(trans
, dev_root
, path
);
5824 printk_in_rcu(KERN_WARNING
"btrfs: delete too small dev_stats item for device %s failed %d!\n",
5825 rcu_str_deref(device
->name
), ret
);
5832 /* need to insert a new item */
5833 btrfs_release_path(path
);
5834 ret
= btrfs_insert_empty_item(trans
, dev_root
, path
,
5835 &key
, sizeof(*ptr
));
5837 printk_in_rcu(KERN_WARNING
"btrfs: insert dev_stats item for device %s failed %d!\n",
5838 rcu_str_deref(device
->name
), ret
);
5843 eb
= path
->nodes
[0];
5844 ptr
= btrfs_item_ptr(eb
, path
->slots
[0], struct btrfs_dev_stats_item
);
5845 for (i
= 0; i
< BTRFS_DEV_STAT_VALUES_MAX
; i
++)
5846 btrfs_set_dev_stats_value(eb
, ptr
, i
,
5847 btrfs_dev_stat_read(device
, i
));
5848 btrfs_mark_buffer_dirty(eb
);
5851 btrfs_free_path(path
);
5856 * called from commit_transaction. Writes all changed device stats to disk.
5858 int btrfs_run_dev_stats(struct btrfs_trans_handle
*trans
,
5859 struct btrfs_fs_info
*fs_info
)
5861 struct btrfs_root
*dev_root
= fs_info
->dev_root
;
5862 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
5863 struct btrfs_device
*device
;
5866 mutex_lock(&fs_devices
->device_list_mutex
);
5867 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
5868 if (!device
->dev_stats_valid
|| !device
->dev_stats_dirty
)
5871 ret
= update_dev_stat_item(trans
, dev_root
, device
);
5873 device
->dev_stats_dirty
= 0;
5875 mutex_unlock(&fs_devices
->device_list_mutex
);
5880 void btrfs_dev_stat_inc_and_print(struct btrfs_device
*dev
, int index
)
5882 btrfs_dev_stat_inc(dev
, index
);
5883 btrfs_dev_stat_print_on_error(dev
);
5886 void btrfs_dev_stat_print_on_error(struct btrfs_device
*dev
)
5888 if (!dev
->dev_stats_valid
)
5890 printk_ratelimited_in_rcu(KERN_ERR
5891 "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
5892 rcu_str_deref(dev
->name
),
5893 btrfs_dev_stat_read(dev
, BTRFS_DEV_STAT_WRITE_ERRS
),
5894 btrfs_dev_stat_read(dev
, BTRFS_DEV_STAT_READ_ERRS
),
5895 btrfs_dev_stat_read(dev
, BTRFS_DEV_STAT_FLUSH_ERRS
),
5896 btrfs_dev_stat_read(dev
,
5897 BTRFS_DEV_STAT_CORRUPTION_ERRS
),
5898 btrfs_dev_stat_read(dev
,
5899 BTRFS_DEV_STAT_GENERATION_ERRS
));
5902 static void btrfs_dev_stat_print_on_load(struct btrfs_device
*dev
)
5906 for (i
= 0; i
< BTRFS_DEV_STAT_VALUES_MAX
; i
++)
5907 if (btrfs_dev_stat_read(dev
, i
) != 0)
5909 if (i
== BTRFS_DEV_STAT_VALUES_MAX
)
5910 return; /* all values == 0, suppress message */
5912 printk_in_rcu(KERN_INFO
"btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
5913 rcu_str_deref(dev
->name
),
5914 btrfs_dev_stat_read(dev
, BTRFS_DEV_STAT_WRITE_ERRS
),
5915 btrfs_dev_stat_read(dev
, BTRFS_DEV_STAT_READ_ERRS
),
5916 btrfs_dev_stat_read(dev
, BTRFS_DEV_STAT_FLUSH_ERRS
),
5917 btrfs_dev_stat_read(dev
, BTRFS_DEV_STAT_CORRUPTION_ERRS
),
5918 btrfs_dev_stat_read(dev
, BTRFS_DEV_STAT_GENERATION_ERRS
));
5921 int btrfs_get_dev_stats(struct btrfs_root
*root
,
5922 struct btrfs_ioctl_get_dev_stats
*stats
)
5924 struct btrfs_device
*dev
;
5925 struct btrfs_fs_devices
*fs_devices
= root
->fs_info
->fs_devices
;
5928 mutex_lock(&fs_devices
->device_list_mutex
);
5929 dev
= btrfs_find_device(root
->fs_info
, stats
->devid
, NULL
, NULL
);
5930 mutex_unlock(&fs_devices
->device_list_mutex
);
5934 "btrfs: get dev_stats failed, device not found\n");
5936 } else if (!dev
->dev_stats_valid
) {
5938 "btrfs: get dev_stats failed, not yet valid\n");
5940 } else if (stats
->flags
& BTRFS_DEV_STATS_RESET
) {
5941 for (i
= 0; i
< BTRFS_DEV_STAT_VALUES_MAX
; i
++) {
5942 if (stats
->nr_items
> i
)
5944 btrfs_dev_stat_read_and_reset(dev
, i
);
5946 btrfs_dev_stat_reset(dev
, i
);
5949 for (i
= 0; i
< BTRFS_DEV_STAT_VALUES_MAX
; i
++)
5950 if (stats
->nr_items
> i
)
5951 stats
->values
[i
] = btrfs_dev_stat_read(dev
, i
);
5953 if (stats
->nr_items
> BTRFS_DEV_STAT_VALUES_MAX
)
5954 stats
->nr_items
= BTRFS_DEV_STAT_VALUES_MAX
;
5958 int btrfs_scratch_superblock(struct btrfs_device
*device
)
5960 struct buffer_head
*bh
;
5961 struct btrfs_super_block
*disk_super
;
5963 bh
= btrfs_read_dev_super(device
->bdev
);
5966 disk_super
= (struct btrfs_super_block
*)bh
->b_data
;
5968 memset(&disk_super
->magic
, 0, sizeof(disk_super
->magic
));
5969 set_buffer_dirty(bh
);
5970 sync_dirty_buffer(bh
);