fs/btrfs/volumes.c

   1 /*
   2  * Copyright (C) 2007 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18 #include <linux/sched.h>
  19 #include <linux/bio.h>
  20 #include <linux/slab.h>
  21 #include <linux/buffer_head.h>
  22 #include <linux/blkdev.h>
  23 #include <linux/random.h>
  24 #include <linux/iocontext.h>
  25 #include <linux/capability.h>
  26 #include <linux/ratelimit.h>
  27 #include <linux/kthread.h>
  28 #include "compat.h"
  29 #include "ctree.h"
  30 #include "extent_map.h"
  31 #include "disk-io.h"
  32 #include "transaction.h"
  33 #include "print-tree.h"
  34 #include "volumes.h"
  35 #include "async-thread.h"
  36 #include "check-integrity.h"
  37 #include "rcu-string.h"
  38 #include "math.h"
  39 #include "dev-replace.h"
  40
  41 static int init_first_rw_device(struct btrfs_trans_handle *trans,
  42                                 struct btrfs_root *root,
  43                                 struct btrfs_device *device);
  44 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
  45 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
  46 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
  47
  48 static DEFINE_MUTEX(uuid_mutex);
  49 static LIST_HEAD(fs_uuids);
  50
  51 static void lock_chunks(struct btrfs_root *root)
  52 {
  53         mutex_lock(&root->fs_info->chunk_mutex);
  54 }
  55
  56 static void unlock_chunks(struct btrfs_root *root)
  57 {
  58         mutex_unlock(&root->fs_info->chunk_mutex);
  59 }
  60
  61 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
  62 {
  63         struct btrfs_device *device;
  64         WARN_ON(fs_devices->opened);
  65         while (!list_empty(&fs_devices->devices)) {
  66                 device = list_entry(fs_devices->devices.next,
  67                                     struct btrfs_device, dev_list);
  68                 list_del(&device->dev_list);
  69                 rcu_string_free(device->name);
  70                 kfree(device);
  71         }
  72         kfree(fs_devices);
  73 }
  74
  75 static void btrfs_kobject_uevent(struct block_device *bdev,
  76                                  enum kobject_action action)
  77 {
  78         int ret;
  79
  80         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
  81         if (ret)
  82                 pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
  83                         action,
  84                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
  85                         &disk_to_dev(bdev->bd_disk)->kobj);
  86 }
  87
  88 void btrfs_cleanup_fs_uuids(void)
  89 {
  90         struct btrfs_fs_devices *fs_devices;
  91
  92         while (!list_empty(&fs_uuids)) {
  93                 fs_devices = list_entry(fs_uuids.next,
  94                                         struct btrfs_fs_devices, list);
  95                 list_del(&fs_devices->list);
  96                 free_fs_devices(fs_devices);
  97         }
  98 }
  99
 100 static noinline struct btrfs_device *__find_device(struct list_head *head,
 101                                                    u64 devid, u8 *uuid)
 102 {
 103         struct btrfs_device *dev;
 104
 105         list_for_each_entry(dev, head, dev_list) {
 106                 if (dev->devid == devid &&
 107                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
 108                         return dev;
 109                 }
 110         }
 111         return NULL;
 112 }
 113
 114 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 115 {
 116         struct btrfs_fs_devices *fs_devices;
 117
 118         list_for_each_entry(fs_devices, &fs_uuids, list) {
 119                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 120                         return fs_devices;
 121         }
 122         return NULL;
 123 }
 124
 125 static int
 126 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 127                       int flush, struct block_device **bdev,
 128                       struct buffer_head **bh)
 129 {
 130         int ret;
 131
 132         *bdev = blkdev_get_by_path(device_path, flags, holder);
 133
 134         if (IS_ERR(*bdev)) {
 135                 ret = PTR_ERR(*bdev);
 136                 printk(KERN_INFO "btrfs: open %s failed\n", device_path);
 137                 goto error;
 138         }
 139
 140         if (flush)
 141                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
 142         ret = set_blocksize(*bdev, 4096);
 143         if (ret) {
 144                 blkdev_put(*bdev, flags);
 145                 goto error;
 146         }
 147         invalidate_bdev(*bdev);
 148         *bh = btrfs_read_dev_super(*bdev);
 149         if (!*bh) {
 150                 ret = -EINVAL;
 151                 blkdev_put(*bdev, flags);
 152                 goto error;
 153         }
 154
 155         return 0;
 156
 157 error:
 158         *bdev = NULL;
 159         *bh = NULL;
 160         return ret;
 161 }
 162
 163 static void requeue_list(struct btrfs_pending_bios *pending_bios,
 164                         struct bio *head, struct bio *tail)
 165 {
 166
 167         struct bio *old_head;
 168
 169         old_head = pending_bios->head;
 170         pending_bios->head = head;
 171         if (pending_bios->tail)
 172                 tail->bi_next = old_head;
 173         else
 174                 pending_bios->tail = tail;
 175 }
 176
 177 /*
 178  * we try to collect pending bios for a device so we don't get a large
 179  * number of procs sending bios down to the same device.  This greatly
 180  * improves the schedulers ability to collect and merge the bios.
 181  *
 182  * But, it also turns into a long list of bios to process and that is sure
 183  * to eventually make the worker thread block.  The solution here is to
 184  * make some progress and then put this work struct back at the end of
 185  * the list if the block device is congested.  This way, multiple devices
 186  * can make progress from a single worker thread.
 187  */
 188 static noinline void run_scheduled_bios(struct btrfs_device *device)
 189 {
 190         struct bio *pending;
 191         struct backing_dev_info *bdi;
 192         struct btrfs_fs_info *fs_info;
 193         struct btrfs_pending_bios *pending_bios;
 194         struct bio *tail;
 195         struct bio *cur;
 196         int again = 0;
 197         unsigned long num_run;
 198         unsigned long batch_run = 0;
 199         unsigned long limit;
 200         unsigned long last_waited = 0;
 201         int force_reg = 0;
 202         int sync_pending = 0;
 203         struct blk_plug plug;
 204
 205         /*
 206          * this function runs all the bios we've collected for
 207          * a particular device.  We don't want to wander off to
 208          * another device without first sending all of these down.
 209          * So, setup a plug here and finish it off before we return
 210          */
 211         blk_start_plug(&plug);
 212
 213         bdi = blk_get_backing_dev_info(device->bdev);
 214         fs_info = device->dev_root->fs_info;
 215         limit = btrfs_async_submit_limit(fs_info);
 216         limit = limit * 2 / 3;
 217
 218 loop:
 219         spin_lock(&device->io_lock);
 220
 221 loop_lock:
 222         num_run = 0;
 223
 224         /* take all the bios off the list at once and process them
 225          * later on (without the lock held).  But, remember the
 226          * tail and other pointers so the bios can be properly reinserted
 227          * into the list if we hit congestion
 228          */
 229         if (!force_reg && device->pending_sync_bios.head) {
 230                 pending_bios = &device->pending_sync_bios;
 231                 force_reg = 1;
 232         } else {
 233                 pending_bios = &device->pending_bios;
 234                 force_reg = 0;
 235         }
 236
 237         pending = pending_bios->head;
 238         tail = pending_bios->tail;
 239         WARN_ON(pending && !tail);
 240
 241         /*
 242          * if pending was null this time around, no bios need processing
 243          * at all and we can stop.  Otherwise it'll loop back up again
 244          * and do an additional check so no bios are missed.
 245          *
 246          * device->running_pending is used to synchronize with the
 247          * schedule_bio code.
 248          */
 249         if (device->pending_sync_bios.head == NULL &&
 250             device->pending_bios.head == NULL) {
 251                 again = 0;
 252                 device->running_pending = 0;
 253         } else {
 254                 again = 1;
 255                 device->running_pending = 1;
 256         }
 257
 258         pending_bios->head = NULL;
 259         pending_bios->tail = NULL;
 260
 261         spin_unlock(&device->io_lock);
 262
 263         while (pending) {
 264
 265                 rmb();
 266                 /* we want to work on both lists, but do more bios on the
 267                  * sync list than the regular list
 268                  */
 269                 if ((num_run > 32 &&
 270                     pending_bios != &device->pending_sync_bios &&
 271                     device->pending_sync_bios.head) ||
 272                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
 273                     device->pending_bios.head)) {
 274                         spin_lock(&device->io_lock);
 275                         requeue_list(pending_bios, pending, tail);
 276                         goto loop_lock;
 277                 }
 278
 279                 cur = pending;
 280                 pending = pending->bi_next;
 281                 cur->bi_next = NULL;
 282
 283                 if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
 284                     waitqueue_active(&fs_info->async_submit_wait))
 285                         wake_up(&fs_info->async_submit_wait);
 286
 287                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 288
 289                 /*
 290                  * if we're doing the sync list, record that our
 291                  * plug has some sync requests on it
 292                  *
 293                  * If we're doing the regular list and there are
 294                  * sync requests sitting around, unplug before
 295                  * we add more
 296                  */
 297                 if (pending_bios == &device->pending_sync_bios) {
 298                         sync_pending = 1;
 299                 } else if (sync_pending) {
 300                         blk_finish_plug(&plug);
 301                         blk_start_plug(&plug);
 302                         sync_pending = 0;
 303                 }
 304
 305                 btrfsic_submit_bio(cur->bi_rw, cur);
 306                 num_run++;
 307                 batch_run++;
 308                 if (need_resched())
 309                         cond_resched();
 310
 311                 /*
 312                  * we made progress, there is more work to do and the bdi
 313                  * is now congested.  Back off and let other work structs
 314                  * run instead
 315                  */
 316                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
 317                     fs_info->fs_devices->open_devices > 1) {
 318                         struct io_context *ioc;
 319
 320                         ioc = current->io_context;
 321
 322                         /*
 323                          * the main goal here is that we don't want to
 324                          * block if we're going to be able to submit
 325                          * more requests without blocking.
 326                          *
 327                          * This code does two great things, it pokes into
 328                          * the elevator code from a filesystem _and_
 329                          * it makes assumptions about how batching works.
 330                          */
 331                         if (ioc && ioc->nr_batch_requests > 0 &&
 332                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
 333                             (last_waited == 0 ||
 334                              ioc->last_waited == last_waited)) {
 335                                 /*
 336                                  * we want to go through our batch of
 337                                  * requests and stop.  So, we copy out
 338                                  * the ioc->last_waited time and test
 339                                  * against it before looping
 340                                  */
 341                                 last_waited = ioc->last_waited;
 342                                 if (need_resched())
 343                                         cond_resched();
 344                                 continue;
 345                         }
 346                         spin_lock(&device->io_lock);
 347                         requeue_list(pending_bios, pending, tail);
 348                         device->running_pending = 1;
 349
 350                         spin_unlock(&device->io_lock);
 351                         btrfs_requeue_work(&device->work);
 352                         goto done;
 353                 }
 354                 /* unplug every 64 requests just for good measure */
 355                 if (batch_run % 64 == 0) {
 356                         blk_finish_plug(&plug);
 357                         blk_start_plug(&plug);
 358                         sync_pending = 0;
 359                 }
 360         }
 361
 362         cond_resched();
 363         if (again)
 364                 goto loop;
 365
 366         spin_lock(&device->io_lock);
 367         if (device->pending_bios.head || device->pending_sync_bios.head)
 368                 goto loop_lock;
 369         spin_unlock(&device->io_lock);
 370
 371 done:
 372         blk_finish_plug(&plug);
 373 }
 374
 375 static void pending_bios_fn(struct btrfs_work *work)
 376 {
 377         struct btrfs_device *device;
 378
 379         device = container_of(work, struct btrfs_device, work);
 380         run_scheduled_bios(device);
 381 }
 382
 383 static noinline int device_list_add(const char *path,
 384                            struct btrfs_super_block *disk_super,
 385                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
 386 {
 387         struct btrfs_device *device;
 388         struct btrfs_fs_devices *fs_devices;
 389         struct rcu_string *name;
 390         u64 found_transid = btrfs_super_generation(disk_super);
 391
 392         fs_devices = find_fsid(disk_super->fsid);
 393         if (!fs_devices) {
 394                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
 395                 if (!fs_devices)
 396                         return -ENOMEM;
 397                 INIT_LIST_HEAD(&fs_devices->devices);
 398                 INIT_LIST_HEAD(&fs_devices->alloc_list);
 399                 list_add(&fs_devices->list, &fs_uuids);
 400                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 401                 fs_devices->latest_devid = devid;
 402                 fs_devices->latest_trans = found_transid;
 403                 mutex_init(&fs_devices->device_list_mutex);
 404                 device = NULL;
 405         } else {
 406                 device = __find_device(&fs_devices->devices, devid,
 407                                        disk_super->dev_item.uuid);
 408         }
 409         if (!device) {
 410                 if (fs_devices->opened)
 411                         return -EBUSY;
 412
 413                 device = kzalloc(sizeof(*device), GFP_NOFS);
 414                 if (!device) {
 415                         /* we can safely leave the fs_devices entry around */
 416                         return -ENOMEM;
 417                 }
 418                 device->devid = devid;
 419                 device->dev_stats_valid = 0;
 420                 device->work.func = pending_bios_fn;
 421                 memcpy(device->uuid, disk_super->dev_item.uuid,
 422                        BTRFS_UUID_SIZE);
 423                 spin_lock_init(&device->io_lock);
 424
 425                 name = rcu_string_strdup(path, GFP_NOFS);
 426                 if (!name) {
 427                         kfree(device);
 428                         return -ENOMEM;
 429                 }
 430                 rcu_assign_pointer(device->name, name);
 431                 INIT_LIST_HEAD(&device->dev_alloc_list);
 432
 433                 /* init readahead state */
 434                 spin_lock_init(&device->reada_lock);
 435                 device->reada_curr_zone = NULL;
 436                 atomic_set(&device->reada_in_flight, 0);
 437                 device->reada_next = 0;
 438                 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
 439                 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
 440
 441                 mutex_lock(&fs_devices->device_list_mutex);
 442                 list_add_rcu(&device->dev_list, &fs_devices->devices);
 443                 mutex_unlock(&fs_devices->device_list_mutex);
 444
 445                 device->fs_devices = fs_devices;
 446                 fs_devices->num_devices++;
 447         } else if (!device->name || strcmp(device->name->str, path)) {
 448                 name = rcu_string_strdup(path, GFP_NOFS);
 449                 if (!name)
 450                         return -ENOMEM;
 451                 rcu_string_free(device->name);
 452                 rcu_assign_pointer(device->name, name);
 453                 if (device->missing) {
 454                         fs_devices->missing_devices--;
 455                         device->missing = 0;
 456                 }
 457         }
 458
 459         if (found_transid > fs_devices->latest_trans) {
 460                 fs_devices->latest_devid = devid;
 461                 fs_devices->latest_trans = found_transid;
 462         }
 463         *fs_devices_ret = fs_devices;
 464         return 0;
 465 }
 466
 467 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 468 {
 469         struct btrfs_fs_devices *fs_devices;
 470         struct btrfs_device *device;
 471         struct btrfs_device *orig_dev;
 472
 473         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
 474         if (!fs_devices)
 475                 return ERR_PTR(-ENOMEM);
 476
 477         INIT_LIST_HEAD(&fs_devices->devices);
 478         INIT_LIST_HEAD(&fs_devices->alloc_list);
 479         INIT_LIST_HEAD(&fs_devices->list);
 480         mutex_init(&fs_devices->device_list_mutex);
 481         fs_devices->latest_devid = orig->latest_devid;
 482         fs_devices->latest_trans = orig->latest_trans;
 483         fs_devices->total_devices = orig->total_devices;
 484         memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
 485
 486         /* We have held the volume lock, it is safe to get the devices. */
 487         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 488                 struct rcu_string *name;
 489
 490                 device = kzalloc(sizeof(*device), GFP_NOFS);
 491                 if (!device)
 492                         goto error;
 493
 494                 /*
 495                  * This is ok to do without rcu read locked because we hold the
 496                  * uuid mutex so nothing we touch in here is going to disappear.
 497                  */
 498                 name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
 499                 if (!name) {
 500                         kfree(device);
 501                         goto error;
 502                 }
 503                 rcu_assign_pointer(device->name, name);
 504
 505                 device->devid = orig_dev->devid;
 506                 device->work.func = pending_bios_fn;
 507                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
 508                 spin_lock_init(&device->io_lock);
 509                 INIT_LIST_HEAD(&device->dev_list);
 510                 INIT_LIST_HEAD(&device->dev_alloc_list);
 511
 512                 list_add(&device->dev_list, &fs_devices->devices);
 513                 device->fs_devices = fs_devices;
 514                 fs_devices->num_devices++;
 515         }
 516         return fs_devices;
 517 error:
 518         free_fs_devices(fs_devices);
 519         return ERR_PTR(-ENOMEM);
 520 }
 521
 522 void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
 523                                struct btrfs_fs_devices *fs_devices, int step)
 524 {
 525         struct btrfs_device *device, *next;
 526
 527         struct block_device *latest_bdev = NULL;
 528         u64 latest_devid = 0;
 529         u64 latest_transid = 0;
 530
 531         mutex_lock(&uuid_mutex);
 532 again:
 533         /* This is the initialized path, it is safe to release the devices. */
 534         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 535                 if (device->in_fs_metadata) {
 536                         if (!device->is_tgtdev_for_dev_replace &&
 537                             (!latest_transid ||
 538                              device->generation > latest_transid)) {
 539                                 latest_devid = device->devid;
 540                                 latest_transid = device->generation;
 541                                 latest_bdev = device->bdev;
 542                         }
 543                         continue;
 544                 }
 545
 546                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
 547                         /*
 548                          * In the first step, keep the device which has
 549                          * the correct fsid and the devid that is used
 550                          * for the dev_replace procedure.
 551                          * In the second step, the dev_replace state is
 552                          * read from the device tree and it is known
 553                          * whether the procedure is really active or
 554                          * not, which means whether this device is
 555                          * used or whether it should be removed.
 556                          */
 557                         if (step == 0 || device->is_tgtdev_for_dev_replace) {
 558                                 continue;
 559                         }
 560                 }
 561                 if (device->bdev) {
 562                         blkdev_put(device->bdev, device->mode);
 563                         device->bdev = NULL;
 564                         fs_devices->open_devices--;
 565                 }
 566                 if (device->writeable) {
 567                         list_del_init(&device->dev_alloc_list);
 568                         device->writeable = 0;
 569                         if (!device->is_tgtdev_for_dev_replace)
 570                                 fs_devices->rw_devices--;
 571                 }
 572                 list_del_init(&device->dev_list);
 573                 fs_devices->num_devices--;
 574                 rcu_string_free(device->name);
 575                 kfree(device);
 576         }
 577
 578         if (fs_devices->seed) {
 579                 fs_devices = fs_devices->seed;
 580                 goto again;
 581         }
 582
 583         fs_devices->latest_bdev = latest_bdev;
 584         fs_devices->latest_devid = latest_devid;
 585         fs_devices->latest_trans = latest_transid;
 586
 587         mutex_unlock(&uuid_mutex);
 588 }
 589
 590 static void __free_device(struct work_struct *work)
 591 {
 592         struct btrfs_device *device;
 593
 594         device = container_of(work, struct btrfs_device, rcu_work);
 595
 596         if (device->bdev)
 597                 blkdev_put(device->bdev, device->mode);
 598
 599         rcu_string_free(device->name);
 600         kfree(device);
 601 }
 602
 603 static void free_device(struct rcu_head *head)
 604 {
 605         struct btrfs_device *device;
 606
 607         device = container_of(head, struct btrfs_device, rcu);
 608
 609         INIT_WORK(&device->rcu_work, __free_device);
 610         schedule_work(&device->rcu_work);
 611 }
 612
 613 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 614 {
 615         struct btrfs_device *device;
 616
 617         if (--fs_devices->opened > 0)
 618                 return 0;
 619
 620         mutex_lock(&fs_devices->device_list_mutex);
 621         list_for_each_entry(device, &fs_devices->devices, dev_list) {
 622                 struct btrfs_device *new_device;
 623                 struct rcu_string *name;
 624
 625                 if (device->bdev)
 626                         fs_devices->open_devices--;
 627
 628                 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
 629                         list_del_init(&device->dev_alloc_list);
 630                         fs_devices->rw_devices--;
 631                 }
 632
 633                 if (device->can_discard)
 634                         fs_devices->num_can_discard--;
 635
 636                 new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
 637                 BUG_ON(!new_device); /* -ENOMEM */
 638                 memcpy(new_device, device, sizeof(*new_device));
 639
 640                 /* Safe because we are under uuid_mutex */
 641                 if (device->name) {
 642                         name = rcu_string_strdup(device->name->str, GFP_NOFS);
 643                         BUG_ON(device->name && !name); /* -ENOMEM */
 644                         rcu_assign_pointer(new_device->name, name);
 645                 }
 646                 new_device->bdev = NULL;
 647                 new_device->writeable = 0;
 648                 new_device->in_fs_metadata = 0;
 649                 new_device->can_discard = 0;
 650                 list_replace_rcu(&device->dev_list, &new_device->dev_list);
 651
 652                 call_rcu(&device->rcu, free_device);
 653         }
 654         mutex_unlock(&fs_devices->device_list_mutex);
 655
 656         WARN_ON(fs_devices->open_devices);
 657         WARN_ON(fs_devices->rw_devices);
 658         fs_devices->opened = 0;
 659         fs_devices->seeding = 0;
 660
 661         return 0;
 662 }
 663
 664 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 665 {
 666         struct btrfs_fs_devices *seed_devices = NULL;
 667         int ret;
 668
 669         mutex_lock(&uuid_mutex);
 670         ret = __btrfs_close_devices(fs_devices);
 671         if (!fs_devices->opened) {
 672                 seed_devices = fs_devices->seed;
 673                 fs_devices->seed = NULL;
 674         }
 675         mutex_unlock(&uuid_mutex);
 676
 677         while (seed_devices) {
 678                 fs_devices = seed_devices;
 679                 seed_devices = fs_devices->seed;
 680                 __btrfs_close_devices(fs_devices);
 681                 free_fs_devices(fs_devices);
 682         }
 683         return ret;
 684 }
 685
 686 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 687                                 fmode_t flags, void *holder)
 688 {
 689         struct request_queue *q;
 690         struct block_device *bdev;
 691         struct list_head *head = &fs_devices->devices;
 692         struct btrfs_device *device;
 693         struct block_device *latest_bdev = NULL;
 694         struct buffer_head *bh;
 695         struct btrfs_super_block *disk_super;
 696         u64 latest_devid = 0;
 697         u64 latest_transid = 0;
 698         u64 devid;
 699         int seeding = 1;
 700         int ret = 0;
 701
 702         flags |= FMODE_EXCL;
 703
 704         list_for_each_entry(device, head, dev_list) {
 705                 if (device->bdev)
 706                         continue;
 707                 if (!device->name)
 708                         continue;
 709
 710                 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 711                                             &bdev, &bh);
 712                 if (ret)
 713                         continue;
 714
 715                 disk_super = (struct btrfs_super_block *)bh->b_data;
 716                 devid = btrfs_stack_device_id(&disk_super->dev_item);
 717                 if (devid != device->devid)
 718                         goto error_brelse;
 719
 720                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
 721                            BTRFS_UUID_SIZE))
 722                         goto error_brelse;
 723
 724                 device->generation = btrfs_super_generation(disk_super);
 725                 if (!latest_transid || device->generation > latest_transid) {
 726                         latest_devid = devid;
 727                         latest_transid = device->generation;
 728                         latest_bdev = bdev;
 729                 }
 730
 731                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 732                         device->writeable = 0;
 733                 } else {
 734                         device->writeable = !bdev_read_only(bdev);
 735                         seeding = 0;
 736                 }
 737
 738                 q = bdev_get_queue(bdev);
 739                 if (blk_queue_discard(q)) {
 740                         device->can_discard = 1;
 741                         fs_devices->num_can_discard++;
 742                 }
 743
 744                 device->bdev = bdev;
 745                 device->in_fs_metadata = 0;
 746                 device->mode = flags;
 747
 748                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
 749                         fs_devices->rotating = 1;
 750
 751                 fs_devices->open_devices++;
 752                 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
 753                         fs_devices->rw_devices++;
 754                         list_add(&device->dev_alloc_list,
 755                                  &fs_devices->alloc_list);
 756                 }
 757                 brelse(bh);
 758                 continue;
 759
 760 error_brelse:
 761                 brelse(bh);
 762                 blkdev_put(bdev, flags);
 763                 continue;
 764         }
 765         if (fs_devices->open_devices == 0) {
 766                 ret = -EINVAL;
 767                 goto out;
 768         }
 769         fs_devices->seeding = seeding;
 770         fs_devices->opened = 1;
 771         fs_devices->latest_bdev = latest_bdev;
 772         fs_devices->latest_devid = latest_devid;
 773         fs_devices->latest_trans = latest_transid;
 774         fs_devices->total_rw_bytes = 0;
 775 out:
 776         return ret;
 777 }
 778
 779 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 780                        fmode_t flags, void *holder)
 781 {
 782         int ret;
 783
 784         mutex_lock(&uuid_mutex);
 785         if (fs_devices->opened) {
 786                 fs_devices->opened++;
 787                 ret = 0;
 788         } else {
 789                 ret = __btrfs_open_devices(fs_devices, flags, holder);
 790         }
 791         mutex_unlock(&uuid_mutex);
 792         return ret;
 793 }
 794
 795 /*
 796  * Look for a btrfs signature on a device. This may be called out of the mount path
 797  * and we are not allowed to call set_blocksize during the scan. The superblock
 798  * is read via pagecache
 799  */
 800 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 801                           struct btrfs_fs_devices **fs_devices_ret)
 802 {
 803         struct btrfs_super_block *disk_super;
 804         struct block_device *bdev;
 805         struct page *page;
 806         void *p;
 807         int ret = -EINVAL;
 808         u64 devid;
 809         u64 transid;
 810         u64 total_devices;
 811         u64 bytenr;
 812         pgoff_t index;
 813
 814         /*
 815          * we would like to check all the supers, but that would make
 816          * a btrfs mount succeed after a mkfs from a different FS.
 817          * So, we need to add a special mount option to scan for
 818          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
 819          */
 820         bytenr = btrfs_sb_offset(0);
 821         flags |= FMODE_EXCL;
 822         mutex_lock(&uuid_mutex);
 823
 824         bdev = blkdev_get_by_path(path, flags, holder);
 825
 826         if (IS_ERR(bdev)) {
 827                 ret = PTR_ERR(bdev);
 828                 printk(KERN_INFO "btrfs: open %s failed\n", path);
 829                 goto error;
 830         }
 831
 832         /* make sure our super fits in the device */
 833         if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
 834                 goto error_bdev_put;
 835
 836         /* make sure our super fits in the page */
 837         if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
 838                 goto error_bdev_put;
 839
 840         /* make sure our super doesn't straddle pages on disk */
 841         index = bytenr >> PAGE_CACHE_SHIFT;
 842         if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
 843                 goto error_bdev_put;
 844
 845         /* pull in the page with our super */
 846         page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
 847                                    index, GFP_NOFS);
 848
 849         if (IS_ERR_OR_NULL(page))
 850                 goto error_bdev_put;
 851
 852         p = kmap(page);
 853
 854         /* align our pointer to the offset of the super block */
 855         disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
 856
 857         if (btrfs_super_bytenr(disk_super) != bytenr ||
 858             strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 859                     sizeof(disk_super->magic)))
 860                 goto error_unmap;
 861
 862         devid = btrfs_stack_device_id(&disk_super->dev_item);
 863         transid = btrfs_super_generation(disk_super);
 864         total_devices = btrfs_super_num_devices(disk_super);
 865
 866         if (disk_super->label[0]) {
 867                 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
 868                         disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
 869                 printk(KERN_INFO "device label %s ", disk_super->label);
 870         } else {
 871                 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
 872         }
 873
 874         printk(KERN_CONT "devid %llu transid %llu %s\n",
 875                (unsigned long long)devid, (unsigned long long)transid, path);
 876
 877         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 878         if (!ret && fs_devices_ret)
 879                 (*fs_devices_ret)->total_devices = total_devices;
 880
 881 error_unmap:
 882         kunmap(page);
 883         page_cache_release(page);
 884
 885 error_bdev_put:
 886         blkdev_put(bdev, flags);
 887 error:
 888         mutex_unlock(&uuid_mutex);
 889         return ret;
 890 }
 891
 892 /* helper to account the used device space in the range */
 893 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 894                                    u64 end, u64 *length)
 895 {
 896         struct btrfs_key key;
 897         struct btrfs_root *root = device->dev_root;
 898         struct btrfs_dev_extent *dev_extent;
 899         struct btrfs_path *path;
 900         u64 extent_end;
 901         int ret;
 902         int slot;
 903         struct extent_buffer *l;
 904
 905         *length = 0;
 906
 907         if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
 908                 return 0;
 909
 910         path = btrfs_alloc_path();
 911         if (!path)
 912                 return -ENOMEM;
 913         path->reada = 2;
 914
 915         key.objectid = device->devid;
 916         key.offset = start;
 917         key.type = BTRFS_DEV_EXTENT_KEY;
 918
 919         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 920         if (ret < 0)
 921                 goto out;
 922         if (ret > 0) {
 923                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
 924                 if (ret < 0)
 925                         goto out;
 926         }
 927
 928         while (1) {
 929                 l = path->nodes[0];
 930                 slot = path->slots[0];
 931                 if (slot >= btrfs_header_nritems(l)) {
 932                         ret = btrfs_next_leaf(root, path);
 933                         if (ret == 0)
 934                                 continue;
 935                         if (ret < 0)
 936                                 goto out;
 937
 938                         break;
 939                 }
 940                 btrfs_item_key_to_cpu(l, &key, slot);
 941
 942                 if (key.objectid < device->devid)
 943                         goto next;
 944
 945                 if (key.objectid > device->devid)
 946                         break;
 947
 948                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
 949                         goto next;
 950
 951                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
 952                 extent_end = key.offset + btrfs_dev_extent_length(l,
 953                                                                   dev_extent);
 954                 if (key.offset <= start && extent_end > end) {
 955                         *length = end - start + 1;
 956                         break;
 957                 } else if (key.offset <= start && extent_end > start)
 958                         *length += extent_end - start;
 959                 else if (key.offset > start && extent_end <= end)
 960                         *length += extent_end - key.offset;
 961                 else if (key.offset > start && key.offset <= end) {
 962                         *length += end - key.offset + 1;
 963                         break;
 964                 } else if (key.offset > end)
 965                         break;
 966
 967 next:
 968                 path->slots[0]++;
 969         }
 970         ret = 0;
 971 out:
 972         btrfs_free_path(path);
 973         return ret;
 974 }
 975
 976 /*
 977  * find_free_dev_extent - find free space in the specified device
 978  * @device:     the device which we search the free space in
 979  * @num_bytes:  the size of the free space that we need
 980  * @start:      store the start of the free space.
 981  * @len:        the size of the free space. that we find, or the size of the max
 982  *              free space if we don't find suitable free space
 983  *
 984  * this uses a pretty simple search, the expectation is that it is
 985  * called very infrequently and that a given device has a small number
 986  * of extents
 987  *
 988  * @start is used to store the start of the free space if we find. But if we
 989  * don't find suitable free space, it will be used to store the start position
 990  * of the max free space.
 991  *
 992  * @len is used to store the size of the free space that we find.
 993  * But if we don't find suitable free space, it is used to store the size of
 994  * the max free space.
 995  */
 996 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
 997                          u64 *start, u64 *len)
 998 {
 999         struct btrfs_key key;
1000         struct btrfs_root *root = device->dev_root;
1001         struct btrfs_dev_extent *dev_extent;
1002         struct btrfs_path *path;
1003         u64 hole_size;
1004         u64 max_hole_start;
1005         u64 max_hole_size;
1006         u64 extent_end;
1007         u64 search_start;
1008         u64 search_end = device->total_bytes;
1009         int ret;
1010         int slot;
1011         struct extent_buffer *l;
1012
1013         /* FIXME use last free of some kind */
1014
1015         /* we don't want to overwrite the superblock on the drive,
1016          * so we make sure to start at an offset of at least 1MB
1017          */
1018         search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
1019
1020         max_hole_start = search_start;
1021         max_hole_size = 0;
1022         hole_size = 0;
1023
1024         if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1025                 ret = -ENOSPC;
1026                 goto error;
1027         }
1028
1029         path = btrfs_alloc_path();
1030         if (!path) {
1031                 ret = -ENOMEM;
1032                 goto error;
1033         }
1034         path->reada = 2;
1035
1036         key.objectid = device->devid;
1037         key.offset = search_start;
1038         key.type = BTRFS_DEV_EXTENT_KEY;
1039
1040         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1041         if (ret < 0)
1042                 goto out;
1043         if (ret > 0) {
1044                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1045                 if (ret < 0)
1046                         goto out;
1047         }
1048
1049         while (1) {
1050                 l = path->nodes[0];
1051                 slot = path->slots[0];
1052                 if (slot >= btrfs_header_nritems(l)) {
1053                         ret = btrfs_next_leaf(root, path);
1054                         if (ret == 0)
1055                                 continue;
1056                         if (ret < 0)
1057                                 goto out;
1058
1059                         break;
1060                 }
1061                 btrfs_item_key_to_cpu(l, &key, slot);
1062
1063                 if (key.objectid < device->devid)
1064                         goto next;
1065
1066                 if (key.objectid > device->devid)
1067                         break;
1068
1069                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
1070                         goto next;
1071
1072                 if (key.offset > search_start) {
1073                         hole_size = key.offset - search_start;
1074
1075                         if (hole_size > max_hole_size) {
1076                                 max_hole_start = search_start;
1077                                 max_hole_size = hole_size;
1078                         }
1079
1080                         /*
1081                          * If this free space is greater than which we need,
1082                          * it must be the max free space that we have found
1083                          * until now, so max_hole_start must point to the start
1084                          * of this free space and the length of this free space
1085                          * is stored in max_hole_size. Thus, we return
1086                          * max_hole_start and max_hole_size and go back to the
1087                          * caller.
1088                          */
1089                         if (hole_size >= num_bytes) {
1090                                 ret = 0;
1091                                 goto out;
1092                         }
1093                 }
1094
1095                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1096                 extent_end = key.offset + btrfs_dev_extent_length(l,
1097                                                                   dev_extent);
1098                 if (extent_end > search_start)
1099                         search_start = extent_end;
1100 next:
1101                 path->slots[0]++;
1102                 cond_resched();
1103         }
1104
1105         /*
1106          * At this point, search_start should be the end of
1107          * allocated dev extents, and when shrinking the device,
1108          * search_end may be smaller than search_start.
1109          */
1110         if (search_end > search_start)
1111                 hole_size = search_end - search_start;
1112
1113         if (hole_size > max_hole_size) {
1114                 max_hole_start = search_start;
1115                 max_hole_size = hole_size;
1116         }
1117
1118         /* See above. */
1119         if (hole_size < num_bytes)
1120                 ret = -ENOSPC;
1121         else
1122                 ret = 0;
1123
1124 out:
1125         btrfs_free_path(path);
1126 error:
1127         *start = max_hole_start;
1128         if (len)
1129                 *len = max_hole_size;
1130         return ret;
1131 }
1132
1133 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1134                           struct btrfs_device *device,
1135                           u64 start)
1136 {
1137         int ret;
1138         struct btrfs_path *path;
1139         struct btrfs_root *root = device->dev_root;
1140         struct btrfs_key key;
1141         struct btrfs_key found_key;
1142         struct extent_buffer *leaf = NULL;
1143         struct btrfs_dev_extent *extent = NULL;
1144
1145         path = btrfs_alloc_path();
1146         if (!path)
1147                 return -ENOMEM;
1148
1149         key.objectid = device->devid;
1150         key.offset = start;
1151         key.type = BTRFS_DEV_EXTENT_KEY;
1152 again:
1153         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1154         if (ret > 0) {
1155                 ret = btrfs_previous_item(root, path, key.objectid,
1156                                           BTRFS_DEV_EXTENT_KEY);
1157                 if (ret)
1158                         goto out;
1159                 leaf = path->nodes[0];
1160                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1161                 extent = btrfs_item_ptr(leaf, path->slots[0],
1162                                         struct btrfs_dev_extent);
1163                 BUG_ON(found_key.offset > start || found_key.offset +
1164                        btrfs_dev_extent_length(leaf, extent) < start);
1165                 key = found_key;
1166                 btrfs_release_path(path);
1167                 goto again;
1168         } else if (ret == 0) {
1169                 leaf = path->nodes[0];
1170                 extent = btrfs_item_ptr(leaf, path->slots[0],
1171                                         struct btrfs_dev_extent);
1172         } else {
1173                 btrfs_error(root->fs_info, ret, "Slot search failed");
1174                 goto out;
1175         }
1176
1177         if (device->bytes_used > 0) {
1178                 u64 len = btrfs_dev_extent_length(leaf, extent);
1179                 device->bytes_used -= len;
1180                 spin_lock(&root->fs_info->free_chunk_lock);
1181                 root->fs_info->free_chunk_space += len;
1182                 spin_unlock(&root->fs_info->free_chunk_lock);
1183         }
1184         ret = btrfs_del_item(trans, root, path);
1185         if (ret) {
1186                 btrfs_error(root->fs_info, ret,
1187                             "Failed to remove dev extent item");
1188         }
1189 out:
1190         btrfs_free_path(path);
1191         return ret;
1192 }
1193
1194 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1195                            struct btrfs_device *device,
1196                            u64 chunk_tree, u64 chunk_objectid,
1197                            u64 chunk_offset, u64 start, u64 num_bytes)
1198 {
1199         int ret;
1200         struct btrfs_path *path;
1201         struct btrfs_root *root = device->dev_root;
1202         struct btrfs_dev_extent *extent;
1203         struct extent_buffer *leaf;
1204         struct btrfs_key key;
1205
1206         WARN_ON(!device->in_fs_metadata);
1207         WARN_ON(device->is_tgtdev_for_dev_replace);
1208         path = btrfs_alloc_path();
1209         if (!path)
1210                 return -ENOMEM;
1211
1212         key.objectid = device->devid;
1213         key.offset = start;
1214         key.type = BTRFS_DEV_EXTENT_KEY;
1215         ret = btrfs_insert_empty_item(trans, root, path, &key,
1216                                       sizeof(*extent));
1217         if (ret)
1218                 goto out;
1219
1220         leaf = path->nodes[0];
1221         extent = btrfs_item_ptr(leaf, path->slots[0],
1222                                 struct btrfs_dev_extent);
1223         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1224         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1225         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1226
1227         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1228                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1229                     BTRFS_UUID_SIZE);
1230
1231         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1232         btrfs_mark_buffer_dirty(leaf);
1233 out:
1234         btrfs_free_path(path);
1235         return ret;
1236 }
1237
1238 static noinline int find_next_chunk(struct btrfs_root *root,
1239                                     u64 objectid, u64 *offset)
1240 {
1241         struct btrfs_path *path;
1242         int ret;
1243         struct btrfs_key key;
1244         struct btrfs_chunk *chunk;
1245         struct btrfs_key found_key;
1246
1247         path = btrfs_alloc_path();
1248         if (!path)
1249                 return -ENOMEM;
1250
1251         key.objectid = objectid;
1252         key.offset = (u64)-1;
1253         key.type = BTRFS_CHUNK_ITEM_KEY;
1254
1255         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1256         if (ret < 0)
1257                 goto error;
1258
1259         BUG_ON(ret == 0); /* Corruption */
1260
1261         ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1262         if (ret) {
1263                 *offset = 0;
1264         } else {
1265                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1266                                       path->slots[0]);
1267                 if (found_key.objectid != objectid)
1268                         *offset = 0;
1269                 else {
1270                         chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1271                                                struct btrfs_chunk);
1272                         *offset = found_key.offset +
1273                                 btrfs_chunk_length(path->nodes[0], chunk);
1274                 }
1275         }
1276         ret = 0;
1277 error:
1278         btrfs_free_path(path);
1279         return ret;
1280 }
1281
1282 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1283 {
1284         int ret;
1285         struct btrfs_key key;
1286         struct btrfs_key found_key;
1287         struct btrfs_path *path;
1288
1289         root = root->fs_info->chunk_root;
1290
1291         path = btrfs_alloc_path();
1292         if (!path)
1293                 return -ENOMEM;
1294
1295         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1296         key.type = BTRFS_DEV_ITEM_KEY;
1297         key.offset = (u64)-1;
1298
1299         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1300         if (ret < 0)
1301                 goto error;
1302
1303         BUG_ON(ret == 0); /* Corruption */
1304
1305         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1306                                   BTRFS_DEV_ITEM_KEY);
1307         if (ret) {
1308                 *objectid = 1;
1309         } else {
1310                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1311                                       path->slots[0]);
1312                 *objectid = found_key.offset + 1;
1313         }
1314         ret = 0;
1315 error:
1316         btrfs_free_path(path);
1317         return ret;
1318 }
1319
1320 /*
1321  * the device information is stored in the chunk root
1322  * the btrfs_device struct should be fully filled in
1323  */
1324 int btrfs_add_device(struct btrfs_trans_handle *trans,
1325                      struct btrfs_root *root,
1326                      struct btrfs_device *device)
1327 {
1328         int ret;
1329         struct btrfs_path *path;
1330         struct btrfs_dev_item *dev_item;
1331         struct extent_buffer *leaf;
1332         struct btrfs_key key;
1333         unsigned long ptr;
1334
1335         root = root->fs_info->chunk_root;
1336
1337         path = btrfs_alloc_path();
1338         if (!path)
1339                 return -ENOMEM;
1340
1341         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1342         key.type = BTRFS_DEV_ITEM_KEY;
1343         key.offset = device->devid;
1344
1345         ret = btrfs_insert_empty_item(trans, root, path, &key,
1346                                       sizeof(*dev_item));
1347         if (ret)
1348                 goto out;
1349
1350         leaf = path->nodes[0];
1351         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1352
1353         btrfs_set_device_id(leaf, dev_item, device->devid);
1354         btrfs_set_device_generation(leaf, dev_item, 0);
1355         btrfs_set_device_type(leaf, dev_item, device->type);
1356         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1357         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1358         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1359         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1360         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1361         btrfs_set_device_group(leaf, dev_item, 0);
1362         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1363         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1364         btrfs_set_device_start_offset(leaf, dev_item, 0);
1365
1366         ptr = (unsigned long)btrfs_device_uuid(dev_item);
1367         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1368         ptr = (unsigned long)btrfs_device_fsid(dev_item);
1369         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1370         btrfs_mark_buffer_dirty(leaf);
1371
1372         ret = 0;
1373 out:
1374         btrfs_free_path(path);
1375         return ret;
1376 }
1377
1378 static int btrfs_rm_dev_item(struct btrfs_root *root,
1379                              struct btrfs_device *device)
1380 {
1381         int ret;
1382         struct btrfs_path *path;
1383         struct btrfs_key key;
1384         struct btrfs_trans_handle *trans;
1385
1386         root = root->fs_info->chunk_root;
1387
1388         path = btrfs_alloc_path();
1389         if (!path)
1390                 return -ENOMEM;
1391
1392         trans = btrfs_start_transaction(root, 0);
1393         if (IS_ERR(trans)) {
1394                 btrfs_free_path(path);
1395                 return PTR_ERR(trans);
1396         }
1397         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1398         key.type = BTRFS_DEV_ITEM_KEY;
1399         key.offset = device->devid;
1400         lock_chunks(root);
1401
1402         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1403         if (ret < 0)
1404                 goto out;
1405
1406         if (ret > 0) {
1407                 ret = -ENOENT;
1408                 goto out;
1409         }
1410
1411         ret = btrfs_del_item(trans, root, path);
1412         if (ret)
1413                 goto out;
1414 out:
1415         btrfs_free_path(path);
1416         unlock_chunks(root);
1417         btrfs_commit_transaction(trans, root);
1418         return ret;
1419 }
1420
1421 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1422 {
1423         struct btrfs_device *device;
1424         struct btrfs_device *next_device;
1425         struct block_device *bdev;
1426         struct buffer_head *bh = NULL;
1427         struct btrfs_super_block *disk_super;
1428         struct btrfs_fs_devices *cur_devices;
1429         u64 all_avail;
1430         u64 devid;
1431         u64 num_devices;
1432         u8 *dev_uuid;
1433         int ret = 0;
1434         bool clear_super = false;
1435
1436         mutex_lock(&uuid_mutex);
1437
1438         all_avail = root->fs_info->avail_data_alloc_bits |
1439                 root->fs_info->avail_system_alloc_bits |
1440                 root->fs_info->avail_metadata_alloc_bits;
1441
1442         num_devices = root->fs_info->fs_devices->num_devices;
1443         btrfs_dev_replace_lock(&root->fs_info->dev_replace);
1444         if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
1445                 WARN_ON(num_devices < 1);
1446                 num_devices--;
1447         }
1448         btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1449
1450         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1451                 printk(KERN_ERR "btrfs: unable to go below four devices "
1452                        "on raid10\n");
1453                 ret = -EINVAL;
1454                 goto out;
1455         }
1456
1457         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1458                 printk(KERN_ERR "btrfs: unable to go below two "
1459                        "devices on raid1\n");
1460                 ret = -EINVAL;
1461                 goto out;
1462         }
1463
1464         if (strcmp(device_path, "missing") == 0) {
1465                 struct list_head *devices;
1466                 struct btrfs_device *tmp;
1467
1468                 device = NULL;
1469                 devices = &root->fs_info->fs_devices->devices;
1470                 /*
1471                  * It is safe to read the devices since the volume_mutex
1472                  * is held.
1473                  */
1474                 list_for_each_entry(tmp, devices, dev_list) {
1475                         if (tmp->in_fs_metadata &&
1476                             !tmp->is_tgtdev_for_dev_replace &&
1477                             !tmp->bdev) {
1478                                 device = tmp;
1479                                 break;
1480                         }
1481                 }
1482                 bdev = NULL;
1483                 bh = NULL;
1484                 disk_super = NULL;
1485                 if (!device) {
1486                         printk(KERN_ERR "btrfs: no missing devices found to "
1487                                "remove\n");
1488                         goto out;
1489                 }
1490         } else {
1491                 ret = btrfs_get_bdev_and_sb(device_path,
1492                                             FMODE_WRITE | FMODE_EXCL,
1493                                             root->fs_info->bdev_holder, 0,
1494                                             &bdev, &bh);
1495                 if (ret)
1496                         goto out;
1497                 disk_super = (struct btrfs_super_block *)bh->b_data;
1498                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1499                 dev_uuid = disk_super->dev_item.uuid;
1500                 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1501                                            disk_super->fsid);
1502                 if (!device) {
1503                         ret = -ENOENT;
1504                         goto error_brelse;
1505                 }
1506         }
1507
1508         if (device->is_tgtdev_for_dev_replace) {
1509                 pr_err("btrfs: unable to remove the dev_replace target dev\n");
1510                 ret = -EINVAL;
1511                 goto error_brelse;
1512         }
1513
1514         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1515                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1516                        "device\n");
1517                 ret = -EINVAL;
1518                 goto error_brelse;
1519         }
1520
1521         if (device->writeable) {
1522                 lock_chunks(root);
1523                 list_del_init(&device->dev_alloc_list);
1524                 unlock_chunks(root);
1525                 root->fs_info->fs_devices->rw_devices--;
1526                 clear_super = true;
1527         }
1528
1529         ret = btrfs_shrink_device(device, 0);
1530         if (ret)
1531                 goto error_undo;
1532
1533         /*
1534          * TODO: the superblock still includes this device in its num_devices
1535          * counter although write_all_supers() is not locked out. This
1536          * could give a filesystem state which requires a degraded mount.
1537          */
1538         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1539         if (ret)
1540                 goto error_undo;
1541
1542         spin_lock(&root->fs_info->free_chunk_lock);
1543         root->fs_info->free_chunk_space = device->total_bytes -
1544                 device->bytes_used;
1545         spin_unlock(&root->fs_info->free_chunk_lock);
1546
1547         device->in_fs_metadata = 0;
1548         btrfs_scrub_cancel_dev(root->fs_info, device);
1549
1550         /*
1551          * the device list mutex makes sure that we don't change
1552          * the device list while someone else is writing out all
1553          * the device supers.
1554          */
1555
1556         cur_devices = device->fs_devices;
1557         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1558         list_del_rcu(&device->dev_list);
1559
1560         device->fs_devices->num_devices--;
1561         device->fs_devices->total_devices--;
1562
1563         if (device->missing)
1564                 root->fs_info->fs_devices->missing_devices--;
1565
1566         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1567                                  struct btrfs_device, dev_list);
1568         if (device->bdev == root->fs_info->sb->s_bdev)
1569                 root->fs_info->sb->s_bdev = next_device->bdev;
1570         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1571                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1572
1573         if (device->bdev)
1574                 device->fs_devices->open_devices--;
1575
1576         call_rcu(&device->rcu, free_device);
1577         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1578
1579         num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1580         btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1581
1582         if (cur_devices->open_devices == 0) {
1583                 struct btrfs_fs_devices *fs_devices;
1584                 fs_devices = root->fs_info->fs_devices;
1585                 while (fs_devices) {
1586                         if (fs_devices->seed == cur_devices)
1587                                 break;
1588                         fs_devices = fs_devices->seed;
1589                 }
1590                 fs_devices->seed = cur_devices->seed;
1591                 cur_devices->seed = NULL;
1592                 lock_chunks(root);
1593                 __btrfs_close_devices(cur_devices);
1594                 unlock_chunks(root);
1595                 free_fs_devices(cur_devices);
1596         }
1597
1598         root->fs_info->num_tolerated_disk_barrier_failures =
1599                 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1600
1601         /*
1602          * at this point, the device is zero sized.  We want to
1603          * remove it from the devices list and zero out the old super
1604          */
1605         if (clear_super && disk_super) {
1606                 /* make sure this device isn't detected as part of
1607                  * the FS anymore
1608                  */
1609                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1610                 set_buffer_dirty(bh);
1611                 sync_dirty_buffer(bh);
1612         }
1613
1614         ret = 0;
1615
1616         /* Notify udev that device has changed */
1617         if (bdev)
1618                 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1619
1620 error_brelse:
1621         brelse(bh);
1622         if (bdev)
1623                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1624 out:
1625         mutex_unlock(&uuid_mutex);
1626         return ret;
1627 error_undo:
1628         if (device->writeable) {
1629                 lock_chunks(root);
1630                 list_add(&device->dev_alloc_list,
1631                          &root->fs_info->fs_devices->alloc_list);
1632                 unlock_chunks(root);
1633                 root->fs_info->fs_devices->rw_devices++;
1634         }
1635         goto error_brelse;
1636 }
1637
1638 void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1639                                  struct btrfs_device *srcdev)
1640 {
1641         WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1642         list_del_rcu(&srcdev->dev_list);
1643         list_del_rcu(&srcdev->dev_alloc_list);
1644         fs_info->fs_devices->num_devices--;
1645         if (srcdev->missing) {
1646                 fs_info->fs_devices->missing_devices--;
1647                 fs_info->fs_devices->rw_devices++;
1648         }
1649         if (srcdev->can_discard)
1650                 fs_info->fs_devices->num_can_discard--;
1651         if (srcdev->bdev)
1652                 fs_info->fs_devices->open_devices--;
1653
1654         call_rcu(&srcdev->rcu, free_device);
1655 }
1656
1657 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1658                                       struct btrfs_device *tgtdev)
1659 {
1660         struct btrfs_device *next_device;
1661
1662         WARN_ON(!tgtdev);
1663         mutex_lock(&fs_info->fs_devices->device_list_mutex);
1664         if (tgtdev->bdev) {
1665                 btrfs_scratch_superblock(tgtdev);
1666                 fs_info->fs_devices->open_devices--;
1667         }
1668         fs_info->fs_devices->num_devices--;
1669         if (tgtdev->can_discard)
1670                 fs_info->fs_devices->num_can_discard++;
1671
1672         next_device = list_entry(fs_info->fs_devices->devices.next,
1673                                  struct btrfs_device, dev_list);
1674         if (tgtdev->bdev == fs_info->sb->s_bdev)
1675                 fs_info->sb->s_bdev = next_device->bdev;
1676         if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
1677                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1678         list_del_rcu(&tgtdev->dev_list);
1679
1680         call_rcu(&tgtdev->rcu, free_device);
1681
1682         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1683 }
1684
1685 int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
1686                               struct btrfs_device **device)
1687 {
1688         int ret = 0;
1689         struct btrfs_super_block *disk_super;
1690         u64 devid;
1691         u8 *dev_uuid;
1692         struct block_device *bdev;
1693         struct buffer_head *bh;
1694
1695         *device = NULL;
1696         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
1697                                     root->fs_info->bdev_holder, 0, &bdev, &bh);
1698         if (ret)
1699                 return ret;
1700         disk_super = (struct btrfs_super_block *)bh->b_data;
1701         devid = btrfs_stack_device_id(&disk_super->dev_item);
1702         dev_uuid = disk_super->dev_item.uuid;
1703         *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1704                                     disk_super->fsid);
1705         brelse(bh);
1706         if (!*device)
1707                 ret = -ENOENT;
1708         blkdev_put(bdev, FMODE_READ);
1709         return ret;
1710 }
1711
1712 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
1713                                          char *device_path,
1714                                          struct btrfs_device **device)
1715 {
1716         *device = NULL;
1717         if (strcmp(device_path, "missing") == 0) {
1718                 struct list_head *devices;
1719                 struct btrfs_device *tmp;
1720
1721                 devices = &root->fs_info->fs_devices->devices;
1722                 /*
1723                  * It is safe to read the devices since the volume_mutex
1724                  * is held by the caller.
1725                  */
1726                 list_for_each_entry(tmp, devices, dev_list) {
1727                         if (tmp->in_fs_metadata && !tmp->bdev) {
1728                                 *device = tmp;
1729                                 break;
1730                         }
1731                 }
1732
1733                 if (!*device) {
1734                         pr_err("btrfs: no missing device found\n");
1735                         return -ENOENT;
1736                 }
1737
1738                 return 0;
1739         } else {
1740                 return btrfs_find_device_by_path(root, device_path, device);
1741         }
1742 }
1743
1744 /*
1745  * does all the dirty work required for changing file system's UUID.
1746  */
1747 static int btrfs_prepare_sprout(struct btrfs_root *root)
1748 {
1749         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1750         struct btrfs_fs_devices *old_devices;
1751         struct btrfs_fs_devices *seed_devices;
1752         struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1753         struct btrfs_device *device;
1754         u64 super_flags;
1755
1756         BUG_ON(!mutex_is_locked(&uuid_mutex));
1757         if (!fs_devices->seeding)
1758                 return -EINVAL;
1759
1760         seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1761         if (!seed_devices)
1762                 return -ENOMEM;
1763
1764         old_devices = clone_fs_devices(fs_devices);
1765         if (IS_ERR(old_devices)) {
1766                 kfree(seed_devices);
1767                 return PTR_ERR(old_devices);
1768         }
1769
1770         list_add(&old_devices->list, &fs_uuids);
1771
1772         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1773         seed_devices->opened = 1;
1774         INIT_LIST_HEAD(&seed_devices->devices);
1775         INIT_LIST_HEAD(&seed_devices->alloc_list);
1776         mutex_init(&seed_devices->device_list_mutex);
1777
1778         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1779         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1780                               synchronize_rcu);
1781         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1782
1783         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1784         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1785                 device->fs_devices = seed_devices;
1786         }
1787
1788         fs_devices->seeding = 0;
1789         fs_devices->num_devices = 0;
1790         fs_devices->open_devices = 0;
1791         fs_devices->total_devices = 0;
1792         fs_devices->seed = seed_devices;
1793
1794         generate_random_uuid(fs_devices->fsid);
1795         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1796         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1797         super_flags = btrfs_super_flags(disk_super) &
1798                       ~BTRFS_SUPER_FLAG_SEEDING;
1799         btrfs_set_super_flags(disk_super, super_flags);
1800
1801         return 0;
1802 }
1803
1804 /*
1805  * strore the expected generation for seed devices in device items.
1806  */
1807 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1808                                struct btrfs_root *root)
1809 {
1810         struct btrfs_path *path;
1811         struct extent_buffer *leaf;
1812         struct btrfs_dev_item *dev_item;
1813         struct btrfs_device *device;
1814         struct btrfs_key key;
1815         u8 fs_uuid[BTRFS_UUID_SIZE];
1816         u8 dev_uuid[BTRFS_UUID_SIZE];
1817         u64 devid;
1818         int ret;
1819
1820         path = btrfs_alloc_path();
1821         if (!path)
1822                 return -ENOMEM;
1823
1824         root = root->fs_info->chunk_root;
1825         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1826         key.offset = 0;
1827         key.type = BTRFS_DEV_ITEM_KEY;
1828
1829         while (1) {
1830                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1831                 if (ret < 0)
1832                         goto error;
1833
1834                 leaf = path->nodes[0];
1835 next_slot:
1836                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1837                         ret = btrfs_next_leaf(root, path);
1838                         if (ret > 0)
1839                                 break;
1840                         if (ret < 0)
1841                                 goto error;
1842                         leaf = path->nodes[0];
1843                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1844                         btrfs_release_path(path);
1845                         continue;
1846                 }
1847
1848                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1849                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1850                     key.type != BTRFS_DEV_ITEM_KEY)
1851                         break;
1852
1853                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1854                                           struct btrfs_dev_item);
1855                 devid = btrfs_device_id(leaf, dev_item);
1856                 read_extent_buffer(leaf, dev_uuid,
1857                                    (unsigned long)btrfs_device_uuid(dev_item),
1858                                    BTRFS_UUID_SIZE);
1859                 read_extent_buffer(leaf, fs_uuid,
1860                                    (unsigned long)btrfs_device_fsid(dev_item),
1861                                    BTRFS_UUID_SIZE);
1862                 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1863                                            fs_uuid);
1864                 BUG_ON(!device); /* Logic error */
1865
1866                 if (device->fs_devices->seeding) {
1867                         btrfs_set_device_generation(leaf, dev_item,
1868                                                     device->generation);
1869                         btrfs_mark_buffer_dirty(leaf);
1870                 }
1871
1872                 path->slots[0]++;
1873                 goto next_slot;
1874         }
1875         ret = 0;
1876 error:
1877         btrfs_free_path(path);
1878         return ret;
1879 }
1880
1881 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1882 {
1883         struct request_queue *q;
1884         struct btrfs_trans_handle *trans;
1885         struct btrfs_device *device;
1886         struct block_device *bdev;
1887         struct list_head *devices;
1888         struct super_block *sb = root->fs_info->sb;
1889         struct rcu_string *name;
1890         u64 total_bytes;
1891         int seeding_dev = 0;
1892         int ret = 0;
1893
1894         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1895                 return -EROFS;
1896
1897         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1898                                   root->fs_info->bdev_holder);
1899         if (IS_ERR(bdev))
1900                 return PTR_ERR(bdev);
1901
1902         if (root->fs_info->fs_devices->seeding) {
1903                 seeding_dev = 1;
1904                 down_write(&sb->s_umount);
1905                 mutex_lock(&uuid_mutex);
1906         }
1907
1908         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1909
1910         devices = &root->fs_info->fs_devices->devices;
1911
1912         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1913         list_for_each_entry(device, devices, dev_list) {
1914                 if (device->bdev == bdev) {
1915                         ret = -EEXIST;
1916                         mutex_unlock(
1917                                 &root->fs_info->fs_devices->device_list_mutex);
1918                         goto error;
1919                 }
1920         }
1921         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1922
1923         device = kzalloc(sizeof(*device), GFP_NOFS);
1924         if (!device) {
1925                 /* we can safely leave the fs_devices entry around */
1926                 ret = -ENOMEM;
1927                 goto error;
1928         }
1929
1930         name = rcu_string_strdup(device_path, GFP_NOFS);
1931         if (!name) {
1932                 kfree(device);
1933                 ret = -ENOMEM;
1934                 goto error;
1935         }
1936         rcu_assign_pointer(device->name, name);
1937
1938         ret = find_next_devid(root, &device->devid);
1939         if (ret) {
1940                 rcu_string_free(device->name);
1941                 kfree(device);
1942                 goto error;
1943         }
1944
1945         trans = btrfs_start_transaction(root, 0);
1946         if (IS_ERR(trans)) {
1947                 rcu_string_free(device->name);
1948                 kfree(device);
1949                 ret = PTR_ERR(trans);
1950                 goto error;
1951         }
1952
1953         lock_chunks(root);
1954
1955         q = bdev_get_queue(bdev);
1956         if (blk_queue_discard(q))
1957                 device->can_discard = 1;
1958         device->writeable = 1;
1959         device->work.func = pending_bios_fn;
1960         generate_random_uuid(device->uuid);
1961         spin_lock_init(&device->io_lock);
1962         device->generation = trans->transid;
1963         device->io_width = root->sectorsize;
1964         device->io_align = root->sectorsize;
1965         device->sector_size = root->sectorsize;
1966         device->total_bytes = i_size_read(bdev->bd_inode);
1967         device->disk_total_bytes = device->total_bytes;
1968         device->dev_root = root->fs_info->dev_root;
1969         device->bdev = bdev;
1970         device->in_fs_metadata = 1;
1971         device->is_tgtdev_for_dev_replace = 0;
1972         device->mode = FMODE_EXCL;
1973         set_blocksize(device->bdev, 4096);
1974
1975         if (seeding_dev) {
1976                 sb->s_flags &= ~MS_RDONLY;
1977                 ret = btrfs_prepare_sprout(root);
1978                 BUG_ON(ret); /* -ENOMEM */
1979         }
1980
1981         device->fs_devices = root->fs_info->fs_devices;
1982
1983         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1984         list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1985         list_add(&device->dev_alloc_list,
1986                  &root->fs_info->fs_devices->alloc_list);
1987         root->fs_info->fs_devices->num_devices++;
1988         root->fs_info->fs_devices->open_devices++;
1989         root->fs_info->fs_devices->rw_devices++;
1990         root->fs_info->fs_devices->total_devices++;
1991         if (device->can_discard)
1992                 root->fs_info->fs_devices->num_can_discard++;
1993         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1994
1995         spin_lock(&root->fs_info->free_chunk_lock);
1996         root->fs_info->free_chunk_space += device->total_bytes;
1997         spin_unlock(&root->fs_info->free_chunk_lock);
1998
1999         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
2000                 root->fs_info->fs_devices->rotating = 1;
2001
2002         total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
2003         btrfs_set_super_total_bytes(root->fs_info->super_copy,
2004                                     total_bytes + device->total_bytes);
2005
2006         total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
2007         btrfs_set_super_num_devices(root->fs_info->super_copy,
2008                                     total_bytes + 1);
2009         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2010
2011         if (seeding_dev) {
2012                 ret = init_first_rw_device(trans, root, device);
2013                 if (ret) {
2014                         btrfs_abort_transaction(trans, root, ret);
2015                         goto error_trans;
2016                 }
2017                 ret = btrfs_finish_sprout(trans, root);
2018                 if (ret) {
2019                         btrfs_abort_transaction(trans, root, ret);
2020                         goto error_trans;
2021                 }
2022         } else {
2023                 ret = btrfs_add_device(trans, root, device);
2024                 if (ret) {
2025                         btrfs_abort_transaction(trans, root, ret);
2026                         goto error_trans;
2027                 }
2028         }
2029
2030         /*
2031          * we've got more storage, clear any full flags on the space
2032          * infos
2033          */
2034         btrfs_clear_space_info_full(root->fs_info);
2035
2036         unlock_chunks(root);
2037         root->fs_info->num_tolerated_disk_barrier_failures =
2038                 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
2039         ret = btrfs_commit_transaction(trans, root);
2040
2041         if (seeding_dev) {
2042                 mutex_unlock(&uuid_mutex);
2043                 up_write(&sb->s_umount);
2044
2045                 if (ret) /* transaction commit */
2046                         return ret;
2047
2048                 ret = btrfs_relocate_sys_chunks(root);
2049                 if (ret < 0)
2050                         btrfs_error(root->fs_info, ret,
2051                                     "Failed to relocate sys chunks after "
2052                                     "device initialization. This can be fixed "
2053                                     "using the \"btrfs balance\" command.");
2054                 trans = btrfs_attach_transaction(root);
2055                 if (IS_ERR(trans)) {
2056                         if (PTR_ERR(trans) == -ENOENT)
2057                                 return 0;
2058                         return PTR_ERR(trans);
2059                 }
2060                 ret = btrfs_commit_transaction(trans, root);
2061         }
2062
2063         return ret;
2064
2065 error_trans:
2066         unlock_chunks(root);
2067         btrfs_end_transaction(trans, root);
2068         rcu_string_free(device->name);
2069         kfree(device);
2070 error:
2071         blkdev_put(bdev, FMODE_EXCL);
2072         if (seeding_dev) {
2073                 mutex_unlock(&uuid_mutex);
2074                 up_write(&sb->s_umount);
2075         }
2076         return ret;
2077 }
2078
2079 int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2080                                   struct btrfs_device **device_out)
2081 {
2082         struct request_queue *q;
2083         struct btrfs_device *device;
2084         struct block_device *bdev;
2085         struct btrfs_fs_info *fs_info = root->fs_info;
2086         struct list_head *devices;
2087         struct rcu_string *name;
2088         int ret = 0;
2089
2090         *device_out = NULL;
2091         if (fs_info->fs_devices->seeding)
2092                 return -EINVAL;
2093
2094         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2095                                   fs_info->bdev_holder);
2096         if (IS_ERR(bdev))
2097                 return PTR_ERR(bdev);
2098
2099         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2100
2101         devices = &fs_info->fs_devices->devices;
2102         list_for_each_entry(device, devices, dev_list) {
2103                 if (device->bdev == bdev) {
2104                         ret = -EEXIST;
2105                         goto error;
2106                 }
2107         }
2108
2109         device = kzalloc(sizeof(*device), GFP_NOFS);
2110         if (!device) {
2111                 ret = -ENOMEM;
2112                 goto error;
2113         }
2114
2115         name = rcu_string_strdup(device_path, GFP_NOFS);
2116         if (!name) {
2117                 kfree(device);
2118                 ret = -ENOMEM;
2119                 goto error;
2120         }
2121         rcu_assign_pointer(device->name, name);
2122
2123         q = bdev_get_queue(bdev);
2124         if (blk_queue_discard(q))
2125                 device->can_discard = 1;
2126         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2127         device->writeable = 1;
2128         device->work.func = pending_bios_fn;
2129         generate_random_uuid(device->uuid);
2130         device->devid = BTRFS_DEV_REPLACE_DEVID;
2131         spin_lock_init(&device->io_lock);
2132         device->generation = 0;
2133         device->io_width = root->sectorsize;
2134         device->io_align = root->sectorsize;
2135         device->sector_size = root->sectorsize;
2136         device->total_bytes = i_size_read(bdev->bd_inode);
2137         device->disk_total_bytes = device->total_bytes;
2138         device->dev_root = fs_info->dev_root;
2139         device->bdev = bdev;
2140         device->in_fs_metadata = 1;
2141         device->is_tgtdev_for_dev_replace = 1;
2142         device->mode = FMODE_EXCL;
2143         set_blocksize(device->bdev, 4096);
2144         device->fs_devices = fs_info->fs_devices;
2145         list_add(&device->dev_list, &fs_info->fs_devices->devices);
2146         fs_info->fs_devices->num_devices++;
2147         fs_info->fs_devices->open_devices++;
2148         if (device->can_discard)
2149                 fs_info->fs_devices->num_can_discard++;
2150         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2151
2152         *device_out = device;
2153         return ret;
2154
2155 error:
2156         blkdev_put(bdev, FMODE_EXCL);
2157         return ret;
2158 }
2159
2160 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2161                                               struct btrfs_device *tgtdev)
2162 {
2163         WARN_ON(fs_info->fs_devices->rw_devices == 0);
2164         tgtdev->io_width = fs_info->dev_root->sectorsize;
2165         tgtdev->io_align = fs_info->dev_root->sectorsize;
2166         tgtdev->sector_size = fs_info->dev_root->sectorsize;
2167         tgtdev->dev_root = fs_info->dev_root;
2168         tgtdev->in_fs_metadata = 1;
2169 }
2170
2171 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2172                                         struct btrfs_device *device)
2173 {
2174         int ret;
2175         struct btrfs_path *path;
2176         struct btrfs_root *root;
2177         struct btrfs_dev_item *dev_item;
2178         struct extent_buffer *leaf;
2179         struct btrfs_key key;
2180
2181         root = device->dev_root->fs_info->chunk_root;
2182
2183         path = btrfs_alloc_path();
2184         if (!path)
2185                 return -ENOMEM;
2186
2187         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2188         key.type = BTRFS_DEV_ITEM_KEY;
2189         key.offset = device->devid;
2190
2191         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2192         if (ret < 0)
2193                 goto out;
2194
2195         if (ret > 0) {
2196                 ret = -ENOENT;
2197                 goto out;
2198         }
2199
2200         leaf = path->nodes[0];
2201         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2202
2203         btrfs_set_device_id(leaf, dev_item, device->devid);
2204         btrfs_set_device_type(leaf, dev_item, device->type);
2205         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2206         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2207         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2208         btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
2209         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
2210         btrfs_mark_buffer_dirty(leaf);
2211
2212 out:
2213         btrfs_free_path(path);
2214         return ret;
2215 }
2216
2217 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
2218                       struct btrfs_device *device, u64 new_size)
2219 {
2220         struct btrfs_super_block *super_copy =
2221                 device->dev_root->fs_info->super_copy;
2222         u64 old_total = btrfs_super_total_bytes(super_copy);
2223         u64 diff = new_size - device->total_bytes;
2224
2225         if (!device->writeable)
2226                 return -EACCES;
2227         if (new_size <= device->total_bytes ||
2228             device->is_tgtdev_for_dev_replace)
2229                 return -EINVAL;
2230
2231         btrfs_set_super_total_bytes(super_copy, old_total + diff);
2232         device->fs_devices->total_rw_bytes += diff;
2233
2234         device->total_bytes = new_size;
2235         device->disk_total_bytes = new_size;
2236         btrfs_clear_space_info_full(device->dev_root->fs_info);
2237
2238         return btrfs_update_device(trans, device);
2239 }
2240
2241 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2242                       struct btrfs_device *device, u64 new_size)
2243 {
2244         int ret;
2245         lock_chunks(device->dev_root);
2246         ret = __btrfs_grow_device(trans, device, new_size);
2247         unlock_chunks(device->dev_root);
2248         return ret;
2249 }
2250
2251 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2252                             struct btrfs_root *root,
2253                             u64 chunk_tree, u64 chunk_objectid,
2254                             u64 chunk_offset)
2255 {
2256         int ret;
2257         struct btrfs_path *path;
2258         struct btrfs_key key;
2259
2260         root = root->fs_info->chunk_root;
2261         path = btrfs_alloc_path();
2262         if (!path)
2263                 return -ENOMEM;
2264
2265         key.objectid = chunk_objectid;
2266         key.offset = chunk_offset;
2267         key.type = BTRFS_CHUNK_ITEM_KEY;
2268
2269         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2270         if (ret < 0)
2271                 goto out;
2272         else if (ret > 0) { /* Logic error or corruption */
2273                 btrfs_error(root->fs_info, -ENOENT,
2274                             "Failed lookup while freeing chunk.");
2275                 ret = -ENOENT;
2276                 goto out;
2277         }
2278
2279         ret = btrfs_del_item(trans, root, path);
2280         if (ret < 0)
2281                 btrfs_error(root->fs_info, ret,
2282                             "Failed to delete chunk item.");
2283 out:
2284         btrfs_free_path(path);
2285         return ret;
2286 }
2287
2288 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
2289                         chunk_offset)
2290 {
2291         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2292         struct btrfs_disk_key *disk_key;
2293         struct btrfs_chunk *chunk;
2294         u8 *ptr;
2295         int ret = 0;
2296         u32 num_stripes;
2297         u32 array_size;
2298         u32 len = 0;
2299         u32 cur;
2300         struct btrfs_key key;
2301
2302         array_size = btrfs_super_sys_array_size(super_copy);
2303
2304         ptr = super_copy->sys_chunk_array;
2305         cur = 0;
2306
2307         while (cur < array_size) {
2308                 disk_key = (struct btrfs_disk_key *)ptr;
2309                 btrfs_disk_key_to_cpu(&key, disk_key);
2310
2311                 len = sizeof(*disk_key);
2312
2313                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2314                         chunk = (struct btrfs_chunk *)(ptr + len);
2315                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2316                         len += btrfs_chunk_item_size(num_stripes);
2317                 } else {
2318                         ret = -EIO;
2319                         break;
2320                 }
2321                 if (key.objectid == chunk_objectid &&
2322                     key.offset == chunk_offset) {
2323                         memmove(ptr, ptr + len, array_size - (cur + len));
2324                         array_size -= len;
2325                         btrfs_set_super_sys_array_size(super_copy, array_size);
2326                 } else {
2327                         ptr += len;
2328                         cur += len;
2329                 }
2330         }
2331         return ret;
2332 }
2333
2334 static int btrfs_relocate_chunk(struct btrfs_root *root,
2335                          u64 chunk_tree, u64 chunk_objectid,
2336                          u64 chunk_offset)
2337 {
2338         struct extent_map_tree *em_tree;
2339         struct btrfs_root *extent_root;
2340         struct btrfs_trans_handle *trans;
2341         struct extent_map *em;
2342         struct map_lookup *map;
2343         int ret;
2344         int i;
2345
2346         root = root->fs_info->chunk_root;
2347         extent_root = root->fs_info->extent_root;
2348         em_tree = &root->fs_info->mapping_tree.map_tree;
2349
2350         ret = btrfs_can_relocate(extent_root, chunk_offset);
2351         if (ret)
2352                 return -ENOSPC;
2353
2354         /* step one, relocate all the extents inside this chunk */
2355         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2356         if (ret)
2357                 return ret;
2358
2359         trans = btrfs_start_transaction(root, 0);
2360         BUG_ON(IS_ERR(trans));
2361
2362         lock_chunks(root);
2363
2364         /*
2365          * step two, delete the device extents and the
2366          * chunk tree entries
2367          */
2368         read_lock(&em_tree->lock);
2369         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
2370         read_unlock(&em_tree->lock);
2371
2372         BUG_ON(!em || em->start > chunk_offset ||
2373                em->start + em->len < chunk_offset);
2374         map = (struct map_lookup *)em->bdev;
2375
2376         for (i = 0; i < map->num_stripes; i++) {
2377                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
2378                                             map->stripes[i].physical);
2379                 BUG_ON(ret);
2380
2381                 if (map->stripes[i].dev) {
2382                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2383                         BUG_ON(ret);
2384                 }
2385         }
2386         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
2387                                chunk_offset);
2388
2389         BUG_ON(ret);
2390
2391         trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
2392
2393         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2394                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
2395                 BUG_ON(ret);
2396         }
2397
2398         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
2399         BUG_ON(ret);
2400
2401         write_lock(&em_tree->lock);
2402         remove_extent_mapping(em_tree, em);
2403         write_unlock(&em_tree->lock);
2404
2405         kfree(map);
2406         em->bdev = NULL;
2407
2408         /* once for the tree */
2409         free_extent_map(em);
2410         /* once for us */
2411         free_extent_map(em);
2412
2413         unlock_chunks(root);
2414         btrfs_end_transaction(trans, root);
2415         return 0;
2416 }
2417
2418 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2419 {
2420         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2421         struct btrfs_path *path;
2422         struct extent_buffer *leaf;
2423         struct btrfs_chunk *chunk;
2424         struct btrfs_key key;
2425         struct btrfs_key found_key;
2426         u64 chunk_tree = chunk_root->root_key.objectid;
2427         u64 chunk_type;
2428         bool retried = false;
2429         int failed = 0;
2430         int ret;
2431
2432         path = btrfs_alloc_path();
2433         if (!path)
2434                 return -ENOMEM;
2435
2436 again:
2437         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2438         key.offset = (u64)-1;
2439         key.type = BTRFS_CHUNK_ITEM_KEY;
2440
2441         while (1) {
2442                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2443                 if (ret < 0)
2444                         goto error;
2445                 BUG_ON(ret == 0); /* Corruption */
2446
2447                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2448                                           key.type);
2449                 if (ret < 0)
2450                         goto error;
2451                 if (ret > 0)
2452                         break;
2453
2454                 leaf = path->nodes[0];
2455                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2456
2457                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2458                                        struct btrfs_chunk);
2459                 chunk_type = btrfs_chunk_type(leaf, chunk);
2460                 btrfs_release_path(path);
2461
2462                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2463                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2464                                                    found_key.objectid,
2465                                                    found_key.offset);
2466                         if (ret == -ENOSPC)
2467                                 failed++;
2468                         else if (ret)
2469                                 BUG();
2470                 }
2471
2472                 if (found_key.offset == 0)
2473                         break;
2474                 key.offset = found_key.offset - 1;
2475         }
2476         ret = 0;
2477         if (failed && !retried) {
2478                 failed = 0;
2479                 retried = true;
2480                 goto again;
2481         } else if (failed && retried) {
2482                 WARN_ON(1);
2483                 ret = -ENOSPC;
2484         }
2485 error:
2486         btrfs_free_path(path);
2487         return ret;
2488 }
2489
2490 static int insert_balance_item(struct btrfs_root *root,
2491                                struct btrfs_balance_control *bctl)
2492 {
2493         struct btrfs_trans_handle *trans;
2494         struct btrfs_balance_item *item;
2495         struct btrfs_disk_balance_args disk_bargs;
2496         struct btrfs_path *path;
2497         struct extent_buffer *leaf;
2498         struct btrfs_key key;
2499         int ret, err;
2500
2501         path = btrfs_alloc_path();
2502         if (!path)
2503                 return -ENOMEM;
2504
2505         trans = btrfs_start_transaction(root, 0);
2506         if (IS_ERR(trans)) {
2507                 btrfs_free_path(path);
2508                 return PTR_ERR(trans);
2509         }
2510
2511         key.objectid = BTRFS_BALANCE_OBJECTID;
2512         key.type = BTRFS_BALANCE_ITEM_KEY;
2513         key.offset = 0;
2514
2515         ret = btrfs_insert_empty_item(trans, root, path, &key,
2516                                       sizeof(*item));
2517         if (ret)
2518                 goto out;
2519
2520         leaf = path->nodes[0];
2521         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2522
2523         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2524
2525         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2526         btrfs_set_balance_data(leaf, item, &disk_bargs);
2527         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2528         btrfs_set_balance_meta(leaf, item, &disk_bargs);
2529         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2530         btrfs_set_balance_sys(leaf, item, &disk_bargs);
2531
2532         btrfs_set_balance_flags(leaf, item, bctl->flags);
2533
2534         btrfs_mark_buffer_dirty(leaf);
2535 out:
2536         btrfs_free_path(path);
2537         err = btrfs_commit_transaction(trans, root);
2538         if (err && !ret)
2539                 ret = err;
2540         return ret;
2541 }
2542
2543 static int del_balance_item(struct btrfs_root *root)
2544 {
2545         struct btrfs_trans_handle *trans;
2546         struct btrfs_path *path;
2547         struct btrfs_key key;
2548         int ret, err;
2549
2550         path = btrfs_alloc_path();
2551         if (!path)
2552                 return -ENOMEM;
2553
2554         trans = btrfs_start_transaction(root, 0);
2555         if (IS_ERR(trans)) {
2556                 btrfs_free_path(path);
2557                 return PTR_ERR(trans);
2558         }
2559
2560         key.objectid = BTRFS_BALANCE_OBJECTID;
2561         key.type = BTRFS_BALANCE_ITEM_KEY;
2562         key.offset = 0;
2563
2564         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2565         if (ret < 0)
2566                 goto out;
2567         if (ret > 0) {
2568                 ret = -ENOENT;
2569                 goto out;
2570         }
2571
2572         ret = btrfs_del_item(trans, root, path);
2573 out:
2574         btrfs_free_path(path);
2575         err = btrfs_commit_transaction(trans, root);
2576         if (err && !ret)
2577                 ret = err;
2578         return ret;
2579 }
2580
2581 /*
2582  * This is a heuristic used to reduce the number of chunks balanced on
2583  * resume after balance was interrupted.
2584  */
2585 static void update_balance_args(struct btrfs_balance_control *bctl)
2586 {
2587         /*
2588          * Turn on soft mode for chunk types that were being converted.
2589          */
2590         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2591                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2592         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2593                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2594         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2595                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2596
2597         /*
2598          * Turn on usage filter if is not already used.  The idea is
2599          * that chunks that we have already balanced should be
2600          * reasonably full.  Don't do it for chunks that are being
2601          * converted - that will keep us from relocating unconverted
2602          * (albeit full) chunks.
2603          */
2604         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2605             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2606                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2607                 bctl->data.usage = 90;
2608         }
2609         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2610             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2611                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2612                 bctl->sys.usage = 90;
2613         }
2614         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2615             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2616                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2617                 bctl->meta.usage = 90;
2618         }
2619 }
2620
2621 /*
2622  * Should be called with both balance and volume mutexes held to
2623  * serialize other volume operations (add_dev/rm_dev/resize) with
2624  * restriper.  Same goes for unset_balance_control.
2625  */
2626 static void set_balance_control(struct btrfs_balance_control *bctl)
2627 {
2628         struct btrfs_fs_info *fs_info = bctl->fs_info;
2629
2630         BUG_ON(fs_info->balance_ctl);
2631
2632         spin_lock(&fs_info->balance_lock);
2633         fs_info->balance_ctl = bctl;
2634         spin_unlock(&fs_info->balance_lock);
2635 }
2636
2637 static void unset_balance_control(struct btrfs_fs_info *fs_info)
2638 {
2639         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2640
2641         BUG_ON(!fs_info->balance_ctl);
2642
2643         spin_lock(&fs_info->balance_lock);
2644         fs_info->balance_ctl = NULL;
2645         spin_unlock(&fs_info->balance_lock);
2646
2647         kfree(bctl);
2648 }
2649
2650 /*
2651  * Balance filters.  Return 1 if chunk should be filtered out
2652  * (should not be balanced).
2653  */
2654 static int chunk_profiles_filter(u64 chunk_type,
2655                                  struct btrfs_balance_args *bargs)
2656 {
2657         chunk_type = chunk_to_extended(chunk_type) &
2658                                 BTRFS_EXTENDED_PROFILE_MASK;
2659
2660         if (bargs->profiles & chunk_type)
2661                 return 0;
2662
2663         return 1;
2664 }
2665
2666 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2667                               struct btrfs_balance_args *bargs)
2668 {
2669         struct btrfs_block_group_cache *cache;
2670         u64 chunk_used, user_thresh;
2671         int ret = 1;
2672
2673         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2674         chunk_used = btrfs_block_group_used(&cache->item);
2675
2676         if (bargs->usage == 0)
2677                 user_thresh = 0;
2678         else if (bargs->usage > 100)
2679                 user_thresh = cache->key.offset;
2680         else
2681                 user_thresh = div_factor_fine(cache->key.offset,
2682                                               bargs->usage);
2683
2684         if (chunk_used < user_thresh)
2685                 ret = 0;
2686
2687         btrfs_put_block_group(cache);
2688         return ret;
2689 }
2690
2691 static int chunk_devid_filter(struct extent_buffer *leaf,
2692                               struct btrfs_chunk *chunk,
2693                               struct btrfs_balance_args *bargs)
2694 {
2695         struct btrfs_stripe *stripe;
2696         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2697         int i;
2698
2699         for (i = 0; i < num_stripes; i++) {
2700                 stripe = btrfs_stripe_nr(chunk, i);
2701                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2702                         return 0;
2703         }
2704
2705         return 1;
2706 }
2707
2708 /* [pstart, pend) */
2709 static int chunk_drange_filter(struct extent_buffer *leaf,
2710                                struct btrfs_chunk *chunk,
2711                                u64 chunk_offset,
2712                                struct btrfs_balance_args *bargs)
2713 {
2714         struct btrfs_stripe *stripe;
2715         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2716         u64 stripe_offset;
2717         u64 stripe_length;
2718         int factor;
2719         int i;
2720
2721         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2722                 return 0;
2723
2724         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2725              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
2726                 factor = 2;
2727         else
2728                 factor = 1;
2729         factor = num_stripes / factor;
2730
2731         for (i = 0; i < num_stripes; i++) {
2732                 stripe = btrfs_stripe_nr(chunk, i);
2733                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2734                         continue;
2735
2736                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2737                 stripe_length = btrfs_chunk_length(leaf, chunk);
2738                 do_div(stripe_length, factor);
2739
2740                 if (stripe_offset < bargs->pend &&
2741                     stripe_offset + stripe_length > bargs->pstart)
2742                         return 0;
2743         }
2744
2745         return 1;
2746 }
2747
2748 /* [vstart, vend) */
2749 static int chunk_vrange_filter(struct extent_buffer *leaf,
2750                                struct btrfs_chunk *chunk,
2751                                u64 chunk_offset,
2752                                struct btrfs_balance_args *bargs)
2753 {
2754         if (chunk_offset < bargs->vend &&
2755             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2756                 /* at least part of the chunk is inside this vrange */
2757                 return 0;
2758
2759         return 1;
2760 }
2761
2762 static int chunk_soft_convert_filter(u64 chunk_type,
2763                                      struct btrfs_balance_args *bargs)
2764 {
2765         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2766                 return 0;
2767
2768         chunk_type = chunk_to_extended(chunk_type) &
2769                                 BTRFS_EXTENDED_PROFILE_MASK;
2770
2771         if (bargs->target == chunk_type)
2772                 return 1;
2773
2774         return 0;
2775 }
2776
2777 static int should_balance_chunk(struct btrfs_root *root,
2778                                 struct extent_buffer *leaf,
2779                                 struct btrfs_chunk *chunk, u64 chunk_offset)
2780 {
2781         struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2782         struct btrfs_balance_args *bargs = NULL;
2783         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2784
2785         /* type filter */
2786         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2787               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2788                 return 0;
2789         }
2790
2791         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2792                 bargs = &bctl->data;
2793         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2794                 bargs = &bctl->sys;
2795         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2796                 bargs = &bctl->meta;
2797
2798         /* profiles filter */
2799         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2800             chunk_profiles_filter(chunk_type, bargs)) {
2801                 return 0;
2802         }
2803
2804         /* usage filter */
2805         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2806             chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2807                 return 0;
2808         }
2809
2810         /* devid filter */
2811         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2812             chunk_devid_filter(leaf, chunk, bargs)) {
2813                 return 0;
2814         }
2815
2816         /* drange filter, makes sense only with devid filter */
2817         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2818             chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2819                 return 0;
2820         }
2821
2822         /* vrange filter */
2823         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2824             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2825                 return 0;
2826         }
2827
2828         /* soft profile changing mode */
2829         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2830             chunk_soft_convert_filter(chunk_type, bargs)) {
2831                 return 0;
2832         }
2833
2834         return 1;
2835 }
2836
2837 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2838 {
2839         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2840         struct btrfs_root *chunk_root = fs_info->chunk_root;
2841         struct btrfs_root *dev_root = fs_info->dev_root;
2842         struct list_head *devices;
2843         struct btrfs_device *device;
2844         u64 old_size;
2845         u64 size_to_free;
2846         struct btrfs_chunk *chunk;
2847         struct btrfs_path *path;
2848         struct btrfs_key key;
2849         struct btrfs_key found_key;
2850         struct btrfs_trans_handle *trans;
2851         struct extent_buffer *leaf;
2852         int slot;
2853         int ret;
2854         int enospc_errors = 0;
2855         bool counting = true;
2856
2857         /* step one make some room on all the devices */
2858         devices = &fs_info->fs_devices->devices;
2859         list_for_each_entry(device, devices, dev_list) {
2860                 old_size = device->total_bytes;
2861                 size_to_free = div_factor(old_size, 1);
2862                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2863                 if (!device->writeable ||
2864                     device->total_bytes - device->bytes_used > size_to_free ||
2865                     device->is_tgtdev_for_dev_replace)
2866                         continue;
2867
2868                 ret = btrfs_shrink_device(device, old_size - size_to_free);
2869                 if (ret == -ENOSPC)
2870                         break;
2871                 BUG_ON(ret);
2872
2873                 trans = btrfs_start_transaction(dev_root, 0);
2874                 BUG_ON(IS_ERR(trans));
2875
2876                 ret = btrfs_grow_device(trans, device, old_size);
2877                 BUG_ON(ret);
2878
2879                 btrfs_end_transaction(trans, dev_root);
2880         }
2881
2882         /* step two, relocate all the chunks */
2883         path = btrfs_alloc_path();
2884         if (!path) {
2885                 ret = -ENOMEM;
2886                 goto error;
2887         }
2888
2889         /* zero out stat counters */
2890         spin_lock(&fs_info->balance_lock);
2891         memset(&bctl->stat, 0, sizeof(bctl->stat));
2892         spin_unlock(&fs_info->balance_lock);
2893 again:
2894         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2895         key.offset = (u64)-1;
2896         key.type = BTRFS_CHUNK_ITEM_KEY;
2897
2898         while (1) {
2899                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
2900                     atomic_read(&fs_info->balance_cancel_req)) {
2901                         ret = -ECANCELED;
2902                         goto error;
2903                 }
2904
2905                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2906                 if (ret < 0)
2907                         goto error;
2908
2909                 /*
2910                  * this shouldn't happen, it means the last relocate
2911                  * failed
2912                  */
2913                 if (ret == 0)
2914                         BUG(); /* FIXME break ? */
2915
2916                 ret = btrfs_previous_item(chunk_root, path, 0,
2917                                           BTRFS_CHUNK_ITEM_KEY);
2918                 if (ret) {
2919                         ret = 0;
2920                         break;
2921                 }
2922
2923                 leaf = path->nodes[0];
2924                 slot = path->slots[0];
2925                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2926
2927                 if (found_key.objectid != key.objectid)
2928                         break;
2929
2930                 /* chunk zero is special */
2931                 if (found_key.offset == 0)
2932                         break;
2933
2934                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2935
2936                 if (!counting) {
2937                         spin_lock(&fs_info->balance_lock);
2938                         bctl->stat.considered++;
2939                         spin_unlock(&fs_info->balance_lock);
2940                 }
2941
2942                 ret = should_balance_chunk(chunk_root, leaf, chunk,
2943                                            found_key.offset);
2944                 btrfs_release_path(path);
2945                 if (!ret)
2946                         goto loop;
2947
2948                 if (counting) {
2949                         spin_lock(&fs_info->balance_lock);
2950                         bctl->stat.expected++;
2951                         spin_unlock(&fs_info->balance_lock);
2952                         goto loop;
2953                 }
2954
2955                 ret = btrfs_relocate_chunk(chunk_root,
2956                                            chunk_root->root_key.objectid,
2957                                            found_key.objectid,
2958                                            found_key.offset);
2959                 if (ret && ret != -ENOSPC)
2960                         goto error;
2961                 if (ret == -ENOSPC) {
2962                         enospc_errors++;
2963                 } else {
2964                         spin_lock(&fs_info->balance_lock);
2965                         bctl->stat.completed++;
2966                         spin_unlock(&fs_info->balance_lock);
2967                 }
2968 loop:
2969                 key.offset = found_key.offset - 1;
2970         }
2971
2972         if (counting) {
2973                 btrfs_release_path(path);
2974                 counting = false;
2975                 goto again;
2976         }
2977 error:
2978         btrfs_free_path(path);
2979         if (enospc_errors) {
2980                 printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
2981                        enospc_errors);
2982                 if (!ret)
2983                         ret = -ENOSPC;
2984         }
2985
2986         return ret;
2987 }
2988
2989 /**
2990  * alloc_profile_is_valid - see if a given profile is valid and reduced
2991  * @flags: profile to validate
2992  * @extended: if true @flags is treated as an extended profile
2993  */
2994 static int alloc_profile_is_valid(u64 flags, int extended)
2995 {
2996         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
2997                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
2998
2999         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3000
3001         /* 1) check that all other bits are zeroed */
3002         if (flags & ~mask)
3003                 return 0;
3004
3005         /* 2) see if profile is reduced */
3006         if (flags == 0)
3007                 return !extended; /* "0" is valid for usual profiles */
3008
3009         /* true if exactly one bit set */
3010         return (flags & (flags - 1)) == 0;
3011 }
3012
3013 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3014 {
3015         /* cancel requested || normal exit path */
3016         return atomic_read(&fs_info->balance_cancel_req) ||
3017                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3018                  atomic_read(&fs_info->balance_cancel_req) == 0);
3019 }
3020
3021 static void __cancel_balance(struct btrfs_fs_info *fs_info)
3022 {
3023         int ret;
3024
3025         unset_balance_control(fs_info);
3026         ret = del_balance_item(fs_info->tree_root);
3027         BUG_ON(ret);
3028
3029         atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3030 }
3031
3032 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
3033                                struct btrfs_ioctl_balance_args *bargs);
3034
3035 /*
3036  * Should be called with both balance and volume mutexes held
3037  */
3038 int btrfs_balance(struct btrfs_balance_control *bctl,
3039                   struct btrfs_ioctl_balance_args *bargs)
3040 {
3041         struct btrfs_fs_info *fs_info = bctl->fs_info;
3042         u64 allowed;
3043         int mixed = 0;
3044         int ret;
3045         u64 num_devices;
3046
3047         if (btrfs_fs_closing(fs_info) ||
3048             atomic_read(&fs_info->balance_pause_req) ||
3049             atomic_read(&fs_info->balance_cancel_req)) {
3050                 ret = -EINVAL;
3051                 goto out;
3052         }
3053
3054         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
3055         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
3056                 mixed = 1;
3057
3058         /*
3059          * In case of mixed groups both data and meta should be picked,
3060          * and identical options should be given for both of them.
3061          */
3062         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
3063         if (mixed && (bctl->flags & allowed)) {
3064                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
3065                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
3066                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
3067                         printk(KERN_ERR "btrfs: with mixed groups data and "
3068                                "metadata balance options must be the same\n");
3069                         ret = -EINVAL;
3070                         goto out;
3071                 }
3072         }
3073
3074         num_devices = fs_info->fs_devices->num_devices;
3075         btrfs_dev_replace_lock(&fs_info->dev_replace);
3076         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3077                 BUG_ON(num_devices < 1);
3078                 num_devices--;
3079         }
3080         btrfs_dev_replace_unlock(&fs_info->dev_replace);
3081         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3082         if (num_devices == 1)
3083                 allowed |= BTRFS_BLOCK_GROUP_DUP;
3084         else if (num_devices < 4)
3085                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3086         else
3087                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
3088                                 BTRFS_BLOCK_GROUP_RAID10);
3089
3090         if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3091             (!alloc_profile_is_valid(bctl->data.target, 1) ||
3092              (bctl->data.target & ~allowed))) {
3093                 printk(KERN_ERR "btrfs: unable to start balance with target "
3094                        "data profile %llu\n",
3095                        (unsigned long long)bctl->data.target);
3096                 ret = -EINVAL;
3097                 goto out;
3098         }
3099         if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3100             (!alloc_profile_is_valid(bctl->meta.target, 1) ||
3101              (bctl->meta.target & ~allowed))) {
3102                 printk(KERN_ERR "btrfs: unable to start balance with target "
3103                        "metadata profile %llu\n",
3104                        (unsigned long long)bctl->meta.target);
3105                 ret = -EINVAL;
3106                 goto out;
3107         }
3108         if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3109             (!alloc_profile_is_valid(bctl->sys.target, 1) ||
3110              (bctl->sys.target & ~allowed))) {
3111                 printk(KERN_ERR "btrfs: unable to start balance with target "
3112                        "system profile %llu\n",
3113                        (unsigned long long)bctl->sys.target);
3114                 ret = -EINVAL;
3115                 goto out;
3116         }
3117
3118         /* allow dup'ed data chunks only in mixed mode */
3119         if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3120             (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
3121                 printk(KERN_ERR "btrfs: dup for data is not allowed\n");
3122                 ret = -EINVAL;
3123                 goto out;
3124         }
3125
3126         /* allow to reduce meta or sys integrity only if force set */
3127         allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3128                         BTRFS_BLOCK_GROUP_RAID10;
3129         if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3130              (fs_info->avail_system_alloc_bits & allowed) &&
3131              !(bctl->sys.target & allowed)) ||
3132             ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3133              (fs_info->avail_metadata_alloc_bits & allowed) &&
3134              !(bctl->meta.target & allowed))) {
3135                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
3136                         printk(KERN_INFO "btrfs: force reducing metadata "
3137                                "integrity\n");
3138                 } else {
3139                         printk(KERN_ERR "btrfs: balance will reduce metadata "
3140                                "integrity, use force if you want this\n");
3141                         ret = -EINVAL;
3142                         goto out;
3143                 }
3144         }
3145
3146         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3147                 int num_tolerated_disk_barrier_failures;
3148                 u64 target = bctl->sys.target;
3149
3150                 num_tolerated_disk_barrier_failures =
3151                         btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3152                 if (num_tolerated_disk_barrier_failures > 0 &&
3153                     (target &
3154                      (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3155                       BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
3156                         num_tolerated_disk_barrier_failures = 0;
3157                 else if (num_tolerated_disk_barrier_failures > 1 &&
3158                          (target &
3159                           (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
3160                         num_tolerated_disk_barrier_failures = 1;
3161
3162                 fs_info->num_tolerated_disk_barrier_failures =
3163                         num_tolerated_disk_barrier_failures;
3164         }
3165
3166         ret = insert_balance_item(fs_info->tree_root, bctl);
3167         if (ret && ret != -EEXIST)
3168                 goto out;
3169
3170         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
3171                 BUG_ON(ret == -EEXIST);
3172                 set_balance_control(bctl);
3173         } else {
3174                 BUG_ON(ret != -EEXIST);
3175                 spin_lock(&fs_info->balance_lock);
3176                 update_balance_args(bctl);
3177                 spin_unlock(&fs_info->balance_lock);
3178         }
3179
3180         atomic_inc(&fs_info->balance_running);
3181         mutex_unlock(&fs_info->balance_mutex);
3182
3183         ret = __btrfs_balance(fs_info);
3184
3185         mutex_lock(&fs_info->balance_mutex);
3186         atomic_dec(&fs_info->balance_running);
3187
3188         if (bargs) {
3189                 memset(bargs, 0, sizeof(*bargs));
3190                 update_ioctl_balance_args(fs_info, 0, bargs);
3191         }
3192
3193         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3194             balance_need_close(fs_info)) {
3195                 __cancel_balance(fs_info);
3196         }
3197
3198         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3199                 fs_info->num_tolerated_disk_barrier_failures =
3200                         btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3201         }
3202
3203         wake_up(&fs_info->balance_wait_q);
3204
3205         return ret;
3206 out:
3207         if (bctl->flags & BTRFS_BALANCE_RESUME)
3208                 __cancel_balance(fs_info);
3209         else {
3210                 kfree(bctl);
3211                 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3212         }
3213         return ret;
3214 }
3215
3216 static int balance_kthread(void *data)
3217 {
3218         struct btrfs_fs_info *fs_info = data;
3219         int ret = 0;
3220
3221         mutex_lock(&fs_info->volume_mutex);
3222         mutex_lock(&fs_info->balance_mutex);
3223
3224         if (fs_info->balance_ctl) {
3225                 printk(KERN_INFO "btrfs: continuing balance\n");
3226                 ret = btrfs_balance(fs_info->balance_ctl, NULL);
3227         }
3228
3229         mutex_unlock(&fs_info->balance_mutex);
3230         mutex_unlock(&fs_info->volume_mutex);
3231
3232         return ret;
3233 }
3234
3235 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3236 {
3237         struct task_struct *tsk;
3238
3239         spin_lock(&fs_info->balance_lock);
3240         if (!fs_info->balance_ctl) {
3241                 spin_unlock(&fs_info->balance_lock);
3242                 return 0;
3243         }
3244         spin_unlock(&fs_info->balance_lock);
3245
3246         if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
3247                 printk(KERN_INFO "btrfs: force skipping balance\n");
3248                 return 0;
3249         }
3250
3251         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3252         if (IS_ERR(tsk))
3253                 return PTR_ERR(tsk);
3254
3255         return 0;
3256 }
3257
3258 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
3259 {
3260         struct btrfs_balance_control *bctl;
3261         struct btrfs_balance_item *item;
3262         struct btrfs_disk_balance_args disk_bargs;
3263         struct btrfs_path *path;
3264         struct extent_buffer *leaf;
3265         struct btrfs_key key;
3266         int ret;
3267
3268         path = btrfs_alloc_path();
3269         if (!path)
3270                 return -ENOMEM;
3271
3272         key.objectid = BTRFS_BALANCE_OBJECTID;
3273         key.type = BTRFS_BALANCE_ITEM_KEY;
3274         key.offset = 0;
3275
3276         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
3277         if (ret < 0)
3278                 goto out;
3279         if (ret > 0) { /* ret = -ENOENT; */
3280                 ret = 0;
3281                 goto out;
3282         }
3283
3284         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3285         if (!bctl) {
3286                 ret = -ENOMEM;
3287                 goto out;
3288         }
3289
3290         leaf = path->nodes[0];
3291         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3292
3293         bctl->fs_info = fs_info;
3294         bctl->flags = btrfs_balance_flags(leaf, item);
3295         bctl->flags |= BTRFS_BALANCE_RESUME;
3296
3297         btrfs_balance_data(leaf, item, &disk_bargs);
3298         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
3299         btrfs_balance_meta(leaf, item, &disk_bargs);
3300         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
3301         btrfs_balance_sys(leaf, item, &disk_bargs);
3302         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
3303
3304         WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
3305
3306         mutex_lock(&fs_info->volume_mutex);
3307         mutex_lock(&fs_info->balance_mutex);
3308
3309         set_balance_control(bctl);
3310
3311         mutex_unlock(&fs_info->balance_mutex);
3312         mutex_unlock(&fs_info->volume_mutex);
3313 out:
3314         btrfs_free_path(path);
3315         return ret;
3316 }
3317
3318 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
3319 {
3320         int ret = 0;
3321
3322         mutex_lock(&fs_info->balance_mutex);
3323         if (!fs_info->balance_ctl) {
3324                 mutex_unlock(&fs_info->balance_mutex);
3325                 return -ENOTCONN;
3326         }
3327
3328         if (atomic_read(&fs_info->balance_running)) {
3329                 atomic_inc(&fs_info->balance_pause_req);
3330                 mutex_unlock(&fs_info->balance_mutex);
3331
3332                 wait_event(fs_info->balance_wait_q,
3333                            atomic_read(&fs_info->balance_running) == 0);
3334
3335                 mutex_lock(&fs_info->balance_mutex);
3336                 /* we are good with balance_ctl ripped off from under us */
3337                 BUG_ON(atomic_read(&fs_info->balance_running));
3338                 atomic_dec(&fs_info->balance_pause_req);
3339         } else {
3340                 ret = -ENOTCONN;
3341         }
3342
3343         mutex_unlock(&fs_info->balance_mutex);
3344         return ret;
3345 }
3346
3347 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
3348 {
3349         mutex_lock(&fs_info->balance_mutex);
3350         if (!fs_info->balance_ctl) {
3351                 mutex_unlock(&fs_info->balance_mutex);
3352                 return -ENOTCONN;
3353         }
3354
3355         atomic_inc(&fs_info->balance_cancel_req);
3356         /*
3357          * if we are running just wait and return, balance item is
3358          * deleted in btrfs_balance in this case
3359          */
3360         if (atomic_read(&fs_info->balance_running)) {
3361                 mutex_unlock(&fs_info->balance_mutex);
3362                 wait_event(fs_info->balance_wait_q,
3363                            atomic_read(&fs_info->balance_running) == 0);
3364                 mutex_lock(&fs_info->balance_mutex);
3365         } else {
3366                 /* __cancel_balance needs volume_mutex */
3367                 mutex_unlock(&fs_info->balance_mutex);
3368                 mutex_lock(&fs_info->volume_mutex);
3369                 mutex_lock(&fs_info->balance_mutex);
3370
3371                 if (fs_info->balance_ctl)
3372                         __cancel_balance(fs_info);
3373
3374                 mutex_unlock(&fs_info->volume_mutex);
3375         }
3376
3377         BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
3378         atomic_dec(&fs_info->balance_cancel_req);
3379         mutex_unlock(&fs_info->balance_mutex);
3380         return 0;
3381 }
3382
3383 /*
3384  * shrinking a device means finding all of the device extents past
3385  * the new size, and then following the back refs to the chunks.
3386  * The chunk relocation code actually frees the device extent
3387  */
3388 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3389 {
3390         struct btrfs_trans_handle *trans;
3391         struct btrfs_root *root = device->dev_root;
3392         struct btrfs_dev_extent *dev_extent = NULL;
3393         struct btrfs_path *path;
3394         u64 length;
3395         u64 chunk_tree;
3396         u64 chunk_objectid;
3397         u64 chunk_offset;
3398         int ret;
3399         int slot;
3400         int failed = 0;
3401         bool retried = false;
3402         struct extent_buffer *l;
3403         struct btrfs_key key;
3404         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3405         u64 old_total = btrfs_super_total_bytes(super_copy);
3406         u64 old_size = device->total_bytes;
3407         u64 diff = device->total_bytes - new_size;
3408
3409         if (device->is_tgtdev_for_dev_replace)
3410                 return -EINVAL;
3411
3412         path = btrfs_alloc_path();
3413         if (!path)
3414                 return -ENOMEM;
3415
3416         path->reada = 2;
3417
3418         lock_chunks(root);
3419
3420         device->total_bytes = new_size;
3421         if (device->writeable) {
3422                 device->fs_devices->total_rw_bytes -= diff;
3423                 spin_lock(&root->fs_info->free_chunk_lock);
3424                 root->fs_info->free_chunk_space -= diff;
3425                 spin_unlock(&root->fs_info->free_chunk_lock);
3426         }
3427         unlock_chunks(root);
3428
3429 again:
3430         key.objectid = device->devid;
3431         key.offset = (u64)-1;
3432         key.type = BTRFS_DEV_EXTENT_KEY;
3433
3434         do {
3435                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3436                 if (ret < 0)
3437                         goto done;
3438
3439                 ret = btrfs_previous_item(root, path, 0, key.type);
3440                 if (ret < 0)
3441                         goto done;
3442                 if (ret) {
3443                         ret = 0;
3444                         btrfs_release_path(path);
3445                         break;
3446                 }
3447
3448                 l = path->nodes[0];
3449                 slot = path->slots[0];
3450                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
3451
3452                 if (key.objectid != device->devid) {
3453                         btrfs_release_path(path);
3454                         break;
3455                 }
3456
3457                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3458                 length = btrfs_dev_extent_length(l, dev_extent);
3459
3460                 if (key.offset + length <= new_size) {
3461                         btrfs_release_path(path);
3462                         break;
3463                 }
3464
3465                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3466                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3467                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3468                 btrfs_release_path(path);
3469
3470                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
3471                                            chunk_offset);
3472                 if (ret && ret != -ENOSPC)
3473                         goto done;
3474                 if (ret == -ENOSPC)
3475                         failed++;
3476         } while (key.offset-- > 0);
3477
3478         if (failed && !retried) {
3479                 failed = 0;
3480                 retried = true;
3481                 goto again;
3482         } else if (failed && retried) {
3483                 ret = -ENOSPC;
3484                 lock_chunks(root);
3485
3486                 device->total_bytes = old_size;
3487                 if (device->writeable)
3488                         device->fs_devices->total_rw_bytes += diff;
3489                 spin_lock(&root->fs_info->free_chunk_lock);
3490                 root->fs_info->free_chunk_space += diff;
3491                 spin_unlock(&root->fs_info->free_chunk_lock);
3492                 unlock_chunks(root);
3493                 goto done;
3494         }
3495
3496         /* Shrinking succeeded, else we would be at "done". */
3497         trans = btrfs_start_transaction(root, 0);
3498         if (IS_ERR(trans)) {
3499                 ret = PTR_ERR(trans);
3500                 goto done;
3501         }
3502
3503         lock_chunks(root);
3504
3505         device->disk_total_bytes = new_size;
3506         /* Now btrfs_update_device() will change the on-disk size. */
3507         ret = btrfs_update_device(trans, device);
3508         if (ret) {
3509                 unlock_chunks(root);
3510                 btrfs_end_transaction(trans, root);
3511                 goto done;
3512         }
3513         WARN_ON(diff > old_total);
3514         btrfs_set_super_total_bytes(super_copy, old_total - diff);
3515         unlock_chunks(root);
3516         btrfs_end_transaction(trans, root);
3517 done:
3518         btrfs_free_path(path);
3519         return ret;
3520 }
3521
3522 static int btrfs_add_system_chunk(struct btrfs_root *root,
3523                            struct btrfs_key *key,
3524                            struct btrfs_chunk *chunk, int item_size)
3525 {
3526         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3527         struct btrfs_disk_key disk_key;
3528         u32 array_size;
3529         u8 *ptr;
3530
3531         array_size = btrfs_super_sys_array_size(super_copy);
3532         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
3533                 return -EFBIG;
3534
3535         ptr = super_copy->sys_chunk_array + array_size;
3536         btrfs_cpu_key_to_disk(&disk_key, key);
3537         memcpy(ptr, &disk_key, sizeof(disk_key));
3538         ptr += sizeof(disk_key);
3539         memcpy(ptr, chunk, item_size);
3540         item_size += sizeof(disk_key);
3541         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
3542         return 0;
3543 }
3544
3545 /*
3546  * sort the devices in descending order by max_avail, total_avail
3547  */
3548 static int btrfs_cmp_device_info(const void *a, const void *b)
3549 {
3550         const struct btrfs_device_info *di_a = a;
3551         const struct btrfs_device_info *di_b = b;
3552
3553         if (di_a->max_avail > di_b->max_avail)
3554                 return -1;
3555         if (di_a->max_avail < di_b->max_avail)
3556                 return 1;
3557         if (di_a->total_avail > di_b->total_avail)
3558                 return -1;
3559         if (di_a->total_avail < di_b->total_avail)
3560                 return 1;
3561         return 0;
3562 }
3563
3564 struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3565         [BTRFS_RAID_RAID10] = {
3566                 .sub_stripes    = 2,
3567                 .dev_stripes    = 1,
3568                 .devs_max       = 0,    /* 0 == as many as possible */
3569                 .devs_min       = 4,
3570                 .devs_increment = 2,
3571                 .ncopies        = 2,
3572         },
3573         [BTRFS_RAID_RAID1] = {
3574                 .sub_stripes    = 1,
3575                 .dev_stripes    = 1,
3576                 .devs_max       = 2,
3577                 .devs_min       = 2,
3578                 .devs_increment = 2,
3579                 .ncopies        = 2,
3580         },
3581         [BTRFS_RAID_DUP] = {
3582                 .sub_stripes    = 1,
3583                 .dev_stripes    = 2,
3584                 .devs_max       = 1,
3585                 .devs_min       = 1,
3586                 .devs_increment = 1,
3587                 .ncopies        = 2,
3588         },
3589         [BTRFS_RAID_RAID0] = {
3590                 .sub_stripes    = 1,
3591                 .dev_stripes    = 1,
3592                 .devs_max       = 0,
3593                 .devs_min       = 2,
3594                 .devs_increment = 1,
3595                 .ncopies        = 1,
3596         },
3597         [BTRFS_RAID_SINGLE] = {
3598                 .sub_stripes    = 1,
3599                 .dev_stripes    = 1,
3600                 .devs_max       = 1,
3601                 .devs_min       = 1,
3602                 .devs_increment = 1,
3603                 .ncopies        = 1,
3604         },
3605 };
3606
3607 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3608                                struct btrfs_root *extent_root,
3609                                struct map_lookup **map_ret,
3610                                u64 *num_bytes_out, u64 *stripe_size_out,
3611                                u64 start, u64 type)
3612 {
3613         struct btrfs_fs_info *info = extent_root->fs_info;
3614         struct btrfs_fs_devices *fs_devices = info->fs_devices;
3615         struct list_head *cur;
3616         struct map_lookup *map = NULL;
3617         struct extent_map_tree *em_tree;
3618         struct extent_map *em;
3619         struct btrfs_device_info *devices_info = NULL;
3620         u64 total_avail;
3621         int num_stripes;        /* total number of stripes to allocate */
3622         int sub_stripes;        /* sub_stripes info for map */
3623         int dev_stripes;        /* stripes per dev */
3624         int devs_max;           /* max devs to use */
3625         int devs_min;           /* min devs needed */
3626         int devs_increment;     /* ndevs has to be a multiple of this */
3627         int ncopies;            /* how many copies to data has */
3628         int ret;
3629         u64 max_stripe_size;
3630         u64 max_chunk_size;
3631         u64 stripe_size;
3632         u64 num_bytes;
3633         int ndevs;
3634         int i;
3635         int j;
3636         int index;
3637
3638         BUG_ON(!alloc_profile_is_valid(type, 0));
3639
3640         if (list_empty(&fs_devices->alloc_list))
3641                 return -ENOSPC;
3642
3643         index = __get_raid_index(type);
3644
3645         sub_stripes = btrfs_raid_array[index].sub_stripes;
3646         dev_stripes = btrfs_raid_array[index].dev_stripes;
3647         devs_max = btrfs_raid_array[index].devs_max;
3648         devs_min = btrfs_raid_array[index].devs_min;
3649         devs_increment = btrfs_raid_array[index].devs_increment;
3650         ncopies = btrfs_raid_array[index].ncopies;
3651
3652         if (type & BTRFS_BLOCK_GROUP_DATA) {
3653                 max_stripe_size = 1024 * 1024 * 1024;
3654                 max_chunk_size = 10 * max_stripe_size;
3655         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
3656                 /* for larger filesystems, use larger metadata chunks */
3657                 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
3658                         max_stripe_size = 1024 * 1024 * 1024;
3659                 else
3660                         max_stripe_size = 256 * 1024 * 1024;
3661                 max_chunk_size = max_stripe_size;
3662         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
3663                 max_stripe_size = 32 * 1024 * 1024;
3664                 max_chunk_size = 2 * max_stripe_size;
3665         } else {
3666                 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
3667                        type);
3668                 BUG_ON(1);
3669         }
3670
3671         /* we don't want a chunk larger than 10% of writeable space */
3672         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
3673                              max_chunk_size);
3674
3675         devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
3676                                GFP_NOFS);
3677         if (!devices_info)
3678                 return -ENOMEM;
3679
3680         cur = fs_devices->alloc_list.next;
3681
3682         /*
3683          * in the first pass through the devices list, we gather information
3684          * about the available holes on each device.
3685          */
3686         ndevs = 0;
3687         while (cur != &fs_devices->alloc_list) {
3688                 struct btrfs_device *device;
3689                 u64 max_avail;
3690                 u64 dev_offset;
3691
3692                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
3693
3694                 cur = cur->next;
3695
3696                 if (!device->writeable) {
3697                         WARN(1, KERN_ERR
3698                                "btrfs: read-only device in alloc_list\n");
3699                         continue;
3700                 }
3701
3702                 if (!device->in_fs_metadata ||
3703                     device->is_tgtdev_for_dev_replace)
3704                         continue;
3705
3706                 if (device->total_bytes > device->bytes_used)
3707                         total_avail = device->total_bytes - device->bytes_used;
3708                 else
3709                         total_avail = 0;
3710
3711                 /* If there is no space on this device, skip it. */
3712                 if (total_avail == 0)
3713                         continue;
3714
3715                 ret = find_free_dev_extent(device,
3716                                            max_stripe_size * dev_stripes,
3717                                            &dev_offset, &max_avail);
3718                 if (ret && ret != -ENOSPC)
3719                         goto error;
3720
3721                 if (ret == 0)
3722                         max_avail = max_stripe_size * dev_stripes;
3723
3724                 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3725                         continue;
3726
3727                 devices_info[ndevs].dev_offset = dev_offset;
3728                 devices_info[ndevs].max_avail = max_avail;
3729                 devices_info[ndevs].total_avail = total_avail;
3730                 devices_info[ndevs].dev = device;
3731                 ++ndevs;
3732                 WARN_ON(ndevs > fs_devices->rw_devices);
3733         }
3734
3735         /*
3736          * now sort the devices by hole size / available space
3737          */
3738         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
3739              btrfs_cmp_device_info, NULL);
3740
3741         /* round down to number of usable stripes */
3742         ndevs -= ndevs % devs_increment;
3743
3744         if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
3745                 ret = -ENOSPC;
3746                 goto error;
3747         }
3748
3749         if (devs_max && ndevs > devs_max)
3750                 ndevs = devs_max;
3751         /*
3752          * the primary goal is to maximize the number of stripes, so use as many
3753          * devices as possible, even if the stripes are not maximum sized.
3754          */
3755         stripe_size = devices_info[ndevs-1].max_avail;
3756         num_stripes = ndevs * dev_stripes;
3757
3758         if (stripe_size * ndevs > max_chunk_size * ncopies) {
3759                 stripe_size = max_chunk_size * ncopies;
3760                 do_div(stripe_size, ndevs);
3761         }
3762
3763         do_div(stripe_size, dev_stripes);
3764
3765         /* align to BTRFS_STRIPE_LEN */
3766         do_div(stripe_size, BTRFS_STRIPE_LEN);
3767         stripe_size *= BTRFS_STRIPE_LEN;
3768
3769         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3770         if (!map) {
3771                 ret = -ENOMEM;
3772                 goto error;
3773         }
3774         map->num_stripes = num_stripes;
3775
3776         for (i = 0; i < ndevs; ++i) {
3777                 for (j = 0; j < dev_stripes; ++j) {
3778                         int s = i * dev_stripes + j;
3779                         map->stripes[s].dev = devices_info[i].dev;
3780                         map->stripes[s].physical = devices_info[i].dev_offset +
3781                                                    j * stripe_size;
3782                 }
3783         }
3784         map->sector_size = extent_root->sectorsize;
3785         map->stripe_len = BTRFS_STRIPE_LEN;
3786         map->io_align = BTRFS_STRIPE_LEN;
3787         map->io_width = BTRFS_STRIPE_LEN;
3788         map->type = type;
3789         map->sub_stripes = sub_stripes;
3790
3791         *map_ret = map;
3792         num_bytes = stripe_size * (num_stripes / ncopies);
3793
3794         *stripe_size_out = stripe_size;
3795         *num_bytes_out = num_bytes;
3796
3797         trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
3798
3799         em = alloc_extent_map();
3800         if (!em) {
3801                 ret = -ENOMEM;
3802                 goto error;
3803         }
3804         em->bdev = (struct block_device *)map;
3805         em->start = start;
3806         em->len = num_bytes;
3807         em->block_start = 0;
3808         em->block_len = em->len;
3809
3810         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3811         write_lock(&em_tree->lock);
3812         ret = add_extent_mapping(em_tree, em);
3813         write_unlock(&em_tree->lock);
3814         free_extent_map(em);
3815         if (ret)
3816                 goto error;
3817
3818         ret = btrfs_make_block_group(trans, extent_root, 0, type,
3819                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3820                                      start, num_bytes);
3821         if (ret)
3822                 goto error;
3823
3824         for (i = 0; i < map->num_stripes; ++i) {
3825                 struct btrfs_device *device;
3826                 u64 dev_offset;
3827
3828                 device = map->stripes[i].dev;
3829                 dev_offset = map->stripes[i].physical;
3830
3831                 ret = btrfs_alloc_dev_extent(trans, device,
3832                                 info->chunk_root->root_key.objectid,
3833                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3834                                 start, dev_offset, stripe_size);
3835                 if (ret) {
3836                         btrfs_abort_transaction(trans, extent_root, ret);
3837                         goto error;
3838                 }
3839         }
3840
3841         kfree(devices_info);
3842         return 0;
3843
3844 error:
3845         kfree(map);
3846         kfree(devices_info);
3847         return ret;
3848 }
3849
3850 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
3851                                 struct btrfs_root *extent_root,
3852                                 struct map_lookup *map, u64 chunk_offset,
3853                                 u64 chunk_size, u64 stripe_size)
3854 {
3855         u64 dev_offset;
3856         struct btrfs_key key;
3857         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3858         struct btrfs_device *device;
3859         struct btrfs_chunk *chunk;
3860         struct btrfs_stripe *stripe;
3861         size_t item_size = btrfs_chunk_item_size(map->num_stripes);
3862         int index = 0;
3863         int ret;
3864
3865         chunk = kzalloc(item_size, GFP_NOFS);
3866         if (!chunk)
3867                 return -ENOMEM;
3868
3869         index = 0;
3870         while (index < map->num_stripes) {
3871                 device = map->stripes[index].dev;
3872                 device->bytes_used += stripe_size;
3873                 ret = btrfs_update_device(trans, device);
3874                 if (ret)
3875                         goto out_free;
3876                 index++;
3877         }
3878
3879         spin_lock(&extent_root->fs_info->free_chunk_lock);
3880         extent_root->fs_info->free_chunk_space -= (stripe_size *
3881                                                    map->num_stripes);
3882         spin_unlock(&extent_root->fs_info->free_chunk_lock);
3883
3884         index = 0;
3885         stripe = &chunk->stripe;
3886         while (index < map->num_stripes) {
3887                 device = map->stripes[index].dev;
3888                 dev_offset = map->stripes[index].physical;
3889
3890                 btrfs_set_stack_stripe_devid(stripe, device->devid);
3891                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
3892                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
3893                 stripe++;
3894                 index++;
3895         }
3896
3897         btrfs_set_stack_chunk_length(chunk, chunk_size);
3898         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
3899         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
3900         btrfs_set_stack_chunk_type(chunk, map->type);
3901         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
3902         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
3903         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
3904         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
3905         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
3906
3907         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3908         key.type = BTRFS_CHUNK_ITEM_KEY;
3909         key.offset = chunk_offset;
3910
3911         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
3912
3913         if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3914                 /*
3915                  * TODO: Cleanup of inserted chunk root in case of
3916                  * failure.
3917                  */
3918                 ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
3919                                              item_size);
3920         }
3921
3922 out_free:
3923         kfree(chunk);
3924         return ret;
3925 }
3926
3927 /*
3928  * Chunk allocation falls into two parts. The first part does works
3929  * that make the new allocated chunk useable, but not do any operation
3930  * that modifies the chunk tree. The second part does the works that
3931  * require modifying the chunk tree. This division is important for the
3932  * bootstrap process of adding storage to a seed btrfs.
3933  */
3934 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3935                       struct btrfs_root *extent_root, u64 type)
3936 {
3937         u64 chunk_offset;
3938         u64 chunk_size;
3939         u64 stripe_size;
3940         struct map_lookup *map;
3941         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3942         int ret;
3943
3944         ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3945                               &chunk_offset);
3946         if (ret)
3947                 return ret;
3948
3949         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3950                                   &stripe_size, chunk_offset, type);
3951         if (ret)
3952                 return ret;
3953
3954         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3955                                    chunk_size, stripe_size);
3956         if (ret)
3957                 return ret;
3958         return 0;
3959 }
3960
3961 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3962                                          struct btrfs_root *root,
3963                                          struct btrfs_device *device)
3964 {
3965         u64 chunk_offset;
3966         u64 sys_chunk_offset;
3967         u64 chunk_size;
3968         u64 sys_chunk_size;
3969         u64 stripe_size;
3970         u64 sys_stripe_size;
3971         u64 alloc_profile;
3972         struct map_lookup *map;
3973         struct map_lookup *sys_map;
3974         struct btrfs_fs_info *fs_info = root->fs_info;
3975         struct btrfs_root *extent_root = fs_info->extent_root;
3976         int ret;
3977
3978         ret = find_next_chunk(fs_info->chunk_root,
3979                               BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
3980         if (ret)
3981                 return ret;
3982
3983         alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
3984                                 fs_info->avail_metadata_alloc_bits;
3985         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3986
3987         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3988                                   &stripe_size, chunk_offset, alloc_profile);
3989         if (ret)
3990                 return ret;
3991
3992         sys_chunk_offset = chunk_offset + chunk_size;
3993
3994         alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
3995                                 fs_info->avail_system_alloc_bits;
3996         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3997
3998         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3999                                   &sys_chunk_size, &sys_stripe_size,
4000                                   sys_chunk_offset, alloc_profile);
4001         if (ret) {
4002                 btrfs_abort_transaction(trans, root, ret);
4003                 goto out;
4004         }
4005
4006         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
4007         if (ret) {
4008                 btrfs_abort_transaction(trans, root, ret);
4009                 goto out;
4010         }
4011
4012         /*
4013          * Modifying chunk tree needs allocating new blocks from both
4014          * system block group and metadata block group. So we only can
4015          * do operations require modifying the chunk tree after both
4016          * block groups were created.
4017          */
4018         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
4019                                    chunk_size, stripe_size);
4020         if (ret) {
4021                 btrfs_abort_transaction(trans, root, ret);
4022                 goto out;
4023         }
4024
4025         ret = __finish_chunk_alloc(trans, extent_root, sys_map,
4026                                    sys_chunk_offset, sys_chunk_size,
4027                                    sys_stripe_size);
4028         if (ret)
4029                 btrfs_abort_transaction(trans, root, ret);
4030
4031 out:
4032
4033         return ret;
4034 }
4035
4036 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
4037 {
4038         struct extent_map *em;
4039         struct map_lookup *map;
4040         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4041         int readonly = 0;
4042         int i;
4043
4044         read_lock(&map_tree->map_tree.lock);
4045         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
4046         read_unlock(&map_tree->map_tree.lock);
4047         if (!em)
4048                 return 1;
4049
4050         if (btrfs_test_opt(root, DEGRADED)) {
4051                 free_extent_map(em);
4052                 return 0;
4053         }
4054
4055         map = (struct map_lookup *)em->bdev;
4056         for (i = 0; i < map->num_stripes; i++) {
4057                 if (!map->stripes[i].dev->writeable) {
4058                         readonly = 1;
4059                         break;
4060                 }
4061         }
4062         free_extent_map(em);
4063         return readonly;
4064 }
4065
4066 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
4067 {
4068         extent_map_tree_init(&tree->map_tree);
4069 }
4070
4071 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
4072 {
4073         struct extent_map *em;
4074
4075         while (1) {
4076                 write_lock(&tree->map_tree.lock);
4077                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
4078                 if (em)
4079                         remove_extent_mapping(&tree->map_tree, em);
4080                 write_unlock(&tree->map_tree.lock);
4081                 if (!em)
4082                         break;
4083                 kfree(em->bdev);
4084                 /* once for us */
4085                 free_extent_map(em);
4086                 /* once for the tree */
4087                 free_extent_map(em);
4088         }
4089 }
4090
4091 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4092 {
4093         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
4094         struct extent_map *em;
4095         struct map_lookup *map;
4096         struct extent_map_tree *em_tree = &map_tree->map_tree;
4097         int ret;
4098
4099         read_lock(&em_tree->lock);
4100         em = lookup_extent_mapping(em_tree, logical, len);
4101         read_unlock(&em_tree->lock);
4102         BUG_ON(!em);
4103
4104         BUG_ON(em->start > logical || em->start + em->len < logical);
4105         map = (struct map_lookup *)em->bdev;
4106         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
4107                 ret = map->num_stripes;
4108         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4109                 ret = map->sub_stripes;
4110         else
4111                 ret = 1;
4112         free_extent_map(em);
4113
4114         btrfs_dev_replace_lock(&fs_info->dev_replace);
4115         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
4116                 ret++;
4117         btrfs_dev_replace_unlock(&fs_info->dev_replace);
4118
4119         return ret;
4120 }
4121
4122 static int find_live_mirror(struct btrfs_fs_info *fs_info,
4123                             struct map_lookup *map, int first, int num,
4124                             int optimal, int dev_replace_is_ongoing)
4125 {
4126         int i;
4127         int tolerance;
4128         struct btrfs_device *srcdev;
4129
4130         if (dev_replace_is_ongoing &&
4131             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
4132              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
4133                 srcdev = fs_info->dev_replace.srcdev;
4134         else
4135                 srcdev = NULL;
4136
4137         /*
4138          * try to avoid the drive that is the source drive for a
4139          * dev-replace procedure, only choose it if no other non-missing
4140          * mirror is available
4141          */
4142         for (tolerance = 0; tolerance < 2; tolerance++) {
4143                 if (map->stripes[optimal].dev->bdev &&
4144                     (tolerance || map->stripes[optimal].dev != srcdev))
4145                         return optimal;
4146                 for (i = first; i < first + num; i++) {
4147                         if (map->stripes[i].dev->bdev &&
4148                             (tolerance || map->stripes[i].dev != srcdev))
4149                                 return i;
4150                 }
4151         }
4152
4153         /* we couldn't find one that doesn't fail.  Just return something
4154          * and the io error handling code will clean up eventually
4155          */
4156         return optimal;
4157 }
4158
4159 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4160                              u64 logical, u64 *length,
4161                              struct btrfs_bio **bbio_ret,
4162                              int mirror_num)
4163 {
4164         struct extent_map *em;
4165         struct map_lookup *map;
4166         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
4167         struct extent_map_tree *em_tree = &map_tree->map_tree;
4168         u64 offset;
4169         u64 stripe_offset;
4170         u64 stripe_end_offset;
4171         u64 stripe_nr;
4172         u64 stripe_nr_orig;
4173         u64 stripe_nr_end;
4174         int stripe_index;
4175         int i;
4176         int ret = 0;
4177         int num_stripes;
4178         int max_errors = 0;
4179         struct btrfs_bio *bbio = NULL;
4180         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4181         int dev_replace_is_ongoing = 0;
4182         int num_alloc_stripes;
4183         int patch_the_first_stripe_for_dev_replace = 0;
4184         u64 physical_to_patch_in_first_stripe = 0;
4185
4186         read_lock(&em_tree->lock);
4187         em = lookup_extent_mapping(em_tree, logical, *length);
4188         read_unlock(&em_tree->lock);
4189
4190         if (!em) {
4191                 printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n",
4192                        (unsigned long long)logical,
4193                        (unsigned long long)*length);
4194                 BUG();
4195         }
4196
4197         BUG_ON(em->start > logical || em->start + em->len < logical);
4198         map = (struct map_lookup *)em->bdev;
4199         offset = logical - em->start;
4200
4201         stripe_nr = offset;
4202         /*
4203          * stripe_nr counts the total number of stripes we have to stride
4204          * to get to this block
4205          */
4206         do_div(stripe_nr, map->stripe_len);
4207
4208         stripe_offset = stripe_nr * map->stripe_len;
4209         BUG_ON(offset < stripe_offset);
4210
4211         /* stripe_offset is the offset of this block in its stripe*/
4212         stripe_offset = offset - stripe_offset;
4213
4214         if (rw & REQ_DISCARD)
4215                 *length = min_t(u64, em->len - offset, *length);
4216         else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
4217                 /* we limit the length of each bio to what fits in a stripe */
4218                 *length = min_t(u64, em->len - offset,
4219                                 map->stripe_len - stripe_offset);
4220         } else {
4221                 *length = em->len - offset;
4222         }
4223
4224         if (!bbio_ret)
4225                 goto out;
4226
4227         btrfs_dev_replace_lock(dev_replace);
4228         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
4229         if (!dev_replace_is_ongoing)
4230                 btrfs_dev_replace_unlock(dev_replace);
4231
4232         if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
4233             !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
4234             dev_replace->tgtdev != NULL) {
4235                 /*
4236                  * in dev-replace case, for repair case (that's the only
4237                  * case where the mirror is selected explicitly when
4238                  * calling btrfs_map_block), blocks left of the left cursor
4239                  * can also be read from the target drive.
4240                  * For REQ_GET_READ_MIRRORS, the target drive is added as
4241                  * the last one to the array of stripes. For READ, it also
4242                  * needs to be supported using the same mirror number.
4243                  * If the requested block is not left of the left cursor,
4244                  * EIO is returned. This can happen because btrfs_num_copies()
4245                  * returns one more in the dev-replace case.
4246                  */
4247                 u64 tmp_length = *length;
4248                 struct btrfs_bio *tmp_bbio = NULL;
4249                 int tmp_num_stripes;
4250                 u64 srcdev_devid = dev_replace->srcdev->devid;
4251                 int index_srcdev = 0;
4252                 int found = 0;
4253                 u64 physical_of_found = 0;
4254
4255                 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4256                              logical, &tmp_length, &tmp_bbio, 0);
4257                 if (ret) {
4258                         WARN_ON(tmp_bbio != NULL);
4259                         goto out;
4260                 }
4261
4262                 tmp_num_stripes = tmp_bbio->num_stripes;
4263                 if (mirror_num > tmp_num_stripes) {
4264                         /*
4265                          * REQ_GET_READ_MIRRORS does not contain this
4266                          * mirror, that means that the requested area
4267                          * is not left of the left cursor
4268                          */
4269                         ret = -EIO;
4270                         kfree(tmp_bbio);
4271                         goto out;
4272                 }
4273
4274                 /*
4275                  * process the rest of the function using the mirror_num
4276                  * of the source drive. Therefore look it up first.
4277                  * At the end, patch the device pointer to the one of the
4278                  * target drive.
4279                  */
4280                 for (i = 0; i < tmp_num_stripes; i++) {
4281                         if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
4282                                 /*
4283                                  * In case of DUP, in order to keep it
4284                                  * simple, only add the mirror with the
4285                                  * lowest physical address
4286                                  */
4287                                 if (found &&
4288                                     physical_of_found <=
4289                                      tmp_bbio->stripes[i].physical)
4290                                         continue;
4291                                 index_srcdev = i;
4292                                 found = 1;
4293                                 physical_of_found =
4294                                         tmp_bbio->stripes[i].physical;
4295                         }
4296                 }
4297
4298                 if (found) {
4299                         mirror_num = index_srcdev + 1;
4300                         patch_the_first_stripe_for_dev_replace = 1;
4301                         physical_to_patch_in_first_stripe = physical_of_found;
4302                 } else {
4303                         WARN_ON(1);
4304                         ret = -EIO;
4305                         kfree(tmp_bbio);
4306                         goto out;
4307                 }
4308
4309                 kfree(tmp_bbio);
4310         } else if (mirror_num > map->num_stripes) {
4311                 mirror_num = 0;
4312         }
4313
4314         num_stripes = 1;
4315         stripe_index = 0;
4316         stripe_nr_orig = stripe_nr;
4317         stripe_nr_end = (offset + *length + map->stripe_len - 1) &
4318                         (~(map->stripe_len - 1));
4319         do_div(stripe_nr_end, map->stripe_len);
4320         stripe_end_offset = stripe_nr_end * map->stripe_len -
4321                             (offset + *length);
4322         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4323                 if (rw & REQ_DISCARD)
4324                         num_stripes = min_t(u64, map->num_stripes,
4325                                             stripe_nr_end - stripe_nr_orig);
4326                 stripe_index = do_div(stripe_nr, map->num_stripes);
4327         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
4328                 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
4329                         num_stripes = map->num_stripes;
4330                 else if (mirror_num)
4331                         stripe_index = mirror_num - 1;
4332                 else {
4333                         stripe_index = find_live_mirror(fs_info, map, 0,
4334                                             map->num_stripes,
4335                                             current->pid % map->num_stripes,
4336                                             dev_replace_is_ongoing);
4337                         mirror_num = stripe_index + 1;
4338                 }
4339
4340         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
4341                 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
4342                         num_stripes = map->num_stripes;
4343                 } else if (mirror_num) {
4344                         stripe_index = mirror_num - 1;
4345                 } else {
4346                         mirror_num = 1;
4347                 }
4348
4349         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
4350                 int factor = map->num_stripes / map->sub_stripes;
4351
4352                 stripe_index = do_div(stripe_nr, factor);
4353                 stripe_index *= map->sub_stripes;
4354
4355                 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
4356                         num_stripes = map->sub_stripes;
4357                 else if (rw & REQ_DISCARD)
4358                         num_stripes = min_t(u64, map->sub_stripes *
4359                                             (stripe_nr_end - stripe_nr_orig),
4360                                             map->num_stripes);
4361                 else if (mirror_num)
4362                         stripe_index += mirror_num - 1;
4363                 else {
4364                         int old_stripe_index = stripe_index;
4365                         stripe_index = find_live_mirror(fs_info, map,
4366                                               stripe_index,
4367                                               map->sub_stripes, stripe_index +
4368                                               current->pid % map->sub_stripes,
4369                                               dev_replace_is_ongoing);
4370                         mirror_num = stripe_index - old_stripe_index + 1;
4371                 }
4372         } else {
4373                 /*
4374                  * after this do_div call, stripe_nr is the number of stripes
4375                  * on this device we have to walk to find the data, and
4376                  * stripe_index is the number of our device in the stripe array
4377                  */
4378                 stripe_index = do_div(stripe_nr, map->num_stripes);
4379                 mirror_num = stripe_index + 1;
4380         }
4381         BUG_ON(stripe_index >= map->num_stripes);
4382
4383         num_alloc_stripes = num_stripes;
4384         if (dev_replace_is_ongoing) {
4385                 if (rw & (REQ_WRITE | REQ_DISCARD))
4386                         num_alloc_stripes <<= 1;
4387                 if (rw & REQ_GET_READ_MIRRORS)
4388                         num_alloc_stripes++;
4389         }
4390         bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
4391         if (!bbio) {
4392                 ret = -ENOMEM;
4393                 goto out;
4394         }
4395         atomic_set(&bbio->error, 0);
4396
4397         if (rw & REQ_DISCARD) {
4398                 int factor = 0;
4399                 int sub_stripes = 0;
4400                 u64 stripes_per_dev = 0;
4401                 u32 remaining_stripes = 0;
4402                 u32 last_stripe = 0;
4403
4404                 if (map->type &
4405                     (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
4406                         if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4407                                 sub_stripes = 1;
4408                         else
4409                                 sub_stripes = map->sub_stripes;
4410
4411                         factor = map->num_stripes / sub_stripes;
4412                         stripes_per_dev = div_u64_rem(stripe_nr_end -
4413                                                       stripe_nr_orig,
4414                                                       factor,
4415                                                       &remaining_stripes);
4416                         div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
4417                         last_stripe *= sub_stripes;
4418                 }
4419
4420                 for (i = 0; i < num_stripes; i++) {
4421                         bbio->stripes[i].physical =
4422                                 map->stripes[stripe_index].physical +
4423                                 stripe_offset + stripe_nr * map->stripe_len;
4424                         bbio->stripes[i].dev = map->stripes[stripe_index].dev;
4425
4426                         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
4427                                          BTRFS_BLOCK_GROUP_RAID10)) {
4428                                 bbio->stripes[i].length = stripes_per_dev *
4429                                                           map->stripe_len;
4430
4431                                 if (i / sub_stripes < remaining_stripes)
4432                                         bbio->stripes[i].length +=
4433                                                 map->stripe_len;
4434
4435                                 /*
4436                                  * Special for the first stripe and
4437                                  * the last stripe:
4438                                  *
4439                                  * |-------|...|-------|
4440                                  *     |----------|
4441                                  *    off     end_off
4442                                  */
4443                                 if (i < sub_stripes)
4444                                         bbio->stripes[i].length -=
4445                                                 stripe_offset;
4446
4447                                 if (stripe_index >= last_stripe &&
4448                                     stripe_index <= (last_stripe +
4449                                                      sub_stripes - 1))
4450                                         bbio->stripes[i].length -=
4451                                                 stripe_end_offset;
4452
4453                                 if (i == sub_stripes - 1)
4454                                         stripe_offset = 0;
4455                         } else
4456                                 bbio->stripes[i].length = *length;
4457
4458                         stripe_index++;
4459                         if (stripe_index == map->num_stripes) {
4460                                 /* This could only happen for RAID0/10 */
4461                                 stripe_index = 0;
4462                                 stripe_nr++;
4463                         }
4464                 }
4465         } else {
4466                 for (i = 0; i < num_stripes; i++) {
4467                         bbio->stripes[i].physical =
4468                                 map->stripes[stripe_index].physical +
4469                                 stripe_offset +
4470                                 stripe_nr * map->stripe_len;
4471                         bbio->stripes[i].dev =
4472                                 map->stripes[stripe_index].dev;
4473                         stripe_index++;
4474                 }
4475         }
4476
4477         if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4478                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4479                                  BTRFS_BLOCK_GROUP_RAID10 |
4480                                  BTRFS_BLOCK_GROUP_DUP)) {
4481                         max_errors = 1;
4482                 }
4483         }
4484
4485         if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
4486             dev_replace->tgtdev != NULL) {
4487                 int index_where_to_add;
4488                 u64 srcdev_devid = dev_replace->srcdev->devid;
4489
4490                 /*
4491                  * duplicate the write operations while the dev replace
4492                  * procedure is running. Since the copying of the old disk
4493                  * to the new disk takes place at run time while the
4494                  * filesystem is mounted writable, the regular write
4495                  * operations to the old disk have to be duplicated to go
4496                  * to the new disk as well.
4497                  * Note that device->missing is handled by the caller, and
4498                  * that the write to the old disk is already set up in the
4499                  * stripes array.
4500                  */
4501                 index_where_to_add = num_stripes;
4502                 for (i = 0; i < num_stripes; i++) {
4503                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
4504                                 /* write to new disk, too */
4505                                 struct btrfs_bio_stripe *new =
4506                                         bbio->stripes + index_where_to_add;
4507                                 struct btrfs_bio_stripe *old =
4508                                         bbio->stripes + i;
4509
4510                                 new->physical = old->physical;
4511                                 new->length = old->length;
4512                                 new->dev = dev_replace->tgtdev;
4513                                 index_where_to_add++;
4514                                 max_errors++;
4515                         }
4516                 }
4517                 num_stripes = index_where_to_add;
4518         } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
4519                    dev_replace->tgtdev != NULL) {
4520                 u64 srcdev_devid = dev_replace->srcdev->devid;
4521                 int index_srcdev = 0;
4522                 int found = 0;
4523                 u64 physical_of_found = 0;
4524
4525                 /*
4526                  * During the dev-replace procedure, the target drive can
4527                  * also be used to read data in case it is needed to repair
4528                  * a corrupt block elsewhere. This is possible if the
4529                  * requested area is left of the left cursor. In this area,
4530                  * the target drive is a full copy of the source drive.
4531                  */
4532                 for (i = 0; i < num_stripes; i++) {
4533                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
4534                                 /*
4535                                  * In case of DUP, in order to keep it
4536                                  * simple, only add the mirror with the
4537                                  * lowest physical address
4538                                  */
4539                                 if (found &&
4540                                     physical_of_found <=
4541                                      bbio->stripes[i].physical)
4542                                         continue;
4543                                 index_srcdev = i;
4544                                 found = 1;
4545                                 physical_of_found = bbio->stripes[i].physical;
4546                         }
4547                 }
4548                 if (found) {
4549                         u64 length = map->stripe_len;
4550
4551                         if (physical_of_found + length <=
4552                             dev_replace->cursor_left) {
4553                                 struct btrfs_bio_stripe *tgtdev_stripe =
4554                                         bbio->stripes + num_stripes;
4555
4556                                 tgtdev_stripe->physical = physical_of_found;
4557                                 tgtdev_stripe->length =
4558                                         bbio->stripes[index_srcdev].length;
4559                                 tgtdev_stripe->dev = dev_replace->tgtdev;
4560
4561                                 num_stripes++;
4562                         }
4563                 }
4564         }
4565
4566         *bbio_ret = bbio;
4567         bbio->num_stripes = num_stripes;
4568         bbio->max_errors = max_errors;
4569         bbio->mirror_num = mirror_num;
4570
4571         /*
4572          * this is the case that REQ_READ && dev_replace_is_ongoing &&
4573          * mirror_num == num_stripes + 1 && dev_replace target drive is
4574          * available as a mirror
4575          */
4576         if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
4577                 WARN_ON(num_stripes > 1);
4578                 bbio->stripes[0].dev = dev_replace->tgtdev;
4579                 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4580                 bbio->mirror_num = map->num_stripes + 1;
4581         }
4582 out:
4583         if (dev_replace_is_ongoing)
4584                 btrfs_dev_replace_unlock(dev_replace);
4585         free_extent_map(em);
4586         return ret;
4587 }
4588
4589 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4590                       u64 logical, u64 *length,
4591                       struct btrfs_bio **bbio_ret, int mirror_num)
4592 {
4593         return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4594                                  mirror_num);
4595 }
4596
4597 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4598                      u64 chunk_start, u64 physical, u64 devid,
4599                      u64 **logical, int *naddrs, int *stripe_len)
4600 {
4601         struct extent_map_tree *em_tree = &map_tree->map_tree;
4602         struct extent_map *em;
4603         struct map_lookup *map;
4604         u64 *buf;
4605         u64 bytenr;
4606         u64 length;
4607         u64 stripe_nr;
4608         int i, j, nr = 0;
4609
4610         read_lock(&em_tree->lock);
4611         em = lookup_extent_mapping(em_tree, chunk_start, 1);
4612         read_unlock(&em_tree->lock);
4613
4614         BUG_ON(!em || em->start != chunk_start);
4615         map = (struct map_lookup *)em->bdev;
4616
4617         length = em->len;
4618         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4619                 do_div(length, map->num_stripes / map->sub_stripes);
4620         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4621                 do_div(length, map->num_stripes);
4622
4623         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
4624         BUG_ON(!buf); /* -ENOMEM */
4625
4626         for (i = 0; i < map->num_stripes; i++) {
4627                 if (devid && map->stripes[i].dev->devid != devid)
4628                         continue;
4629                 if (map->stripes[i].physical > physical ||
4630                     map->stripes[i].physical + length <= physical)
4631                         continue;
4632
4633                 stripe_nr = physical - map->stripes[i].physical;
4634                 do_div(stripe_nr, map->stripe_len);
4635
4636                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
4637                         stripe_nr = stripe_nr * map->num_stripes + i;
4638                         do_div(stripe_nr, map->sub_stripes);
4639                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4640                         stripe_nr = stripe_nr * map->num_stripes + i;
4641                 }
4642                 bytenr = chunk_start + stripe_nr * map->stripe_len;
4643                 WARN_ON(nr >= map->num_stripes);
4644                 for (j = 0; j < nr; j++) {
4645                         if (buf[j] == bytenr)
4646                                 break;
4647                 }
4648                 if (j == nr) {
4649                         WARN_ON(nr >= map->num_stripes);
4650                         buf[nr++] = bytenr;
4651                 }
4652         }
4653
4654         *logical = buf;
4655         *naddrs = nr;
4656         *stripe_len = map->stripe_len;
4657
4658         free_extent_map(em);
4659         return 0;
4660 }
4661
4662 static void *merge_stripe_index_into_bio_private(void *bi_private,
4663                                                  unsigned int stripe_index)
4664 {
4665         /*
4666          * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
4667          * at most 1.
4668          * The alternative solution (instead of stealing bits from the
4669          * pointer) would be to allocate an intermediate structure
4670          * that contains the old private pointer plus the stripe_index.
4671          */
4672         BUG_ON((((uintptr_t)bi_private) & 3) != 0);
4673         BUG_ON(stripe_index > 3);
4674         return (void *)(((uintptr_t)bi_private) | stripe_index);
4675 }
4676
4677 static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
4678 {
4679         return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
4680 }
4681
4682 static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
4683 {
4684         return (unsigned int)((uintptr_t)bi_private) & 3;
4685 }
4686
4687 static void btrfs_end_bio(struct bio *bio, int err)
4688 {
4689         struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
4690         int is_orig_bio = 0;
4691
4692         if (err) {
4693                 atomic_inc(&bbio->error);
4694                 if (err == -EIO || err == -EREMOTEIO) {
4695                         unsigned int stripe_index =
4696                                 extract_stripe_index_from_bio_private(
4697                                         bio->bi_private);
4698                         struct btrfs_device *dev;
4699
4700                         BUG_ON(stripe_index >= bbio->num_stripes);
4701                         dev = bbio->stripes[stripe_index].dev;
4702                         if (dev->bdev) {
4703                                 if (bio->bi_rw & WRITE)
4704                                         btrfs_dev_stat_inc(dev,
4705                                                 BTRFS_DEV_STAT_WRITE_ERRS);
4706                                 else
4707                                         btrfs_dev_stat_inc(dev,
4708                                                 BTRFS_DEV_STAT_READ_ERRS);
4709                                 if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
4710                                         btrfs_dev_stat_inc(dev,
4711                                                 BTRFS_DEV_STAT_FLUSH_ERRS);
4712                                 btrfs_dev_stat_print_on_error(dev);
4713                         }
4714                 }
4715         }
4716
4717         if (bio == bbio->orig_bio)
4718                 is_orig_bio = 1;
4719
4720         if (atomic_dec_and_test(&bbio->stripes_pending)) {
4721                 if (!is_orig_bio) {
4722                         bio_put(bio);
4723                         bio = bbio->orig_bio;
4724                 }
4725                 bio->bi_private = bbio->private;
4726                 bio->bi_end_io = bbio->end_io;
4727                 bio->bi_bdev = (struct block_device *)
4728                                         (unsigned long)bbio->mirror_num;
4729                 /* only send an error to the higher layers if it is
4730                  * beyond the tolerance of the multi-bio
4731                  */
4732                 if (atomic_read(&bbio->error) > bbio->max_errors) {
4733                         err = -EIO;
4734                 } else {
4735                         /*
4736                          * this bio is actually up to date, we didn't
4737                          * go over the max number of errors
4738                          */
4739                         set_bit(BIO_UPTODATE, &bio->bi_flags);
4740                         err = 0;
4741                 }
4742                 kfree(bbio);
4743
4744                 bio_endio(bio, err);
4745         } else if (!is_orig_bio) {
4746                 bio_put(bio);
4747         }
4748 }
4749
4750 struct async_sched {
4751         struct bio *bio;
4752         int rw;
4753         struct btrfs_fs_info *info;
4754         struct btrfs_work work;
4755 };
4756
4757 /*
4758  * see run_scheduled_bios for a description of why bios are collected for
4759  * async submit.
4760  *
4761  * This will add one bio to the pending list for a device and make sure
4762  * the work struct is scheduled.
4763  */
4764 static noinline void schedule_bio(struct btrfs_root *root,
4765                                  struct btrfs_device *device,
4766                                  int rw, struct bio *bio)
4767 {
4768         int should_queue = 1;
4769         struct btrfs_pending_bios *pending_bios;
4770
4771         /* don't bother with additional async steps for reads, right now */
4772         if (!(rw & REQ_WRITE)) {
4773                 bio_get(bio);
4774                 btrfsic_submit_bio(rw, bio);
4775                 bio_put(bio);
4776                 return;
4777         }
4778
4779         /*
4780          * nr_async_bios allows us to reliably return congestion to the
4781          * higher layers.  Otherwise, the async bio makes it appear we have
4782          * made progress against dirty pages when we've really just put it
4783          * on a queue for later
4784          */
4785         atomic_inc(&root->fs_info->nr_async_bios);
4786         WARN_ON(bio->bi_next);
4787         bio->bi_next = NULL;
4788         bio->bi_rw |= rw;
4789
4790         spin_lock(&device->io_lock);
4791         if (bio->bi_rw & REQ_SYNC)
4792                 pending_bios = &device->pending_sync_bios;
4793         else
4794                 pending_bios = &device->pending_bios;
4795
4796         if (pending_bios->tail)
4797                 pending_bios->tail->bi_next = bio;
4798
4799         pending_bios->tail = bio;
4800         if (!pending_bios->head)
4801                 pending_bios->head = bio;
4802         if (device->running_pending)
4803                 should_queue = 0;
4804
4805         spin_unlock(&device->io_lock);
4806
4807         if (should_queue)
4808                 btrfs_queue_worker(&root->fs_info->submit_workers,
4809                                    &device->work);
4810 }
4811
4812 static int bio_size_ok(struct block_device *bdev, struct bio *bio,
4813                        sector_t sector)
4814 {
4815         struct bio_vec *prev;
4816         struct request_queue *q = bdev_get_queue(bdev);
4817         unsigned short max_sectors = queue_max_sectors(q);
4818         struct bvec_merge_data bvm = {
4819                 .bi_bdev = bdev,
4820                 .bi_sector = sector,
4821                 .bi_rw = bio->bi_rw,
4822         };
4823
4824         if (bio->bi_vcnt == 0) {
4825                 WARN_ON(1);
4826                 return 1;
4827         }
4828
4829         prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
4830         if ((bio->bi_size >> 9) > max_sectors)
4831                 return 0;
4832
4833         if (!q->merge_bvec_fn)
4834                 return 1;
4835
4836         bvm.bi_size = bio->bi_size - prev->bv_len;
4837         if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
4838                 return 0;
4839         return 1;
4840 }
4841
4842 static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4843                               struct bio *bio, u64 physical, int dev_nr,
4844                               int rw, int async)
4845 {
4846         struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
4847
4848         bio->bi_private = bbio;
4849         bio->bi_private = merge_stripe_index_into_bio_private(
4850                         bio->bi_private, (unsigned int)dev_nr);
4851         bio->bi_end_io = btrfs_end_bio;
4852         bio->bi_sector = physical >> 9;
4853 #ifdef DEBUG
4854         {
4855                 struct rcu_string *name;
4856
4857                 rcu_read_lock();
4858                 name = rcu_dereference(dev->name);
4859                 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
4860                          "(%s id %llu), size=%u\n", rw,
4861                          (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4862                          name->str, dev->devid, bio->bi_size);
4863                 rcu_read_unlock();
4864         }
4865 #endif
4866         bio->bi_bdev = dev->bdev;
4867         if (async)
4868                 schedule_bio(root, dev, rw, bio);
4869         else
4870                 btrfsic_submit_bio(rw, bio);
4871 }
4872
4873 static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4874                               struct bio *first_bio, struct btrfs_device *dev,
4875                               int dev_nr, int rw, int async)
4876 {
4877         struct bio_vec *bvec = first_bio->bi_io_vec;
4878         struct bio *bio;
4879         int nr_vecs = bio_get_nr_vecs(dev->bdev);
4880         u64 physical = bbio->stripes[dev_nr].physical;
4881
4882 again:
4883         bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
4884         if (!bio)
4885                 return -ENOMEM;
4886
4887         while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
4888                 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
4889                                  bvec->bv_offset) < bvec->bv_len) {
4890                         u64 len = bio->bi_size;
4891
4892                         atomic_inc(&bbio->stripes_pending);
4893                         submit_stripe_bio(root, bbio, bio, physical, dev_nr,
4894                                           rw, async);
4895                         physical += len;
4896                         goto again;
4897                 }
4898                 bvec++;
4899         }
4900
4901         submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
4902         return 0;
4903 }
4904
4905 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
4906 {
4907         atomic_inc(&bbio->error);
4908         if (atomic_dec_and_test(&bbio->stripes_pending)) {
4909                 bio->bi_private = bbio->private;
4910                 bio->bi_end_io = bbio->end_io;
4911                 bio->bi_bdev = (struct block_device *)
4912                         (unsigned long)bbio->mirror_num;
4913                 bio->bi_sector = logical >> 9;
4914                 kfree(bbio);
4915                 bio_endio(bio, -EIO);
4916         }
4917 }
4918
4919 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4920                   int mirror_num, int async_submit)
4921 {
4922         struct btrfs_device *dev;
4923         struct bio *first_bio = bio;
4924         u64 logical = (u64)bio->bi_sector << 9;
4925         u64 length = 0;
4926         u64 map_length;
4927         int ret;
4928         int dev_nr = 0;
4929         int total_devs = 1;
4930         struct btrfs_bio *bbio = NULL;
4931
4932         length = bio->bi_size;
4933         map_length = length;
4934
4935         ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4936                               mirror_num);
4937         if (ret)
4938                 return ret;
4939
4940         total_devs = bbio->num_stripes;
4941         if (map_length < length) {
4942                 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
4943                        "len %llu\n", (unsigned long long)logical,
4944                        (unsigned long long)length,
4945                        (unsigned long long)map_length);
4946                 BUG();
4947         }
4948
4949         bbio->orig_bio = first_bio;
4950         bbio->private = first_bio->bi_private;
4951         bbio->end_io = first_bio->bi_end_io;
4952         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4953
4954         while (dev_nr < total_devs) {
4955                 dev = bbio->stripes[dev_nr].dev;
4956                 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
4957                         bbio_error(bbio, first_bio, logical);
4958                         dev_nr++;
4959                         continue;
4960                 }
4961
4962                 /*
4963                  * Check and see if we're ok with this bio based on it's size
4964                  * and offset with the given device.
4965                  */
4966                 if (!bio_size_ok(dev->bdev, first_bio,
4967                                  bbio->stripes[dev_nr].physical >> 9)) {
4968                         ret = breakup_stripe_bio(root, bbio, first_bio, dev,
4969                                                  dev_nr, rw, async_submit);
4970                         BUG_ON(ret);
4971                         dev_nr++;
4972                         continue;
4973                 }
4974
4975                 if (dev_nr < total_devs - 1) {
4976                         bio = bio_clone(first_bio, GFP_NOFS);
4977                         BUG_ON(!bio); /* -ENOMEM */
4978                 } else {
4979                         bio = first_bio;
4980                 }
4981
4982                 submit_stripe_bio(root, bbio, bio,
4983                                   bbio->stripes[dev_nr].physical, dev_nr, rw,
4984                                   async_submit);
4985                 dev_nr++;
4986         }
4987         return 0;
4988 }
4989
4990 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
4991                                        u8 *uuid, u8 *fsid)
4992 {
4993         struct btrfs_device *device;
4994         struct btrfs_fs_devices *cur_devices;
4995
4996         cur_devices = fs_info->fs_devices;
4997         while (cur_devices) {
4998                 if (!fsid ||
4999                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
5000                         device = __find_device(&cur_devices->devices,
5001                                                devid, uuid);
5002                         if (device)
5003                                 return device;
5004                 }
5005                 cur_devices = cur_devices->seed;
5006         }
5007         return NULL;
5008 }
5009
5010 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
5011                                             u64 devid, u8 *dev_uuid)
5012 {
5013         struct btrfs_device *device;
5014         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
5015
5016         device = kzalloc(sizeof(*device), GFP_NOFS);
5017         if (!device)
5018                 return NULL;
5019         list_add(&device->dev_list,
5020                  &fs_devices->devices);
5021         device->dev_root = root->fs_info->dev_root;
5022         device->devid = devid;
5023         device->work.func = pending_bios_fn;
5024         device->fs_devices = fs_devices;
5025         device->missing = 1;
5026         fs_devices->num_devices++;
5027         fs_devices->missing_devices++;
5028         spin_lock_init(&device->io_lock);
5029         INIT_LIST_HEAD(&device->dev_alloc_list);
5030         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
5031         return device;
5032 }
5033
5034 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
5035                           struct extent_buffer *leaf,
5036                           struct btrfs_chunk *chunk)
5037 {
5038         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
5039         struct map_lookup *map;
5040         struct extent_map *em;
5041         u64 logical;
5042         u64 length;
5043         u64 devid;
5044         u8 uuid[BTRFS_UUID_SIZE];
5045         int num_stripes;
5046         int ret;
5047         int i;
5048
5049         logical = key->offset;
5050         length = btrfs_chunk_length(leaf, chunk);
5051
5052         read_lock(&map_tree->map_tree.lock);
5053         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
5054         read_unlock(&map_tree->map_tree.lock);
5055
5056         /* already mapped? */
5057         if (em && em->start <= logical && em->start + em->len > logical) {
5058                 free_extent_map(em);
5059                 return 0;
5060         } else if (em) {
5061                 free_extent_map(em);
5062         }
5063
5064         em = alloc_extent_map();
5065         if (!em)
5066                 return -ENOMEM;
5067         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
5068         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
5069         if (!map) {
5070                 free_extent_map(em);
5071                 return -ENOMEM;
5072         }
5073
5074         em->bdev = (struct block_device *)map;
5075         em->start = logical;
5076         em->len = length;
5077         em->orig_start = 0;
5078         em->block_start = 0;
5079         em->block_len = em->len;
5080
5081         map->num_stripes = num_stripes;
5082         map->io_width = btrfs_chunk_io_width(leaf, chunk);
5083         map->io_align = btrfs_chunk_io_align(leaf, chunk);
5084         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
5085         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
5086         map->type = btrfs_chunk_type(leaf, chunk);
5087         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
5088         for (i = 0; i < num_stripes; i++) {
5089                 map->stripes[i].physical =
5090                         btrfs_stripe_offset_nr(leaf, chunk, i);
5091                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
5092                 read_extent_buffer(leaf, uuid, (unsigned long)
5093                                    btrfs_stripe_dev_uuid_nr(chunk, i),
5094                                    BTRFS_UUID_SIZE);
5095                 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
5096                                                         uuid, NULL);
5097                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
5098                         kfree(map);
5099                         free_extent_map(em);
5100                         return -EIO;
5101                 }
5102                 if (!map->stripes[i].dev) {
5103                         map->stripes[i].dev =
5104                                 add_missing_dev(root, devid, uuid);
5105                         if (!map->stripes[i].dev) {
5106                                 kfree(map);
5107                                 free_extent_map(em);
5108                                 return -EIO;
5109                         }
5110                 }
5111                 map->stripes[i].dev->in_fs_metadata = 1;
5112         }
5113
5114         write_lock(&map_tree->map_tree.lock);
5115         ret = add_extent_mapping(&map_tree->map_tree, em);
5116         write_unlock(&map_tree->map_tree.lock);
5117         BUG_ON(ret); /* Tree corruption */
5118         free_extent_map(em);
5119
5120         return 0;
5121 }
5122
5123 static void fill_device_from_item(struct extent_buffer *leaf,
5124                                  struct btrfs_dev_item *dev_item,
5125                                  struct btrfs_device *device)
5126 {
5127         unsigned long ptr;
5128
5129         device->devid = btrfs_device_id(leaf, dev_item);
5130         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
5131         device->total_bytes = device->disk_total_bytes;
5132         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
5133         device->type = btrfs_device_type(leaf, dev_item);
5134         device->io_align = btrfs_device_io_align(leaf, dev_item);
5135         device->io_width = btrfs_device_io_width(leaf, dev_item);
5136         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
5137         WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
5138         device->is_tgtdev_for_dev_replace = 0;
5139
5140         ptr = (unsigned long)btrfs_device_uuid(dev_item);
5141         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
5142 }
5143
5144 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
5145 {
5146         struct btrfs_fs_devices *fs_devices;
5147         int ret;
5148
5149         BUG_ON(!mutex_is_locked(&uuid_mutex));
5150
5151         fs_devices = root->fs_info->fs_devices->seed;
5152         while (fs_devices) {
5153                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
5154                         ret = 0;
5155                         goto out;
5156                 }
5157                 fs_devices = fs_devices->seed;
5158         }
5159
5160         fs_devices = find_fsid(fsid);
5161         if (!fs_devices) {
5162                 ret = -ENOENT;
5163                 goto out;
5164         }
5165
5166         fs_devices = clone_fs_devices(fs_devices);
5167         if (IS_ERR(fs_devices)) {
5168                 ret = PTR_ERR(fs_devices);
5169                 goto out;
5170         }
5171
5172         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
5173                                    root->fs_info->bdev_holder);
5174         if (ret) {
5175                 free_fs_devices(fs_devices);
5176                 goto out;
5177         }
5178
5179         if (!fs_devices->seeding) {
5180                 __btrfs_close_devices(fs_devices);
5181                 free_fs_devices(fs_devices);
5182                 ret = -EINVAL;
5183                 goto out;
5184         }
5185
5186         fs_devices->seed = root->fs_info->fs_devices->seed;
5187         root->fs_info->fs_devices->seed = fs_devices;
5188 out:
5189         return ret;
5190 }
5191
5192 static int read_one_dev(struct btrfs_root *root,
5193                         struct extent_buffer *leaf,
5194                         struct btrfs_dev_item *dev_item)
5195 {
5196         struct btrfs_device *device;
5197         u64 devid;
5198         int ret;
5199         u8 fs_uuid[BTRFS_UUID_SIZE];
5200         u8 dev_uuid[BTRFS_UUID_SIZE];
5201
5202         devid = btrfs_device_id(leaf, dev_item);
5203         read_extent_buffer(leaf, dev_uuid,
5204                            (unsigned long)btrfs_device_uuid(dev_item),
5205                            BTRFS_UUID_SIZE);
5206         read_extent_buffer(leaf, fs_uuid,
5207                            (unsigned long)btrfs_device_fsid(dev_item),
5208                            BTRFS_UUID_SIZE);
5209
5210         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
5211                 ret = open_seed_devices(root, fs_uuid);
5212                 if (ret && !btrfs_test_opt(root, DEGRADED))
5213                         return ret;
5214         }
5215
5216         device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
5217         if (!device || !device->bdev) {
5218                 if (!btrfs_test_opt(root, DEGRADED))
5219                         return -EIO;
5220
5221                 if (!device) {
5222                         printk(KERN_WARNING "warning devid %llu missing\n",
5223                                (unsigned long long)devid);
5224                         device = add_missing_dev(root, devid, dev_uuid);
5225                         if (!device)
5226                                 return -ENOMEM;
5227                 } else if (!device->missing) {
5228                         /*
5229                          * this happens when a device that was properly setup
5230                          * in the device info lists suddenly goes bad.
5231                          * device->bdev is NULL, and so we have to set
5232                          * device->missing to one here
5233                          */
5234                         root->fs_info->fs_devices->missing_devices++;
5235                         device->missing = 1;
5236                 }
5237         }
5238
5239         if (device->fs_devices != root->fs_info->fs_devices) {
5240                 BUG_ON(device->writeable);
5241                 if (device->generation !=
5242                     btrfs_device_generation(leaf, dev_item))
5243                         return -EINVAL;
5244         }
5245
5246         fill_device_from_item(leaf, dev_item, device);
5247         device->dev_root = root->fs_info->dev_root;
5248         device->in_fs_metadata = 1;
5249         if (device->writeable && !device->is_tgtdev_for_dev_replace) {
5250                 device->fs_devices->total_rw_bytes += device->total_bytes;
5251                 spin_lock(&root->fs_info->free_chunk_lock);
5252                 root->fs_info->free_chunk_space += device->total_bytes -
5253                         device->bytes_used;
5254                 spin_unlock(&root->fs_info->free_chunk_lock);
5255         }
5256         ret = 0;
5257         return ret;
5258 }
5259
5260 int btrfs_read_sys_array(struct btrfs_root *root)
5261 {
5262         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
5263         struct extent_buffer *sb;
5264         struct btrfs_disk_key *disk_key;
5265         struct btrfs_chunk *chunk;
5266         u8 *ptr;
5267         unsigned long sb_ptr;
5268         int ret = 0;
5269         u32 num_stripes;
5270         u32 array_size;
5271         u32 len = 0;
5272         u32 cur;
5273         struct btrfs_key key;
5274
5275         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
5276                                           BTRFS_SUPER_INFO_SIZE);
5277         if (!sb)
5278                 return -ENOMEM;
5279         btrfs_set_buffer_uptodate(sb);
5280         btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
5281         /*
5282          * The sb extent buffer is artifical and just used to read the system array.
5283          * btrfs_set_buffer_uptodate() call does not properly mark all it's
5284          * pages up-to-date when the page is larger: extent does not cover the
5285          * whole page and consequently check_page_uptodate does not find all
5286          * the page's extents up-to-date (the hole beyond sb),
5287          * write_extent_buffer then triggers a WARN_ON.
5288          *
5289          * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
5290          * but sb spans only this function. Add an explicit SetPageUptodate call
5291          * to silence the warning eg. on PowerPC 64.
5292          */
5293         if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
5294                 SetPageUptodate(sb->pages[0]);
5295
5296         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
5297         array_size = btrfs_super_sys_array_size(super_copy);
5298
5299         ptr = super_copy->sys_chunk_array;
5300         sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
5301         cur = 0;
5302
5303         while (cur < array_size) {
5304                 disk_key = (struct btrfs_disk_key *)ptr;
5305                 btrfs_disk_key_to_cpu(&key, disk_key);
5306
5307                 len = sizeof(*disk_key); ptr += len;
5308                 sb_ptr += len;
5309                 cur += len;
5310
5311                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
5312                         chunk = (struct btrfs_chunk *)sb_ptr;
5313                         ret = read_one_chunk(root, &key, sb, chunk);
5314                         if (ret)
5315                                 break;
5316                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
5317                         len = btrfs_chunk_item_size(num_stripes);
5318                 } else {
5319                         ret = -EIO;
5320                         break;
5321                 }
5322                 ptr += len;
5323                 sb_ptr += len;
5324                 cur += len;
5325         }
5326         free_extent_buffer(sb);
5327         return ret;
5328 }
5329
5330 int btrfs_read_chunk_tree(struct btrfs_root *root)
5331 {
5332         struct btrfs_path *path;
5333         struct extent_buffer *leaf;
5334         struct btrfs_key key;
5335         struct btrfs_key found_key;
5336         int ret;
5337         int slot;
5338
5339         root = root->fs_info->chunk_root;
5340
5341         path = btrfs_alloc_path();
5342         if (!path)
5343                 return -ENOMEM;
5344
5345         mutex_lock(&uuid_mutex);
5346         lock_chunks(root);
5347
5348         /* first we search for all of the device items, and then we
5349          * read in all of the chunk items.  This way we can create chunk
5350          * mappings that reference all of the devices that are afound
5351          */
5352         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
5353         key.offset = 0;
5354         key.type = 0;
5355 again:
5356         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5357         if (ret < 0)
5358                 goto error;
5359         while (1) {
5360                 leaf = path->nodes[0];
5361                 slot = path->slots[0];
5362                 if (slot >= btrfs_header_nritems(leaf)) {
5363                         ret = btrfs_next_leaf(root, path);
5364                         if (ret == 0)
5365                                 continue;
5366                         if (ret < 0)
5367                                 goto error;
5368                         break;
5369                 }
5370                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5371                 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
5372                         if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
5373                                 break;
5374                         if (found_key.type == BTRFS_DEV_ITEM_KEY) {
5375                                 struct btrfs_dev_item *dev_item;
5376                                 dev_item = btrfs_item_ptr(leaf, slot,
5377                                                   struct btrfs_dev_item);
5378                                 ret = read_one_dev(root, leaf, dev_item);
5379                                 if (ret)
5380                                         goto error;
5381                         }
5382                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
5383                         struct btrfs_chunk *chunk;
5384                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
5385                         ret = read_one_chunk(root, &found_key, leaf, chunk);
5386                         if (ret)
5387                                 goto error;
5388                 }
5389                 path->slots[0]++;
5390         }
5391         if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
5392                 key.objectid = 0;
5393                 btrfs_release_path(path);
5394                 goto again;
5395         }
5396         ret = 0;
5397 error:
5398         unlock_chunks(root);
5399         mutex_unlock(&uuid_mutex);
5400
5401         btrfs_free_path(path);
5402         return ret;
5403 }
5404
5405 static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
5406 {
5407         int i;
5408
5409         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
5410                 btrfs_dev_stat_reset(dev, i);
5411 }
5412
5413 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
5414 {
5415         struct btrfs_key key;
5416         struct btrfs_key found_key;
5417         struct btrfs_root *dev_root = fs_info->dev_root;
5418         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
5419         struct extent_buffer *eb;
5420         int slot;
5421         int ret = 0;
5422         struct btrfs_device *device;
5423         struct btrfs_path *path = NULL;
5424         int i;
5425
5426         path = btrfs_alloc_path();
5427         if (!path) {
5428                 ret = -ENOMEM;
5429                 goto out;
5430         }
5431
5432         mutex_lock(&fs_devices->device_list_mutex);
5433         list_for_each_entry(device, &fs_devices->devices, dev_list) {
5434                 int item_size;
5435                 struct btrfs_dev_stats_item *ptr;
5436
5437                 key.objectid = 0;
5438                 key.type = BTRFS_DEV_STATS_KEY;
5439                 key.offset = device->devid;
5440                 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
5441                 if (ret) {
5442                         __btrfs_reset_dev_stats(device);
5443                         device->dev_stats_valid = 1;
5444                         btrfs_release_path(path);
5445                         continue;
5446                 }
5447                 slot = path->slots[0];
5448                 eb = path->nodes[0];
5449                 btrfs_item_key_to_cpu(eb, &found_key, slot);
5450                 item_size = btrfs_item_size_nr(eb, slot);
5451
5452                 ptr = btrfs_item_ptr(eb, slot,
5453                                      struct btrfs_dev_stats_item);
5454
5455                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
5456                         if (item_size >= (1 + i) * sizeof(__le64))
5457                                 btrfs_dev_stat_set(device, i,
5458                                         btrfs_dev_stats_value(eb, ptr, i));
5459                         else
5460                                 btrfs_dev_stat_reset(device, i);
5461                 }
5462
5463                 device->dev_stats_valid = 1;
5464                 btrfs_dev_stat_print_on_load(device);
5465                 btrfs_release_path(path);
5466         }
5467         mutex_unlock(&fs_devices->device_list_mutex);
5468
5469 out:
5470         btrfs_free_path(path);
5471         return ret < 0 ? ret : 0;
5472 }
5473
5474 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
5475                                 struct btrfs_root *dev_root,
5476                                 struct btrfs_device *device)
5477 {
5478         struct btrfs_path *path;
5479         struct btrfs_key key;
5480         struct extent_buffer *eb;
5481         struct btrfs_dev_stats_item *ptr;
5482         int ret;
5483         int i;
5484
5485         key.objectid = 0;
5486         key.type = BTRFS_DEV_STATS_KEY;
5487         key.offset = device->devid;
5488
5489         path = btrfs_alloc_path();
5490         BUG_ON(!path);
5491         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
5492         if (ret < 0) {
5493                 printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
5494                               ret, rcu_str_deref(device->name));
5495                 goto out;
5496         }
5497
5498         if (ret == 0 &&
5499             btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
5500                 /* need to delete old one and insert a new one */
5501                 ret = btrfs_del_item(trans, dev_root, path);
5502                 if (ret != 0) {
5503                         printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
5504                                       rcu_str_deref(device->name), ret);
5505                         goto out;
5506                 }
5507                 ret = 1;
5508         }
5509
5510         if (ret == 1) {
5511                 /* need to insert a new item */
5512                 btrfs_release_path(path);
5513                 ret = btrfs_insert_empty_item(trans, dev_root, path,
5514                                               &key, sizeof(*ptr));
5515                 if (ret < 0) {
5516                         printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
5517                                       rcu_str_deref(device->name), ret);
5518                         goto out;
5519                 }
5520         }
5521
5522         eb = path->nodes[0];
5523         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
5524         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
5525                 btrfs_set_dev_stats_value(eb, ptr, i,
5526                                           btrfs_dev_stat_read(device, i));
5527         btrfs_mark_buffer_dirty(eb);
5528
5529 out:
5530         btrfs_free_path(path);
5531         return ret;
5532 }
5533
5534 /*
5535  * called from commit_transaction. Writes all changed device stats to disk.
5536  */
5537 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
5538                         struct btrfs_fs_info *fs_info)
5539 {
5540         struct btrfs_root *dev_root = fs_info->dev_root;
5541         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
5542         struct btrfs_device *device;
5543         int ret = 0;
5544
5545         mutex_lock(&fs_devices->device_list_mutex);
5546         list_for_each_entry(device, &fs_devices->devices, dev_list) {
5547                 if (!device->dev_stats_valid || !device->dev_stats_dirty)
5548                         continue;
5549
5550                 ret = update_dev_stat_item(trans, dev_root, device);
5551                 if (!ret)
5552                         device->dev_stats_dirty = 0;
5553         }
5554         mutex_unlock(&fs_devices->device_list_mutex);
5555
5556         return ret;
5557 }
5558
5559 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
5560 {
5561         btrfs_dev_stat_inc(dev, index);
5562         btrfs_dev_stat_print_on_error(dev);
5563 }
5564
5565 void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
5566 {
5567         if (!dev->dev_stats_valid)
5568                 return;
5569         printk_ratelimited_in_rcu(KERN_ERR
5570                            "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
5571                            rcu_str_deref(dev->name),
5572                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
5573                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
5574                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
5575                            btrfs_dev_stat_read(dev,
5576                                                BTRFS_DEV_STAT_CORRUPTION_ERRS),
5577                            btrfs_dev_stat_read(dev,
5578                                                BTRFS_DEV_STAT_GENERATION_ERRS));
5579 }
5580
5581 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
5582 {
5583         int i;
5584
5585         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
5586                 if (btrfs_dev_stat_read(dev, i) != 0)
5587                         break;
5588         if (i == BTRFS_DEV_STAT_VALUES_MAX)
5589                 return; /* all values == 0, suppress message */
5590
5591         printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
5592                rcu_str_deref(dev->name),
5593                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
5594                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
5595                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
5596                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
5597                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
5598 }
5599
5600 int btrfs_get_dev_stats(struct btrfs_root *root,
5601                         struct btrfs_ioctl_get_dev_stats *stats)
5602 {
5603         struct btrfs_device *dev;
5604         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
5605         int i;
5606
5607         mutex_lock(&fs_devices->device_list_mutex);
5608         dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
5609         mutex_unlock(&fs_devices->device_list_mutex);
5610
5611         if (!dev) {
5612                 printk(KERN_WARNING
5613                        "btrfs: get dev_stats failed, device not found\n");
5614                 return -ENODEV;
5615         } else if (!dev->dev_stats_valid) {
5616                 printk(KERN_WARNING
5617                        "btrfs: get dev_stats failed, not yet valid\n");
5618                 return -ENODEV;
5619         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
5620                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
5621                         if (stats->nr_items > i)
5622                                 stats->values[i] =
5623                                         btrfs_dev_stat_read_and_reset(dev, i);
5624                         else
5625                                 btrfs_dev_stat_reset(dev, i);
5626                 }
5627         } else {
5628                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
5629                         if (stats->nr_items > i)
5630                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
5631         }
5632         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
5633                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
5634         return 0;
5635 }
5636
5637 int btrfs_scratch_superblock(struct btrfs_device *device)
5638 {
5639         struct buffer_head *bh;
5640         struct btrfs_super_block *disk_super;
5641
5642         bh = btrfs_read_dev_super(device->bdev);
5643         if (!bh)
5644                 return -EINVAL;
5645         disk_super = (struct btrfs_super_block *)bh->b_data;
5646
5647         memset(&disk_super->magic, 0, sizeof(disk_super->magic));
5648         set_buffer_dirty(bh);
5649         sync_dirty_buffer(bh);
5650         brelse(bh);
5651
5652         return 0;
5653 }