fs/btrfs/volumes.c

   1 /*
   2  * Copyright (C) 2007 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18 #include <linux/sched.h>
  19 #include <linux/bio.h>
  20 #include <linux/slab.h>
  21 #include <linux/buffer_head.h>
  22 #include <linux/blkdev.h>
  23 #include <linux/random.h>
  24 #include <linux/iocontext.h>
  25 #include <linux/capability.h>
  26 #include <linux/ratelimit.h>
  27 #include <linux/kthread.h>
  28 #include <linux/raid/pq.h>
  29 #include <asm/div64.h>
  30 #include "compat.h"
  31 #include "ctree.h"
  32 #include "extent_map.h"
  33 #include "disk-io.h"
  34 #include "transaction.h"
  35 #include "print-tree.h"
  36 #include "volumes.h"
  37 #include "raid56.h"
  38 #include "async-thread.h"
  39 #include "check-integrity.h"
  40 #include "rcu-string.h"
  41 #include "math.h"
  42 #include "dev-replace.h"
  43
  44 static int init_first_rw_device(struct btrfs_trans_handle *trans,
  45                                 struct btrfs_root *root,
  46                                 struct btrfs_device *device);
  47 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
  48 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
  49 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
  50 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
  51
  52 static DEFINE_MUTEX(uuid_mutex);
  53 static LIST_HEAD(fs_uuids);
  54
  55 static void lock_chunks(struct btrfs_root *root)
  56 {
  57         mutex_lock(&root->fs_info->chunk_mutex);
  58 }
  59
  60 static void unlock_chunks(struct btrfs_root *root)
  61 {
  62         mutex_unlock(&root->fs_info->chunk_mutex);
  63 }
  64
  65 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
  66 {
  67         struct btrfs_device *device;
  68         WARN_ON(fs_devices->opened);
  69         while (!list_empty(&fs_devices->devices)) {
  70                 device = list_entry(fs_devices->devices.next,
  71                                     struct btrfs_device, dev_list);
  72                 list_del(&device->dev_list);
  73                 rcu_string_free(device->name);
  74                 kfree(device);
  75         }
  76         kfree(fs_devices);
  77 }
  78
  79 static void btrfs_kobject_uevent(struct block_device *bdev,
  80                                  enum kobject_action action)
  81 {
  82         int ret;
  83
  84         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
  85         if (ret)
  86                 pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
  87                         action,
  88                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
  89                         &disk_to_dev(bdev->bd_disk)->kobj);
  90 }
  91
  92 void btrfs_cleanup_fs_uuids(void)
  93 {
  94         struct btrfs_fs_devices *fs_devices;
  95
  96         while (!list_empty(&fs_uuids)) {
  97                 fs_devices = list_entry(fs_uuids.next,
  98                                         struct btrfs_fs_devices, list);
  99                 list_del(&fs_devices->list);
 100                 free_fs_devices(fs_devices);
 101         }
 102 }
 103
 104 static noinline struct btrfs_device *__find_device(struct list_head *head,
 105                                                    u64 devid, u8 *uuid)
 106 {
 107         struct btrfs_device *dev;
 108
 109         list_for_each_entry(dev, head, dev_list) {
 110                 if (dev->devid == devid &&
 111                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
 112                         return dev;
 113                 }
 114         }
 115         return NULL;
 116 }
 117
 118 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 119 {
 120         struct btrfs_fs_devices *fs_devices;
 121
 122         list_for_each_entry(fs_devices, &fs_uuids, list) {
 123                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 124                         return fs_devices;
 125         }
 126         return NULL;
 127 }
 128
 129 static int
 130 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 131                       int flush, struct block_device **bdev,
 132                       struct buffer_head **bh)
 133 {
 134         int ret;
 135
 136         *bdev = blkdev_get_by_path(device_path, flags, holder);
 137
 138         if (IS_ERR(*bdev)) {
 139                 ret = PTR_ERR(*bdev);
 140                 printk(KERN_INFO "btrfs: open %s failed\n", device_path);
 141                 goto error;
 142         }
 143
 144         if (flush)
 145                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
 146         ret = set_blocksize(*bdev, 4096);
 147         if (ret) {
 148                 blkdev_put(*bdev, flags);
 149                 goto error;
 150         }
 151         invalidate_bdev(*bdev);
 152         *bh = btrfs_read_dev_super(*bdev);
 153         if (!*bh) {
 154                 ret = -EINVAL;
 155                 blkdev_put(*bdev, flags);
 156                 goto error;
 157         }
 158
 159         return 0;
 160
 161 error:
 162         *bdev = NULL;
 163         *bh = NULL;
 164         return ret;
 165 }
 166
 167 static void requeue_list(struct btrfs_pending_bios *pending_bios,
 168                         struct bio *head, struct bio *tail)
 169 {
 170
 171         struct bio *old_head;
 172
 173         old_head = pending_bios->head;
 174         pending_bios->head = head;
 175         if (pending_bios->tail)
 176                 tail->bi_next = old_head;
 177         else
 178                 pending_bios->tail = tail;
 179 }
 180
 181 /*
 182  * we try to collect pending bios for a device so we don't get a large
 183  * number of procs sending bios down to the same device.  This greatly
 184  * improves the schedulers ability to collect and merge the bios.
 185  *
 186  * But, it also turns into a long list of bios to process and that is sure
 187  * to eventually make the worker thread block.  The solution here is to
 188  * make some progress and then put this work struct back at the end of
 189  * the list if the block device is congested.  This way, multiple devices
 190  * can make progress from a single worker thread.
 191  */
 192 static noinline void run_scheduled_bios(struct btrfs_device *device)
 193 {
 194         struct bio *pending;
 195         struct backing_dev_info *bdi;
 196         struct btrfs_fs_info *fs_info;
 197         struct btrfs_pending_bios *pending_bios;
 198         struct bio *tail;
 199         struct bio *cur;
 200         int again = 0;
 201         unsigned long num_run;
 202         unsigned long batch_run = 0;
 203         unsigned long limit;
 204         unsigned long last_waited = 0;
 205         int force_reg = 0;
 206         int sync_pending = 0;
 207         struct blk_plug plug;
 208
 209         /*
 210          * this function runs all the bios we've collected for
 211          * a particular device.  We don't want to wander off to
 212          * another device without first sending all of these down.
 213          * So, setup a plug here and finish it off before we return
 214          */
 215         blk_start_plug(&plug);
 216
 217         bdi = blk_get_backing_dev_info(device->bdev);
 218         fs_info = device->dev_root->fs_info;
 219         limit = btrfs_async_submit_limit(fs_info);
 220         limit = limit * 2 / 3;
 221
 222 loop:
 223         spin_lock(&device->io_lock);
 224
 225 loop_lock:
 226         num_run = 0;
 227
 228         /* take all the bios off the list at once and process them
 229          * later on (without the lock held).  But, remember the
 230          * tail and other pointers so the bios can be properly reinserted
 231          * into the list if we hit congestion
 232          */
 233         if (!force_reg && device->pending_sync_bios.head) {
 234                 pending_bios = &device->pending_sync_bios;
 235                 force_reg = 1;
 236         } else {
 237                 pending_bios = &device->pending_bios;
 238                 force_reg = 0;
 239         }
 240
 241         pending = pending_bios->head;
 242         tail = pending_bios->tail;
 243         WARN_ON(pending && !tail);
 244
 245         /*
 246          * if pending was null this time around, no bios need processing
 247          * at all and we can stop.  Otherwise it'll loop back up again
 248          * and do an additional check so no bios are missed.
 249          *
 250          * device->running_pending is used to synchronize with the
 251          * schedule_bio code.
 252          */
 253         if (device->pending_sync_bios.head == NULL &&
 254             device->pending_bios.head == NULL) {
 255                 again = 0;
 256                 device->running_pending = 0;
 257         } else {
 258                 again = 1;
 259                 device->running_pending = 1;
 260         }
 261
 262         pending_bios->head = NULL;
 263         pending_bios->tail = NULL;
 264
 265         spin_unlock(&device->io_lock);
 266
 267         while (pending) {
 268
 269                 rmb();
 270                 /* we want to work on both lists, but do more bios on the
 271                  * sync list than the regular list
 272                  */
 273                 if ((num_run > 32 &&
 274                     pending_bios != &device->pending_sync_bios &&
 275                     device->pending_sync_bios.head) ||
 276                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
 277                     device->pending_bios.head)) {
 278                         spin_lock(&device->io_lock);
 279                         requeue_list(pending_bios, pending, tail);
 280                         goto loop_lock;
 281                 }
 282
 283                 cur = pending;
 284                 pending = pending->bi_next;
 285                 cur->bi_next = NULL;
 286
 287                 if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
 288                     waitqueue_active(&fs_info->async_submit_wait))
 289                         wake_up(&fs_info->async_submit_wait);
 290
 291                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 292
 293                 /*
 294                  * if we're doing the sync list, record that our
 295                  * plug has some sync requests on it
 296                  *
 297                  * If we're doing the regular list and there are
 298                  * sync requests sitting around, unplug before
 299                  * we add more
 300                  */
 301                 if (pending_bios == &device->pending_sync_bios) {
 302                         sync_pending = 1;
 303                 } else if (sync_pending) {
 304                         blk_finish_plug(&plug);
 305                         blk_start_plug(&plug);
 306                         sync_pending = 0;
 307                 }
 308
 309                 btrfsic_submit_bio(cur->bi_rw, cur);
 310                 num_run++;
 311                 batch_run++;
 312                 if (need_resched())
 313                         cond_resched();
 314
 315                 /*
 316                  * we made progress, there is more work to do and the bdi
 317                  * is now congested.  Back off and let other work structs
 318                  * run instead
 319                  */
 320                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
 321                     fs_info->fs_devices->open_devices > 1) {
 322                         struct io_context *ioc;
 323
 324                         ioc = current->io_context;
 325
 326                         /*
 327                          * the main goal here is that we don't want to
 328                          * block if we're going to be able to submit
 329                          * more requests without blocking.
 330                          *
 331                          * This code does two great things, it pokes into
 332                          * the elevator code from a filesystem _and_
 333                          * it makes assumptions about how batching works.
 334                          */
 335                         if (ioc && ioc->nr_batch_requests > 0 &&
 336                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
 337                             (last_waited == 0 ||
 338                              ioc->last_waited == last_waited)) {
 339                                 /*
 340                                  * we want to go through our batch of
 341                                  * requests and stop.  So, we copy out
 342                                  * the ioc->last_waited time and test
 343                                  * against it before looping
 344                                  */
 345                                 last_waited = ioc->last_waited;
 346                                 if (need_resched())
 347                                         cond_resched();
 348                                 continue;
 349                         }
 350                         spin_lock(&device->io_lock);
 351                         requeue_list(pending_bios, pending, tail);
 352                         device->running_pending = 1;
 353
 354                         spin_unlock(&device->io_lock);
 355                         btrfs_requeue_work(&device->work);
 356                         goto done;
 357                 }
 358                 /* unplug every 64 requests just for good measure */
 359                 if (batch_run % 64 == 0) {
 360                         blk_finish_plug(&plug);
 361                         blk_start_plug(&plug);
 362                         sync_pending = 0;
 363                 }
 364         }
 365
 366         cond_resched();
 367         if (again)
 368                 goto loop;
 369
 370         spin_lock(&device->io_lock);
 371         if (device->pending_bios.head || device->pending_sync_bios.head)
 372                 goto loop_lock;
 373         spin_unlock(&device->io_lock);
 374
 375 done:
 376         blk_finish_plug(&plug);
 377 }
 378
 379 static void pending_bios_fn(struct btrfs_work *work)
 380 {
 381         struct btrfs_device *device;
 382
 383         device = container_of(work, struct btrfs_device, work);
 384         run_scheduled_bios(device);
 385 }
 386
 387 static noinline int device_list_add(const char *path,
 388                            struct btrfs_super_block *disk_super,
 389                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
 390 {
 391         struct btrfs_device *device;
 392         struct btrfs_fs_devices *fs_devices;
 393         struct rcu_string *name;
 394         u64 found_transid = btrfs_super_generation(disk_super);
 395
 396         fs_devices = find_fsid(disk_super->fsid);
 397         if (!fs_devices) {
 398                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
 399                 if (!fs_devices)
 400                         return -ENOMEM;
 401                 INIT_LIST_HEAD(&fs_devices->devices);
 402                 INIT_LIST_HEAD(&fs_devices->alloc_list);
 403                 list_add(&fs_devices->list, &fs_uuids);
 404                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 405                 fs_devices->latest_devid = devid;
 406                 fs_devices->latest_trans = found_transid;
 407                 mutex_init(&fs_devices->device_list_mutex);
 408                 device = NULL;
 409         } else {
 410                 device = __find_device(&fs_devices->devices, devid,
 411                                        disk_super->dev_item.uuid);
 412         }
 413         if (!device) {
 414                 if (fs_devices->opened)
 415                         return -EBUSY;
 416
 417                 device = kzalloc(sizeof(*device), GFP_NOFS);
 418                 if (!device) {
 419                         /* we can safely leave the fs_devices entry around */
 420                         return -ENOMEM;
 421                 }
 422                 device->devid = devid;
 423                 device->dev_stats_valid = 0;
 424                 device->work.func = pending_bios_fn;
 425                 memcpy(device->uuid, disk_super->dev_item.uuid,
 426                        BTRFS_UUID_SIZE);
 427                 spin_lock_init(&device->io_lock);
 428
 429                 name = rcu_string_strdup(path, GFP_NOFS);
 430                 if (!name) {
 431                         kfree(device);
 432                         return -ENOMEM;
 433                 }
 434                 rcu_assign_pointer(device->name, name);
 435                 INIT_LIST_HEAD(&device->dev_alloc_list);
 436
 437                 /* init readahead state */
 438                 spin_lock_init(&device->reada_lock);
 439                 device->reada_curr_zone = NULL;
 440                 atomic_set(&device->reada_in_flight, 0);
 441                 device->reada_next = 0;
 442                 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
 443                 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
 444
 445                 mutex_lock(&fs_devices->device_list_mutex);
 446                 list_add_rcu(&device->dev_list, &fs_devices->devices);
 447                 mutex_unlock(&fs_devices->device_list_mutex);
 448
 449                 device->fs_devices = fs_devices;
 450                 fs_devices->num_devices++;
 451         } else if (!device->name || strcmp(device->name->str, path)) {
 452                 name = rcu_string_strdup(path, GFP_NOFS);
 453                 if (!name)
 454                         return -ENOMEM;
 455                 rcu_string_free(device->name);
 456                 rcu_assign_pointer(device->name, name);
 457                 if (device->missing) {
 458                         fs_devices->missing_devices--;
 459                         device->missing = 0;
 460                 }
 461         }
 462
 463         if (found_transid > fs_devices->latest_trans) {
 464                 fs_devices->latest_devid = devid;
 465                 fs_devices->latest_trans = found_transid;
 466         }
 467         *fs_devices_ret = fs_devices;
 468         return 0;
 469 }
 470
 471 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 472 {
 473         struct btrfs_fs_devices *fs_devices;
 474         struct btrfs_device *device;
 475         struct btrfs_device *orig_dev;
 476
 477         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
 478         if (!fs_devices)
 479                 return ERR_PTR(-ENOMEM);
 480
 481         INIT_LIST_HEAD(&fs_devices->devices);
 482         INIT_LIST_HEAD(&fs_devices->alloc_list);
 483         INIT_LIST_HEAD(&fs_devices->list);
 484         mutex_init(&fs_devices->device_list_mutex);
 485         fs_devices->latest_devid = orig->latest_devid;
 486         fs_devices->latest_trans = orig->latest_trans;
 487         fs_devices->total_devices = orig->total_devices;
 488         memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
 489
 490         /* We have held the volume lock, it is safe to get the devices. */
 491         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 492                 struct rcu_string *name;
 493
 494                 device = kzalloc(sizeof(*device), GFP_NOFS);
 495                 if (!device)
 496                         goto error;
 497
 498                 /*
 499                  * This is ok to do without rcu read locked because we hold the
 500                  * uuid mutex so nothing we touch in here is going to disappear.
 501                  */
 502                 name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
 503                 if (!name) {
 504                         kfree(device);
 505                         goto error;
 506                 }
 507                 rcu_assign_pointer(device->name, name);
 508
 509                 device->devid = orig_dev->devid;
 510                 device->work.func = pending_bios_fn;
 511                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
 512                 spin_lock_init(&device->io_lock);
 513                 INIT_LIST_HEAD(&device->dev_list);
 514                 INIT_LIST_HEAD(&device->dev_alloc_list);
 515
 516                 list_add(&device->dev_list, &fs_devices->devices);
 517                 device->fs_devices = fs_devices;
 518                 fs_devices->num_devices++;
 519         }
 520         return fs_devices;
 521 error:
 522         free_fs_devices(fs_devices);
 523         return ERR_PTR(-ENOMEM);
 524 }
 525
 526 void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
 527                                struct btrfs_fs_devices *fs_devices, int step)
 528 {
 529         struct btrfs_device *device, *next;
 530
 531         struct block_device *latest_bdev = NULL;
 532         u64 latest_devid = 0;
 533         u64 latest_transid = 0;
 534
 535         mutex_lock(&uuid_mutex);
 536 again:
 537         /* This is the initialized path, it is safe to release the devices. */
 538         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 539                 if (device->in_fs_metadata) {
 540                         if (!device->is_tgtdev_for_dev_replace &&
 541                             (!latest_transid ||
 542                              device->generation > latest_transid)) {
 543                                 latest_devid = device->devid;
 544                                 latest_transid = device->generation;
 545                                 latest_bdev = device->bdev;
 546                         }
 547                         continue;
 548                 }
 549
 550                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
 551                         /*
 552                          * In the first step, keep the device which has
 553                          * the correct fsid and the devid that is used
 554                          * for the dev_replace procedure.
 555                          * In the second step, the dev_replace state is
 556                          * read from the device tree and it is known
 557                          * whether the procedure is really active or
 558                          * not, which means whether this device is
 559                          * used or whether it should be removed.
 560                          */
 561                         if (step == 0 || device->is_tgtdev_for_dev_replace) {
 562                                 continue;
 563                         }
 564                 }
 565                 if (device->bdev) {
 566                         blkdev_put(device->bdev, device->mode);
 567                         device->bdev = NULL;
 568                         fs_devices->open_devices--;
 569                 }
 570                 if (device->writeable) {
 571                         list_del_init(&device->dev_alloc_list);
 572                         device->writeable = 0;
 573                         if (!device->is_tgtdev_for_dev_replace)
 574                                 fs_devices->rw_devices--;
 575                 }
 576                 list_del_init(&device->dev_list);
 577                 fs_devices->num_devices--;
 578                 rcu_string_free(device->name);
 579                 kfree(device);
 580         }
 581
 582         if (fs_devices->seed) {
 583                 fs_devices = fs_devices->seed;
 584                 goto again;
 585         }
 586
 587         fs_devices->latest_bdev = latest_bdev;
 588         fs_devices->latest_devid = latest_devid;
 589         fs_devices->latest_trans = latest_transid;
 590
 591         mutex_unlock(&uuid_mutex);
 592 }
 593
 594 static void __free_device(struct work_struct *work)
 595 {
 596         struct btrfs_device *device;
 597
 598         device = container_of(work, struct btrfs_device, rcu_work);
 599
 600         if (device->bdev)
 601                 blkdev_put(device->bdev, device->mode);
 602
 603         rcu_string_free(device->name);
 604         kfree(device);
 605 }
 606
 607 static void free_device(struct rcu_head *head)
 608 {
 609         struct btrfs_device *device;
 610
 611         device = container_of(head, struct btrfs_device, rcu);
 612
 613         INIT_WORK(&device->rcu_work, __free_device);
 614         schedule_work(&device->rcu_work);
 615 }
 616
 617 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 618 {
 619         struct btrfs_device *device;
 620
 621         if (--fs_devices->opened > 0)
 622                 return 0;
 623
 624         mutex_lock(&fs_devices->device_list_mutex);
 625         list_for_each_entry(device, &fs_devices->devices, dev_list) {
 626                 struct btrfs_device *new_device;
 627                 struct rcu_string *name;
 628
 629                 if (device->bdev)
 630                         fs_devices->open_devices--;
 631
 632                 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
 633                         list_del_init(&device->dev_alloc_list);
 634                         fs_devices->rw_devices--;
 635                 }
 636
 637                 if (device->can_discard)
 638                         fs_devices->num_can_discard--;
 639
 640                 new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
 641                 BUG_ON(!new_device); /* -ENOMEM */
 642                 memcpy(new_device, device, sizeof(*new_device));
 643
 644                 /* Safe because we are under uuid_mutex */
 645                 if (device->name) {
 646                         name = rcu_string_strdup(device->name->str, GFP_NOFS);
 647                         BUG_ON(device->name && !name); /* -ENOMEM */
 648                         rcu_assign_pointer(new_device->name, name);
 649                 }
 650                 new_device->bdev = NULL;
 651                 new_device->writeable = 0;
 652                 new_device->in_fs_metadata = 0;
 653                 new_device->can_discard = 0;
 654                 spin_lock_init(&new_device->io_lock);
 655                 list_replace_rcu(&device->dev_list, &new_device->dev_list);
 656
 657                 call_rcu(&device->rcu, free_device);
 658         }
 659         mutex_unlock(&fs_devices->device_list_mutex);
 660
 661         WARN_ON(fs_devices->open_devices);
 662         WARN_ON(fs_devices->rw_devices);
 663         fs_devices->opened = 0;
 664         fs_devices->seeding = 0;
 665
 666         return 0;
 667 }
 668
 669 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 670 {
 671         struct btrfs_fs_devices *seed_devices = NULL;
 672         int ret;
 673
 674         mutex_lock(&uuid_mutex);
 675         ret = __btrfs_close_devices(fs_devices);
 676         if (!fs_devices->opened) {
 677                 seed_devices = fs_devices->seed;
 678                 fs_devices->seed = NULL;
 679         }
 680         mutex_unlock(&uuid_mutex);
 681
 682         while (seed_devices) {
 683                 fs_devices = seed_devices;
 684                 seed_devices = fs_devices->seed;
 685                 __btrfs_close_devices(fs_devices);
 686                 free_fs_devices(fs_devices);
 687         }
 688         /*
 689          * Wait for rcu kworkers under __btrfs_close_devices
 690          * to finish all blkdev_puts so device is really
 691          * free when umount is done.
 692          */
 693         rcu_barrier();
 694         return ret;
 695 }
 696
 697 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 698                                 fmode_t flags, void *holder)
 699 {
 700         struct request_queue *q;
 701         struct block_device *bdev;
 702         struct list_head *head = &fs_devices->devices;
 703         struct btrfs_device *device;
 704         struct block_device *latest_bdev = NULL;
 705         struct buffer_head *bh;
 706         struct btrfs_super_block *disk_super;
 707         u64 latest_devid = 0;
 708         u64 latest_transid = 0;
 709         u64 devid;
 710         int seeding = 1;
 711         int ret = 0;
 712
 713         flags |= FMODE_EXCL;
 714
 715         list_for_each_entry(device, head, dev_list) {
 716                 if (device->bdev)
 717                         continue;
 718                 if (!device->name)
 719                         continue;
 720
 721                 /* Just open everything we can; ignore failures here */
 722                 if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 723                                             &bdev, &bh))
 724                         continue;
 725
 726                 disk_super = (struct btrfs_super_block *)bh->b_data;
 727                 devid = btrfs_stack_device_id(&disk_super->dev_item);
 728                 if (devid != device->devid)
 729                         goto error_brelse;
 730
 731                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
 732                            BTRFS_UUID_SIZE))
 733                         goto error_brelse;
 734
 735                 device->generation = btrfs_super_generation(disk_super);
 736                 if (!latest_transid || device->generation > latest_transid) {
 737                         latest_devid = devid;
 738                         latest_transid = device->generation;
 739                         latest_bdev = bdev;
 740                 }
 741
 742                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 743                         device->writeable = 0;
 744                 } else {
 745                         device->writeable = !bdev_read_only(bdev);
 746                         seeding = 0;
 747                 }
 748
 749                 q = bdev_get_queue(bdev);
 750                 if (blk_queue_discard(q)) {
 751                         device->can_discard = 1;
 752                         fs_devices->num_can_discard++;
 753                 }
 754
 755                 device->bdev = bdev;
 756                 device->in_fs_metadata = 0;
 757                 device->mode = flags;
 758
 759                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
 760                         fs_devices->rotating = 1;
 761
 762                 fs_devices->open_devices++;
 763                 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
 764                         fs_devices->rw_devices++;
 765                         list_add(&device->dev_alloc_list,
 766                                  &fs_devices->alloc_list);
 767                 }
 768                 brelse(bh);
 769                 continue;
 770
 771 error_brelse:
 772                 brelse(bh);
 773                 blkdev_put(bdev, flags);
 774                 continue;
 775         }
 776         if (fs_devices->open_devices == 0) {
 777                 ret = -EINVAL;
 778                 goto out;
 779         }
 780         fs_devices->seeding = seeding;
 781         fs_devices->opened = 1;
 782         fs_devices->latest_bdev = latest_bdev;
 783         fs_devices->latest_devid = latest_devid;
 784         fs_devices->latest_trans = latest_transid;
 785         fs_devices->total_rw_bytes = 0;
 786 out:
 787         return ret;
 788 }
 789
 790 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 791                        fmode_t flags, void *holder)
 792 {
 793         int ret;
 794
 795         mutex_lock(&uuid_mutex);
 796         if (fs_devices->opened) {
 797                 fs_devices->opened++;
 798                 ret = 0;
 799         } else {
 800                 ret = __btrfs_open_devices(fs_devices, flags, holder);
 801         }
 802         mutex_unlock(&uuid_mutex);
 803         return ret;
 804 }
 805
 806 /*
 807  * Look for a btrfs signature on a device. This may be called out of the mount path
 808  * and we are not allowed to call set_blocksize during the scan. The superblock
 809  * is read via pagecache
 810  */
 811 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 812                           struct btrfs_fs_devices **fs_devices_ret)
 813 {
 814         struct btrfs_super_block *disk_super;
 815         struct block_device *bdev;
 816         struct page *page;
 817         void *p;
 818         int ret = -EINVAL;
 819         u64 devid;
 820         u64 transid;
 821         u64 total_devices;
 822         u64 bytenr;
 823         pgoff_t index;
 824
 825         /*
 826          * we would like to check all the supers, but that would make
 827          * a btrfs mount succeed after a mkfs from a different FS.
 828          * So, we need to add a special mount option to scan for
 829          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
 830          */
 831         bytenr = btrfs_sb_offset(0);
 832         flags |= FMODE_EXCL;
 833         mutex_lock(&uuid_mutex);
 834
 835         bdev = blkdev_get_by_path(path, flags, holder);
 836
 837         if (IS_ERR(bdev)) {
 838                 ret = PTR_ERR(bdev);
 839                 goto error;
 840         }
 841
 842         /* make sure our super fits in the device */
 843         if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
 844                 goto error_bdev_put;
 845
 846         /* make sure our super fits in the page */
 847         if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
 848                 goto error_bdev_put;
 849
 850         /* make sure our super doesn't straddle pages on disk */
 851         index = bytenr >> PAGE_CACHE_SHIFT;
 852         if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
 853                 goto error_bdev_put;
 854
 855         /* pull in the page with our super */
 856         page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
 857                                    index, GFP_NOFS);
 858
 859         if (IS_ERR_OR_NULL(page))
 860                 goto error_bdev_put;
 861
 862         p = kmap(page);
 863
 864         /* align our pointer to the offset of the super block */
 865         disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
 866
 867         if (btrfs_super_bytenr(disk_super) != bytenr ||
 868             disk_super->magic != cpu_to_le64(BTRFS_MAGIC))
 869                 goto error_unmap;
 870
 871         devid = btrfs_stack_device_id(&disk_super->dev_item);
 872         transid = btrfs_super_generation(disk_super);
 873         total_devices = btrfs_super_num_devices(disk_super);
 874
 875         if (disk_super->label[0]) {
 876                 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
 877                         disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
 878                 printk(KERN_INFO "device label %s ", disk_super->label);
 879         } else {
 880                 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
 881         }
 882
 883         printk(KERN_CONT "devid %llu transid %llu %s\n",
 884                (unsigned long long)devid, (unsigned long long)transid, path);
 885
 886         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 887         if (!ret && fs_devices_ret)
 888                 (*fs_devices_ret)->total_devices = total_devices;
 889
 890 error_unmap:
 891         kunmap(page);
 892         page_cache_release(page);
 893
 894 error_bdev_put:
 895         blkdev_put(bdev, flags);
 896 error:
 897         mutex_unlock(&uuid_mutex);
 898         return ret;
 899 }
 900
 901 /* helper to account the used device space in the range */
 902 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 903                                    u64 end, u64 *length)
 904 {
 905         struct btrfs_key key;
 906         struct btrfs_root *root = device->dev_root;
 907         struct btrfs_dev_extent *dev_extent;
 908         struct btrfs_path *path;
 909         u64 extent_end;
 910         int ret;
 911         int slot;
 912         struct extent_buffer *l;
 913
 914         *length = 0;
 915
 916         if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
 917                 return 0;
 918
 919         path = btrfs_alloc_path();
 920         if (!path)
 921                 return -ENOMEM;
 922         path->reada = 2;
 923
 924         key.objectid = device->devid;
 925         key.offset = start;
 926         key.type = BTRFS_DEV_EXTENT_KEY;
 927
 928         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 929         if (ret < 0)
 930                 goto out;
 931         if (ret > 0) {
 932                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
 933                 if (ret < 0)
 934                         goto out;
 935         }
 936
 937         while (1) {
 938                 l = path->nodes[0];
 939                 slot = path->slots[0];
 940                 if (slot >= btrfs_header_nritems(l)) {
 941                         ret = btrfs_next_leaf(root, path);
 942                         if (ret == 0)
 943                                 continue;
 944                         if (ret < 0)
 945                                 goto out;
 946
 947                         break;
 948                 }
 949                 btrfs_item_key_to_cpu(l, &key, slot);
 950
 951                 if (key.objectid < device->devid)
 952                         goto next;
 953
 954                 if (key.objectid > device->devid)
 955                         break;
 956
 957                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
 958                         goto next;
 959
 960                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
 961                 extent_end = key.offset + btrfs_dev_extent_length(l,
 962                                                                   dev_extent);
 963                 if (key.offset <= start && extent_end > end) {
 964                         *length = end - start + 1;
 965                         break;
 966                 } else if (key.offset <= start && extent_end > start)
 967                         *length += extent_end - start;
 968                 else if (key.offset > start && extent_end <= end)
 969                         *length += extent_end - key.offset;
 970                 else if (key.offset > start && key.offset <= end) {
 971                         *length += end - key.offset + 1;
 972                         break;
 973                 } else if (key.offset > end)
 974                         break;
 975
 976 next:
 977                 path->slots[0]++;
 978         }
 979         ret = 0;
 980 out:
 981         btrfs_free_path(path);
 982         return ret;
 983 }
 984
 985 static int contains_pending_extent(struct btrfs_trans_handle *trans,
 986                                    struct btrfs_device *device,
 987                                    u64 *start, u64 len)
 988 {
 989         struct extent_map *em;
 990         int ret = 0;
 991
 992         list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
 993                 struct map_lookup *map;
 994                 int i;
 995
 996                 map = (struct map_lookup *)em->bdev;
 997                 for (i = 0; i < map->num_stripes; i++) {
 998                         if (map->stripes[i].dev != device)
 999                                 continue;
1000                         if (map->stripes[i].physical >= *start + len ||
1001                             map->stripes[i].physical + em->orig_block_len <=
1002                             *start)
1003                                 continue;
1004                         *start = map->stripes[i].physical +
1005                                 em->orig_block_len;
1006                         ret = 1;
1007                 }
1008         }
1009
1010         return ret;
1011 }
1012
1013
1014 /*
1015  * find_free_dev_extent - find free space in the specified device
1016  * @device:     the device which we search the free space in
1017  * @num_bytes:  the size of the free space that we need
1018  * @start:      store the start of the free space.
1019  * @len:        the size of the free space. that we find, or the size of the max
1020  *              free space if we don't find suitable free space
1021  *
1022  * this uses a pretty simple search, the expectation is that it is
1023  * called very infrequently and that a given device has a small number
1024  * of extents
1025  *
1026  * @start is used to store the start of the free space if we find. But if we
1027  * don't find suitable free space, it will be used to store the start position
1028  * of the max free space.
1029  *
1030  * @len is used to store the size of the free space that we find.
1031  * But if we don't find suitable free space, it is used to store the size of
1032  * the max free space.
1033  */
1034 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1035                          struct btrfs_device *device, u64 num_bytes,
1036                          u64 *start, u64 *len)
1037 {
1038         struct btrfs_key key;
1039         struct btrfs_root *root = device->dev_root;
1040         struct btrfs_dev_extent *dev_extent;
1041         struct btrfs_path *path;
1042         u64 hole_size;
1043         u64 max_hole_start;
1044         u64 max_hole_size;
1045         u64 extent_end;
1046         u64 search_start;
1047         u64 search_end = device->total_bytes;
1048         int ret;
1049         int slot;
1050         struct extent_buffer *l;
1051
1052         /* FIXME use last free of some kind */
1053
1054         /* we don't want to overwrite the superblock on the drive,
1055          * so we make sure to start at an offset of at least 1MB
1056          */
1057         search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
1058
1059         path = btrfs_alloc_path();
1060         if (!path)
1061                 return -ENOMEM;
1062 again:
1063         max_hole_start = search_start;
1064         max_hole_size = 0;
1065         hole_size = 0;
1066
1067         if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1068                 ret = -ENOSPC;
1069                 goto out;
1070         }
1071
1072         path->reada = 2;
1073         path->search_commit_root = 1;
1074         path->skip_locking = 1;
1075
1076         key.objectid = device->devid;
1077         key.offset = search_start;
1078         key.type = BTRFS_DEV_EXTENT_KEY;
1079
1080         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1081         if (ret < 0)
1082                 goto out;
1083         if (ret > 0) {
1084                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1085                 if (ret < 0)
1086                         goto out;
1087         }
1088
1089         while (1) {
1090                 l = path->nodes[0];
1091                 slot = path->slots[0];
1092                 if (slot >= btrfs_header_nritems(l)) {
1093                         ret = btrfs_next_leaf(root, path);
1094                         if (ret == 0)
1095                                 continue;
1096                         if (ret < 0)
1097                                 goto out;
1098
1099                         break;
1100                 }
1101                 btrfs_item_key_to_cpu(l, &key, slot);
1102
1103                 if (key.objectid < device->devid)
1104                         goto next;
1105
1106                 if (key.objectid > device->devid)
1107                         break;
1108
1109                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
1110                         goto next;
1111
1112                 if (key.offset > search_start) {
1113                         hole_size = key.offset - search_start;
1114
1115                         /*
1116                          * Have to check before we set max_hole_start, otherwise
1117                          * we could end up sending back this offset anyway.
1118                          */
1119                         if (contains_pending_extent(trans, device,
1120                                                     &search_start,
1121                                                     hole_size))
1122                                 hole_size = 0;
1123
1124                         if (hole_size > max_hole_size) {
1125                                 max_hole_start = search_start;
1126                                 max_hole_size = hole_size;
1127                         }
1128
1129                         /*
1130                          * If this free space is greater than which we need,
1131                          * it must be the max free space that we have found
1132                          * until now, so max_hole_start must point to the start
1133                          * of this free space and the length of this free space
1134                          * is stored in max_hole_size. Thus, we return
1135                          * max_hole_start and max_hole_size and go back to the
1136                          * caller.
1137                          */
1138                         if (hole_size >= num_bytes) {
1139                                 ret = 0;
1140                                 goto out;
1141                         }
1142                 }
1143
1144                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1145                 extent_end = key.offset + btrfs_dev_extent_length(l,
1146                                                                   dev_extent);
1147                 if (extent_end > search_start)
1148                         search_start = extent_end;
1149 next:
1150                 path->slots[0]++;
1151                 cond_resched();
1152         }
1153
1154         /*
1155          * At this point, search_start should be the end of
1156          * allocated dev extents, and when shrinking the device,
1157          * search_end may be smaller than search_start.
1158          */
1159         if (search_end > search_start)
1160                 hole_size = search_end - search_start;
1161
1162         if (hole_size > max_hole_size) {
1163                 max_hole_start = search_start;
1164                 max_hole_size = hole_size;
1165         }
1166
1167         if (contains_pending_extent(trans, device, &search_start, hole_size)) {
1168                 btrfs_release_path(path);
1169                 goto again;
1170         }
1171
1172         /* See above. */
1173         if (hole_size < num_bytes)
1174                 ret = -ENOSPC;
1175         else
1176                 ret = 0;
1177
1178 out:
1179         btrfs_free_path(path);
1180         *start = max_hole_start;
1181         if (len)
1182                 *len = max_hole_size;
1183         return ret;
1184 }
1185
1186 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1187                           struct btrfs_device *device,
1188                           u64 start)
1189 {
1190         int ret;
1191         struct btrfs_path *path;
1192         struct btrfs_root *root = device->dev_root;
1193         struct btrfs_key key;
1194         struct btrfs_key found_key;
1195         struct extent_buffer *leaf = NULL;
1196         struct btrfs_dev_extent *extent = NULL;
1197
1198         path = btrfs_alloc_path();
1199         if (!path)
1200                 return -ENOMEM;
1201
1202         key.objectid = device->devid;
1203         key.offset = start;
1204         key.type = BTRFS_DEV_EXTENT_KEY;
1205 again:
1206         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1207         if (ret > 0) {
1208                 ret = btrfs_previous_item(root, path, key.objectid,
1209                                           BTRFS_DEV_EXTENT_KEY);
1210                 if (ret)
1211                         goto out;
1212                 leaf = path->nodes[0];
1213                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1214                 extent = btrfs_item_ptr(leaf, path->slots[0],
1215                                         struct btrfs_dev_extent);
1216                 BUG_ON(found_key.offset > start || found_key.offset +
1217                        btrfs_dev_extent_length(leaf, extent) < start);
1218                 key = found_key;
1219                 btrfs_release_path(path);
1220                 goto again;
1221         } else if (ret == 0) {
1222                 leaf = path->nodes[0];
1223                 extent = btrfs_item_ptr(leaf, path->slots[0],
1224                                         struct btrfs_dev_extent);
1225         } else {
1226                 btrfs_error(root->fs_info, ret, "Slot search failed");
1227                 goto out;
1228         }
1229
1230         if (device->bytes_used > 0) {
1231                 u64 len = btrfs_dev_extent_length(leaf, extent);
1232                 device->bytes_used -= len;
1233                 spin_lock(&root->fs_info->free_chunk_lock);
1234                 root->fs_info->free_chunk_space += len;
1235                 spin_unlock(&root->fs_info->free_chunk_lock);
1236         }
1237         ret = btrfs_del_item(trans, root, path);
1238         if (ret) {
1239                 btrfs_error(root->fs_info, ret,
1240                             "Failed to remove dev extent item");
1241         }
1242 out:
1243         btrfs_free_path(path);
1244         return ret;
1245 }
1246
1247 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1248                                   struct btrfs_device *device,
1249                                   u64 chunk_tree, u64 chunk_objectid,
1250                                   u64 chunk_offset, u64 start, u64 num_bytes)
1251 {
1252         int ret;
1253         struct btrfs_path *path;
1254         struct btrfs_root *root = device->dev_root;
1255         struct btrfs_dev_extent *extent;
1256         struct extent_buffer *leaf;
1257         struct btrfs_key key;
1258
1259         WARN_ON(!device->in_fs_metadata);
1260         WARN_ON(device->is_tgtdev_for_dev_replace);
1261         path = btrfs_alloc_path();
1262         if (!path)
1263                 return -ENOMEM;
1264
1265         key.objectid = device->devid;
1266         key.offset = start;
1267         key.type = BTRFS_DEV_EXTENT_KEY;
1268         ret = btrfs_insert_empty_item(trans, root, path, &key,
1269                                       sizeof(*extent));
1270         if (ret)
1271                 goto out;
1272
1273         leaf = path->nodes[0];
1274         extent = btrfs_item_ptr(leaf, path->slots[0],
1275                                 struct btrfs_dev_extent);
1276         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1277         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1278         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1279
1280         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1281                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1282                     BTRFS_UUID_SIZE);
1283
1284         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1285         btrfs_mark_buffer_dirty(leaf);
1286 out:
1287         btrfs_free_path(path);
1288         return ret;
1289 }
1290
1291 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1292 {
1293         struct extent_map_tree *em_tree;
1294         struct extent_map *em;
1295         struct rb_node *n;
1296         u64 ret = 0;
1297
1298         em_tree = &fs_info->mapping_tree.map_tree;
1299         read_lock(&em_tree->lock);
1300         n = rb_last(&em_tree->map);
1301         if (n) {
1302                 em = rb_entry(n, struct extent_map, rb_node);
1303                 ret = em->start + em->len;
1304         }
1305         read_unlock(&em_tree->lock);
1306
1307         return ret;
1308 }
1309
1310 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1311 {
1312         int ret;
1313         struct btrfs_key key;
1314         struct btrfs_key found_key;
1315         struct btrfs_path *path;
1316
1317         root = root->fs_info->chunk_root;
1318
1319         path = btrfs_alloc_path();
1320         if (!path)
1321                 return -ENOMEM;
1322
1323         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1324         key.type = BTRFS_DEV_ITEM_KEY;
1325         key.offset = (u64)-1;
1326
1327         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1328         if (ret < 0)
1329                 goto error;
1330
1331         BUG_ON(ret == 0); /* Corruption */
1332
1333         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1334                                   BTRFS_DEV_ITEM_KEY);
1335         if (ret) {
1336                 *objectid = 1;
1337         } else {
1338                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1339                                       path->slots[0]);
1340                 *objectid = found_key.offset + 1;
1341         }
1342         ret = 0;
1343 error:
1344         btrfs_free_path(path);
1345         return ret;
1346 }
1347
1348 /*
1349  * the device information is stored in the chunk root
1350  * the btrfs_device struct should be fully filled in
1351  */
1352 static int btrfs_add_device(struct btrfs_trans_handle *trans,
1353                             struct btrfs_root *root,
1354                             struct btrfs_device *device)
1355 {
1356         int ret;
1357         struct btrfs_path *path;
1358         struct btrfs_dev_item *dev_item;
1359         struct extent_buffer *leaf;
1360         struct btrfs_key key;
1361         unsigned long ptr;
1362
1363         root = root->fs_info->chunk_root;
1364
1365         path = btrfs_alloc_path();
1366         if (!path)
1367                 return -ENOMEM;
1368
1369         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1370         key.type = BTRFS_DEV_ITEM_KEY;
1371         key.offset = device->devid;
1372
1373         ret = btrfs_insert_empty_item(trans, root, path, &key,
1374                                       sizeof(*dev_item));
1375         if (ret)
1376                 goto out;
1377
1378         leaf = path->nodes[0];
1379         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1380
1381         btrfs_set_device_id(leaf, dev_item, device->devid);
1382         btrfs_set_device_generation(leaf, dev_item, 0);
1383         btrfs_set_device_type(leaf, dev_item, device->type);
1384         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1385         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1386         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1387         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1388         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1389         btrfs_set_device_group(leaf, dev_item, 0);
1390         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1391         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1392         btrfs_set_device_start_offset(leaf, dev_item, 0);
1393
1394         ptr = (unsigned long)btrfs_device_uuid(dev_item);
1395         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1396         ptr = (unsigned long)btrfs_device_fsid(dev_item);
1397         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1398         btrfs_mark_buffer_dirty(leaf);
1399
1400         ret = 0;
1401 out:
1402         btrfs_free_path(path);
1403         return ret;
1404 }
1405
1406 static int btrfs_rm_dev_item(struct btrfs_root *root,
1407                              struct btrfs_device *device)
1408 {
1409         int ret;
1410         struct btrfs_path *path;
1411         struct btrfs_key key;
1412         struct btrfs_trans_handle *trans;
1413
1414         root = root->fs_info->chunk_root;
1415
1416         path = btrfs_alloc_path();
1417         if (!path)
1418                 return -ENOMEM;
1419
1420         trans = btrfs_start_transaction(root, 0);
1421         if (IS_ERR(trans)) {
1422                 btrfs_free_path(path);
1423                 return PTR_ERR(trans);
1424         }
1425         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1426         key.type = BTRFS_DEV_ITEM_KEY;
1427         key.offset = device->devid;
1428         lock_chunks(root);
1429
1430         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1431         if (ret < 0)
1432                 goto out;
1433
1434         if (ret > 0) {
1435                 ret = -ENOENT;
1436                 goto out;
1437         }
1438
1439         ret = btrfs_del_item(trans, root, path);
1440         if (ret)
1441                 goto out;
1442 out:
1443         btrfs_free_path(path);
1444         unlock_chunks(root);
1445         btrfs_commit_transaction(trans, root);
1446         return ret;
1447 }
1448
1449 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1450 {
1451         struct btrfs_device *device;
1452         struct btrfs_device *next_device;
1453         struct block_device *bdev;
1454         struct buffer_head *bh = NULL;
1455         struct btrfs_super_block *disk_super;
1456         struct btrfs_fs_devices *cur_devices;
1457         u64 all_avail;
1458         u64 devid;
1459         u64 num_devices;
1460         u8 *dev_uuid;
1461         unsigned seq;
1462         int ret = 0;
1463         bool clear_super = false;
1464
1465         mutex_lock(&uuid_mutex);
1466
1467         do {
1468                 seq = read_seqbegin(&root->fs_info->profiles_lock);
1469
1470                 all_avail = root->fs_info->avail_data_alloc_bits |
1471                             root->fs_info->avail_system_alloc_bits |
1472                             root->fs_info->avail_metadata_alloc_bits;
1473         } while (read_seqretry(&root->fs_info->profiles_lock, seq));
1474
1475         num_devices = root->fs_info->fs_devices->num_devices;
1476         btrfs_dev_replace_lock(&root->fs_info->dev_replace);
1477         if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
1478                 WARN_ON(num_devices < 1);
1479                 num_devices--;
1480         }
1481         btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1482
1483         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1484                 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
1485                 goto out;
1486         }
1487
1488         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1489                 ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
1490                 goto out;
1491         }
1492
1493         if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1494             root->fs_info->fs_devices->rw_devices <= 2) {
1495                 ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
1496                 goto out;
1497         }
1498         if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1499             root->fs_info->fs_devices->rw_devices <= 3) {
1500                 ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
1501                 goto out;
1502         }
1503
1504         if (strcmp(device_path, "missing") == 0) {
1505                 struct list_head *devices;
1506                 struct btrfs_device *tmp;
1507
1508                 device = NULL;
1509                 devices = &root->fs_info->fs_devices->devices;
1510                 /*
1511                  * It is safe to read the devices since the volume_mutex
1512                  * is held.
1513                  */
1514                 list_for_each_entry(tmp, devices, dev_list) {
1515                         if (tmp->in_fs_metadata &&
1516                             !tmp->is_tgtdev_for_dev_replace &&
1517                             !tmp->bdev) {
1518                                 device = tmp;
1519                                 break;
1520                         }
1521                 }
1522                 bdev = NULL;
1523                 bh = NULL;
1524                 disk_super = NULL;
1525                 if (!device) {
1526                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
1527                         goto out;
1528                 }
1529         } else {
1530                 ret = btrfs_get_bdev_and_sb(device_path,
1531                                             FMODE_WRITE | FMODE_EXCL,
1532                                             root->fs_info->bdev_holder, 0,
1533                                             &bdev, &bh);
1534                 if (ret)
1535                         goto out;
1536                 disk_super = (struct btrfs_super_block *)bh->b_data;
1537                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1538                 dev_uuid = disk_super->dev_item.uuid;
1539                 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1540                                            disk_super->fsid);
1541                 if (!device) {
1542                         ret = -ENOENT;
1543                         goto error_brelse;
1544                 }
1545         }
1546
1547         if (device->is_tgtdev_for_dev_replace) {
1548                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1549                 goto error_brelse;
1550         }
1551
1552         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1553                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1554                 goto error_brelse;
1555         }
1556
1557         if (device->writeable) {
1558                 lock_chunks(root);
1559                 list_del_init(&device->dev_alloc_list);
1560                 unlock_chunks(root);
1561                 root->fs_info->fs_devices->rw_devices--;
1562                 clear_super = true;
1563         }
1564
1565         mutex_unlock(&uuid_mutex);
1566         ret = btrfs_shrink_device(device, 0);
1567         mutex_lock(&uuid_mutex);
1568         if (ret)
1569                 goto error_undo;
1570
1571         /*
1572          * TODO: the superblock still includes this device in its num_devices
1573          * counter although write_all_supers() is not locked out. This
1574          * could give a filesystem state which requires a degraded mount.
1575          */
1576         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1577         if (ret)
1578                 goto error_undo;
1579
1580         spin_lock(&root->fs_info->free_chunk_lock);
1581         root->fs_info->free_chunk_space = device->total_bytes -
1582                 device->bytes_used;
1583         spin_unlock(&root->fs_info->free_chunk_lock);
1584
1585         device->in_fs_metadata = 0;
1586         btrfs_scrub_cancel_dev(root->fs_info, device);
1587
1588         /*
1589          * the device list mutex makes sure that we don't change
1590          * the device list while someone else is writing out all
1591          * the device supers.
1592          */
1593
1594         cur_devices = device->fs_devices;
1595         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1596         list_del_rcu(&device->dev_list);
1597
1598         device->fs_devices->num_devices--;
1599         device->fs_devices->total_devices--;
1600
1601         if (device->missing)
1602                 root->fs_info->fs_devices->missing_devices--;
1603
1604         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1605                                  struct btrfs_device, dev_list);
1606         if (device->bdev == root->fs_info->sb->s_bdev)
1607                 root->fs_info->sb->s_bdev = next_device->bdev;
1608         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1609                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1610
1611         if (device->bdev)
1612                 device->fs_devices->open_devices--;
1613
1614         call_rcu(&device->rcu, free_device);
1615         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1616
1617         num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1618         btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1619
1620         if (cur_devices->open_devices == 0) {
1621                 struct btrfs_fs_devices *fs_devices;
1622                 fs_devices = root->fs_info->fs_devices;
1623                 while (fs_devices) {
1624                         if (fs_devices->seed == cur_devices)
1625                                 break;
1626                         fs_devices = fs_devices->seed;
1627                 }
1628                 fs_devices->seed = cur_devices->seed;
1629                 cur_devices->seed = NULL;
1630                 lock_chunks(root);
1631                 __btrfs_close_devices(cur_devices);
1632                 unlock_chunks(root);
1633                 free_fs_devices(cur_devices);
1634         }
1635
1636         root->fs_info->num_tolerated_disk_barrier_failures =
1637                 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1638
1639         /*
1640          * at this point, the device is zero sized.  We want to
1641          * remove it from the devices list and zero out the old super
1642          */
1643         if (clear_super && disk_super) {
1644                 /* make sure this device isn't detected as part of
1645                  * the FS anymore
1646                  */
1647                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1648                 set_buffer_dirty(bh);
1649                 sync_dirty_buffer(bh);
1650         }
1651
1652         ret = 0;
1653
1654         /* Notify udev that device has changed */
1655         if (bdev)
1656                 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1657
1658 error_brelse:
1659         brelse(bh);
1660         if (bdev)
1661                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1662 out:
1663         mutex_unlock(&uuid_mutex);
1664         return ret;
1665 error_undo:
1666         if (device->writeable) {
1667                 lock_chunks(root);
1668                 list_add(&device->dev_alloc_list,
1669                          &root->fs_info->fs_devices->alloc_list);
1670                 unlock_chunks(root);
1671                 root->fs_info->fs_devices->rw_devices++;
1672         }
1673         goto error_brelse;
1674 }
1675
1676 void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1677                                  struct btrfs_device *srcdev)
1678 {
1679         WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1680         list_del_rcu(&srcdev->dev_list);
1681         list_del_rcu(&srcdev->dev_alloc_list);
1682         fs_info->fs_devices->num_devices--;
1683         if (srcdev->missing) {
1684                 fs_info->fs_devices->missing_devices--;
1685                 fs_info->fs_devices->rw_devices++;
1686         }
1687         if (srcdev->can_discard)
1688                 fs_info->fs_devices->num_can_discard--;
1689         if (srcdev->bdev)
1690                 fs_info->fs_devices->open_devices--;
1691
1692         call_rcu(&srcdev->rcu, free_device);
1693 }
1694
1695 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1696                                       struct btrfs_device *tgtdev)
1697 {
1698         struct btrfs_device *next_device;
1699
1700         WARN_ON(!tgtdev);
1701         mutex_lock(&fs_info->fs_devices->device_list_mutex);
1702         if (tgtdev->bdev) {
1703                 btrfs_scratch_superblock(tgtdev);
1704                 fs_info->fs_devices->open_devices--;
1705         }
1706         fs_info->fs_devices->num_devices--;
1707         if (tgtdev->can_discard)
1708                 fs_info->fs_devices->num_can_discard++;
1709
1710         next_device = list_entry(fs_info->fs_devices->devices.next,
1711                                  struct btrfs_device, dev_list);
1712         if (tgtdev->bdev == fs_info->sb->s_bdev)
1713                 fs_info->sb->s_bdev = next_device->bdev;
1714         if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
1715                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1716         list_del_rcu(&tgtdev->dev_list);
1717
1718         call_rcu(&tgtdev->rcu, free_device);
1719
1720         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1721 }
1722
1723 static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
1724                                      struct btrfs_device **device)
1725 {
1726         int ret = 0;
1727         struct btrfs_super_block *disk_super;
1728         u64 devid;
1729         u8 *dev_uuid;
1730         struct block_device *bdev;
1731         struct buffer_head *bh;
1732
1733         *device = NULL;
1734         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
1735                                     root->fs_info->bdev_holder, 0, &bdev, &bh);
1736         if (ret)
1737                 return ret;
1738         disk_super = (struct btrfs_super_block *)bh->b_data;
1739         devid = btrfs_stack_device_id(&disk_super->dev_item);
1740         dev_uuid = disk_super->dev_item.uuid;
1741         *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1742                                     disk_super->fsid);
1743         brelse(bh);
1744         if (!*device)
1745                 ret = -ENOENT;
1746         blkdev_put(bdev, FMODE_READ);
1747         return ret;
1748 }
1749
1750 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
1751                                          char *device_path,
1752                                          struct btrfs_device **device)
1753 {
1754         *device = NULL;
1755         if (strcmp(device_path, "missing") == 0) {
1756                 struct list_head *devices;
1757                 struct btrfs_device *tmp;
1758
1759                 devices = &root->fs_info->fs_devices->devices;
1760                 /*
1761                  * It is safe to read the devices since the volume_mutex
1762                  * is held by the caller.
1763                  */
1764                 list_for_each_entry(tmp, devices, dev_list) {
1765                         if (tmp->in_fs_metadata && !tmp->bdev) {
1766                                 *device = tmp;
1767                                 break;
1768                         }
1769                 }
1770
1771                 if (!*device) {
1772                         pr_err("btrfs: no missing device found\n");
1773                         return -ENOENT;
1774                 }
1775
1776                 return 0;
1777         } else {
1778                 return btrfs_find_device_by_path(root, device_path, device);
1779         }
1780 }
1781
1782 /*
1783  * does all the dirty work required for changing file system's UUID.
1784  */
1785 static int btrfs_prepare_sprout(struct btrfs_root *root)
1786 {
1787         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1788         struct btrfs_fs_devices *old_devices;
1789         struct btrfs_fs_devices *seed_devices;
1790         struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1791         struct btrfs_device *device;
1792         u64 super_flags;
1793
1794         BUG_ON(!mutex_is_locked(&uuid_mutex));
1795         if (!fs_devices->seeding)
1796                 return -EINVAL;
1797
1798         seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1799         if (!seed_devices)
1800                 return -ENOMEM;
1801
1802         old_devices = clone_fs_devices(fs_devices);
1803         if (IS_ERR(old_devices)) {
1804                 kfree(seed_devices);
1805                 return PTR_ERR(old_devices);
1806         }
1807
1808         list_add(&old_devices->list, &fs_uuids);
1809
1810         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1811         seed_devices->opened = 1;
1812         INIT_LIST_HEAD(&seed_devices->devices);
1813         INIT_LIST_HEAD(&seed_devices->alloc_list);
1814         mutex_init(&seed_devices->device_list_mutex);
1815
1816         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1817         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1818                               synchronize_rcu);
1819         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1820
1821         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1822         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1823                 device->fs_devices = seed_devices;
1824         }
1825
1826         fs_devices->seeding = 0;
1827         fs_devices->num_devices = 0;
1828         fs_devices->open_devices = 0;
1829         fs_devices->total_devices = 0;
1830         fs_devices->seed = seed_devices;
1831
1832         generate_random_uuid(fs_devices->fsid);
1833         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1834         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1835         super_flags = btrfs_super_flags(disk_super) &
1836                       ~BTRFS_SUPER_FLAG_SEEDING;
1837         btrfs_set_super_flags(disk_super, super_flags);
1838
1839         return 0;
1840 }
1841
1842 /*
1843  * strore the expected generation for seed devices in device items.
1844  */
1845 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1846                                struct btrfs_root *root)
1847 {
1848         struct btrfs_path *path;
1849         struct extent_buffer *leaf;
1850         struct btrfs_dev_item *dev_item;
1851         struct btrfs_device *device;
1852         struct btrfs_key key;
1853         u8 fs_uuid[BTRFS_UUID_SIZE];
1854         u8 dev_uuid[BTRFS_UUID_SIZE];
1855         u64 devid;
1856         int ret;
1857
1858         path = btrfs_alloc_path();
1859         if (!path)
1860                 return -ENOMEM;
1861
1862         root = root->fs_info->chunk_root;
1863         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1864         key.offset = 0;
1865         key.type = BTRFS_DEV_ITEM_KEY;
1866
1867         while (1) {
1868                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1869                 if (ret < 0)
1870                         goto error;
1871
1872                 leaf = path->nodes[0];
1873 next_slot:
1874                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1875                         ret = btrfs_next_leaf(root, path);
1876                         if (ret > 0)
1877                                 break;
1878                         if (ret < 0)
1879                                 goto error;
1880                         leaf = path->nodes[0];
1881                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1882                         btrfs_release_path(path);
1883                         continue;
1884                 }
1885
1886                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1887                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1888                     key.type != BTRFS_DEV_ITEM_KEY)
1889                         break;
1890
1891                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1892                                           struct btrfs_dev_item);
1893                 devid = btrfs_device_id(leaf, dev_item);
1894                 read_extent_buffer(leaf, dev_uuid,
1895                                    (unsigned long)btrfs_device_uuid(dev_item),
1896                                    BTRFS_UUID_SIZE);
1897                 read_extent_buffer(leaf, fs_uuid,
1898                                    (unsigned long)btrfs_device_fsid(dev_item),
1899                                    BTRFS_UUID_SIZE);
1900                 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1901                                            fs_uuid);
1902                 BUG_ON(!device); /* Logic error */
1903
1904                 if (device->fs_devices->seeding) {
1905                         btrfs_set_device_generation(leaf, dev_item,
1906                                                     device->generation);
1907                         btrfs_mark_buffer_dirty(leaf);
1908                 }
1909
1910                 path->slots[0]++;
1911                 goto next_slot;
1912         }
1913         ret = 0;
1914 error:
1915         btrfs_free_path(path);
1916         return ret;
1917 }
1918
1919 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1920 {
1921         struct request_queue *q;
1922         struct btrfs_trans_handle *trans;
1923         struct btrfs_device *device;
1924         struct block_device *bdev;
1925         struct list_head *devices;
1926         struct super_block *sb = root->fs_info->sb;
1927         struct rcu_string *name;
1928         u64 total_bytes;
1929         int seeding_dev = 0;
1930         int ret = 0;
1931
1932         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1933                 return -EROFS;
1934
1935         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1936                                   root->fs_info->bdev_holder);
1937         if (IS_ERR(bdev))
1938                 return PTR_ERR(bdev);
1939
1940         if (root->fs_info->fs_devices->seeding) {
1941                 seeding_dev = 1;
1942                 down_write(&sb->s_umount);
1943                 mutex_lock(&uuid_mutex);
1944         }
1945
1946         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1947
1948         devices = &root->fs_info->fs_devices->devices;
1949
1950         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1951         list_for_each_entry(device, devices, dev_list) {
1952                 if (device->bdev == bdev) {
1953                         ret = -EEXIST;
1954                         mutex_unlock(
1955                                 &root->fs_info->fs_devices->device_list_mutex);
1956                         goto error;
1957                 }
1958         }
1959         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1960
1961         device = kzalloc(sizeof(*device), GFP_NOFS);
1962         if (!device) {
1963                 /* we can safely leave the fs_devices entry around */
1964                 ret = -ENOMEM;
1965                 goto error;
1966         }
1967
1968         name = rcu_string_strdup(device_path, GFP_NOFS);
1969         if (!name) {
1970                 kfree(device);
1971                 ret = -ENOMEM;
1972                 goto error;
1973         }
1974         rcu_assign_pointer(device->name, name);
1975
1976         ret = find_next_devid(root, &device->devid);
1977         if (ret) {
1978                 rcu_string_free(device->name);
1979                 kfree(device);
1980                 goto error;
1981         }
1982
1983         trans = btrfs_start_transaction(root, 0);
1984         if (IS_ERR(trans)) {
1985                 rcu_string_free(device->name);
1986                 kfree(device);
1987                 ret = PTR_ERR(trans);
1988                 goto error;
1989         }
1990
1991         lock_chunks(root);
1992
1993         q = bdev_get_queue(bdev);
1994         if (blk_queue_discard(q))
1995                 device->can_discard = 1;
1996         device->writeable = 1;
1997         device->work.func = pending_bios_fn;
1998         generate_random_uuid(device->uuid);
1999         spin_lock_init(&device->io_lock);
2000         device->generation = trans->transid;
2001         device->io_width = root->sectorsize;
2002         device->io_align = root->sectorsize;
2003         device->sector_size = root->sectorsize;
2004         device->total_bytes = i_size_read(bdev->bd_inode);
2005         device->disk_total_bytes = device->total_bytes;
2006         device->dev_root = root->fs_info->dev_root;
2007         device->bdev = bdev;
2008         device->in_fs_metadata = 1;
2009         device->is_tgtdev_for_dev_replace = 0;
2010         device->mode = FMODE_EXCL;
2011         set_blocksize(device->bdev, 4096);
2012
2013         if (seeding_dev) {
2014                 sb->s_flags &= ~MS_RDONLY;
2015                 ret = btrfs_prepare_sprout(root);
2016                 BUG_ON(ret); /* -ENOMEM */
2017         }
2018
2019         device->fs_devices = root->fs_info->fs_devices;
2020
2021         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2022         list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
2023         list_add(&device->dev_alloc_list,
2024                  &root->fs_info->fs_devices->alloc_list);
2025         root->fs_info->fs_devices->num_devices++;
2026         root->fs_info->fs_devices->open_devices++;
2027         root->fs_info->fs_devices->rw_devices++;
2028         root->fs_info->fs_devices->total_devices++;
2029         if (device->can_discard)
2030                 root->fs_info->fs_devices->num_can_discard++;
2031         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2032
2033         spin_lock(&root->fs_info->free_chunk_lock);
2034         root->fs_info->free_chunk_space += device->total_bytes;
2035         spin_unlock(&root->fs_info->free_chunk_lock);
2036
2037         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
2038                 root->fs_info->fs_devices->rotating = 1;
2039
2040         total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
2041         btrfs_set_super_total_bytes(root->fs_info->super_copy,
2042                                     total_bytes + device->total_bytes);
2043
2044         total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
2045         btrfs_set_super_num_devices(root->fs_info->super_copy,
2046                                     total_bytes + 1);
2047         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2048
2049         if (seeding_dev) {
2050                 ret = init_first_rw_device(trans, root, device);
2051                 if (ret) {
2052                         btrfs_abort_transaction(trans, root, ret);
2053                         goto error_trans;
2054                 }
2055                 ret = btrfs_finish_sprout(trans, root);
2056                 if (ret) {
2057                         btrfs_abort_transaction(trans, root, ret);
2058                         goto error_trans;
2059                 }
2060         } else {
2061                 ret = btrfs_add_device(trans, root, device);
2062                 if (ret) {
2063                         btrfs_abort_transaction(trans, root, ret);
2064                         goto error_trans;
2065                 }
2066         }
2067
2068         /*
2069          * we've got more storage, clear any full flags on the space
2070          * infos
2071          */
2072         btrfs_clear_space_info_full(root->fs_info);
2073
2074         unlock_chunks(root);
2075         root->fs_info->num_tolerated_disk_barrier_failures =
2076                 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
2077         ret = btrfs_commit_transaction(trans, root);
2078
2079         if (seeding_dev) {
2080                 mutex_unlock(&uuid_mutex);
2081                 up_write(&sb->s_umount);
2082
2083                 if (ret) /* transaction commit */
2084                         return ret;
2085
2086                 ret = btrfs_relocate_sys_chunks(root);
2087                 if (ret < 0)
2088                         btrfs_error(root->fs_info, ret,
2089                                     "Failed to relocate sys chunks after "
2090                                     "device initialization. This can be fixed "
2091                                     "using the \"btrfs balance\" command.");
2092                 trans = btrfs_attach_transaction(root);
2093                 if (IS_ERR(trans)) {
2094                         if (PTR_ERR(trans) == -ENOENT)
2095                                 return 0;
2096                         return PTR_ERR(trans);
2097                 }
2098                 ret = btrfs_commit_transaction(trans, root);
2099         }
2100
2101         return ret;
2102
2103 error_trans:
2104         unlock_chunks(root);
2105         btrfs_end_transaction(trans, root);
2106         rcu_string_free(device->name);
2107         kfree(device);
2108 error:
2109         blkdev_put(bdev, FMODE_EXCL);
2110         if (seeding_dev) {
2111                 mutex_unlock(&uuid_mutex);
2112                 up_write(&sb->s_umount);
2113         }
2114         return ret;
2115 }
2116
2117 int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2118                                   struct btrfs_device **device_out)
2119 {
2120         struct request_queue *q;
2121         struct btrfs_device *device;
2122         struct block_device *bdev;
2123         struct btrfs_fs_info *fs_info = root->fs_info;
2124         struct list_head *devices;
2125         struct rcu_string *name;
2126         int ret = 0;
2127
2128         *device_out = NULL;
2129         if (fs_info->fs_devices->seeding)
2130                 return -EINVAL;
2131
2132         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2133                                   fs_info->bdev_holder);
2134         if (IS_ERR(bdev))
2135                 return PTR_ERR(bdev);
2136
2137         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2138
2139         devices = &fs_info->fs_devices->devices;
2140         list_for_each_entry(device, devices, dev_list) {
2141                 if (device->bdev == bdev) {
2142                         ret = -EEXIST;
2143                         goto error;
2144                 }
2145         }
2146
2147         device = kzalloc(sizeof(*device), GFP_NOFS);
2148         if (!device) {
2149                 ret = -ENOMEM;
2150                 goto error;
2151         }
2152
2153         name = rcu_string_strdup(device_path, GFP_NOFS);
2154         if (!name) {
2155                 kfree(device);
2156                 ret = -ENOMEM;
2157                 goto error;
2158         }
2159         rcu_assign_pointer(device->name, name);
2160
2161         q = bdev_get_queue(bdev);
2162         if (blk_queue_discard(q))
2163                 device->can_discard = 1;
2164         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2165         device->writeable = 1;
2166         device->work.func = pending_bios_fn;
2167         generate_random_uuid(device->uuid);
2168         device->devid = BTRFS_DEV_REPLACE_DEVID;
2169         spin_lock_init(&device->io_lock);
2170         device->generation = 0;
2171         device->io_width = root->sectorsize;
2172         device->io_align = root->sectorsize;
2173         device->sector_size = root->sectorsize;
2174         device->total_bytes = i_size_read(bdev->bd_inode);
2175         device->disk_total_bytes = device->total_bytes;
2176         device->dev_root = fs_info->dev_root;
2177         device->bdev = bdev;
2178         device->in_fs_metadata = 1;
2179         device->is_tgtdev_for_dev_replace = 1;
2180         device->mode = FMODE_EXCL;
2181         set_blocksize(device->bdev, 4096);
2182         device->fs_devices = fs_info->fs_devices;
2183         list_add(&device->dev_list, &fs_info->fs_devices->devices);
2184         fs_info->fs_devices->num_devices++;
2185         fs_info->fs_devices->open_devices++;
2186         if (device->can_discard)
2187                 fs_info->fs_devices->num_can_discard++;
2188         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2189
2190         *device_out = device;
2191         return ret;
2192
2193 error:
2194         blkdev_put(bdev, FMODE_EXCL);
2195         return ret;
2196 }
2197
2198 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2199                                               struct btrfs_device *tgtdev)
2200 {
2201         WARN_ON(fs_info->fs_devices->rw_devices == 0);
2202         tgtdev->io_width = fs_info->dev_root->sectorsize;
2203         tgtdev->io_align = fs_info->dev_root->sectorsize;
2204         tgtdev->sector_size = fs_info->dev_root->sectorsize;
2205         tgtdev->dev_root = fs_info->dev_root;
2206         tgtdev->in_fs_metadata = 1;
2207 }
2208
2209 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2210                                         struct btrfs_device *device)
2211 {
2212         int ret;
2213         struct btrfs_path *path;
2214         struct btrfs_root *root;
2215         struct btrfs_dev_item *dev_item;
2216         struct extent_buffer *leaf;
2217         struct btrfs_key key;
2218
2219         root = device->dev_root->fs_info->chunk_root;
2220
2221         path = btrfs_alloc_path();
2222         if (!path)
2223                 return -ENOMEM;
2224
2225         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2226         key.type = BTRFS_DEV_ITEM_KEY;
2227         key.offset = device->devid;
2228
2229         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2230         if (ret < 0)
2231                 goto out;
2232
2233         if (ret > 0) {
2234                 ret = -ENOENT;
2235                 goto out;
2236         }
2237
2238         leaf = path->nodes[0];
2239         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2240
2241         btrfs_set_device_id(leaf, dev_item, device->devid);
2242         btrfs_set_device_type(leaf, dev_item, device->type);
2243         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2244         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2245         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2246         btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
2247         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
2248         btrfs_mark_buffer_dirty(leaf);
2249
2250 out:
2251         btrfs_free_path(path);
2252         return ret;
2253 }
2254
2255 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
2256                       struct btrfs_device *device, u64 new_size)
2257 {
2258         struct btrfs_super_block *super_copy =
2259                 device->dev_root->fs_info->super_copy;
2260         u64 old_total = btrfs_super_total_bytes(super_copy);
2261         u64 diff = new_size - device->total_bytes;
2262
2263         if (!device->writeable)
2264                 return -EACCES;
2265         if (new_size <= device->total_bytes ||
2266             device->is_tgtdev_for_dev_replace)
2267                 return -EINVAL;
2268
2269         btrfs_set_super_total_bytes(super_copy, old_total + diff);
2270         device->fs_devices->total_rw_bytes += diff;
2271
2272         device->total_bytes = new_size;
2273         device->disk_total_bytes = new_size;
2274         btrfs_clear_space_info_full(device->dev_root->fs_info);
2275
2276         return btrfs_update_device(trans, device);
2277 }
2278
2279 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2280                       struct btrfs_device *device, u64 new_size)
2281 {
2282         int ret;
2283         lock_chunks(device->dev_root);
2284         ret = __btrfs_grow_device(trans, device, new_size);
2285         unlock_chunks(device->dev_root);
2286         return ret;
2287 }
2288
2289 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2290                             struct btrfs_root *root,
2291                             u64 chunk_tree, u64 chunk_objectid,
2292                             u64 chunk_offset)
2293 {
2294         int ret;
2295         struct btrfs_path *path;
2296         struct btrfs_key key;
2297
2298         root = root->fs_info->chunk_root;
2299         path = btrfs_alloc_path();
2300         if (!path)
2301                 return -ENOMEM;
2302
2303         key.objectid = chunk_objectid;
2304         key.offset = chunk_offset;
2305         key.type = BTRFS_CHUNK_ITEM_KEY;
2306
2307         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2308         if (ret < 0)
2309                 goto out;
2310         else if (ret > 0) { /* Logic error or corruption */
2311                 btrfs_error(root->fs_info, -ENOENT,
2312                             "Failed lookup while freeing chunk.");
2313                 ret = -ENOENT;
2314                 goto out;
2315         }
2316
2317         ret = btrfs_del_item(trans, root, path);
2318         if (ret < 0)
2319                 btrfs_error(root->fs_info, ret,
2320                             "Failed to delete chunk item.");
2321 out:
2322         btrfs_free_path(path);
2323         return ret;
2324 }
2325
2326 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
2327                         chunk_offset)
2328 {
2329         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2330         struct btrfs_disk_key *disk_key;
2331         struct btrfs_chunk *chunk;
2332         u8 *ptr;
2333         int ret = 0;
2334         u32 num_stripes;
2335         u32 array_size;
2336         u32 len = 0;
2337         u32 cur;
2338         struct btrfs_key key;
2339
2340         array_size = btrfs_super_sys_array_size(super_copy);
2341
2342         ptr = super_copy->sys_chunk_array;
2343         cur = 0;
2344
2345         while (cur < array_size) {
2346                 disk_key = (struct btrfs_disk_key *)ptr;
2347                 btrfs_disk_key_to_cpu(&key, disk_key);
2348
2349                 len = sizeof(*disk_key);
2350
2351                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2352                         chunk = (struct btrfs_chunk *)(ptr + len);
2353                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2354                         len += btrfs_chunk_item_size(num_stripes);
2355                 } else {
2356                         ret = -EIO;
2357                         break;
2358                 }
2359                 if (key.objectid == chunk_objectid &&
2360                     key.offset == chunk_offset) {
2361                         memmove(ptr, ptr + len, array_size - (cur + len));
2362                         array_size -= len;
2363                         btrfs_set_super_sys_array_size(super_copy, array_size);
2364                 } else {
2365                         ptr += len;
2366                         cur += len;
2367                 }
2368         }
2369         return ret;
2370 }
2371
2372 static int btrfs_relocate_chunk(struct btrfs_root *root,
2373                          u64 chunk_tree, u64 chunk_objectid,
2374                          u64 chunk_offset)
2375 {
2376         struct extent_map_tree *em_tree;
2377         struct btrfs_root *extent_root;
2378         struct btrfs_trans_handle *trans;
2379         struct extent_map *em;
2380         struct map_lookup *map;
2381         int ret;
2382         int i;
2383
2384         root = root->fs_info->chunk_root;
2385         extent_root = root->fs_info->extent_root;
2386         em_tree = &root->fs_info->mapping_tree.map_tree;
2387
2388         ret = btrfs_can_relocate(extent_root, chunk_offset);
2389         if (ret)
2390                 return -ENOSPC;
2391
2392         /* step one, relocate all the extents inside this chunk */
2393         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2394         if (ret)
2395                 return ret;
2396
2397         trans = btrfs_start_transaction(root, 0);
2398         if (IS_ERR(trans)) {
2399                 ret = PTR_ERR(trans);
2400                 btrfs_std_error(root->fs_info, ret);
2401                 return ret;
2402         }
2403
2404         lock_chunks(root);
2405
2406         /*
2407          * step two, delete the device extents and the
2408          * chunk tree entries
2409          */
2410         read_lock(&em_tree->lock);
2411         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
2412         read_unlock(&em_tree->lock);
2413
2414         BUG_ON(!em || em->start > chunk_offset ||
2415                em->start + em->len < chunk_offset);
2416         map = (struct map_lookup *)em->bdev;
2417
2418         for (i = 0; i < map->num_stripes; i++) {
2419                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
2420                                             map->stripes[i].physical);
2421                 BUG_ON(ret);
2422
2423                 if (map->stripes[i].dev) {
2424                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2425                         BUG_ON(ret);
2426                 }
2427         }
2428         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
2429                                chunk_offset);
2430
2431         BUG_ON(ret);
2432
2433         trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
2434
2435         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2436                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
2437                 BUG_ON(ret);
2438         }
2439
2440         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
2441         BUG_ON(ret);
2442
2443         write_lock(&em_tree->lock);
2444         remove_extent_mapping(em_tree, em);
2445         write_unlock(&em_tree->lock);
2446
2447         kfree(map);
2448         em->bdev = NULL;
2449
2450         /* once for the tree */
2451         free_extent_map(em);
2452         /* once for us */
2453         free_extent_map(em);
2454
2455         unlock_chunks(root);
2456         btrfs_end_transaction(trans, root);
2457         return 0;
2458 }
2459
2460 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2461 {
2462         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2463         struct btrfs_path *path;
2464         struct extent_buffer *leaf;
2465         struct btrfs_chunk *chunk;
2466         struct btrfs_key key;
2467         struct btrfs_key found_key;
2468         u64 chunk_tree = chunk_root->root_key.objectid;
2469         u64 chunk_type;
2470         bool retried = false;
2471         int failed = 0;
2472         int ret;
2473
2474         path = btrfs_alloc_path();
2475         if (!path)
2476                 return -ENOMEM;
2477
2478 again:
2479         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2480         key.offset = (u64)-1;
2481         key.type = BTRFS_CHUNK_ITEM_KEY;
2482
2483         while (1) {
2484                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2485                 if (ret < 0)
2486                         goto error;
2487                 BUG_ON(ret == 0); /* Corruption */
2488
2489                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2490                                           key.type);
2491                 if (ret < 0)
2492                         goto error;
2493                 if (ret > 0)
2494                         break;
2495
2496                 leaf = path->nodes[0];
2497                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2498
2499                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2500                                        struct btrfs_chunk);
2501                 chunk_type = btrfs_chunk_type(leaf, chunk);
2502                 btrfs_release_path(path);
2503
2504                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2505                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2506                                                    found_key.objectid,
2507                                                    found_key.offset);
2508                         if (ret == -ENOSPC)
2509                                 failed++;
2510                         else if (ret)
2511                                 BUG();
2512                 }
2513
2514                 if (found_key.offset == 0)
2515                         break;
2516                 key.offset = found_key.offset - 1;
2517         }
2518         ret = 0;
2519         if (failed && !retried) {
2520                 failed = 0;
2521                 retried = true;
2522                 goto again;
2523         } else if (failed && retried) {
2524                 WARN_ON(1);
2525                 ret = -ENOSPC;
2526         }
2527 error:
2528         btrfs_free_path(path);
2529         return ret;
2530 }
2531
2532 static int insert_balance_item(struct btrfs_root *root,
2533                                struct btrfs_balance_control *bctl)
2534 {
2535         struct btrfs_trans_handle *trans;
2536         struct btrfs_balance_item *item;
2537         struct btrfs_disk_balance_args disk_bargs;
2538         struct btrfs_path *path;
2539         struct extent_buffer *leaf;
2540         struct btrfs_key key;
2541         int ret, err;
2542
2543         path = btrfs_alloc_path();
2544         if (!path)
2545                 return -ENOMEM;
2546
2547         trans = btrfs_start_transaction(root, 0);
2548         if (IS_ERR(trans)) {
2549                 btrfs_free_path(path);
2550                 return PTR_ERR(trans);
2551         }
2552
2553         key.objectid = BTRFS_BALANCE_OBJECTID;
2554         key.type = BTRFS_BALANCE_ITEM_KEY;
2555         key.offset = 0;
2556
2557         ret = btrfs_insert_empty_item(trans, root, path, &key,
2558                                       sizeof(*item));
2559         if (ret)
2560                 goto out;
2561
2562         leaf = path->nodes[0];
2563         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2564
2565         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2566
2567         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2568         btrfs_set_balance_data(leaf, item, &disk_bargs);
2569         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2570         btrfs_set_balance_meta(leaf, item, &disk_bargs);
2571         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2572         btrfs_set_balance_sys(leaf, item, &disk_bargs);
2573
2574         btrfs_set_balance_flags(leaf, item, bctl->flags);
2575
2576         btrfs_mark_buffer_dirty(leaf);
2577 out:
2578         btrfs_free_path(path);
2579         err = btrfs_commit_transaction(trans, root);
2580         if (err && !ret)
2581                 ret = err;
2582         return ret;
2583 }
2584
2585 static int del_balance_item(struct btrfs_root *root)
2586 {
2587         struct btrfs_trans_handle *trans;
2588         struct btrfs_path *path;
2589         struct btrfs_key key;
2590         int ret, err;
2591
2592         path = btrfs_alloc_path();
2593         if (!path)
2594                 return -ENOMEM;
2595
2596         trans = btrfs_start_transaction(root, 0);
2597         if (IS_ERR(trans)) {
2598                 btrfs_free_path(path);
2599                 return PTR_ERR(trans);
2600         }
2601
2602         key.objectid = BTRFS_BALANCE_OBJECTID;
2603         key.type = BTRFS_BALANCE_ITEM_KEY;
2604         key.offset = 0;
2605
2606         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2607         if (ret < 0)
2608                 goto out;
2609         if (ret > 0) {
2610                 ret = -ENOENT;
2611                 goto out;
2612         }
2613
2614         ret = btrfs_del_item(trans, root, path);
2615 out:
2616         btrfs_free_path(path);
2617         err = btrfs_commit_transaction(trans, root);
2618         if (err && !ret)
2619                 ret = err;
2620         return ret;
2621 }
2622
2623 /*
2624  * This is a heuristic used to reduce the number of chunks balanced on
2625  * resume after balance was interrupted.
2626  */
2627 static void update_balance_args(struct btrfs_balance_control *bctl)
2628 {
2629         /*
2630          * Turn on soft mode for chunk types that were being converted.
2631          */
2632         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2633                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2634         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2635                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2636         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2637                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2638
2639         /*
2640          * Turn on usage filter if is not already used.  The idea is
2641          * that chunks that we have already balanced should be
2642          * reasonably full.  Don't do it for chunks that are being
2643          * converted - that will keep us from relocating unconverted
2644          * (albeit full) chunks.
2645          */
2646         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2647             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2648                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2649                 bctl->data.usage = 90;
2650         }
2651         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2652             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2653                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2654                 bctl->sys.usage = 90;
2655         }
2656         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2657             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2658                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2659                 bctl->meta.usage = 90;
2660         }
2661 }
2662
2663 /*
2664  * Should be called with both balance and volume mutexes held to
2665  * serialize other volume operations (add_dev/rm_dev/resize) with
2666  * restriper.  Same goes for unset_balance_control.
2667  */
2668 static void set_balance_control(struct btrfs_balance_control *bctl)
2669 {
2670         struct btrfs_fs_info *fs_info = bctl->fs_info;
2671
2672         BUG_ON(fs_info->balance_ctl);
2673
2674         spin_lock(&fs_info->balance_lock);
2675         fs_info->balance_ctl = bctl;
2676         spin_unlock(&fs_info->balance_lock);
2677 }
2678
2679 static void unset_balance_control(struct btrfs_fs_info *fs_info)
2680 {
2681         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2682
2683         BUG_ON(!fs_info->balance_ctl);
2684
2685         spin_lock(&fs_info->balance_lock);
2686         fs_info->balance_ctl = NULL;
2687         spin_unlock(&fs_info->balance_lock);
2688
2689         kfree(bctl);
2690 }
2691
2692 /*
2693  * Balance filters.  Return 1 if chunk should be filtered out
2694  * (should not be balanced).
2695  */
2696 static int chunk_profiles_filter(u64 chunk_type,
2697                                  struct btrfs_balance_args *bargs)
2698 {
2699         chunk_type = chunk_to_extended(chunk_type) &
2700                                 BTRFS_EXTENDED_PROFILE_MASK;
2701
2702         if (bargs->profiles & chunk_type)
2703                 return 0;
2704
2705         return 1;
2706 }
2707
2708 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2709                               struct btrfs_balance_args *bargs)
2710 {
2711         struct btrfs_block_group_cache *cache;
2712         u64 chunk_used, user_thresh;
2713         int ret = 1;
2714
2715         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2716         chunk_used = btrfs_block_group_used(&cache->item);
2717
2718         if (bargs->usage == 0)
2719                 user_thresh = 1;
2720         else if (bargs->usage > 100)
2721                 user_thresh = cache->key.offset;
2722         else
2723                 user_thresh = div_factor_fine(cache->key.offset,
2724                                               bargs->usage);
2725
2726         if (chunk_used < user_thresh)
2727                 ret = 0;
2728
2729         btrfs_put_block_group(cache);
2730         return ret;
2731 }
2732
2733 static int chunk_devid_filter(struct extent_buffer *leaf,
2734                               struct btrfs_chunk *chunk,
2735                               struct btrfs_balance_args *bargs)
2736 {
2737         struct btrfs_stripe *stripe;
2738         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2739         int i;
2740
2741         for (i = 0; i < num_stripes; i++) {
2742                 stripe = btrfs_stripe_nr(chunk, i);
2743                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2744                         return 0;
2745         }
2746
2747         return 1;
2748 }
2749
2750 /* [pstart, pend) */
2751 static int chunk_drange_filter(struct extent_buffer *leaf,
2752                                struct btrfs_chunk *chunk,
2753                                u64 chunk_offset,
2754                                struct btrfs_balance_args *bargs)
2755 {
2756         struct btrfs_stripe *stripe;
2757         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2758         u64 stripe_offset;
2759         u64 stripe_length;
2760         int factor;
2761         int i;
2762
2763         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2764                 return 0;
2765
2766         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2767              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
2768                 factor = num_stripes / 2;
2769         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
2770                 factor = num_stripes - 1;
2771         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
2772                 factor = num_stripes - 2;
2773         } else {
2774                 factor = num_stripes;
2775         }
2776
2777         for (i = 0; i < num_stripes; i++) {
2778                 stripe = btrfs_stripe_nr(chunk, i);
2779                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2780                         continue;
2781
2782                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2783                 stripe_length = btrfs_chunk_length(leaf, chunk);
2784                 do_div(stripe_length, factor);
2785
2786                 if (stripe_offset < bargs->pend &&
2787                     stripe_offset + stripe_length > bargs->pstart)
2788                         return 0;
2789         }
2790
2791         return 1;
2792 }
2793
2794 /* [vstart, vend) */
2795 static int chunk_vrange_filter(struct extent_buffer *leaf,
2796                                struct btrfs_chunk *chunk,
2797                                u64 chunk_offset,
2798                                struct btrfs_balance_args *bargs)
2799 {
2800         if (chunk_offset < bargs->vend &&
2801             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2802                 /* at least part of the chunk is inside this vrange */
2803                 return 0;
2804
2805         return 1;
2806 }
2807
2808 static int chunk_soft_convert_filter(u64 chunk_type,
2809                                      struct btrfs_balance_args *bargs)
2810 {
2811         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2812                 return 0;
2813
2814         chunk_type = chunk_to_extended(chunk_type) &
2815                                 BTRFS_EXTENDED_PROFILE_MASK;
2816
2817         if (bargs->target == chunk_type)
2818                 return 1;
2819
2820         return 0;
2821 }
2822
2823 static int should_balance_chunk(struct btrfs_root *root,
2824                                 struct extent_buffer *leaf,
2825                                 struct btrfs_chunk *chunk, u64 chunk_offset)
2826 {
2827         struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2828         struct btrfs_balance_args *bargs = NULL;
2829         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2830
2831         /* type filter */
2832         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2833               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2834                 return 0;
2835         }
2836
2837         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2838                 bargs = &bctl->data;
2839         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2840                 bargs = &bctl->sys;
2841         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2842                 bargs = &bctl->meta;
2843
2844         /* profiles filter */
2845         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2846             chunk_profiles_filter(chunk_type, bargs)) {
2847                 return 0;
2848         }
2849
2850         /* usage filter */
2851         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2852             chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2853                 return 0;
2854         }
2855
2856         /* devid filter */
2857         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2858             chunk_devid_filter(leaf, chunk, bargs)) {
2859                 return 0;
2860         }
2861
2862         /* drange filter, makes sense only with devid filter */
2863         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2864             chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2865                 return 0;
2866         }
2867
2868         /* vrange filter */
2869         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2870             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2871                 return 0;
2872         }
2873
2874         /* soft profile changing mode */
2875         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2876             chunk_soft_convert_filter(chunk_type, bargs)) {
2877                 return 0;
2878         }
2879
2880         return 1;
2881 }
2882
2883 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2884 {
2885         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2886         struct btrfs_root *chunk_root = fs_info->chunk_root;
2887         struct btrfs_root *dev_root = fs_info->dev_root;
2888         struct list_head *devices;
2889         struct btrfs_device *device;
2890         u64 old_size;
2891         u64 size_to_free;
2892         struct btrfs_chunk *chunk;
2893         struct btrfs_path *path;
2894         struct btrfs_key key;
2895         struct btrfs_key found_key;
2896         struct btrfs_trans_handle *trans;
2897         struct extent_buffer *leaf;
2898         int slot;
2899         int ret;
2900         int enospc_errors = 0;
2901         bool counting = true;
2902
2903         /* step one make some room on all the devices */
2904         devices = &fs_info->fs_devices->devices;
2905         list_for_each_entry(device, devices, dev_list) {
2906                 old_size = device->total_bytes;
2907                 size_to_free = div_factor(old_size, 1);
2908                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2909                 if (!device->writeable ||
2910                     device->total_bytes - device->bytes_used > size_to_free ||
2911                     device->is_tgtdev_for_dev_replace)
2912                         continue;
2913
2914                 ret = btrfs_shrink_device(device, old_size - size_to_free);
2915                 if (ret == -ENOSPC)
2916                         break;
2917                 BUG_ON(ret);
2918
2919                 trans = btrfs_start_transaction(dev_root, 0);
2920                 BUG_ON(IS_ERR(trans));
2921
2922                 ret = btrfs_grow_device(trans, device, old_size);
2923                 BUG_ON(ret);
2924
2925                 btrfs_end_transaction(trans, dev_root);
2926         }
2927
2928         /* step two, relocate all the chunks */
2929         path = btrfs_alloc_path();
2930         if (!path) {
2931                 ret = -ENOMEM;
2932                 goto error;
2933         }
2934
2935         /* zero out stat counters */
2936         spin_lock(&fs_info->balance_lock);
2937         memset(&bctl->stat, 0, sizeof(bctl->stat));
2938         spin_unlock(&fs_info->balance_lock);
2939 again:
2940         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2941         key.offset = (u64)-1;
2942         key.type = BTRFS_CHUNK_ITEM_KEY;
2943
2944         while (1) {
2945                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
2946                     atomic_read(&fs_info->balance_cancel_req)) {
2947                         ret = -ECANCELED;
2948                         goto error;
2949                 }
2950
2951                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2952                 if (ret < 0)
2953                         goto error;
2954
2955                 /*
2956                  * this shouldn't happen, it means the last relocate
2957                  * failed
2958                  */
2959                 if (ret == 0)
2960                         BUG(); /* FIXME break ? */
2961
2962                 ret = btrfs_previous_item(chunk_root, path, 0,
2963                                           BTRFS_CHUNK_ITEM_KEY);
2964                 if (ret) {
2965                         ret = 0;
2966                         break;
2967                 }
2968
2969                 leaf = path->nodes[0];
2970                 slot = path->slots[0];
2971                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2972
2973                 if (found_key.objectid != key.objectid)
2974                         break;
2975
2976                 /* chunk zero is special */
2977                 if (found_key.offset == 0)
2978                         break;
2979
2980                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2981
2982                 if (!counting) {
2983                         spin_lock(&fs_info->balance_lock);
2984                         bctl->stat.considered++;
2985                         spin_unlock(&fs_info->balance_lock);
2986                 }
2987
2988                 ret = should_balance_chunk(chunk_root, leaf, chunk,
2989                                            found_key.offset);
2990                 btrfs_release_path(path);
2991                 if (!ret)
2992                         goto loop;
2993
2994                 if (counting) {
2995                         spin_lock(&fs_info->balance_lock);
2996                         bctl->stat.expected++;
2997                         spin_unlock(&fs_info->balance_lock);
2998                         goto loop;
2999                 }
3000
3001                 ret = btrfs_relocate_chunk(chunk_root,
3002                                            chunk_root->root_key.objectid,
3003                                            found_key.objectid,
3004                                            found_key.offset);
3005                 if (ret && ret != -ENOSPC)
3006                         goto error;
3007                 if (ret == -ENOSPC) {
3008                         enospc_errors++;
3009                 } else {
3010                         spin_lock(&fs_info->balance_lock);
3011                         bctl->stat.completed++;
3012                         spin_unlock(&fs_info->balance_lock);
3013                 }
3014 loop:
3015                 key.offset = found_key.offset - 1;
3016         }
3017
3018         if (counting) {
3019                 btrfs_release_path(path);
3020                 counting = false;
3021                 goto again;
3022         }
3023 error:
3024         btrfs_free_path(path);
3025         if (enospc_errors) {
3026                 printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
3027                        enospc_errors);
3028                 if (!ret)
3029                         ret = -ENOSPC;
3030         }
3031
3032         return ret;
3033 }
3034
3035 /**
3036  * alloc_profile_is_valid - see if a given profile is valid and reduced
3037  * @flags: profile to validate
3038  * @extended: if true @flags is treated as an extended profile
3039  */
3040 static int alloc_profile_is_valid(u64 flags, int extended)
3041 {
3042         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3043                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
3044
3045         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3046
3047         /* 1) check that all other bits are zeroed */
3048         if (flags & ~mask)
3049                 return 0;
3050
3051         /* 2) see if profile is reduced */
3052         if (flags == 0)
3053                 return !extended; /* "0" is valid for usual profiles */
3054
3055         /* true if exactly one bit set */
3056         return (flags & (flags - 1)) == 0;
3057 }
3058
3059 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3060 {
3061         /* cancel requested || normal exit path */
3062         return atomic_read(&fs_info->balance_cancel_req) ||
3063                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3064                  atomic_read(&fs_info->balance_cancel_req) == 0);
3065 }
3066
3067 static void __cancel_balance(struct btrfs_fs_info *fs_info)
3068 {
3069         int ret;
3070
3071         unset_balance_control(fs_info);
3072         ret = del_balance_item(fs_info->tree_root);
3073         if (ret)
3074                 btrfs_std_error(fs_info, ret);
3075
3076         atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3077 }
3078
3079 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
3080                                struct btrfs_ioctl_balance_args *bargs);
3081
3082 /*
3083  * Should be called with both balance and volume mutexes held
3084  */
3085 int btrfs_balance(struct btrfs_balance_control *bctl,
3086                   struct btrfs_ioctl_balance_args *bargs)
3087 {
3088         struct btrfs_fs_info *fs_info = bctl->fs_info;
3089         u64 allowed;
3090         int mixed = 0;
3091         int ret;
3092         u64 num_devices;
3093         unsigned seq;
3094
3095         if (btrfs_fs_closing(fs_info) ||
3096             atomic_read(&fs_info->balance_pause_req) ||
3097             atomic_read(&fs_info->balance_cancel_req)) {
3098                 ret = -EINVAL;
3099                 goto out;
3100         }
3101
3102         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
3103         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
3104                 mixed = 1;
3105
3106         /*
3107          * In case of mixed groups both data and meta should be picked,
3108          * and identical options should be given for both of them.
3109          */
3110         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
3111         if (mixed && (bctl->flags & allowed)) {
3112                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
3113                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
3114                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
3115                         printk(KERN_ERR "btrfs: with mixed groups data and "
3116                                "metadata balance options must be the same\n");
3117                         ret = -EINVAL;
3118                         goto out;
3119                 }
3120         }
3121
3122         num_devices = fs_info->fs_devices->num_devices;
3123         btrfs_dev_replace_lock(&fs_info->dev_replace);
3124         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3125                 BUG_ON(num_devices < 1);
3126                 num_devices--;
3127         }
3128         btrfs_dev_replace_unlock(&fs_info->dev_replace);
3129         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3130         if (num_devices == 1)
3131                 allowed |= BTRFS_BLOCK_GROUP_DUP;
3132         else if (num_devices > 1)
3133                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3134         if (num_devices > 2)
3135                 allowed |= BTRFS_BLOCK_GROUP_RAID5;
3136         if (num_devices > 3)
3137                 allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
3138                             BTRFS_BLOCK_GROUP_RAID6);
3139         if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3140             (!alloc_profile_is_valid(bctl->data.target, 1) ||
3141              (bctl->data.target & ~allowed))) {
3142                 printk(KERN_ERR "btrfs: unable to start balance with target "
3143                        "data profile %llu\n",
3144                        (unsigned long long)bctl->data.target);
3145                 ret = -EINVAL;
3146                 goto out;
3147         }
3148         if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3149             (!alloc_profile_is_valid(bctl->meta.target, 1) ||
3150              (bctl->meta.target & ~allowed))) {
3151                 printk(KERN_ERR "btrfs: unable to start balance with target "
3152                        "metadata profile %llu\n",
3153                        (unsigned long long)bctl->meta.target);
3154                 ret = -EINVAL;
3155                 goto out;
3156         }
3157         if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3158             (!alloc_profile_is_valid(bctl->sys.target, 1) ||
3159              (bctl->sys.target & ~allowed))) {
3160                 printk(KERN_ERR "btrfs: unable to start balance with target "
3161                        "system profile %llu\n",
3162                        (unsigned long long)bctl->sys.target);
3163                 ret = -EINVAL;
3164                 goto out;
3165         }
3166
3167         /* allow dup'ed data chunks only in mixed mode */
3168         if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3169             (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
3170                 printk(KERN_ERR "btrfs: dup for data is not allowed\n");
3171                 ret = -EINVAL;
3172                 goto out;
3173         }
3174
3175         /* allow to reduce meta or sys integrity only if force set */
3176         allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3177                         BTRFS_BLOCK_GROUP_RAID10 |
3178                         BTRFS_BLOCK_GROUP_RAID5 |
3179                         BTRFS_BLOCK_GROUP_RAID6;
3180         do {
3181                 seq = read_seqbegin(&fs_info->profiles_lock);
3182
3183                 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3184                      (fs_info->avail_system_alloc_bits & allowed) &&
3185                      !(bctl->sys.target & allowed)) ||
3186                     ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3187                      (fs_info->avail_metadata_alloc_bits & allowed) &&
3188                      !(bctl->meta.target & allowed))) {
3189                         if (bctl->flags & BTRFS_BALANCE_FORCE) {
3190                                 printk(KERN_INFO "btrfs: force reducing metadata "
3191                                        "integrity\n");
3192                         } else {
3193                                 printk(KERN_ERR "btrfs: balance will reduce metadata "
3194                                        "integrity, use force if you want this\n");
3195                                 ret = -EINVAL;
3196                                 goto out;
3197                         }
3198                 }
3199         } while (read_seqretry(&fs_info->profiles_lock, seq));
3200
3201         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3202                 int num_tolerated_disk_barrier_failures;
3203                 u64 target = bctl->sys.target;
3204
3205                 num_tolerated_disk_barrier_failures =
3206                         btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3207                 if (num_tolerated_disk_barrier_failures > 0 &&
3208                     (target &
3209                      (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3210                       BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
3211                         num_tolerated_disk_barrier_failures = 0;
3212                 else if (num_tolerated_disk_barrier_failures > 1 &&
3213                          (target &
3214                           (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
3215                         num_tolerated_disk_barrier_failures = 1;
3216
3217                 fs_info->num_tolerated_disk_barrier_failures =
3218                         num_tolerated_disk_barrier_failures;
3219         }
3220
3221         ret = insert_balance_item(fs_info->tree_root, bctl);
3222         if (ret && ret != -EEXIST)
3223                 goto out;
3224
3225         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
3226                 BUG_ON(ret == -EEXIST);
3227                 set_balance_control(bctl);
3228         } else {
3229                 BUG_ON(ret != -EEXIST);
3230                 spin_lock(&fs_info->balance_lock);
3231                 update_balance_args(bctl);
3232                 spin_unlock(&fs_info->balance_lock);
3233         }
3234
3235         atomic_inc(&fs_info->balance_running);
3236         mutex_unlock(&fs_info->balance_mutex);
3237
3238         ret = __btrfs_balance(fs_info);
3239
3240         mutex_lock(&fs_info->balance_mutex);
3241         atomic_dec(&fs_info->balance_running);
3242
3243         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3244                 fs_info->num_tolerated_disk_barrier_failures =
3245                         btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3246         }
3247
3248         if (bargs) {
3249                 memset(bargs, 0, sizeof(*bargs));
3250                 update_ioctl_balance_args(fs_info, 0, bargs);
3251         }
3252
3253         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3254             balance_need_close(fs_info)) {
3255                 __cancel_balance(fs_info);
3256         }
3257
3258         wake_up(&fs_info->balance_wait_q);
3259
3260         return ret;
3261 out:
3262         if (bctl->flags & BTRFS_BALANCE_RESUME)
3263                 __cancel_balance(fs_info);
3264         else {
3265                 kfree(bctl);
3266                 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3267         }
3268         return ret;
3269 }
3270
3271 static int balance_kthread(void *data)
3272 {
3273         struct btrfs_fs_info *fs_info = data;
3274         int ret = 0;
3275
3276         mutex_lock(&fs_info->volume_mutex);
3277         mutex_lock(&fs_info->balance_mutex);
3278
3279         if (fs_info->balance_ctl) {
3280                 printk(KERN_INFO "btrfs: continuing balance\n");
3281                 ret = btrfs_balance(fs_info->balance_ctl, NULL);
3282         }
3283
3284         mutex_unlock(&fs_info->balance_mutex);
3285         mutex_unlock(&fs_info->volume_mutex);
3286
3287         return ret;
3288 }
3289
3290 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3291 {
3292         struct task_struct *tsk;
3293
3294         spin_lock(&fs_info->balance_lock);
3295         if (!fs_info->balance_ctl) {
3296                 spin_unlock(&fs_info->balance_lock);
3297                 return 0;
3298         }
3299         spin_unlock(&fs_info->balance_lock);
3300
3301         if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
3302                 printk(KERN_INFO "btrfs: force skipping balance\n");
3303                 return 0;
3304         }
3305
3306         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3307         return PTR_RET(tsk);
3308 }
3309
3310 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
3311 {
3312         struct btrfs_balance_control *bctl;
3313         struct btrfs_balance_item *item;
3314         struct btrfs_disk_balance_args disk_bargs;
3315         struct btrfs_path *path;
3316         struct extent_buffer *leaf;
3317         struct btrfs_key key;
3318         int ret;
3319
3320         path = btrfs_alloc_path();
3321         if (!path)
3322                 return -ENOMEM;
3323
3324         key.objectid = BTRFS_BALANCE_OBJECTID;
3325         key.type = BTRFS_BALANCE_ITEM_KEY;
3326         key.offset = 0;
3327
3328         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
3329         if (ret < 0)
3330                 goto out;
3331         if (ret > 0) { /* ret = -ENOENT; */
3332                 ret = 0;
3333                 goto out;
3334         }
3335
3336         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3337         if (!bctl) {
3338                 ret = -ENOMEM;
3339                 goto out;
3340         }
3341
3342         leaf = path->nodes[0];
3343         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3344
3345         bctl->fs_info = fs_info;
3346         bctl->flags = btrfs_balance_flags(leaf, item);
3347         bctl->flags |= BTRFS_BALANCE_RESUME;
3348
3349         btrfs_balance_data(leaf, item, &disk_bargs);
3350         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
3351         btrfs_balance_meta(leaf, item, &disk_bargs);
3352         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
3353         btrfs_balance_sys(leaf, item, &disk_bargs);
3354         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
3355
3356         WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
3357
3358         mutex_lock(&fs_info->volume_mutex);
3359         mutex_lock(&fs_info->balance_mutex);
3360
3361         set_balance_control(bctl);
3362
3363         mutex_unlock(&fs_info->balance_mutex);
3364         mutex_unlock(&fs_info->volume_mutex);
3365 out:
3366         btrfs_free_path(path);
3367         return ret;
3368 }
3369
3370 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
3371 {
3372         int ret = 0;
3373
3374         mutex_lock(&fs_info->balance_mutex);
3375         if (!fs_info->balance_ctl) {
3376                 mutex_unlock(&fs_info->balance_mutex);
3377                 return -ENOTCONN;
3378         }
3379
3380         if (atomic_read(&fs_info->balance_running)) {
3381                 atomic_inc(&fs_info->balance_pause_req);
3382                 mutex_unlock(&fs_info->balance_mutex);
3383
3384                 wait_event(fs_info->balance_wait_q,
3385                            atomic_read(&fs_info->balance_running) == 0);
3386
3387                 mutex_lock(&fs_info->balance_mutex);
3388                 /* we are good with balance_ctl ripped off from under us */
3389                 BUG_ON(atomic_read(&fs_info->balance_running));
3390                 atomic_dec(&fs_info->balance_pause_req);
3391         } else {
3392                 ret = -ENOTCONN;
3393         }
3394
3395         mutex_unlock(&fs_info->balance_mutex);
3396         return ret;
3397 }
3398
3399 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
3400 {
3401         mutex_lock(&fs_info->balance_mutex);
3402         if (!fs_info->balance_ctl) {
3403                 mutex_unlock(&fs_info->balance_mutex);
3404                 return -ENOTCONN;
3405         }
3406
3407         atomic_inc(&fs_info->balance_cancel_req);
3408         /*
3409          * if we are running just wait and return, balance item is
3410          * deleted in btrfs_balance in this case
3411          */
3412         if (atomic_read(&fs_info->balance_running)) {
3413                 mutex_unlock(&fs_info->balance_mutex);
3414                 wait_event(fs_info->balance_wait_q,
3415                            atomic_read(&fs_info->balance_running) == 0);
3416                 mutex_lock(&fs_info->balance_mutex);
3417         } else {
3418                 /* __cancel_balance needs volume_mutex */
3419                 mutex_unlock(&fs_info->balance_mutex);
3420                 mutex_lock(&fs_info->volume_mutex);
3421                 mutex_lock(&fs_info->balance_mutex);
3422
3423                 if (fs_info->balance_ctl)
3424                         __cancel_balance(fs_info);
3425
3426                 mutex_unlock(&fs_info->volume_mutex);
3427         }
3428
3429         BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
3430         atomic_dec(&fs_info->balance_cancel_req);
3431         mutex_unlock(&fs_info->balance_mutex);
3432         return 0;
3433 }
3434
3435 /*
3436  * shrinking a device means finding all of the device extents past
3437  * the new size, and then following the back refs to the chunks.
3438  * The chunk relocation code actually frees the device extent
3439  */
3440 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3441 {
3442         struct btrfs_trans_handle *trans;
3443         struct btrfs_root *root = device->dev_root;
3444         struct btrfs_dev_extent *dev_extent = NULL;
3445         struct btrfs_path *path;
3446         u64 length;
3447         u64 chunk_tree;
3448         u64 chunk_objectid;
3449         u64 chunk_offset;
3450         int ret;
3451         int slot;
3452         int failed = 0;
3453         bool retried = false;
3454         struct extent_buffer *l;
3455         struct btrfs_key key;
3456         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3457         u64 old_total = btrfs_super_total_bytes(super_copy);
3458         u64 old_size = device->total_bytes;
3459         u64 diff = device->total_bytes - new_size;
3460
3461         if (device->is_tgtdev_for_dev_replace)
3462                 return -EINVAL;
3463
3464         path = btrfs_alloc_path();
3465         if (!path)
3466                 return -ENOMEM;
3467
3468         path->reada = 2;
3469
3470         lock_chunks(root);
3471
3472         device->total_bytes = new_size;
3473         if (device->writeable) {
3474                 device->fs_devices->total_rw_bytes -= diff;
3475                 spin_lock(&root->fs_info->free_chunk_lock);
3476                 root->fs_info->free_chunk_space -= diff;
3477                 spin_unlock(&root->fs_info->free_chunk_lock);
3478         }
3479         unlock_chunks(root);
3480
3481 again:
3482         key.objectid = device->devid;
3483         key.offset = (u64)-1;
3484         key.type = BTRFS_DEV_EXTENT_KEY;
3485
3486         do {
3487                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3488                 if (ret < 0)
3489                         goto done;
3490
3491                 ret = btrfs_previous_item(root, path, 0, key.type);
3492                 if (ret < 0)
3493                         goto done;
3494                 if (ret) {
3495                         ret = 0;
3496                         btrfs_release_path(path);
3497                         break;
3498                 }
3499
3500                 l = path->nodes[0];
3501                 slot = path->slots[0];
3502                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
3503
3504                 if (key.objectid != device->devid) {
3505                         btrfs_release_path(path);
3506                         break;
3507                 }
3508
3509                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3510                 length = btrfs_dev_extent_length(l, dev_extent);
3511
3512                 if (key.offset + length <= new_size) {
3513                         btrfs_release_path(path);
3514                         break;
3515                 }
3516
3517                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3518                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3519                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3520                 btrfs_release_path(path);
3521
3522                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
3523                                            chunk_offset);
3524                 if (ret && ret != -ENOSPC)
3525                         goto done;
3526                 if (ret == -ENOSPC)
3527                         failed++;
3528         } while (key.offset-- > 0);
3529
3530         if (failed && !retried) {
3531                 failed = 0;
3532                 retried = true;
3533                 goto again;
3534         } else if (failed && retried) {
3535                 ret = -ENOSPC;
3536                 lock_chunks(root);
3537
3538                 device->total_bytes = old_size;
3539                 if (device->writeable)
3540                         device->fs_devices->total_rw_bytes += diff;
3541                 spin_lock(&root->fs_info->free_chunk_lock);
3542                 root->fs_info->free_chunk_space += diff;
3543                 spin_unlock(&root->fs_info->free_chunk_lock);
3544                 unlock_chunks(root);
3545                 goto done;
3546         }
3547
3548         /* Shrinking succeeded, else we would be at "done". */
3549         trans = btrfs_start_transaction(root, 0);
3550         if (IS_ERR(trans)) {
3551                 ret = PTR_ERR(trans);
3552                 goto done;
3553         }
3554
3555         lock_chunks(root);
3556
3557         device->disk_total_bytes = new_size;
3558         /* Now btrfs_update_device() will change the on-disk size. */
3559         ret = btrfs_update_device(trans, device);
3560         if (ret) {
3561                 unlock_chunks(root);
3562                 btrfs_end_transaction(trans, root);
3563                 goto done;
3564         }
3565         WARN_ON(diff > old_total);
3566         btrfs_set_super_total_bytes(super_copy, old_total - diff);
3567         unlock_chunks(root);
3568         btrfs_end_transaction(trans, root);
3569 done:
3570         btrfs_free_path(path);
3571         return ret;
3572 }
3573
3574 static int btrfs_add_system_chunk(struct btrfs_root *root,
3575                            struct btrfs_key *key,
3576                            struct btrfs_chunk *chunk, int item_size)
3577 {
3578         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3579         struct btrfs_disk_key disk_key;
3580         u32 array_size;
3581         u8 *ptr;
3582
3583         array_size = btrfs_super_sys_array_size(super_copy);
3584         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
3585                 return -EFBIG;
3586
3587         ptr = super_copy->sys_chunk_array + array_size;
3588         btrfs_cpu_key_to_disk(&disk_key, key);
3589         memcpy(ptr, &disk_key, sizeof(disk_key));
3590         ptr += sizeof(disk_key);
3591         memcpy(ptr, chunk, item_size);
3592         item_size += sizeof(disk_key);
3593         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
3594         return 0;
3595 }
3596
3597 /*
3598  * sort the devices in descending order by max_avail, total_avail
3599  */
3600 static int btrfs_cmp_device_info(const void *a, const void *b)
3601 {
3602         const struct btrfs_device_info *di_a = a;
3603         const struct btrfs_device_info *di_b = b;
3604
3605         if (di_a->max_avail > di_b->max_avail)
3606                 return -1;
3607         if (di_a->max_avail < di_b->max_avail)
3608                 return 1;
3609         if (di_a->total_avail > di_b->total_avail)
3610                 return -1;
3611         if (di_a->total_avail < di_b->total_avail)
3612                 return 1;
3613         return 0;
3614 }
3615
3616 static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3617         [BTRFS_RAID_RAID10] = {
3618                 .sub_stripes    = 2,
3619                 .dev_stripes    = 1,
3620                 .devs_max       = 0,    /* 0 == as many as possible */
3621                 .devs_min       = 4,
3622                 .devs_increment = 2,
3623                 .ncopies        = 2,
3624         },
3625         [BTRFS_RAID_RAID1] = {
3626                 .sub_stripes    = 1,
3627                 .dev_stripes    = 1,
3628                 .devs_max       = 2,
3629                 .devs_min       = 2,
3630                 .devs_increment = 2,
3631                 .ncopies        = 2,
3632         },
3633         [BTRFS_RAID_DUP] = {
3634                 .sub_stripes    = 1,
3635                 .dev_stripes    = 2,
3636                 .devs_max       = 1,
3637                 .devs_min       = 1,
3638                 .devs_increment = 1,
3639                 .ncopies        = 2,
3640         },
3641         [BTRFS_RAID_RAID0] = {
3642                 .sub_stripes    = 1,
3643                 .dev_stripes    = 1,
3644                 .devs_max       = 0,
3645                 .devs_min       = 2,
3646                 .devs_increment = 1,
3647                 .ncopies        = 1,
3648         },
3649         [BTRFS_RAID_SINGLE] = {
3650                 .sub_stripes    = 1,
3651                 .dev_stripes    = 1,
3652                 .devs_max       = 1,
3653                 .devs_min       = 1,
3654                 .devs_increment = 1,
3655                 .ncopies        = 1,
3656         },
3657         [BTRFS_RAID_RAID5] = {
3658                 .sub_stripes    = 1,
3659                 .dev_stripes    = 1,
3660                 .devs_max       = 0,
3661                 .devs_min       = 2,
3662                 .devs_increment = 1,
3663                 .ncopies        = 2,
3664         },
3665         [BTRFS_RAID_RAID6] = {
3666                 .sub_stripes    = 1,
3667                 .dev_stripes    = 1,
3668                 .devs_max       = 0,
3669                 .devs_min       = 3,
3670                 .devs_increment = 1,
3671                 .ncopies        = 3,
3672         },
3673 };
3674
3675 static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
3676 {
3677         /* TODO allow them to set a preferred stripe size */
3678         return 64 * 1024;
3679 }
3680
3681 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3682 {
3683         if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
3684                 return;
3685
3686         btrfs_set_fs_incompat(info, RAID56);
3687 }
3688
3689 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3690                                struct btrfs_root *extent_root, u64 start,
3691                                u64 type)
3692 {
3693         struct btrfs_fs_info *info = extent_root->fs_info;
3694         struct btrfs_fs_devices *fs_devices = info->fs_devices;
3695         struct list_head *cur;
3696         struct map_lookup *map = NULL;
3697         struct extent_map_tree *em_tree;
3698         struct extent_map *em;
3699         struct btrfs_device_info *devices_info = NULL;
3700         u64 total_avail;
3701         int num_stripes;        /* total number of stripes to allocate */
3702         int data_stripes;       /* number of stripes that count for
3703                                    block group size */
3704         int sub_stripes;        /* sub_stripes info for map */
3705         int dev_stripes;        /* stripes per dev */
3706         int devs_max;           /* max devs to use */
3707         int devs_min;           /* min devs needed */
3708         int devs_increment;     /* ndevs has to be a multiple of this */
3709         int ncopies;            /* how many copies to data has */
3710         int ret;
3711         u64 max_stripe_size;
3712         u64 max_chunk_size;
3713         u64 stripe_size;
3714         u64 num_bytes;
3715         u64 raid_stripe_len = BTRFS_STRIPE_LEN;
3716         int ndevs;
3717         int i;
3718         int j;
3719         int index;
3720
3721         BUG_ON(!alloc_profile_is_valid(type, 0));
3722
3723         if (list_empty(&fs_devices->alloc_list))
3724                 return -ENOSPC;
3725
3726         index = __get_raid_index(type);
3727
3728         sub_stripes = btrfs_raid_array[index].sub_stripes;
3729         dev_stripes = btrfs_raid_array[index].dev_stripes;
3730         devs_max = btrfs_raid_array[index].devs_max;
3731         devs_min = btrfs_raid_array[index].devs_min;
3732         devs_increment = btrfs_raid_array[index].devs_increment;
3733         ncopies = btrfs_raid_array[index].ncopies;
3734
3735         if (type & BTRFS_BLOCK_GROUP_DATA) {
3736                 max_stripe_size = 1024 * 1024 * 1024;
3737                 max_chunk_size = 10 * max_stripe_size;
3738         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
3739                 /* for larger filesystems, use larger metadata chunks */
3740                 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
3741                         max_stripe_size = 1024 * 1024 * 1024;
3742                 else
3743                         max_stripe_size = 256 * 1024 * 1024;
3744                 max_chunk_size = max_stripe_size;
3745         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
3746                 max_stripe_size = 32 * 1024 * 1024;
3747                 max_chunk_size = 2 * max_stripe_size;
3748         } else {
3749                 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
3750                        type);
3751                 BUG_ON(1);
3752         }
3753
3754         /* we don't want a chunk larger than 10% of writeable space */
3755         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
3756                              max_chunk_size);
3757
3758         devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
3759                                GFP_NOFS);
3760         if (!devices_info)
3761                 return -ENOMEM;
3762
3763         cur = fs_devices->alloc_list.next;
3764
3765         /*
3766          * in the first pass through the devices list, we gather information
3767          * about the available holes on each device.
3768          */
3769         ndevs = 0;
3770         while (cur != &fs_devices->alloc_list) {
3771                 struct btrfs_device *device;
3772                 u64 max_avail;
3773                 u64 dev_offset;
3774
3775                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
3776
3777                 cur = cur->next;
3778
3779                 if (!device->writeable) {
3780                         WARN(1, KERN_ERR
3781                                "btrfs: read-only device in alloc_list\n");
3782                         continue;
3783                 }
3784
3785                 if (!device->in_fs_metadata ||
3786                     device->is_tgtdev_for_dev_replace)
3787                         continue;
3788
3789                 if (device->total_bytes > device->bytes_used)
3790                         total_avail = device->total_bytes - device->bytes_used;
3791                 else
3792                         total_avail = 0;
3793
3794                 /* If there is no space on this device, skip it. */
3795                 if (total_avail == 0)
3796                         continue;
3797
3798                 ret = find_free_dev_extent(trans, device,
3799                                            max_stripe_size * dev_stripes,
3800                                            &dev_offset, &max_avail);
3801                 if (ret && ret != -ENOSPC)
3802                         goto error;
3803
3804                 if (ret == 0)
3805                         max_avail = max_stripe_size * dev_stripes;
3806
3807                 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3808                         continue;
3809
3810                 if (ndevs == fs_devices->rw_devices) {
3811                         WARN(1, "%s: found more than %llu devices\n",
3812                              __func__, fs_devices->rw_devices);
3813                         break;
3814                 }
3815                 devices_info[ndevs].dev_offset = dev_offset;
3816                 devices_info[ndevs].max_avail = max_avail;
3817                 devices_info[ndevs].total_avail = total_avail;
3818                 devices_info[ndevs].dev = device;
3819                 ++ndevs;
3820         }
3821
3822         /*
3823          * now sort the devices by hole size / available space
3824          */
3825         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
3826              btrfs_cmp_device_info, NULL);
3827
3828         /* round down to number of usable stripes */
3829         ndevs -= ndevs % devs_increment;
3830
3831         if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
3832                 ret = -ENOSPC;
3833                 goto error;
3834         }
3835
3836         if (devs_max && ndevs > devs_max)
3837                 ndevs = devs_max;
3838         /*
3839          * the primary goal is to maximize the number of stripes, so use as many
3840          * devices as possible, even if the stripes are not maximum sized.
3841          */
3842         stripe_size = devices_info[ndevs-1].max_avail;
3843         num_stripes = ndevs * dev_stripes;
3844
3845         /*
3846          * this will have to be fixed for RAID1 and RAID10 over
3847          * more drives
3848          */
3849         data_stripes = num_stripes / ncopies;
3850
3851         if (type & BTRFS_BLOCK_GROUP_RAID5) {
3852                 raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
3853                                  btrfs_super_stripesize(info->super_copy));
3854                 data_stripes = num_stripes - 1;
3855         }
3856         if (type & BTRFS_BLOCK_GROUP_RAID6) {
3857                 raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
3858                                  btrfs_super_stripesize(info->super_copy));
3859                 data_stripes = num_stripes - 2;
3860         }
3861
3862         /*
3863          * Use the number of data stripes to figure out how big this chunk
3864          * is really going to be in terms of logical address space,
3865          * and compare that answer with the max chunk size
3866          */
3867         if (stripe_size * data_stripes > max_chunk_size) {
3868                 u64 mask = (1ULL << 24) - 1;
3869                 stripe_size = max_chunk_size;
3870                 do_div(stripe_size, data_stripes);
3871
3872                 /* bump the answer up to a 16MB boundary */
3873                 stripe_size = (stripe_size + mask) & ~mask;
3874
3875                 /* but don't go higher than the limits we found
3876                  * while searching for free extents
3877                  */
3878                 if (stripe_size > devices_info[ndevs-1].max_avail)
3879                         stripe_size = devices_info[ndevs-1].max_avail;
3880         }
3881
3882         do_div(stripe_size, dev_stripes);
3883
3884         /* align to BTRFS_STRIPE_LEN */
3885         do_div(stripe_size, raid_stripe_len);
3886         stripe_size *= raid_stripe_len;
3887
3888         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3889         if (!map) {
3890                 ret = -ENOMEM;
3891                 goto error;
3892         }
3893         map->num_stripes = num_stripes;
3894
3895         for (i = 0; i < ndevs; ++i) {
3896                 for (j = 0; j < dev_stripes; ++j) {
3897                         int s = i * dev_stripes + j;
3898                         map->stripes[s].dev = devices_info[i].dev;
3899                         map->stripes[s].physical = devices_info[i].dev_offset +
3900                                                    j * stripe_size;
3901                 }
3902         }
3903         map->sector_size = extent_root->sectorsize;
3904         map->stripe_len = raid_stripe_len;
3905         map->io_align = raid_stripe_len;
3906         map->io_width = raid_stripe_len;
3907         map->type = type;
3908         map->sub_stripes = sub_stripes;
3909
3910         num_bytes = stripe_size * data_stripes;
3911
3912         trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
3913
3914         em = alloc_extent_map();
3915         if (!em) {
3916                 ret = -ENOMEM;
3917                 goto error;
3918         }
3919         em->bdev = (struct block_device *)map;
3920         em->start = start;
3921         em->len = num_bytes;
3922         em->block_start = 0;
3923         em->block_len = em->len;
3924         em->orig_block_len = stripe_size;
3925
3926         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3927         write_lock(&em_tree->lock);
3928         ret = add_extent_mapping(em_tree, em, 0);
3929         if (!ret) {
3930                 list_add_tail(&em->list, &trans->transaction->pending_chunks);
3931                 atomic_inc(&em->refs);
3932         }
3933         write_unlock(&em_tree->lock);
3934         if (ret) {
3935                 free_extent_map(em);
3936                 goto error;
3937         }
3938
3939         ret = btrfs_make_block_group(trans, extent_root, 0, type,
3940                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3941                                      start, num_bytes);
3942         if (ret)
3943                 goto error_del_extent;
3944
3945         free_extent_map(em);
3946         check_raid56_incompat_flag(extent_root->fs_info, type);
3947
3948         kfree(devices_info);
3949         return 0;
3950
3951 error_del_extent:
3952         write_lock(&em_tree->lock);
3953         remove_extent_mapping(em_tree, em);
3954         write_unlock(&em_tree->lock);
3955
3956         /* One for our allocation */
3957         free_extent_map(em);
3958         /* One for the tree reference */
3959         free_extent_map(em);
3960 error:
3961         kfree(map);
3962         kfree(devices_info);
3963         return ret;
3964 }
3965
3966 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
3967                                 struct btrfs_root *extent_root,
3968                                 u64 chunk_offset, u64 chunk_size)
3969 {
3970         struct btrfs_key key;
3971         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3972         struct btrfs_device *device;
3973         struct btrfs_chunk *chunk;
3974         struct btrfs_stripe *stripe;
3975         struct extent_map_tree *em_tree;
3976         struct extent_map *em;
3977         struct map_lookup *map;
3978         size_t item_size;
3979         u64 dev_offset;
3980         u64 stripe_size;
3981         int i = 0;
3982         int ret;
3983
3984         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3985         read_lock(&em_tree->lock);
3986         em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
3987         read_unlock(&em_tree->lock);
3988
3989         if (!em) {
3990                 btrfs_crit(extent_root->fs_info, "unable to find logical "
3991                            "%Lu len %Lu", chunk_offset, chunk_size);
3992                 return -EINVAL;
3993         }
3994
3995         if (em->start != chunk_offset || em->len != chunk_size) {
3996                 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
3997                           " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
3998                           chunk_size, em->start, em->len);
3999                 free_extent_map(em);
4000                 return -EINVAL;
4001         }
4002
4003         map = (struct map_lookup *)em->bdev;
4004         item_size = btrfs_chunk_item_size(map->num_stripes);
4005         stripe_size = em->orig_block_len;
4006
4007         chunk = kzalloc(item_size, GFP_NOFS);
4008         if (!chunk) {
4009                 ret = -ENOMEM;
4010                 goto out;
4011         }
4012
4013         for (i = 0; i < map->num_stripes; i++) {
4014                 device = map->stripes[i].dev;
4015                 dev_offset = map->stripes[i].physical;
4016
4017                 device->bytes_used += stripe_size;
4018                 ret = btrfs_update_device(trans, device);
4019                 if (ret)
4020                         goto out;
4021                 ret = btrfs_alloc_dev_extent(trans, device,
4022                                              chunk_root->root_key.objectid,
4023                                              BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4024                                              chunk_offset, dev_offset,
4025                                              stripe_size);
4026                 if (ret)
4027                         goto out;
4028         }
4029
4030         spin_lock(&extent_root->fs_info->free_chunk_lock);
4031         extent_root->fs_info->free_chunk_space -= (stripe_size *
4032                                                    map->num_stripes);
4033         spin_unlock(&extent_root->fs_info->free_chunk_lock);
4034
4035         stripe = &chunk->stripe;
4036         for (i = 0; i < map->num_stripes; i++) {
4037                 device = map->stripes[i].dev;
4038                 dev_offset = map->stripes[i].physical;
4039
4040                 btrfs_set_stack_stripe_devid(stripe, device->devid);
4041                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
4042                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
4043                 stripe++;
4044         }
4045
4046         btrfs_set_stack_chunk_length(chunk, chunk_size);
4047         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
4048         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
4049         btrfs_set_stack_chunk_type(chunk, map->type);
4050         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
4051         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
4052         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
4053         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
4054         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
4055
4056         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
4057         key.type = BTRFS_CHUNK_ITEM_KEY;
4058         key.offset = chunk_offset;
4059
4060         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4061         if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
4062                 /*
4063                  * TODO: Cleanup of inserted chunk root in case of
4064                  * failure.
4065                  */
4066                 ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
4067                                              item_size);
4068         }
4069
4070 out:
4071         kfree(chunk);
4072         free_extent_map(em);
4073         return ret;
4074 }
4075
4076 /*
4077  * Chunk allocation falls into two parts. The first part does works
4078  * that make the new allocated chunk useable, but not do any operation
4079  * that modifies the chunk tree. The second part does the works that
4080  * require modifying the chunk tree. This division is important for the
4081  * bootstrap process of adding storage to a seed btrfs.
4082  */
4083 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4084                       struct btrfs_root *extent_root, u64 type)
4085 {
4086         u64 chunk_offset;
4087
4088         chunk_offset = find_next_chunk(extent_root->fs_info);
4089         return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
4090 }
4091
4092 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4093                                          struct btrfs_root *root,
4094                                          struct btrfs_device *device)
4095 {
4096         u64 chunk_offset;
4097         u64 sys_chunk_offset;
4098         u64 alloc_profile;
4099         struct btrfs_fs_info *fs_info = root->fs_info;
4100         struct btrfs_root *extent_root = fs_info->extent_root;
4101         int ret;
4102
4103         chunk_offset = find_next_chunk(fs_info);
4104         alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
4105         ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
4106                                   alloc_profile);
4107         if (ret)
4108                 return ret;
4109
4110         sys_chunk_offset = find_next_chunk(root->fs_info);
4111         alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
4112         ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
4113                                   alloc_profile);
4114         if (ret) {
4115                 btrfs_abort_transaction(trans, root, ret);
4116                 goto out;
4117         }
4118
4119         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
4120         if (ret)
4121                 btrfs_abort_transaction(trans, root, ret);
4122 out:
4123         return ret;
4124 }
4125
4126 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
4127 {
4128         struct extent_map *em;
4129         struct map_lookup *map;
4130         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4131         int readonly = 0;
4132         int i;
4133
4134         read_lock(&map_tree->map_tree.lock);
4135         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
4136         read_unlock(&map_tree->map_tree.lock);
4137         if (!em)
4138                 return 1;
4139
4140         if (btrfs_test_opt(root, DEGRADED)) {
4141                 free_extent_map(em);
4142                 return 0;
4143         }
4144
4145         map = (struct map_lookup *)em->bdev;
4146         for (i = 0; i < map->num_stripes; i++) {
4147                 if (!map->stripes[i].dev->writeable) {
4148                         readonly = 1;
4149                         break;
4150                 }
4151         }
4152         free_extent_map(em);
4153         return readonly;
4154 }
4155
4156 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
4157 {
4158         extent_map_tree_init(&tree->map_tree);
4159 }
4160
4161 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
4162 {
4163         struct extent_map *em;
4164
4165         while (1) {
4166                 write_lock(&tree->map_tree.lock);
4167                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
4168                 if (em)
4169                         remove_extent_mapping(&tree->map_tree, em);
4170                 write_unlock(&tree->map_tree.lock);
4171                 if (!em)
4172                         break;
4173                 kfree(em->bdev);
4174                 /* once for us */
4175                 free_extent_map(em);
4176                 /* once for the tree */
4177                 free_extent_map(em);
4178         }
4179 }
4180
4181 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4182 {
4183         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
4184         struct extent_map *em;
4185         struct map_lookup *map;
4186         struct extent_map_tree *em_tree = &map_tree->map_tree;
4187         int ret;
4188
4189         read_lock(&em_tree->lock);
4190         em = lookup_extent_mapping(em_tree, logical, len);
4191         read_unlock(&em_tree->lock);
4192
4193         /*
4194          * We could return errors for these cases, but that could get ugly and
4195          * we'd probably do the same thing which is just not do anything else
4196          * and exit, so return 1 so the callers don't try to use other copies.
4197          */
4198         if (!em) {
4199                 btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical,
4200                             logical+len);
4201                 return 1;
4202         }
4203
4204         if (em->start > logical || em->start + em->len < logical) {
4205                 btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
4206                             "%Lu-%Lu\n", logical, logical+len, em->start,
4207                             em->start + em->len);
4208                 return 1;
4209         }
4210
4211         map = (struct map_lookup *)em->bdev;
4212         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
4213                 ret = map->num_stripes;
4214         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4215                 ret = map->sub_stripes;
4216         else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4217                 ret = 2;
4218         else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4219                 ret = 3;
4220         else
4221                 ret = 1;
4222         free_extent_map(em);
4223
4224         btrfs_dev_replace_lock(&fs_info->dev_replace);
4225         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
4226                 ret++;
4227         btrfs_dev_replace_unlock(&fs_info->dev_replace);
4228
4229         return ret;
4230 }
4231
4232 unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4233                                     struct btrfs_mapping_tree *map_tree,
4234                                     u64 logical)
4235 {
4236         struct extent_map *em;
4237         struct map_lookup *map;
4238         struct extent_map_tree *em_tree = &map_tree->map_tree;
4239         unsigned long len = root->sectorsize;
4240
4241         read_lock(&em_tree->lock);
4242         em = lookup_extent_mapping(em_tree, logical, len);
4243         read_unlock(&em_tree->lock);
4244         BUG_ON(!em);
4245
4246         BUG_ON(em->start > logical || em->start + em->len < logical);
4247         map = (struct map_lookup *)em->bdev;
4248         if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4249                          BTRFS_BLOCK_GROUP_RAID6)) {
4250                 len = map->stripe_len * nr_data_stripes(map);
4251         }
4252         free_extent_map(em);
4253         return len;
4254 }
4255
4256 int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4257                            u64 logical, u64 len, int mirror_num)
4258 {
4259         struct extent_map *em;
4260         struct map_lookup *map;
4261         struct extent_map_tree *em_tree = &map_tree->map_tree;
4262         int ret = 0;
4263
4264         read_lock(&em_tree->lock);
4265         em = lookup_extent_mapping(em_tree, logical, len);
4266         read_unlock(&em_tree->lock);
4267         BUG_ON(!em);
4268
4269         BUG_ON(em->start > logical || em->start + em->len < logical);
4270         map = (struct map_lookup *)em->bdev;
4271         if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4272                          BTRFS_BLOCK_GROUP_RAID6))
4273                 ret = 1;
4274         free_extent_map(em);
4275         return ret;
4276 }
4277
4278 static int find_live_mirror(struct btrfs_fs_info *fs_info,
4279                             struct map_lookup *map, int first, int num,
4280                             int optimal, int dev_replace_is_ongoing)
4281 {
4282         int i;
4283         int tolerance;
4284         struct btrfs_device *srcdev;
4285
4286         if (dev_replace_is_ongoing &&
4287             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
4288              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
4289                 srcdev = fs_info->dev_replace.srcdev;
4290         else
4291                 srcdev = NULL;
4292
4293         /*
4294          * try to avoid the drive that is the source drive for a
4295          * dev-replace procedure, only choose it if no other non-missing
4296          * mirror is available
4297          */
4298         for (tolerance = 0; tolerance < 2; tolerance++) {
4299                 if (map->stripes[optimal].dev->bdev &&
4300                     (tolerance || map->stripes[optimal].dev != srcdev))
4301                         return optimal;
4302                 for (i = first; i < first + num; i++) {
4303                         if (map->stripes[i].dev->bdev &&
4304                             (tolerance || map->stripes[i].dev != srcdev))
4305                                 return i;
4306                 }
4307         }
4308
4309         /* we couldn't find one that doesn't fail.  Just return something
4310          * and the io error handling code will clean up eventually
4311          */
4312         return optimal;
4313 }
4314
4315 static inline int parity_smaller(u64 a, u64 b)
4316 {
4317         return a > b;
4318 }
4319
4320 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4321 static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4322 {
4323         struct btrfs_bio_stripe s;
4324         int i;
4325         u64 l;
4326         int again = 1;
4327
4328         while (again) {
4329                 again = 0;
4330                 for (i = 0; i < bbio->num_stripes - 1; i++) {
4331                         if (parity_smaller(raid_map[i], raid_map[i+1])) {
4332                                 s = bbio->stripes[i];
4333                                 l = raid_map[i];
4334                                 bbio->stripes[i] = bbio->stripes[i+1];
4335                                 raid_map[i] = raid_map[i+1];
4336                                 bbio->stripes[i+1] = s;
4337                                 raid_map[i+1] = l;
4338                                 again = 1;
4339                         }
4340                 }
4341         }
4342 }
4343
4344 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4345                              u64 logical, u64 *length,
4346                              struct btrfs_bio **bbio_ret,
4347                              int mirror_num, u64 **raid_map_ret)
4348 {
4349         struct extent_map *em;
4350         struct map_lookup *map;
4351         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
4352         struct extent_map_tree *em_tree = &map_tree->map_tree;
4353         u64 offset;
4354         u64 stripe_offset;
4355         u64 stripe_end_offset;
4356         u64 stripe_nr;
4357         u64 stripe_nr_orig;
4358         u64 stripe_nr_end;
4359         u64 stripe_len;
4360         u64 *raid_map = NULL;
4361         int stripe_index;
4362         int i;
4363         int ret = 0;
4364         int num_stripes;
4365         int max_errors = 0;
4366         struct btrfs_bio *bbio = NULL;
4367         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4368         int dev_replace_is_ongoing = 0;
4369         int num_alloc_stripes;
4370         int patch_the_first_stripe_for_dev_replace = 0;
4371         u64 physical_to_patch_in_first_stripe = 0;
4372         u64 raid56_full_stripe_start = (u64)-1;
4373
4374         read_lock(&em_tree->lock);
4375         em = lookup_extent_mapping(em_tree, logical, *length);
4376         read_unlock(&em_tree->lock);
4377
4378         if (!em) {
4379                 btrfs_crit(fs_info, "unable to find logical %llu len %llu",
4380                         (unsigned long long)logical,
4381                         (unsigned long long)*length);
4382                 return -EINVAL;
4383         }
4384
4385         if (em->start > logical || em->start + em->len < logical) {
4386                 btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
4387                            "found %Lu-%Lu\n", logical, em->start,
4388                            em->start + em->len);
4389                 return -EINVAL;
4390         }
4391
4392         map = (struct map_lookup *)em->bdev;
4393         offset = logical - em->start;
4394
4395         stripe_len = map->stripe_len;
4396         stripe_nr = offset;
4397         /*
4398          * stripe_nr counts the total number of stripes we have to stride
4399          * to get to this block
4400          */
4401         do_div(stripe_nr, stripe_len);
4402
4403         stripe_offset = stripe_nr * stripe_len;
4404         BUG_ON(offset < stripe_offset);
4405
4406         /* stripe_offset is the offset of this block in its stripe*/
4407         stripe_offset = offset - stripe_offset;
4408
4409         /* if we're here for raid56, we need to know the stripe aligned start */
4410         if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4411                 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
4412                 raid56_full_stripe_start = offset;
4413
4414                 /* allow a write of a full stripe, but make sure we don't
4415                  * allow straddling of stripes
4416                  */
4417                 do_div(raid56_full_stripe_start, full_stripe_len);
4418                 raid56_full_stripe_start *= full_stripe_len;
4419         }
4420
4421         if (rw & REQ_DISCARD) {
4422                 /* we don't discard raid56 yet */
4423                 if (map->type &
4424                     (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4425                         ret = -EOPNOTSUPP;
4426                         goto out;
4427                 }
4428                 *length = min_t(u64, em->len - offset, *length);
4429         } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
4430                 u64 max_len;
4431                 /* For writes to RAID[56], allow a full stripeset across all disks.
4432                    For other RAID types and for RAID[56] reads, just allow a single
4433                    stripe (on a single disk). */
4434                 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
4435                     (rw & REQ_WRITE)) {
4436                         max_len = stripe_len * nr_data_stripes(map) -
4437                                 (offset - raid56_full_stripe_start);
4438                 } else {
4439                         /* we limit the length of each bio to what fits in a stripe */
4440                         max_len = stripe_len - stripe_offset;
4441                 }
4442                 *length = min_t(u64, em->len - offset, max_len);
4443         } else {
4444                 *length = em->len - offset;
4445         }
4446
4447         /* This is for when we're called from btrfs_merge_bio_hook() and all
4448            it cares about is the length */
4449         if (!bbio_ret)
4450                 goto out;
4451
4452         btrfs_dev_replace_lock(dev_replace);
4453         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
4454         if (!dev_replace_is_ongoing)
4455                 btrfs_dev_replace_unlock(dev_replace);
4456
4457         if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
4458             !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
4459             dev_replace->tgtdev != NULL) {
4460                 /*
4461                  * in dev-replace case, for repair case (that's the only
4462                  * case where the mirror is selected explicitly when
4463                  * calling btrfs_map_block), blocks left of the left cursor
4464                  * can also be read from the target drive.
4465                  * For REQ_GET_READ_MIRRORS, the target drive is added as
4466                  * the last one to the array of stripes. For READ, it also
4467                  * needs to be supported using the same mirror number.
4468                  * If the requested block is not left of the left cursor,
4469                  * EIO is returned. This can happen because btrfs_num_copies()
4470                  * returns one more in the dev-replace case.
4471                  */
4472                 u64 tmp_length = *length;
4473                 struct btrfs_bio *tmp_bbio = NULL;
4474                 int tmp_num_stripes;
4475                 u64 srcdev_devid = dev_replace->srcdev->devid;
4476                 int index_srcdev = 0;
4477                 int found = 0;
4478                 u64 physical_of_found = 0;
4479
4480                 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4481                              logical, &tmp_length, &tmp_bbio, 0, NULL);
4482                 if (ret) {
4483                         WARN_ON(tmp_bbio != NULL);
4484                         goto out;
4485                 }
4486
4487                 tmp_num_stripes = tmp_bbio->num_stripes;
4488                 if (mirror_num > tmp_num_stripes) {
4489                         /*
4490                          * REQ_GET_READ_MIRRORS does not contain this
4491                          * mirror, that means that the requested area
4492                          * is not left of the left cursor
4493                          */
4494                         ret = -EIO;
4495                         kfree(tmp_bbio);
4496                         goto out;
4497                 }
4498
4499                 /*
4500                  * process the rest of the function using the mirror_num
4501                  * of the source drive. Therefore look it up first.
4502                  * At the end, patch the device pointer to the one of the
4503                  * target drive.
4504                  */
4505                 for (i = 0; i < tmp_num_stripes; i++) {
4506                         if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
4507                                 /*
4508                                  * In case of DUP, in order to keep it
4509                                  * simple, only add the mirror with the
4510                                  * lowest physical address
4511                                  */
4512                                 if (found &&
4513                                     physical_of_found <=
4514                                      tmp_bbio->stripes[i].physical)
4515                                         continue;
4516                                 index_srcdev = i;
4517                                 found = 1;
4518                                 physical_of_found =
4519                                         tmp_bbio->stripes[i].physical;
4520                         }
4521                 }
4522
4523                 if (found) {
4524                         mirror_num = index_srcdev + 1;
4525                         patch_the_first_stripe_for_dev_replace = 1;
4526                         physical_to_patch_in_first_stripe = physical_of_found;
4527                 } else {
4528                         WARN_ON(1);
4529                         ret = -EIO;
4530                         kfree(tmp_bbio);
4531                         goto out;
4532                 }
4533
4534                 kfree(tmp_bbio);
4535         } else if (mirror_num > map->num_stripes) {
4536                 mirror_num = 0;
4537         }
4538
4539         num_stripes = 1;
4540         stripe_index = 0;
4541         stripe_nr_orig = stripe_nr;
4542         stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
4543         do_div(stripe_nr_end, map->stripe_len);
4544         stripe_end_offset = stripe_nr_end * map->stripe_len -
4545                             (offset + *length);
4546
4547         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4548                 if (rw & REQ_DISCARD)
4549                         num_stripes = min_t(u64, map->num_stripes,
4550                                             stripe_nr_end - stripe_nr_orig);
4551                 stripe_index = do_div(stripe_nr, map->num_stripes);
4552         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
4553                 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
4554                         num_stripes = map->num_stripes;
4555                 else if (mirror_num)
4556                         stripe_index = mirror_num - 1;
4557                 else {
4558                         stripe_index = find_live_mirror(fs_info, map, 0,
4559                                             map->num_stripes,
4560                                             current->pid % map->num_stripes,
4561                                             dev_replace_is_ongoing);
4562                         mirror_num = stripe_index + 1;
4563                 }
4564
4565         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
4566                 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
4567                         num_stripes = map->num_stripes;
4568                 } else if (mirror_num) {
4569                         stripe_index = mirror_num - 1;
4570                 } else {
4571                         mirror_num = 1;
4572                 }
4573
4574         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
4575                 int factor = map->num_stripes / map->sub_stripes;
4576
4577                 stripe_index = do_div(stripe_nr, factor);
4578                 stripe_index *= map->sub_stripes;
4579
4580                 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
4581                         num_stripes = map->sub_stripes;
4582                 else if (rw & REQ_DISCARD)
4583                         num_stripes = min_t(u64, map->sub_stripes *
4584                                             (stripe_nr_end - stripe_nr_orig),
4585                                             map->num_stripes);
4586                 else if (mirror_num)
4587                         stripe_index += mirror_num - 1;
4588                 else {
4589                         int old_stripe_index = stripe_index;
4590                         stripe_index = find_live_mirror(fs_info, map,
4591                                               stripe_index,
4592                                               map->sub_stripes, stripe_index +
4593                                               current->pid % map->sub_stripes,
4594                                               dev_replace_is_ongoing);
4595                         mirror_num = stripe_index - old_stripe_index + 1;
4596                 }
4597
4598         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4599                                 BTRFS_BLOCK_GROUP_RAID6)) {
4600                 u64 tmp;
4601
4602                 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
4603                     && raid_map_ret) {
4604                         int i, rot;
4605
4606                         /* push stripe_nr back to the start of the full stripe */
4607                         stripe_nr = raid56_full_stripe_start;
4608                         do_div(stripe_nr, stripe_len);
4609
4610                         stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4611
4612                         /* RAID[56] write or recovery. Return all stripes */
4613                         num_stripes = map->num_stripes;
4614                         max_errors = nr_parity_stripes(map);
4615
4616                         raid_map = kmalloc(sizeof(u64) * num_stripes,
4617                                            GFP_NOFS);
4618                         if (!raid_map) {
4619                                 ret = -ENOMEM;
4620                                 goto out;
4621                         }
4622
4623                         /* Work out the disk rotation on this stripe-set */
4624                         tmp = stripe_nr;
4625                         rot = do_div(tmp, num_stripes);
4626
4627                         /* Fill in the logical address of each stripe */
4628                         tmp = stripe_nr * nr_data_stripes(map);
4629                         for (i = 0; i < nr_data_stripes(map); i++)
4630                                 raid_map[(i+rot) % num_stripes] =
4631                                         em->start + (tmp + i) * map->stripe_len;
4632
4633                         raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
4634                         if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4635                                 raid_map[(i+rot+1) % num_stripes] =
4636                                         RAID6_Q_STRIPE;
4637
4638                         *length = map->stripe_len;
4639                         stripe_index = 0;
4640                         stripe_offset = 0;
4641                 } else {
4642                         /*
4643                          * Mirror #0 or #1 means the original data block.
4644                          * Mirror #2 is RAID5 parity block.
4645                          * Mirror #3 is RAID6 Q block.
4646                          */
4647                         stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4648                         if (mirror_num > 1)
4649                                 stripe_index = nr_data_stripes(map) +
4650                                                 mirror_num - 2;
4651
4652                         /* We distribute the parity blocks across stripes */
4653                         tmp = stripe_nr + stripe_index;
4654                         stripe_index = do_div(tmp, map->num_stripes);
4655                 }
4656         } else {
4657                 /*
4658                  * after this do_div call, stripe_nr is the number of stripes
4659                  * on this device we have to walk to find the data, and
4660                  * stripe_index is the number of our device in the stripe array
4661                  */
4662                 stripe_index = do_div(stripe_nr, map->num_stripes);
4663                 mirror_num = stripe_index + 1;
4664         }
4665         BUG_ON(stripe_index >= map->num_stripes);
4666
4667         num_alloc_stripes = num_stripes;
4668         if (dev_replace_is_ongoing) {
4669                 if (rw & (REQ_WRITE | REQ_DISCARD))
4670                         num_alloc_stripes <<= 1;
4671                 if (rw & REQ_GET_READ_MIRRORS)
4672                         num_alloc_stripes++;
4673         }
4674         bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
4675         if (!bbio) {
4676                 ret = -ENOMEM;
4677                 goto out;
4678         }
4679         atomic_set(&bbio->error, 0);
4680
4681         if (rw & REQ_DISCARD) {
4682                 int factor = 0;
4683                 int sub_stripes = 0;
4684                 u64 stripes_per_dev = 0;
4685                 u32 remaining_stripes = 0;
4686                 u32 last_stripe = 0;
4687
4688                 if (map->type &
4689                     (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
4690                         if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4691                                 sub_stripes = 1;
4692                         else
4693                                 sub_stripes = map->sub_stripes;
4694
4695                         factor = map->num_stripes / sub_stripes;
4696                         stripes_per_dev = div_u64_rem(stripe_nr_end -
4697                                                       stripe_nr_orig,
4698                                                       factor,
4699                                                       &remaining_stripes);
4700                         div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
4701                         last_stripe *= sub_stripes;
4702                 }
4703
4704                 for (i = 0; i < num_stripes; i++) {
4705                         bbio->stripes[i].physical =
4706                                 map->stripes[stripe_index].physical +
4707                                 stripe_offset + stripe_nr * map->stripe_len;
4708                         bbio->stripes[i].dev = map->stripes[stripe_index].dev;
4709
4710                         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
4711                                          BTRFS_BLOCK_GROUP_RAID10)) {
4712                                 bbio->stripes[i].length = stripes_per_dev *
4713                                                           map->stripe_len;
4714
4715                                 if (i / sub_stripes < remaining_stripes)
4716                                         bbio->stripes[i].length +=
4717                                                 map->stripe_len;
4718
4719                                 /*
4720                                  * Special for the first stripe and
4721                                  * the last stripe:
4722                                  *
4723                                  * |-------|...|-------|
4724                                  *     |----------|
4725                                  *    off     end_off
4726                                  */
4727                                 if (i < sub_stripes)
4728                                         bbio->stripes[i].length -=
4729                                                 stripe_offset;
4730
4731                                 if (stripe_index >= last_stripe &&
4732                                     stripe_index <= (last_stripe +
4733                                                      sub_stripes - 1))
4734                                         bbio->stripes[i].length -=
4735                                                 stripe_end_offset;
4736
4737                                 if (i == sub_stripes - 1)
4738                                         stripe_offset = 0;
4739                         } else
4740                                 bbio->stripes[i].length = *length;
4741
4742                         stripe_index++;
4743                         if (stripe_index == map->num_stripes) {
4744                                 /* This could only happen for RAID0/10 */
4745                                 stripe_index = 0;
4746                                 stripe_nr++;
4747                         }
4748                 }
4749         } else {
4750                 for (i = 0; i < num_stripes; i++) {
4751                         bbio->stripes[i].physical =
4752                                 map->stripes[stripe_index].physical +
4753                                 stripe_offset +
4754                                 stripe_nr * map->stripe_len;
4755                         bbio->stripes[i].dev =
4756                                 map->stripes[stripe_index].dev;
4757                         stripe_index++;
4758                 }
4759         }
4760
4761         if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4762                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4763                                  BTRFS_BLOCK_GROUP_RAID10 |
4764                                  BTRFS_BLOCK_GROUP_RAID5 |
4765                                  BTRFS_BLOCK_GROUP_DUP)) {
4766                         max_errors = 1;
4767                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4768                         max_errors = 2;
4769                 }
4770         }
4771
4772         if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
4773             dev_replace->tgtdev != NULL) {
4774                 int index_where_to_add;
4775                 u64 srcdev_devid = dev_replace->srcdev->devid;
4776
4777                 /*
4778                  * duplicate the write operations while the dev replace
4779                  * procedure is running. Since the copying of the old disk
4780                  * to the new disk takes place at run time while the
4781                  * filesystem is mounted writable, the regular write
4782                  * operations to the old disk have to be duplicated to go
4783                  * to the new disk as well.
4784                  * Note that device->missing is handled by the caller, and
4785                  * that the write to the old disk is already set up in the
4786                  * stripes array.
4787                  */
4788                 index_where_to_add = num_stripes;
4789                 for (i = 0; i < num_stripes; i++) {
4790                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
4791                                 /* write to new disk, too */
4792                                 struct btrfs_bio_stripe *new =
4793                                         bbio->stripes + index_where_to_add;
4794                                 struct btrfs_bio_stripe *old =
4795                                         bbio->stripes + i;
4796
4797                                 new->physical = old->physical;
4798                                 new->length = old->length;
4799                                 new->dev = dev_replace->tgtdev;
4800                                 index_where_to_add++;
4801                                 max_errors++;
4802                         }
4803                 }
4804                 num_stripes = index_where_to_add;
4805         } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
4806                    dev_replace->tgtdev != NULL) {
4807                 u64 srcdev_devid = dev_replace->srcdev->devid;
4808                 int index_srcdev = 0;
4809                 int found = 0;
4810                 u64 physical_of_found = 0;
4811
4812                 /*
4813                  * During the dev-replace procedure, the target drive can
4814                  * also be used to read data in case it is needed to repair
4815                  * a corrupt block elsewhere. This is possible if the
4816                  * requested area is left of the left cursor. In this area,
4817                  * the target drive is a full copy of the source drive.
4818                  */
4819                 for (i = 0; i < num_stripes; i++) {
4820                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
4821                                 /*
4822                                  * In case of DUP, in order to keep it
4823                                  * simple, only add the mirror with the
4824                                  * lowest physical address
4825                                  */
4826                                 if (found &&
4827                                     physical_of_found <=
4828                                      bbio->stripes[i].physical)
4829                                         continue;
4830                                 index_srcdev = i;
4831                                 found = 1;
4832                                 physical_of_found = bbio->stripes[i].physical;
4833                         }
4834                 }
4835                 if (found) {
4836                         u64 length = map->stripe_len;
4837
4838                         if (physical_of_found + length <=
4839                             dev_replace->cursor_left) {
4840                                 struct btrfs_bio_stripe *tgtdev_stripe =
4841                                         bbio->stripes + num_stripes;
4842
4843                                 tgtdev_stripe->physical = physical_of_found;
4844                                 tgtdev_stripe->length =
4845                                         bbio->stripes[index_srcdev].length;
4846                                 tgtdev_stripe->dev = dev_replace->tgtdev;
4847
4848                                 num_stripes++;
4849                         }
4850                 }
4851         }
4852
4853         *bbio_ret = bbio;
4854         bbio->num_stripes = num_stripes;
4855         bbio->max_errors = max_errors;
4856         bbio->mirror_num = mirror_num;
4857
4858         /*
4859          * this is the case that REQ_READ && dev_replace_is_ongoing &&
4860          * mirror_num == num_stripes + 1 && dev_replace target drive is
4861          * available as a mirror
4862          */
4863         if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
4864                 WARN_ON(num_stripes > 1);
4865                 bbio->stripes[0].dev = dev_replace->tgtdev;
4866                 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4867                 bbio->mirror_num = map->num_stripes + 1;
4868         }
4869         if (raid_map) {
4870                 sort_parity_stripes(bbio, raid_map);
4871                 *raid_map_ret = raid_map;
4872         }
4873 out:
4874         if (dev_replace_is_ongoing)
4875                 btrfs_dev_replace_unlock(dev_replace);
4876         free_extent_map(em);
4877         return ret;
4878 }
4879
4880 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4881                       u64 logical, u64 *length,
4882                       struct btrfs_bio **bbio_ret, int mirror_num)
4883 {
4884         return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4885                                  mirror_num, NULL);
4886 }
4887
4888 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4889                      u64 chunk_start, u64 physical, u64 devid,
4890                      u64 **logical, int *naddrs, int *stripe_len)
4891 {
4892         struct extent_map_tree *em_tree = &map_tree->map_tree;
4893         struct extent_map *em;
4894         struct map_lookup *map;
4895         u64 *buf;
4896         u64 bytenr;
4897         u64 length;
4898         u64 stripe_nr;
4899         u64 rmap_len;
4900         int i, j, nr = 0;
4901
4902         read_lock(&em_tree->lock);
4903         em = lookup_extent_mapping(em_tree, chunk_start, 1);
4904         read_unlock(&em_tree->lock);
4905
4906         if (!em) {
4907                 printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n",
4908                        chunk_start);
4909                 return -EIO;
4910         }
4911
4912         if (em->start != chunk_start) {
4913                 printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n",
4914                        em->start, chunk_start);
4915                 free_extent_map(em);
4916                 return -EIO;
4917         }
4918         map = (struct map_lookup *)em->bdev;
4919
4920         length = em->len;
4921         rmap_len = map->stripe_len;
4922
4923         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4924                 do_div(length, map->num_stripes / map->sub_stripes);
4925         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4926                 do_div(length, map->num_stripes);
4927         else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4928                               BTRFS_BLOCK_GROUP_RAID6)) {
4929                 do_div(length, nr_data_stripes(map));
4930                 rmap_len = map->stripe_len * nr_data_stripes(map);
4931         }
4932
4933         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
4934         BUG_ON(!buf); /* -ENOMEM */
4935
4936         for (i = 0; i < map->num_stripes; i++) {
4937                 if (devid && map->stripes[i].dev->devid != devid)
4938                         continue;
4939                 if (map->stripes[i].physical > physical ||
4940                     map->stripes[i].physical + length <= physical)
4941                         continue;
4942
4943                 stripe_nr = physical - map->stripes[i].physical;
4944                 do_div(stripe_nr, map->stripe_len);
4945
4946                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
4947                         stripe_nr = stripe_nr * map->num_stripes + i;
4948                         do_div(stripe_nr, map->sub_stripes);
4949                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4950                         stripe_nr = stripe_nr * map->num_stripes + i;
4951                 } /* else if RAID[56], multiply by nr_data_stripes().
4952                    * Alternatively, just use rmap_len below instead of
4953                    * map->stripe_len */
4954
4955                 bytenr = chunk_start + stripe_nr * rmap_len;
4956                 WARN_ON(nr >= map->num_stripes);
4957                 for (j = 0; j < nr; j++) {
4958                         if (buf[j] == bytenr)
4959                                 break;
4960                 }
4961                 if (j == nr) {
4962                         WARN_ON(nr >= map->num_stripes);
4963                         buf[nr++] = bytenr;
4964                 }
4965         }
4966
4967         *logical = buf;
4968         *naddrs = nr;
4969         *stripe_len = rmap_len;
4970
4971         free_extent_map(em);
4972         return 0;
4973 }
4974
4975 static void btrfs_end_bio(struct bio *bio, int err)
4976 {
4977         struct btrfs_bio *bbio = bio->bi_private;
4978         int is_orig_bio = 0;
4979
4980         if (err) {
4981                 atomic_inc(&bbio->error);
4982                 if (err == -EIO || err == -EREMOTEIO) {
4983                         unsigned int stripe_index =
4984                                 btrfs_io_bio(bio)->stripe_index;
4985                         struct btrfs_device *dev;
4986
4987                         BUG_ON(stripe_index >= bbio->num_stripes);
4988                         dev = bbio->stripes[stripe_index].dev;
4989                         if (dev->bdev) {
4990                                 if (bio->bi_rw & WRITE)
4991                                         btrfs_dev_stat_inc(dev,
4992                                                 BTRFS_DEV_STAT_WRITE_ERRS);
4993                                 else
4994                                         btrfs_dev_stat_inc(dev,
4995                                                 BTRFS_DEV_STAT_READ_ERRS);
4996                                 if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
4997                                         btrfs_dev_stat_inc(dev,
4998                                                 BTRFS_DEV_STAT_FLUSH_ERRS);
4999                                 btrfs_dev_stat_print_on_error(dev);
5000                         }
5001                 }
5002         }
5003
5004         if (bio == bbio->orig_bio)
5005                 is_orig_bio = 1;
5006
5007         if (atomic_dec_and_test(&bbio->stripes_pending)) {
5008                 if (!is_orig_bio) {
5009                         bio_put(bio);
5010                         bio = bbio->orig_bio;
5011                 }
5012                 bio->bi_private = bbio->private;
5013                 bio->bi_end_io = bbio->end_io;
5014                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5015                 /* only send an error to the higher layers if it is
5016                  * beyond the tolerance of the btrfs bio
5017                  */
5018                 if (atomic_read(&bbio->error) > bbio->max_errors) {
5019                         err = -EIO;
5020                 } else {
5021                         /*
5022                          * this bio is actually up to date, we didn't
5023                          * go over the max number of errors
5024                          */
5025                         set_bit(BIO_UPTODATE, &bio->bi_flags);
5026                         err = 0;
5027                 }
5028                 kfree(bbio);
5029
5030                 bio_endio(bio, err);
5031         } else if (!is_orig_bio) {
5032                 bio_put(bio);
5033         }
5034 }
5035
5036 struct async_sched {
5037         struct bio *bio;
5038         int rw;
5039         struct btrfs_fs_info *info;
5040         struct btrfs_work work;
5041 };
5042
5043 /*
5044  * see run_scheduled_bios for a description of why bios are collected for
5045  * async submit.
5046  *
5047  * This will add one bio to the pending list for a device and make sure
5048  * the work struct is scheduled.
5049  */
5050 static noinline void btrfs_schedule_bio(struct btrfs_root *root,
5051                                         struct btrfs_device *device,
5052                                         int rw, struct bio *bio)
5053 {
5054         int should_queue = 1;
5055         struct btrfs_pending_bios *pending_bios;
5056
5057         if (device->missing || !device->bdev) {
5058                 bio_endio(bio, -EIO);
5059                 return;
5060         }
5061
5062         /* don't bother with additional async steps for reads, right now */
5063         if (!(rw & REQ_WRITE)) {
5064                 bio_get(bio);
5065                 btrfsic_submit_bio(rw, bio);
5066                 bio_put(bio);
5067                 return;
5068         }
5069
5070         /*
5071          * nr_async_bios allows us to reliably return congestion to the
5072          * higher layers.  Otherwise, the async bio makes it appear we have
5073          * made progress against dirty pages when we've really just put it
5074          * on a queue for later
5075          */
5076         atomic_inc(&root->fs_info->nr_async_bios);
5077         WARN_ON(bio->bi_next);
5078         bio->bi_next = NULL;
5079         bio->bi_rw |= rw;
5080
5081         spin_lock(&device->io_lock);
5082         if (bio->bi_rw & REQ_SYNC)
5083                 pending_bios = &device->pending_sync_bios;
5084         else
5085                 pending_bios = &device->pending_bios;
5086
5087         if (pending_bios->tail)
5088                 pending_bios->tail->bi_next = bio;
5089
5090         pending_bios->tail = bio;
5091         if (!pending_bios->head)
5092                 pending_bios->head = bio;
5093         if (device->running_pending)
5094                 should_queue = 0;
5095
5096         spin_unlock(&device->io_lock);
5097
5098         if (should_queue)
5099                 btrfs_queue_worker(&root->fs_info->submit_workers,
5100                                    &device->work);
5101 }
5102
5103 static int bio_size_ok(struct block_device *bdev, struct bio *bio,
5104                        sector_t sector)
5105 {
5106         struct bio_vec *prev;
5107         struct request_queue *q = bdev_get_queue(bdev);
5108         unsigned short max_sectors = queue_max_sectors(q);
5109         struct bvec_merge_data bvm = {
5110                 .bi_bdev = bdev,
5111                 .bi_sector = sector,
5112                 .bi_rw = bio->bi_rw,
5113         };
5114
5115         if (bio->bi_vcnt == 0) {
5116                 WARN_ON(1);
5117                 return 1;
5118         }
5119
5120         prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
5121         if (bio_sectors(bio) > max_sectors)
5122                 return 0;
5123
5124         if (!q->merge_bvec_fn)
5125                 return 1;
5126
5127         bvm.bi_size = bio->bi_size - prev->bv_len;
5128         if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
5129                 return 0;
5130         return 1;
5131 }
5132
5133 static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5134                               struct bio *bio, u64 physical, int dev_nr,
5135                               int rw, int async)
5136 {
5137         struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
5138
5139         bio->bi_private = bbio;
5140         btrfs_io_bio(bio)->stripe_index = dev_nr;
5141         bio->bi_end_io = btrfs_end_bio;
5142         bio->bi_sector = physical >> 9;
5143 #ifdef DEBUG
5144         {
5145                 struct rcu_string *name;
5146
5147                 rcu_read_lock();
5148                 name = rcu_dereference(dev->name);
5149                 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
5150                          "(%s id %llu), size=%u\n", rw,
5151                          (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
5152                          name->str, dev->devid, bio->bi_size);
5153                 rcu_read_unlock();
5154         }
5155 #endif
5156         bio->bi_bdev = dev->bdev;
5157         if (async)
5158                 btrfs_schedule_bio(root, dev, rw, bio);
5159         else
5160                 btrfsic_submit_bio(rw, bio);
5161 }
5162
5163 static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5164                               struct bio *first_bio, struct btrfs_device *dev,
5165                               int dev_nr, int rw, int async)
5166 {
5167         struct bio_vec *bvec = first_bio->bi_io_vec;
5168         struct bio *bio;
5169         int nr_vecs = bio_get_nr_vecs(dev->bdev);
5170         u64 physical = bbio->stripes[dev_nr].physical;
5171
5172 again:
5173         bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
5174         if (!bio)
5175                 return -ENOMEM;
5176
5177         while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
5178                 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
5179                                  bvec->bv_offset) < bvec->bv_len) {
5180                         u64 len = bio->bi_size;
5181
5182                         atomic_inc(&bbio->stripes_pending);
5183                         submit_stripe_bio(root, bbio, bio, physical, dev_nr,
5184                                           rw, async);
5185                         physical += len;
5186                         goto again;
5187                 }
5188                 bvec++;
5189         }
5190
5191         submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
5192         return 0;
5193 }
5194
5195 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
5196 {
5197         atomic_inc(&bbio->error);
5198         if (atomic_dec_and_test(&bbio->stripes_pending)) {
5199                 bio->bi_private = bbio->private;
5200                 bio->bi_end_io = bbio->end_io;
5201                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5202                 bio->bi_sector = logical >> 9;
5203                 kfree(bbio);
5204                 bio_endio(bio, -EIO);
5205         }
5206 }
5207
5208 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5209                   int mirror_num, int async_submit)
5210 {
5211         struct btrfs_device *dev;
5212         struct bio *first_bio = bio;
5213         u64 logical = (u64)bio->bi_sector << 9;
5214         u64 length = 0;
5215         u64 map_length;
5216         u64 *raid_map = NULL;
5217         int ret;
5218         int dev_nr = 0;
5219         int total_devs = 1;
5220         struct btrfs_bio *bbio = NULL;
5221
5222         length = bio->bi_size;
5223         map_length = length;
5224
5225         ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
5226                               mirror_num, &raid_map);
5227         if (ret) /* -ENOMEM */
5228                 return ret;
5229
5230         total_devs = bbio->num_stripes;
5231         bbio->orig_bio = first_bio;
5232         bbio->private = first_bio->bi_private;
5233         bbio->end_io = first_bio->bi_end_io;
5234         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5235
5236         if (raid_map) {
5237                 /* In this case, map_length has been set to the length of
5238                    a single stripe; not the whole write */
5239                 if (rw & WRITE) {
5240                         return raid56_parity_write(root, bio, bbio,
5241                                                    raid_map, map_length);
5242                 } else {
5243                         return raid56_parity_recover(root, bio, bbio,
5244                                                      raid_map, map_length,
5245                                                      mirror_num);
5246                 }
5247         }
5248
5249         if (map_length < length) {
5250                 btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu",
5251                         (unsigned long long)logical,
5252                         (unsigned long long)length,
5253                         (unsigned long long)map_length);
5254                 BUG();
5255         }
5256
5257         while (dev_nr < total_devs) {
5258                 dev = bbio->stripes[dev_nr].dev;
5259                 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
5260                         bbio_error(bbio, first_bio, logical);
5261                         dev_nr++;
5262                         continue;
5263                 }
5264
5265                 /*
5266                  * Check and see if we're ok with this bio based on it's size
5267                  * and offset with the given device.
5268                  */
5269                 if (!bio_size_ok(dev->bdev, first_bio,
5270                                  bbio->stripes[dev_nr].physical >> 9)) {
5271                         ret = breakup_stripe_bio(root, bbio, first_bio, dev,
5272                                                  dev_nr, rw, async_submit);
5273                         BUG_ON(ret);
5274                         dev_nr++;
5275                         continue;
5276                 }
5277
5278                 if (dev_nr < total_devs - 1) {
5279                         bio = btrfs_bio_clone(first_bio, GFP_NOFS);
5280                         BUG_ON(!bio); /* -ENOMEM */
5281                 } else {
5282                         bio = first_bio;
5283                 }
5284
5285                 submit_stripe_bio(root, bbio, bio,
5286                                   bbio->stripes[dev_nr].physical, dev_nr, rw,
5287                                   async_submit);
5288                 dev_nr++;
5289         }
5290         return 0;
5291 }
5292
5293 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
5294                                        u8 *uuid, u8 *fsid)
5295 {
5296         struct btrfs_device *device;
5297         struct btrfs_fs_devices *cur_devices;
5298
5299         cur_devices = fs_info->fs_devices;
5300         while (cur_devices) {
5301                 if (!fsid ||
5302                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
5303                         device = __find_device(&cur_devices->devices,
5304                                                devid, uuid);
5305                         if (device)
5306                                 return device;
5307                 }
5308                 cur_devices = cur_devices->seed;
5309         }
5310         return NULL;
5311 }
5312
5313 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
5314                                             u64 devid, u8 *dev_uuid)
5315 {
5316         struct btrfs_device *device;
5317         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
5318
5319         device = kzalloc(sizeof(*device), GFP_NOFS);
5320         if (!device)
5321                 return NULL;
5322         list_add(&device->dev_list,
5323                  &fs_devices->devices);
5324         device->devid = devid;
5325         device->work.func = pending_bios_fn;
5326         device->fs_devices = fs_devices;
5327         device->missing = 1;
5328         fs_devices->num_devices++;
5329         fs_devices->missing_devices++;
5330         spin_lock_init(&device->io_lock);
5331         INIT_LIST_HEAD(&device->dev_alloc_list);
5332         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
5333         return device;
5334 }
5335
5336 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
5337                           struct extent_buffer *leaf,
5338                           struct btrfs_chunk *chunk)
5339 {
5340         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
5341         struct map_lookup *map;
5342         struct extent_map *em;
5343         u64 logical;
5344         u64 length;
5345         u64 devid;
5346         u8 uuid[BTRFS_UUID_SIZE];
5347         int num_stripes;
5348         int ret;
5349         int i;
5350
5351         logical = key->offset;
5352         length = btrfs_chunk_length(leaf, chunk);
5353
5354         read_lock(&map_tree->map_tree.lock);
5355         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
5356         read_unlock(&map_tree->map_tree.lock);
5357
5358         /* already mapped? */
5359         if (em && em->start <= logical && em->start + em->len > logical) {
5360                 free_extent_map(em);
5361                 return 0;
5362         } else if (em) {
5363                 free_extent_map(em);
5364         }
5365
5366         em = alloc_extent_map();
5367         if (!em)
5368                 return -ENOMEM;
5369         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
5370         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
5371         if (!map) {
5372                 free_extent_map(em);
5373                 return -ENOMEM;
5374         }
5375
5376         em->bdev = (struct block_device *)map;
5377         em->start = logical;
5378         em->len = length;
5379         em->orig_start = 0;
5380         em->block_start = 0;
5381         em->block_len = em->len;
5382
5383         map->num_stripes = num_stripes;
5384         map->io_width = btrfs_chunk_io_width(leaf, chunk);
5385         map->io_align = btrfs_chunk_io_align(leaf, chunk);
5386         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
5387         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
5388         map->type = btrfs_chunk_type(leaf, chunk);
5389         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
5390         for (i = 0; i < num_stripes; i++) {
5391                 map->stripes[i].physical =
5392                         btrfs_stripe_offset_nr(leaf, chunk, i);
5393                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
5394                 read_extent_buffer(leaf, uuid, (unsigned long)
5395                                    btrfs_stripe_dev_uuid_nr(chunk, i),
5396                                    BTRFS_UUID_SIZE);
5397                 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
5398                                                         uuid, NULL);
5399                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
5400                         kfree(map);
5401                         free_extent_map(em);
5402                         return -EIO;
5403                 }
5404                 if (!map->stripes[i].dev) {
5405                         map->stripes[i].dev =
5406                                 add_missing_dev(root, devid, uuid);
5407                         if (!map->stripes[i].dev) {
5408                                 kfree(map);
5409                                 free_extent_map(em);
5410                                 return -EIO;
5411                         }
5412                 }
5413                 map->stripes[i].dev->in_fs_metadata = 1;
5414         }
5415
5416         write_lock(&map_tree->map_tree.lock);
5417         ret = add_extent_mapping(&map_tree->map_tree, em, 0);
5418         write_unlock(&map_tree->map_tree.lock);
5419         BUG_ON(ret); /* Tree corruption */
5420         free_extent_map(em);
5421
5422         return 0;
5423 }
5424
5425 static void fill_device_from_item(struct extent_buffer *leaf,
5426                                  struct btrfs_dev_item *dev_item,
5427                                  struct btrfs_device *device)
5428 {
5429         unsigned long ptr;
5430
5431         device->devid = btrfs_device_id(leaf, dev_item);
5432         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
5433         device->total_bytes = device->disk_total_bytes;
5434         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
5435         device->type = btrfs_device_type(leaf, dev_item);
5436         device->io_align = btrfs_device_io_align(leaf, dev_item);
5437         device->io_width = btrfs_device_io_width(leaf, dev_item);
5438         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
5439         WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
5440         device->is_tgtdev_for_dev_replace = 0;
5441
5442         ptr = (unsigned long)btrfs_device_uuid(dev_item);
5443         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
5444 }
5445
5446 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
5447 {
5448         struct btrfs_fs_devices *fs_devices;
5449         int ret;
5450
5451         BUG_ON(!mutex_is_locked(&uuid_mutex));
5452
5453         fs_devices = root->fs_info->fs_devices->seed;
5454         while (fs_devices) {
5455                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
5456                         ret = 0;
5457                         goto out;
5458                 }
5459                 fs_devices = fs_devices->seed;
5460         }
5461
5462         fs_devices = find_fsid(fsid);
5463         if (!fs_devices) {
5464                 ret = -ENOENT;
5465                 goto out;
5466         }
5467
5468         fs_devices = clone_fs_devices(fs_devices);
5469         if (IS_ERR(fs_devices)) {
5470                 ret = PTR_ERR(fs_devices);
5471                 goto out;
5472         }
5473
5474         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
5475                                    root->fs_info->bdev_holder);
5476         if (ret) {
5477                 free_fs_devices(fs_devices);
5478                 goto out;
5479         }
5480
5481         if (!fs_devices->seeding) {
5482                 __btrfs_close_devices(fs_devices);
5483                 free_fs_devices(fs_devices);
5484                 ret = -EINVAL;
5485                 goto out;
5486         }
5487
5488         fs_devices->seed = root->fs_info->fs_devices->seed;
5489         root->fs_info->fs_devices->seed = fs_devices;
5490 out:
5491         return ret;
5492 }
5493
5494 static int read_one_dev(struct btrfs_root *root,
5495                         struct extent_buffer *leaf,
5496                         struct btrfs_dev_item *dev_item)
5497 {
5498         struct btrfs_device *device;
5499         u64 devid;
5500         int ret;
5501         u8 fs_uuid[BTRFS_UUID_SIZE];
5502         u8 dev_uuid[BTRFS_UUID_SIZE];
5503
5504         devid = btrfs_device_id(leaf, dev_item);
5505         read_extent_buffer(leaf, dev_uuid,
5506                            (unsigned long)btrfs_device_uuid(dev_item),
5507                            BTRFS_UUID_SIZE);
5508         read_extent_buffer(leaf, fs_uuid,
5509                            (unsigned long)btrfs_device_fsid(dev_item),
5510                            BTRFS_UUID_SIZE);
5511
5512         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
5513                 ret = open_seed_devices(root, fs_uuid);
5514                 if (ret && !btrfs_test_opt(root, DEGRADED))
5515                         return ret;
5516         }
5517
5518         device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
5519         if (!device || !device->bdev) {
5520                 if (!btrfs_test_opt(root, DEGRADED))
5521                         return -EIO;
5522
5523                 if (!device) {
5524                         btrfs_warn(root->fs_info, "devid %llu missing",
5525                                 (unsigned long long)devid);
5526                         device = add_missing_dev(root, devid, dev_uuid);
5527                         if (!device)
5528                                 return -ENOMEM;
5529                 } else if (!device->missing) {
5530                         /*
5531                          * this happens when a device that was properly setup
5532                          * in the device info lists suddenly goes bad.
5533                          * device->bdev is NULL, and so we have to set
5534                          * device->missing to one here
5535                          */
5536                         root->fs_info->fs_devices->missing_devices++;
5537                         device->missing = 1;
5538                 }
5539         }
5540
5541         if (device->fs_devices != root->fs_info->fs_devices) {
5542                 BUG_ON(device->writeable);
5543                 if (device->generation !=
5544                     btrfs_device_generation(leaf, dev_item))
5545                         return -EINVAL;
5546         }
5547
5548         fill_device_from_item(leaf, dev_item, device);
5549         device->in_fs_metadata = 1;
5550         if (device->writeable && !device->is_tgtdev_for_dev_replace) {
5551                 device->fs_devices->total_rw_bytes += device->total_bytes;
5552                 spin_lock(&root->fs_info->free_chunk_lock);
5553                 root->fs_info->free_chunk_space += device->total_bytes -
5554                         device->bytes_used;
5555                 spin_unlock(&root->fs_info->free_chunk_lock);
5556         }
5557         ret = 0;
5558         return ret;
5559 }
5560
5561 int btrfs_read_sys_array(struct btrfs_root *root)
5562 {
5563         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
5564         struct extent_buffer *sb;
5565         struct btrfs_disk_key *disk_key;
5566         struct btrfs_chunk *chunk;
5567         u8 *ptr;
5568         unsigned long sb_ptr;
5569         int ret = 0;
5570         u32 num_stripes;
5571         u32 array_size;
5572         u32 len = 0;
5573         u32 cur;
5574         struct btrfs_key key;
5575
5576         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
5577                                           BTRFS_SUPER_INFO_SIZE);
5578         if (!sb)
5579                 return -ENOMEM;
5580         btrfs_set_buffer_uptodate(sb);
5581         btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
5582         /*
5583          * The sb extent buffer is artifical and just used to read the system array.
5584          * btrfs_set_buffer_uptodate() call does not properly mark all it's
5585          * pages up-to-date when the page is larger: extent does not cover the
5586          * whole page and consequently check_page_uptodate does not find all
5587          * the page's extents up-to-date (the hole beyond sb),
5588          * write_extent_buffer then triggers a WARN_ON.
5589          *
5590          * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
5591          * but sb spans only this function. Add an explicit SetPageUptodate call
5592          * to silence the warning eg. on PowerPC 64.
5593          */
5594         if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
5595                 SetPageUptodate(sb->pages[0]);
5596
5597         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
5598         array_size = btrfs_super_sys_array_size(super_copy);
5599
5600         ptr = super_copy->sys_chunk_array;
5601         sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
5602         cur = 0;
5603
5604         while (cur < array_size) {
5605                 disk_key = (struct btrfs_disk_key *)ptr;
5606                 btrfs_disk_key_to_cpu(&key, disk_key);
5607
5608                 len = sizeof(*disk_key); ptr += len;
5609                 sb_ptr += len;
5610                 cur += len;
5611
5612                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
5613                         chunk = (struct btrfs_chunk *)sb_ptr;
5614                         ret = read_one_chunk(root, &key, sb, chunk);
5615                         if (ret)
5616                                 break;
5617                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
5618                         len = btrfs_chunk_item_size(num_stripes);
5619                 } else {
5620                         ret = -EIO;
5621                         break;
5622                 }
5623                 ptr += len;
5624                 sb_ptr += len;
5625                 cur += len;
5626         }
5627         free_extent_buffer(sb);
5628         return ret;
5629 }
5630
5631 int btrfs_read_chunk_tree(struct btrfs_root *root)
5632 {
5633         struct btrfs_path *path;
5634         struct extent_buffer *leaf;
5635         struct btrfs_key key;
5636         struct btrfs_key found_key;
5637         int ret;
5638         int slot;
5639
5640         root = root->fs_info->chunk_root;
5641
5642         path = btrfs_alloc_path();
5643         if (!path)
5644                 return -ENOMEM;
5645
5646         mutex_lock(&uuid_mutex);
5647         lock_chunks(root);
5648
5649         /* first we search for all of the device items, and then we
5650          * read in all of the chunk items.  This way we can create chunk
5651          * mappings that reference all of the devices that are afound
5652          */
5653         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
5654         key.offset = 0;
5655         key.type = 0;
5656 again:
5657         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5658         if (ret < 0)
5659                 goto error;
5660         while (1) {
5661                 leaf = path->nodes[0];
5662                 slot = path->slots[0];
5663                 if (slot >= btrfs_header_nritems(leaf)) {
5664                         ret = btrfs_next_leaf(root, path);
5665                         if (ret == 0)
5666                                 continue;
5667                         if (ret < 0)
5668                                 goto error;
5669                         break;
5670                 }
5671                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5672                 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
5673                         if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
5674                                 break;
5675                         if (found_key.type == BTRFS_DEV_ITEM_KEY) {
5676                                 struct btrfs_dev_item *dev_item;
5677                                 dev_item = btrfs_item_ptr(leaf, slot,
5678                                                   struct btrfs_dev_item);
5679                                 ret = read_one_dev(root, leaf, dev_item);
5680                                 if (ret)
5681                                         goto error;
5682                         }
5683                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
5684                         struct btrfs_chunk *chunk;
5685                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
5686                         ret = read_one_chunk(root, &found_key, leaf, chunk);
5687                         if (ret)
5688                                 goto error;
5689                 }
5690                 path->slots[0]++;
5691         }
5692         if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
5693                 key.objectid = 0;
5694                 btrfs_release_path(path);
5695                 goto again;
5696         }
5697         ret = 0;
5698 error:
5699         unlock_chunks(root);
5700         mutex_unlock(&uuid_mutex);
5701
5702         btrfs_free_path(path);
5703         return ret;
5704 }
5705
5706 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
5707 {
5708         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
5709         struct btrfs_device *device;
5710
5711         mutex_lock(&fs_devices->device_list_mutex);
5712         list_for_each_entry(device, &fs_devices->devices, dev_list)
5713                 device->dev_root = fs_info->dev_root;
5714         mutex_unlock(&fs_devices->device_list_mutex);
5715 }
5716
5717 static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
5718 {
5719         int i;
5720
5721         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
5722                 btrfs_dev_stat_reset(dev, i);
5723 }
5724
5725 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
5726 {
5727         struct btrfs_key key;
5728         struct btrfs_key found_key;
5729         struct btrfs_root *dev_root = fs_info->dev_root;
5730         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
5731         struct extent_buffer *eb;
5732         int slot;
5733         int ret = 0;
5734         struct btrfs_device *device;
5735         struct btrfs_path *path = NULL;
5736         int i;
5737
5738         path = btrfs_alloc_path();
5739         if (!path) {
5740                 ret = -ENOMEM;
5741                 goto out;
5742         }
5743
5744         mutex_lock(&fs_devices->device_list_mutex);
5745         list_for_each_entry(device, &fs_devices->devices, dev_list) {
5746                 int item_size;
5747                 struct btrfs_dev_stats_item *ptr;
5748
5749                 key.objectid = 0;
5750                 key.type = BTRFS_DEV_STATS_KEY;
5751                 key.offset = device->devid;
5752                 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
5753                 if (ret) {
5754                         __btrfs_reset_dev_stats(device);
5755                         device->dev_stats_valid = 1;
5756                         btrfs_release_path(path);
5757                         continue;
5758                 }
5759                 slot = path->slots[0];
5760                 eb = path->nodes[0];
5761                 btrfs_item_key_to_cpu(eb, &found_key, slot);
5762                 item_size = btrfs_item_size_nr(eb, slot);
5763
5764                 ptr = btrfs_item_ptr(eb, slot,
5765                                      struct btrfs_dev_stats_item);
5766
5767                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
5768                         if (item_size >= (1 + i) * sizeof(__le64))
5769                                 btrfs_dev_stat_set(device, i,
5770                                         btrfs_dev_stats_value(eb, ptr, i));
5771                         else
5772                                 btrfs_dev_stat_reset(device, i);
5773                 }
5774
5775                 device->dev_stats_valid = 1;
5776                 btrfs_dev_stat_print_on_load(device);
5777                 btrfs_release_path(path);
5778         }
5779         mutex_unlock(&fs_devices->device_list_mutex);
5780
5781 out:
5782         btrfs_free_path(path);
5783         return ret < 0 ? ret : 0;
5784 }
5785
5786 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
5787                                 struct btrfs_root *dev_root,
5788                                 struct btrfs_device *device)
5789 {
5790         struct btrfs_path *path;
5791         struct btrfs_key key;
5792         struct extent_buffer *eb;
5793         struct btrfs_dev_stats_item *ptr;
5794         int ret;
5795         int i;
5796
5797         key.objectid = 0;
5798         key.type = BTRFS_DEV_STATS_KEY;
5799         key.offset = device->devid;
5800
5801         path = btrfs_alloc_path();
5802         BUG_ON(!path);
5803         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
5804         if (ret < 0) {
5805                 printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
5806                               ret, rcu_str_deref(device->name));
5807                 goto out;
5808         }
5809
5810         if (ret == 0 &&
5811             btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
5812                 /* need to delete old one and insert a new one */
5813                 ret = btrfs_del_item(trans, dev_root, path);
5814                 if (ret != 0) {
5815                         printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
5816                                       rcu_str_deref(device->name), ret);
5817                         goto out;
5818                 }
5819                 ret = 1;
5820         }
5821
5822         if (ret == 1) {
5823                 /* need to insert a new item */
5824                 btrfs_release_path(path);
5825                 ret = btrfs_insert_empty_item(trans, dev_root, path,
5826                                               &key, sizeof(*ptr));
5827                 if (ret < 0) {
5828                         printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
5829                                       rcu_str_deref(device->name), ret);
5830                         goto out;
5831                 }
5832         }
5833
5834         eb = path->nodes[0];
5835         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
5836         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
5837                 btrfs_set_dev_stats_value(eb, ptr, i,
5838                                           btrfs_dev_stat_read(device, i));
5839         btrfs_mark_buffer_dirty(eb);
5840
5841 out:
5842         btrfs_free_path(path);
5843         return ret;
5844 }
5845
5846 /*
5847  * called from commit_transaction. Writes all changed device stats to disk.
5848  */
5849 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
5850                         struct btrfs_fs_info *fs_info)
5851 {
5852         struct btrfs_root *dev_root = fs_info->dev_root;
5853         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
5854         struct btrfs_device *device;
5855         int ret = 0;
5856
5857         mutex_lock(&fs_devices->device_list_mutex);
5858         list_for_each_entry(device, &fs_devices->devices, dev_list) {
5859                 if (!device->dev_stats_valid || !device->dev_stats_dirty)
5860                         continue;
5861
5862                 ret = update_dev_stat_item(trans, dev_root, device);
5863                 if (!ret)
5864                         device->dev_stats_dirty = 0;
5865         }
5866         mutex_unlock(&fs_devices->device_list_mutex);
5867
5868         return ret;
5869 }
5870
5871 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
5872 {
5873         btrfs_dev_stat_inc(dev, index);
5874         btrfs_dev_stat_print_on_error(dev);
5875 }
5876
5877 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
5878 {
5879         if (!dev->dev_stats_valid)
5880                 return;
5881         printk_ratelimited_in_rcu(KERN_ERR
5882                            "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
5883                            rcu_str_deref(dev->name),
5884                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
5885                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
5886                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
5887                            btrfs_dev_stat_read(dev,
5888                                                BTRFS_DEV_STAT_CORRUPTION_ERRS),
5889                            btrfs_dev_stat_read(dev,
5890                                                BTRFS_DEV_STAT_GENERATION_ERRS));
5891 }
5892
5893 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
5894 {
5895         int i;
5896
5897         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
5898                 if (btrfs_dev_stat_read(dev, i) != 0)
5899                         break;
5900         if (i == BTRFS_DEV_STAT_VALUES_MAX)
5901                 return; /* all values == 0, suppress message */
5902
5903         printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
5904                rcu_str_deref(dev->name),
5905                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
5906                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
5907                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
5908                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
5909                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
5910 }
5911
5912 int btrfs_get_dev_stats(struct btrfs_root *root,
5913                         struct btrfs_ioctl_get_dev_stats *stats)
5914 {
5915         struct btrfs_device *dev;
5916         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
5917         int i;
5918
5919         mutex_lock(&fs_devices->device_list_mutex);
5920         dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
5921         mutex_unlock(&fs_devices->device_list_mutex);
5922
5923         if (!dev) {
5924                 printk(KERN_WARNING
5925                        "btrfs: get dev_stats failed, device not found\n");
5926                 return -ENODEV;
5927         } else if (!dev->dev_stats_valid) {
5928                 printk(KERN_WARNING
5929                        "btrfs: get dev_stats failed, not yet valid\n");
5930                 return -ENODEV;
5931         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
5932                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
5933                         if (stats->nr_items > i)
5934                                 stats->values[i] =
5935                                         btrfs_dev_stat_read_and_reset(dev, i);
5936                         else
5937                                 btrfs_dev_stat_reset(dev, i);
5938                 }
5939         } else {
5940                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
5941                         if (stats->nr_items > i)
5942                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
5943         }
5944         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
5945                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
5946         return 0;
5947 }
5948
5949 int btrfs_scratch_superblock(struct btrfs_device *device)
5950 {
5951         struct buffer_head *bh;
5952         struct btrfs_super_block *disk_super;
5953
5954         bh = btrfs_read_dev_super(device->bdev);
5955         if (!bh)
5956                 return -EINVAL;
5957         disk_super = (struct btrfs_super_block *)bh->b_data;
5958
5959         memset(&disk_super->magic, 0, sizeof(disk_super->magic));
5960         set_buffer_dirty(bh);
5961         sync_dirty_buffer(bh);
5962         brelse(bh);
5963
5964         return 0;
5965 }