drivers/block/rbd.c

   1 /*
   2    rbd.c -- Export ceph rados objects as a Linux block device
   3
   4
   5    based on drivers/block/osdblk.c:
   6
   7    Copyright 2009 Red Hat, Inc.
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; see the file COPYING.  If not, write to
  20    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  21
  22
  23
  24    For usage instructions, please refer to:
  25
  26                  Documentation/ABI/testing/sysfs-bus-rbd
  27
  28  */
  29
  30 #include <linux/ceph/libceph.h>
  31 #include <linux/ceph/osd_client.h>
  32 #include <linux/ceph/mon_client.h>
  33 #include <linux/ceph/decode.h>
  34 #include <linux/parser.h>
  35
  36 #include <linux/kernel.h>
  37 #include <linux/device.h>
  38 #include <linux/module.h>
  39 #include <linux/fs.h>
  40 #include <linux/blkdev.h>
  41
  42 #include "rbd_types.h"
  43
  44 #define RBD_DEBUG       /* Activate rbd_assert() calls */
  45
  46 /*
  47  * The basic unit of block I/O is a sector.  It is interpreted in a
  48  * number of contexts in Linux (blk, bio, genhd), but the default is
  49  * universally 512 bytes.  These symbols are just slightly more
  50  * meaningful than the bare numbers they represent.
  51  */
  52 #define SECTOR_SHIFT    9
  53 #define SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
  54
  55 /* It might be useful to have this defined elsewhere too */
  56
  57 #define U64_MAX ((u64) (~0ULL))
  58
  59 #define RBD_DRV_NAME "rbd"
  60 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
  61
  62 #define RBD_MINORS_PER_MAJOR    256             /* max minors per blkdev */
  63
  64 #define RBD_SNAP_DEV_NAME_PREFIX        "snap_"
  65 #define RBD_MAX_SNAP_NAME_LEN   \
  66                         (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
  67
  68 #define RBD_MAX_SNAP_COUNT      510     /* allows max snapc to fit in 4KB */
  69 #define RBD_MAX_OPT_LEN         1024
  70
  71 #define RBD_SNAP_HEAD_NAME      "-"
  72
  73 /* This allows a single page to hold an image name sent by OSD */
  74 #define RBD_IMAGE_NAME_LEN_MAX  (PAGE_SIZE - sizeof (__le32) - 1)
  75 #define RBD_IMAGE_ID_LEN_MAX    64
  76
  77 #define RBD_OBJ_PREFIX_LEN_MAX  64
  78
  79 /* Feature bits */
  80
  81 #define RBD_FEATURE_LAYERING      1
  82
  83 /* Features supported by this (client software) implementation. */
  84
  85 #define RBD_FEATURES_ALL          (0)
  86
  87 /*
  88  * An RBD device name will be "rbd#", where the "rbd" comes from
  89  * RBD_DRV_NAME above, and # is a unique integer identifier.
  90  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
  91  * enough to hold all possible device names.
  92  */
  93 #define DEV_NAME_LEN            32
  94 #define MAX_INT_FORMAT_WIDTH    ((5 * sizeof (int)) / 2 + 1)
  95
  96 #define RBD_READ_ONLY_DEFAULT           false
  97
  98 /*
  99  * block device image metadata (in-memory version)
 100  */
 101 struct rbd_image_header {
 102         /* These four fields never change for a given rbd image */
 103         char *object_prefix;
 104         u64 features;
 105         __u8 obj_order;
 106         __u8 crypt_type;
 107         __u8 comp_type;
 108
 109         /* The remaining fields need to be updated occasionally */
 110         u64 image_size;
 111         struct ceph_snap_context *snapc;
 112         char *snap_names;
 113         u64 *snap_sizes;
 114
 115         u64 obj_version;
 116 };
 117
 118 /*
 119  * An rbd image specification.
 120  *
 121  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
 122  * identify an image.  Each rbd_dev structure includes a pointer to
 123  * an rbd_spec structure that encapsulates this identity.
 124  *
 125  * Each of the id's in an rbd_spec has an associated name.  For a
 126  * user-mapped image, the names are supplied and the id's associated
 127  * with them are looked up.  For a layered image, a parent image is
 128  * defined by the tuple, and the names are looked up.
 129  *
 130  * An rbd_dev structure contains a parent_spec pointer which is
 131  * non-null if the image it represents is a child in a layered
 132  * image.  This pointer will refer to the rbd_spec structure used
 133  * by the parent rbd_dev for its own identity (i.e., the structure
 134  * is shared between the parent and child).
 135  *
 136  * Since these structures are populated once, during the discovery
 137  * phase of image construction, they are effectively immutable so
 138  * we make no effort to synchronize access to them.
 139  *
 140  * Note that code herein does not assume the image name is known (it
 141  * could be a null pointer).
 142  */
 143 struct rbd_spec {
 144         u64             pool_id;
 145         char            *pool_name;
 146
 147         char            *image_id;
 148         char            *image_name;
 149
 150         u64             snap_id;
 151         char            *snap_name;
 152
 153         struct kref     kref;
 154 };
 155
 156 struct rbd_options {
 157         bool    read_only;
 158 };
 159
 160 /*
 161  * an instance of the client.  multiple devices may share an rbd client.
 162  */
 163 struct rbd_client {
 164         struct ceph_client      *client;
 165         struct kref             kref;
 166         struct list_head        node;
 167 };
 168
 169 /*
 170  * a request completion status
 171  */
 172 struct rbd_req_status {
 173         int done;
 174         s32 rc;
 175         u64 bytes;
 176 };
 177
 178 /*
 179  * a collection of requests
 180  */
 181 struct rbd_req_coll {
 182         int                     total;
 183         int                     num_done;
 184         struct kref             kref;
 185         struct rbd_req_status   status[0];
 186 };
 187
 188 /*
 189  * a single io request
 190  */
 191 struct rbd_request {
 192         struct request          *rq;            /* blk layer request */
 193         struct bio              *bio;           /* cloned bio */
 194         struct page             **pages;        /* list of used pages */
 195         u64                     len;
 196         int                     coll_index;
 197         struct rbd_req_coll     *coll;
 198 };
 199
 200 struct rbd_snap {
 201         struct  device          dev;
 202         const char              *name;
 203         u64                     size;
 204         struct list_head        node;
 205         u64                     id;
 206         u64                     features;
 207 };
 208
 209 struct rbd_mapping {
 210         u64                     size;
 211         u64                     features;
 212         bool                    read_only;
 213 };
 214
 215 /*
 216  * a single device
 217  */
 218 struct rbd_device {
 219         int                     dev_id;         /* blkdev unique id */
 220
 221         int                     major;          /* blkdev assigned major */
 222         struct gendisk          *disk;          /* blkdev's gendisk and rq */
 223
 224         u32                     image_format;   /* Either 1 or 2 */
 225         struct rbd_client       *rbd_client;
 226
 227         char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 228
 229         spinlock_t              lock;           /* queue lock */
 230
 231         struct rbd_image_header header;
 232         atomic_t                exists;
 233         struct rbd_spec         *spec;
 234
 235         char                    *header_name;
 236
 237         struct ceph_osd_event   *watch_event;
 238         struct ceph_osd_request *watch_request;
 239
 240         struct rbd_spec         *parent_spec;
 241         u64                     parent_overlap;
 242
 243         /* protects updating the header */
 244         struct rw_semaphore     header_rwsem;
 245
 246         struct rbd_mapping      mapping;
 247
 248         struct list_head        node;
 249
 250         /* list of snapshots */
 251         struct list_head        snaps;
 252
 253         /* sysfs related */
 254         struct device           dev;
 255         unsigned long           open_count;
 256 };
 257
 258 static DEFINE_MUTEX(ctl_mutex);   /* Serialize open/close/setup/teardown */
 259
 260 static LIST_HEAD(rbd_dev_list);    /* devices */
 261 static DEFINE_SPINLOCK(rbd_dev_list_lock);
 262
 263 static LIST_HEAD(rbd_client_list);              /* clients */
 264 static DEFINE_SPINLOCK(rbd_client_list_lock);
 265
 266 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
 267 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
 268
 269 static void rbd_dev_release(struct device *dev);
 270 static void rbd_remove_snap_dev(struct rbd_snap *snap);
 271
 272 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 273                        size_t count);
 274 static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
 275                           size_t count);
 276
 277 static struct bus_attribute rbd_bus_attrs[] = {
 278         __ATTR(add, S_IWUSR, NULL, rbd_add),
 279         __ATTR(remove, S_IWUSR, NULL, rbd_remove),
 280         __ATTR_NULL
 281 };
 282
 283 static struct bus_type rbd_bus_type = {
 284         .name           = "rbd",
 285         .bus_attrs      = rbd_bus_attrs,
 286 };
 287
 288 static void rbd_root_dev_release(struct device *dev)
 289 {
 290 }
 291
 292 static struct device rbd_root_dev = {
 293         .init_name =    "rbd",
 294         .release =      rbd_root_dev_release,
 295 };
 296
 297 static __printf(2, 3)
 298 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
 299 {
 300         struct va_format vaf;
 301         va_list args;
 302
 303         va_start(args, fmt);
 304         vaf.fmt = fmt;
 305         vaf.va = &args;
 306
 307         if (!rbd_dev)
 308                 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
 309         else if (rbd_dev->disk)
 310                 printk(KERN_WARNING "%s: %s: %pV\n",
 311                         RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
 312         else if (rbd_dev->spec && rbd_dev->spec->image_name)
 313                 printk(KERN_WARNING "%s: image %s: %pV\n",
 314                         RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
 315         else if (rbd_dev->spec && rbd_dev->spec->image_id)
 316                 printk(KERN_WARNING "%s: id %s: %pV\n",
 317                         RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
 318         else    /* punt */
 319                 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
 320                         RBD_DRV_NAME, rbd_dev, &vaf);
 321         va_end(args);
 322 }
 323
 324 #ifdef RBD_DEBUG
 325 #define rbd_assert(expr)                                                \
 326                 if (unlikely(!(expr))) {                                \
 327                         printk(KERN_ERR "\nAssertion failure in %s() "  \
 328                                                 "at line %d:\n\n"       \
 329                                         "\trbd_assert(%s);\n\n",        \
 330                                         __func__, __LINE__, #expr);     \
 331                         BUG();                                          \
 332                 }
 333 #else /* !RBD_DEBUG */
 334 #  define rbd_assert(expr)      ((void) 0)
 335 #endif /* !RBD_DEBUG */
 336
 337 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
 338 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
 339
 340 static int rbd_open(struct block_device *bdev, fmode_t mode)
 341 {
 342         struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 343
 344         if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
 345                 return -EROFS;
 346
 347         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 348         (void) get_device(&rbd_dev->dev);
 349         set_device_ro(bdev, rbd_dev->mapping.read_only);
 350         rbd_dev->open_count++;
 351         mutex_unlock(&ctl_mutex);
 352
 353         return 0;
 354 }
 355
 356 static int rbd_release(struct gendisk *disk, fmode_t mode)
 357 {
 358         struct rbd_device *rbd_dev = disk->private_data;
 359
 360         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 361         rbd_assert(rbd_dev->open_count > 0);
 362         rbd_dev->open_count--;
 363         put_device(&rbd_dev->dev);
 364         mutex_unlock(&ctl_mutex);
 365
 366         return 0;
 367 }
 368
 369 static const struct block_device_operations rbd_bd_ops = {
 370         .owner                  = THIS_MODULE,
 371         .open                   = rbd_open,
 372         .release                = rbd_release,
 373 };
 374
 375 /*
 376  * Initialize an rbd client instance.
 377  * We own *ceph_opts.
 378  */
 379 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 380 {
 381         struct rbd_client *rbdc;
 382         int ret = -ENOMEM;
 383
 384         dout("rbd_client_create\n");
 385         rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 386         if (!rbdc)
 387                 goto out_opt;
 388
 389         kref_init(&rbdc->kref);
 390         INIT_LIST_HEAD(&rbdc->node);
 391
 392         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 393
 394         rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
 395         if (IS_ERR(rbdc->client))
 396                 goto out_mutex;
 397         ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 398
 399         ret = ceph_open_session(rbdc->client);
 400         if (ret < 0)
 401                 goto out_err;
 402
 403         spin_lock(&rbd_client_list_lock);
 404         list_add_tail(&rbdc->node, &rbd_client_list);
 405         spin_unlock(&rbd_client_list_lock);
 406
 407         mutex_unlock(&ctl_mutex);
 408
 409         dout("rbd_client_create created %p\n", rbdc);
 410         return rbdc;
 411
 412 out_err:
 413         ceph_destroy_client(rbdc->client);
 414 out_mutex:
 415         mutex_unlock(&ctl_mutex);
 416         kfree(rbdc);
 417 out_opt:
 418         if (ceph_opts)
 419                 ceph_destroy_options(ceph_opts);
 420         return ERR_PTR(ret);
 421 }
 422
 423 /*
 424  * Find a ceph client with specific addr and configuration.  If
 425  * found, bump its reference count.
 426  */
 427 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 428 {
 429         struct rbd_client *client_node;
 430         bool found = false;
 431
 432         if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 433                 return NULL;
 434
 435         spin_lock(&rbd_client_list_lock);
 436         list_for_each_entry(client_node, &rbd_client_list, node) {
 437                 if (!ceph_compare_options(ceph_opts, client_node->client)) {
 438                         kref_get(&client_node->kref);
 439                         found = true;
 440                         break;
 441                 }
 442         }
 443         spin_unlock(&rbd_client_list_lock);
 444
 445         return found ? client_node : NULL;
 446 }
 447
 448 /*
 449  * mount options
 450  */
 451 enum {
 452         Opt_last_int,
 453         /* int args above */
 454         Opt_last_string,
 455         /* string args above */
 456         Opt_read_only,
 457         Opt_read_write,
 458         /* Boolean args above */
 459         Opt_last_bool,
 460 };
 461
 462 static match_table_t rbd_opts_tokens = {
 463         /* int args above */
 464         /* string args above */
 465         {Opt_read_only, "read_only"},
 466         {Opt_read_only, "ro"},          /* Alternate spelling */
 467         {Opt_read_write, "read_write"},
 468         {Opt_read_write, "rw"},         /* Alternate spelling */
 469         /* Boolean args above */
 470         {-1, NULL}
 471 };
 472
 473 static int parse_rbd_opts_token(char *c, void *private)
 474 {
 475         struct rbd_options *rbd_opts = private;
 476         substring_t argstr[MAX_OPT_ARGS];
 477         int token, intval, ret;
 478
 479         token = match_token(c, rbd_opts_tokens, argstr);
 480         if (token < 0)
 481                 return -EINVAL;
 482
 483         if (token < Opt_last_int) {
 484                 ret = match_int(&argstr[0], &intval);
 485                 if (ret < 0) {
 486                         pr_err("bad mount option arg (not int) "
 487                                "at '%s'\n", c);
 488                         return ret;
 489                 }
 490                 dout("got int token %d val %d\n", token, intval);
 491         } else if (token > Opt_last_int && token < Opt_last_string) {
 492                 dout("got string token %d val %s\n", token,
 493                      argstr[0].from);
 494         } else if (token > Opt_last_string && token < Opt_last_bool) {
 495                 dout("got Boolean token %d\n", token);
 496         } else {
 497                 dout("got token %d\n", token);
 498         }
 499
 500         switch (token) {
 501         case Opt_read_only:
 502                 rbd_opts->read_only = true;
 503                 break;
 504         case Opt_read_write:
 505                 rbd_opts->read_only = false;
 506                 break;
 507         default:
 508                 rbd_assert(false);
 509                 break;
 510         }
 511         return 0;
 512 }
 513
 514 /*
 515  * Get a ceph client with specific addr and configuration, if one does
 516  * not exist create it.
 517  */
 518 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
 519 {
 520         struct rbd_client *rbdc;
 521
 522         rbdc = rbd_client_find(ceph_opts);
 523         if (rbdc)       /* using an existing client */
 524                 ceph_destroy_options(ceph_opts);
 525         else
 526                 rbdc = rbd_client_create(ceph_opts);
 527
 528         return rbdc;
 529 }
 530
 531 /*
 532  * Destroy ceph client
 533  *
 534  * Caller must hold rbd_client_list_lock.
 535  */
 536 static void rbd_client_release(struct kref *kref)
 537 {
 538         struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 539
 540         dout("rbd_release_client %p\n", rbdc);
 541         spin_lock(&rbd_client_list_lock);
 542         list_del(&rbdc->node);
 543         spin_unlock(&rbd_client_list_lock);
 544
 545         ceph_destroy_client(rbdc->client);
 546         kfree(rbdc);
 547 }
 548
 549 /*
 550  * Drop reference to ceph client node. If it's not referenced anymore, release
 551  * it.
 552  */
 553 static void rbd_put_client(struct rbd_client *rbdc)
 554 {
 555         if (rbdc)
 556                 kref_put(&rbdc->kref, rbd_client_release);
 557 }
 558
 559 /*
 560  * Destroy requests collection
 561  */
 562 static void rbd_coll_release(struct kref *kref)
 563 {
 564         struct rbd_req_coll *coll =
 565                 container_of(kref, struct rbd_req_coll, kref);
 566
 567         dout("rbd_coll_release %p\n", coll);
 568         kfree(coll);
 569 }
 570
 571 static bool rbd_image_format_valid(u32 image_format)
 572 {
 573         return image_format == 1 || image_format == 2;
 574 }
 575
 576 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 577 {
 578         size_t size;
 579         u32 snap_count;
 580
 581         /* The header has to start with the magic rbd header text */
 582         if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
 583                 return false;
 584
 585         /* The bio layer requires at least sector-sized I/O */
 586
 587         if (ondisk->options.order < SECTOR_SHIFT)
 588                 return false;
 589
 590         /* If we use u64 in a few spots we may be able to loosen this */
 591
 592         if (ondisk->options.order > 8 * sizeof (int) - 1)
 593                 return false;
 594
 595         /*
 596          * The size of a snapshot header has to fit in a size_t, and
 597          * that limits the number of snapshots.
 598          */
 599         snap_count = le32_to_cpu(ondisk->snap_count);
 600         size = SIZE_MAX - sizeof (struct ceph_snap_context);
 601         if (snap_count > size / sizeof (__le64))
 602                 return false;
 603
 604         /*
 605          * Not only that, but the size of the entire the snapshot
 606          * header must also be representable in a size_t.
 607          */
 608         size -= snap_count * sizeof (__le64);
 609         if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
 610                 return false;
 611
 612         return true;
 613 }
 614
 615 /*
 616  * Create a new header structure, translate header format from the on-disk
 617  * header.
 618  */
 619 static int rbd_header_from_disk(struct rbd_image_header *header,
 620                                  struct rbd_image_header_ondisk *ondisk)
 621 {
 622         u32 snap_count;
 623         size_t len;
 624         size_t size;
 625         u32 i;
 626
 627         memset(header, 0, sizeof (*header));
 628
 629         snap_count = le32_to_cpu(ondisk->snap_count);
 630
 631         len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
 632         header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
 633         if (!header->object_prefix)
 634                 return -ENOMEM;
 635         memcpy(header->object_prefix, ondisk->object_prefix, len);
 636         header->object_prefix[len] = '\0';
 637
 638         if (snap_count) {
 639                 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 640
 641                 /* Save a copy of the snapshot names */
 642
 643                 if (snap_names_len > (u64) SIZE_MAX)
 644                         return -EIO;
 645                 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
 646                 if (!header->snap_names)
 647                         goto out_err;
 648                 /*
 649                  * Note that rbd_dev_v1_header_read() guarantees
 650                  * the ondisk buffer we're working with has
 651                  * snap_names_len bytes beyond the end of the
 652                  * snapshot id array, this memcpy() is safe.
 653                  */
 654                 memcpy(header->snap_names, &ondisk->snaps[snap_count],
 655                         snap_names_len);
 656
 657                 /* Record each snapshot's size */
 658
 659                 size = snap_count * sizeof (*header->snap_sizes);
 660                 header->snap_sizes = kmalloc(size, GFP_KERNEL);
 661                 if (!header->snap_sizes)
 662                         goto out_err;
 663                 for (i = 0; i < snap_count; i++)
 664                         header->snap_sizes[i] =
 665                                 le64_to_cpu(ondisk->snaps[i].image_size);
 666         } else {
 667                 WARN_ON(ondisk->snap_names_len);
 668                 header->snap_names = NULL;
 669                 header->snap_sizes = NULL;
 670         }
 671
 672         header->features = 0;   /* No features support in v1 images */
 673         header->obj_order = ondisk->options.order;
 674         header->crypt_type = ondisk->options.crypt_type;
 675         header->comp_type = ondisk->options.comp_type;
 676
 677         /* Allocate and fill in the snapshot context */
 678
 679         header->image_size = le64_to_cpu(ondisk->image_size);
 680         size = sizeof (struct ceph_snap_context);
 681         size += snap_count * sizeof (header->snapc->snaps[0]);
 682         header->snapc = kzalloc(size, GFP_KERNEL);
 683         if (!header->snapc)
 684                 goto out_err;
 685
 686         atomic_set(&header->snapc->nref, 1);
 687         header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
 688         header->snapc->num_snaps = snap_count;
 689         for (i = 0; i < snap_count; i++)
 690                 header->snapc->snaps[i] =
 691                         le64_to_cpu(ondisk->snaps[i].id);
 692
 693         return 0;
 694
 695 out_err:
 696         kfree(header->snap_sizes);
 697         header->snap_sizes = NULL;
 698         kfree(header->snap_names);
 699         header->snap_names = NULL;
 700         kfree(header->object_prefix);
 701         header->object_prefix = NULL;
 702
 703         return -ENOMEM;
 704 }
 705
 706 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
 707 {
 708         struct rbd_snap *snap;
 709
 710         if (snap_id == CEPH_NOSNAP)
 711                 return RBD_SNAP_HEAD_NAME;
 712
 713         list_for_each_entry(snap, &rbd_dev->snaps, node)
 714                 if (snap_id == snap->id)
 715                         return snap->name;
 716
 717         return NULL;
 718 }
 719
 720 static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
 721 {
 722
 723         struct rbd_snap *snap;
 724
 725         list_for_each_entry(snap, &rbd_dev->snaps, node) {
 726                 if (!strcmp(snap_name, snap->name)) {
 727                         rbd_dev->spec->snap_id = snap->id;
 728                         rbd_dev->mapping.size = snap->size;
 729                         rbd_dev->mapping.features = snap->features;
 730
 731                         return 0;
 732                 }
 733         }
 734
 735         return -ENOENT;
 736 }
 737
 738 static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
 739 {
 740         int ret;
 741
 742         if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
 743                     sizeof (RBD_SNAP_HEAD_NAME))) {
 744                 rbd_dev->spec->snap_id = CEPH_NOSNAP;
 745                 rbd_dev->mapping.size = rbd_dev->header.image_size;
 746                 rbd_dev->mapping.features = rbd_dev->header.features;
 747                 ret = 0;
 748         } else {
 749                 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
 750                 if (ret < 0)
 751                         goto done;
 752                 rbd_dev->mapping.read_only = true;
 753         }
 754         atomic_set(&rbd_dev->exists, 1);
 755 done:
 756         return ret;
 757 }
 758
 759 static void rbd_header_free(struct rbd_image_header *header)
 760 {
 761         kfree(header->object_prefix);
 762         header->object_prefix = NULL;
 763         kfree(header->snap_sizes);
 764         header->snap_sizes = NULL;
 765         kfree(header->snap_names);
 766         header->snap_names = NULL;
 767         ceph_put_snap_context(header->snapc);
 768         header->snapc = NULL;
 769 }
 770
 771 static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
 772 {
 773         char *name;
 774         u64 segment;
 775         int ret;
 776
 777         name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
 778         if (!name)
 779                 return NULL;
 780         segment = offset >> rbd_dev->header.obj_order;
 781         ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
 782                         rbd_dev->header.object_prefix, segment);
 783         if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
 784                 pr_err("error formatting segment name for #%llu (%d)\n",
 785                         segment, ret);
 786                 kfree(name);
 787                 name = NULL;
 788         }
 789
 790         return name;
 791 }
 792
 793 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 794 {
 795         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
 796
 797         return offset & (segment_size - 1);
 798 }
 799
 800 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
 801                                 u64 offset, u64 length)
 802 {
 803         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
 804
 805         offset &= segment_size - 1;
 806
 807         rbd_assert(length <= U64_MAX - offset);
 808         if (offset + length > segment_size)
 809                 length = segment_size - offset;
 810
 811         return length;
 812 }
 813
 814 static int rbd_get_num_segments(struct rbd_image_header *header,
 815                                 u64 ofs, u64 len)
 816 {
 817         u64 start_seg;
 818         u64 end_seg;
 819
 820         if (!len)
 821                 return 0;
 822         if (len - 1 > U64_MAX - ofs)
 823                 return -ERANGE;
 824
 825         start_seg = ofs >> header->obj_order;
 826         end_seg = (ofs + len - 1) >> header->obj_order;
 827
 828         return end_seg - start_seg + 1;
 829 }
 830
 831 /*
 832  * returns the size of an object in the image
 833  */
 834 static u64 rbd_obj_bytes(struct rbd_image_header *header)
 835 {
 836         return 1 << header->obj_order;
 837 }
 838
 839 /*
 840  * bio helpers
 841  */
 842
 843 static void bio_chain_put(struct bio *chain)
 844 {
 845         struct bio *tmp;
 846
 847         while (chain) {
 848                 tmp = chain;
 849                 chain = chain->bi_next;
 850                 bio_put(tmp);
 851         }
 852 }
 853
 854 /*
 855  * zeros a bio chain, starting at specific offset
 856  */
 857 static void zero_bio_chain(struct bio *chain, int start_ofs)
 858 {
 859         struct bio_vec *bv;
 860         unsigned long flags;
 861         void *buf;
 862         int i;
 863         int pos = 0;
 864
 865         while (chain) {
 866                 bio_for_each_segment(bv, chain, i) {
 867                         if (pos + bv->bv_len > start_ofs) {
 868                                 int remainder = max(start_ofs - pos, 0);
 869                                 buf = bvec_kmap_irq(bv, &flags);
 870                                 memset(buf + remainder, 0,
 871                                        bv->bv_len - remainder);
 872                                 bvec_kunmap_irq(buf, &flags);
 873                         }
 874                         pos += bv->bv_len;
 875                 }
 876
 877                 chain = chain->bi_next;
 878         }
 879 }
 880
 881 /*
 882  * Clone a portion of a bio, starting at the given byte offset
 883  * and continuing for the number of bytes indicated.
 884  */
 885 static struct bio *bio_clone_range(struct bio *bio_src,
 886                                         unsigned int offset,
 887                                         unsigned int len,
 888                                         gfp_t gfpmask)
 889 {
 890         struct bio_vec *bv;
 891         unsigned int resid;
 892         unsigned short idx;
 893         unsigned int voff;
 894         unsigned short end_idx;
 895         unsigned short vcnt;
 896         struct bio *bio;
 897
 898         /* Handle the easy case for the caller */
 899
 900         if (!offset && len == bio_src->bi_size)
 901                 return bio_clone(bio_src, gfpmask);
 902
 903         if (WARN_ON_ONCE(!len))
 904                 return NULL;
 905         if (WARN_ON_ONCE(len > bio_src->bi_size))
 906                 return NULL;
 907         if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
 908                 return NULL;
 909
 910         /* Find first affected segment... */
 911
 912         resid = offset;
 913         __bio_for_each_segment(bv, bio_src, idx, 0) {
 914                 if (resid < bv->bv_len)
 915                         break;
 916                 resid -= bv->bv_len;
 917         }
 918         voff = resid;
 919
 920         /* ...and the last affected segment */
 921
 922         resid += len;
 923         __bio_for_each_segment(bv, bio_src, end_idx, idx) {
 924                 if (resid <= bv->bv_len)
 925                         break;
 926                 resid -= bv->bv_len;
 927         }
 928         vcnt = end_idx - idx + 1;
 929
 930         /* Build the clone */
 931
 932         bio = bio_alloc(gfpmask, (unsigned int) vcnt);
 933         if (!bio)
 934                 return NULL;    /* ENOMEM */
 935
 936         bio->bi_bdev = bio_src->bi_bdev;
 937         bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
 938         bio->bi_rw = bio_src->bi_rw;
 939         bio->bi_flags |= 1 << BIO_CLONED;
 940
 941         /*
 942          * Copy over our part of the bio_vec, then update the first
 943          * and last (or only) entries.
 944          */
 945         memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
 946                         vcnt * sizeof (struct bio_vec));
 947         bio->bi_io_vec[0].bv_offset += voff;
 948         if (vcnt > 1) {
 949                 bio->bi_io_vec[0].bv_len -= voff;
 950                 bio->bi_io_vec[vcnt - 1].bv_len = resid;
 951         } else {
 952                 bio->bi_io_vec[0].bv_len = len;
 953         }
 954
 955         bio->bi_vcnt = vcnt;
 956         bio->bi_size = len;
 957         bio->bi_idx = 0;
 958
 959         return bio;
 960 }
 961
 962 /*
 963  * Clone a portion of a bio chain, starting at the given byte offset
 964  * into the first bio in the source chain and continuing for the
 965  * number of bytes indicated.  The result is another bio chain of
 966  * exactly the given length, or a null pointer on error.
 967  *
 968  * The bio_src and offset parameters are both in-out.  On entry they
 969  * refer to the first source bio and the offset into that bio where
 970  * the start of data to be cloned is located.
 971  *
 972  * On return, bio_src is updated to refer to the bio in the source
 973  * chain that contains first un-cloned byte, and *offset will
 974  * contain the offset of that byte within that bio.
 975  */
 976 static struct bio *bio_chain_clone_range(struct bio **bio_src,
 977                                         unsigned int *offset,
 978                                         unsigned int len,
 979                                         gfp_t gfpmask)
 980 {
 981         struct bio *bi = *bio_src;
 982         unsigned int off = *offset;
 983         struct bio *chain = NULL;
 984         struct bio **end;
 985
 986         /* Build up a chain of clone bios up to the limit */
 987
 988         if (!bi || off >= bi->bi_size || !len)
 989                 return NULL;            /* Nothing to clone */
 990
 991         end = &chain;
 992         while (len) {
 993                 unsigned int bi_size;
 994                 struct bio *bio;
 995
 996                 if (!bi) {
 997                         rbd_warn(NULL, "bio_chain exhausted with %u left", len);
 998                         goto out_err;   /* EINVAL; ran out of bio's */
 999                 }
1000                 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1001                 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1002                 if (!bio)
1003                         goto out_err;   /* ENOMEM */
1004
1005                 *end = bio;
1006                 end = &bio->bi_next;
1007
1008                 off += bi_size;
1009                 if (off == bi->bi_size) {
1010                         bi = bi->bi_next;
1011                         off = 0;
1012                 }
1013                 len -= bi_size;
1014         }
1015         *bio_src = bi;
1016         *offset = off;
1017
1018         return chain;
1019 out_err:
1020         bio_chain_put(chain);
1021
1022         return NULL;
1023 }
1024
1025 /*
1026  * helpers for osd request op vectors.
1027  */
1028 static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
1029                                         int opcode, u32 payload_len)
1030 {
1031         struct ceph_osd_req_op *ops;
1032
1033         ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
1034         if (!ops)
1035                 return NULL;
1036
1037         ops[0].op = opcode;
1038
1039         /*
1040          * op extent offset and length will be set later on
1041          * in calc_raw_layout()
1042          */
1043         ops[0].payload_len = payload_len;
1044
1045         return ops;
1046 }
1047
1048 static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
1049 {
1050         kfree(ops);
1051 }
1052
1053 static void rbd_coll_end_req_index(struct request *rq,
1054                                    struct rbd_req_coll *coll,
1055                                    int index,
1056                                    s32 ret, u64 len)
1057 {
1058         struct request_queue *q;
1059         int min, max, i;
1060
1061         dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1062              coll, index, (int)ret, (unsigned long long)len);
1063
1064         if (!rq)
1065                 return;
1066
1067         if (!coll) {
1068                 blk_end_request(rq, ret, len);
1069                 return;
1070         }
1071
1072         q = rq->q;
1073
1074         spin_lock_irq(q->queue_lock);
1075         coll->status[index].done = 1;
1076         coll->status[index].rc = ret;
1077         coll->status[index].bytes = len;
1078         max = min = coll->num_done;
1079         while (max < coll->total && coll->status[max].done)
1080                 max++;
1081
1082         for (i = min; i<max; i++) {
1083                 __blk_end_request(rq, (int)coll->status[i].rc,
1084                                   coll->status[i].bytes);
1085                 coll->num_done++;
1086                 kref_put(&coll->kref, rbd_coll_release);
1087         }
1088         spin_unlock_irq(q->queue_lock);
1089 }
1090
1091 static void rbd_coll_end_req(struct rbd_request *rbd_req,
1092                              s32 ret, u64 len)
1093 {
1094         rbd_coll_end_req_index(rbd_req->rq,
1095                                 rbd_req->coll, rbd_req->coll_index,
1096                                 ret, len);
1097 }
1098
1099 /*
1100  * Send ceph osd request
1101  */
1102 static int rbd_do_request(struct request *rq,
1103                           struct rbd_device *rbd_dev,
1104                           struct ceph_snap_context *snapc,
1105                           u64 snapid,
1106                           const char *object_name, u64 ofs, u64 len,
1107                           struct bio *bio,
1108                           struct page **pages,
1109                           int num_pages,
1110                           int flags,
1111                           struct ceph_osd_req_op *ops,
1112                           struct rbd_req_coll *coll,
1113                           int coll_index,
1114                           void (*rbd_cb)(struct ceph_osd_request *,
1115                                          struct ceph_msg *),
1116                           struct ceph_osd_request **linger_req,
1117                           u64 *ver)
1118 {
1119         struct ceph_osd_request *osd_req;
1120         struct ceph_file_layout *layout;
1121         int ret;
1122         u64 bno;
1123         struct timespec mtime = CURRENT_TIME;
1124         struct rbd_request *rbd_req;
1125         struct ceph_osd_request_head *reqhead;
1126         struct ceph_osd_client *osdc;
1127
1128         rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO);
1129         if (!rbd_req)
1130                 return -ENOMEM;
1131
1132         if (coll) {
1133                 rbd_req->coll = coll;
1134                 rbd_req->coll_index = coll_index;
1135         }
1136
1137         dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1138                 object_name, (unsigned long long) ofs,
1139                 (unsigned long long) len, coll, coll_index);
1140
1141         osdc = &rbd_dev->rbd_client->client->osdc;
1142         osd_req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1143                                         false, GFP_NOIO, pages, bio);
1144         if (!osd_req) {
1145                 ret = -ENOMEM;
1146                 goto done_pages;
1147         }
1148
1149         osd_req->r_callback = rbd_cb;
1150
1151         rbd_req->rq = rq;
1152         rbd_req->bio = bio;
1153         rbd_req->pages = pages;
1154         rbd_req->len = len;
1155
1156         osd_req->r_priv = rbd_req;
1157
1158         reqhead = osd_req->r_request->front.iov_base;
1159         reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1160
1161         strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
1162         osd_req->r_oid_len = strlen(osd_req->r_oid);
1163
1164         layout = &osd_req->r_file_layout;
1165         memset(layout, 0, sizeof(*layout));
1166         layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1167         layout->fl_stripe_count = cpu_to_le32(1);
1168         layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1169         layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
1170         ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1171                                    osd_req, ops);
1172         rbd_assert(ret == 0);
1173
1174         ceph_osdc_build_request(osd_req, ofs, &len,
1175                                 ops,
1176                                 snapc,
1177                                 &mtime,
1178                                 osd_req->r_oid, osd_req->r_oid_len);
1179
1180         if (linger_req) {
1181                 ceph_osdc_set_request_linger(osdc, osd_req);
1182                 *linger_req = osd_req;
1183         }
1184
1185         ret = ceph_osdc_start_request(osdc, osd_req, false);
1186         if (ret < 0)
1187                 goto done_err;
1188
1189         if (!rbd_cb) {
1190                 u64 version;
1191
1192                 ret = ceph_osdc_wait_request(osdc, osd_req);
1193                 version = le64_to_cpu(osd_req->r_reassert_version.version);
1194                 if (ver)
1195                         *ver = version;
1196                 dout("reassert_ver=%llu\n", (unsigned long long) version);
1197                 ceph_osdc_put_request(osd_req);
1198         }
1199         return ret;
1200
1201 done_err:
1202         bio_chain_put(rbd_req->bio);
1203         ceph_osdc_put_request(osd_req);
1204 done_pages:
1205         kfree(rbd_req);
1206         return ret;
1207 }
1208
1209 /*
1210  * Ceph osd op callback
1211  */
1212 static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
1213 {
1214         struct rbd_request *rbd_req = osd_req->r_priv;
1215         struct ceph_osd_reply_head *replyhead;
1216         struct ceph_osd_op *op;
1217         s32 rc;
1218         u64 bytes;
1219         int read_op;
1220
1221         /* parse reply */
1222         replyhead = msg->front.iov_base;
1223         WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1224         op = (void *)(replyhead + 1);
1225         rc = (s32)le32_to_cpu(replyhead->result);
1226         bytes = le64_to_cpu(op->extent.length);
1227         read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
1228
1229         dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1230                 (unsigned long long) bytes, read_op, (int) rc);
1231
1232         if (rc == (s32)-ENOENT && read_op) {
1233                 zero_bio_chain(rbd_req->bio, 0);
1234                 rc = 0;
1235         } else if (rc == 0 && read_op && bytes < rbd_req->len) {
1236                 zero_bio_chain(rbd_req->bio, bytes);
1237                 bytes = rbd_req->len;
1238         }
1239
1240         rbd_coll_end_req(rbd_req, rc, bytes);
1241
1242         if (rbd_req->bio)
1243                 bio_chain_put(rbd_req->bio);
1244
1245         ceph_osdc_put_request(osd_req);
1246         kfree(rbd_req);
1247 }
1248
1249 static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
1250                                 struct ceph_msg *msg)
1251 {
1252         ceph_osdc_put_request(osd_req);
1253 }
1254
1255 /*
1256  * Do a synchronous ceph osd operation
1257  */
1258 static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1259                            struct ceph_snap_context *snapc,
1260                            u64 snapid,
1261                            int flags,
1262                            struct ceph_osd_req_op *ops,
1263                            const char *object_name,
1264                            u64 ofs, u64 inbound_size,
1265                            char *inbound,
1266                            struct ceph_osd_request **linger_req,
1267                            u64 *ver)
1268 {
1269         int ret;
1270         struct page **pages;
1271         int num_pages;
1272
1273         rbd_assert(ops != NULL);
1274
1275         num_pages = calc_pages_for(ofs, inbound_size);
1276         pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1277         if (IS_ERR(pages))
1278                 return PTR_ERR(pages);
1279
1280         ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
1281                           object_name, ofs, inbound_size, NULL,
1282                           pages, num_pages,
1283                           flags,
1284                           ops,
1285                           NULL, 0,
1286                           NULL,
1287                           linger_req, ver);
1288         if (ret < 0)
1289                 goto done;
1290
1291         if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1292                 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
1293
1294 done:
1295         ceph_release_page_vector(pages, num_pages);
1296         return ret;
1297 }
1298
1299 /*
1300  * Do an asynchronous ceph osd operation
1301  */
1302 static int rbd_do_op(struct request *rq,
1303                      struct rbd_device *rbd_dev,
1304                      struct ceph_snap_context *snapc,
1305                      u64 ofs, u64 len,
1306                      struct bio *bio,
1307                      struct rbd_req_coll *coll,
1308                      int coll_index)
1309 {
1310         char *seg_name;
1311         u64 seg_ofs;
1312         u64 seg_len;
1313         int ret;
1314         struct ceph_osd_req_op *ops;
1315         u32 payload_len;
1316         int opcode;
1317         int flags;
1318         u64 snapid;
1319
1320         seg_name = rbd_segment_name(rbd_dev, ofs);
1321         if (!seg_name)
1322                 return -ENOMEM;
1323         seg_len = rbd_segment_length(rbd_dev, ofs, len);
1324         seg_ofs = rbd_segment_offset(rbd_dev, ofs);
1325
1326         if (rq_data_dir(rq) == WRITE) {
1327                 opcode = CEPH_OSD_OP_WRITE;
1328                 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
1329                 snapid = CEPH_NOSNAP;
1330                 payload_len = seg_len;
1331         } else {
1332                 opcode = CEPH_OSD_OP_READ;
1333                 flags = CEPH_OSD_FLAG_READ;
1334                 snapc = NULL;
1335                 snapid = rbd_dev->spec->snap_id;
1336                 payload_len = 0;
1337         }
1338
1339         ret = -ENOMEM;
1340         ops = rbd_create_rw_ops(1, opcode, payload_len);
1341         if (!ops)
1342                 goto done;
1343
1344         /* we've taken care of segment sizes earlier when we
1345            cloned the bios. We should never have a segment
1346            truncated at this point */
1347         rbd_assert(seg_len == len);
1348
1349         ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1350                              seg_name, seg_ofs, seg_len,
1351                              bio,
1352                              NULL, 0,
1353                              flags,
1354                              ops,
1355                              coll, coll_index,
1356                              rbd_req_cb, 0, NULL);
1357         if (ret < 0)
1358                 rbd_coll_end_req_index(rq, coll, coll_index,
1359                                         (s32)ret, seg_len);
1360         rbd_destroy_ops(ops);
1361 done:
1362         kfree(seg_name);
1363         return ret;
1364 }
1365
1366 /*
1367  * Request sync osd read
1368  */
1369 static int rbd_req_sync_read(struct rbd_device *rbd_dev,
1370                           u64 snapid,
1371                           const char *object_name,
1372                           u64 ofs, u64 len,
1373                           char *buf,
1374                           u64 *ver)
1375 {
1376         struct ceph_osd_req_op *ops;
1377         int ret;
1378
1379         ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1380         if (!ops)
1381                 return -ENOMEM;
1382
1383         ret = rbd_req_sync_op(rbd_dev, NULL,
1384                                snapid,
1385                                CEPH_OSD_FLAG_READ,
1386                                ops, object_name, ofs, len, buf, NULL, ver);
1387         rbd_destroy_ops(ops);
1388
1389         return ret;
1390 }
1391
1392 /*
1393  * Request sync osd watch
1394  */
1395 static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
1396                                    u64 ver,
1397                                    u64 notify_id)
1398 {
1399         struct ceph_osd_req_op *ops;
1400         int ret;
1401
1402         ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1403         if (!ops)
1404                 return -ENOMEM;
1405
1406         ops[0].watch.ver = cpu_to_le64(ver);
1407         ops[0].watch.cookie = notify_id;
1408         ops[0].watch.flag = 0;
1409
1410         ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
1411                           rbd_dev->header_name, 0, 0, NULL,
1412                           NULL, 0,
1413                           CEPH_OSD_FLAG_READ,
1414                           ops,
1415                           NULL, 0,
1416                           rbd_simple_req_cb, 0, NULL);
1417
1418         rbd_destroy_ops(ops);
1419         return ret;
1420 }
1421
1422 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1423 {
1424         struct rbd_device *rbd_dev = (struct rbd_device *)data;
1425         u64 hver;
1426         int rc;
1427
1428         if (!rbd_dev)
1429                 return;
1430
1431         dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1432                 rbd_dev->header_name, (unsigned long long) notify_id,
1433                 (unsigned int) opcode);
1434         rc = rbd_dev_refresh(rbd_dev, &hver);
1435         if (rc)
1436                 rbd_warn(rbd_dev, "got notification but failed to "
1437                            " update snaps: %d\n", rc);
1438
1439         rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
1440 }
1441
1442 /*
1443  * Request sync osd watch
1444  */
1445 static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
1446 {
1447         struct ceph_osd_req_op *ops;
1448         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1449         int ret;
1450
1451         ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1452         if (!ops)
1453                 return -ENOMEM;
1454
1455         ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
1456                                      (void *)rbd_dev, &rbd_dev->watch_event);
1457         if (ret < 0)
1458                 goto fail;
1459
1460         ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
1461         ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
1462         ops[0].watch.flag = 1;
1463
1464         ret = rbd_req_sync_op(rbd_dev, NULL,
1465                               CEPH_NOSNAP,
1466                               CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1467                               ops,
1468                               rbd_dev->header_name,
1469                               0, 0, NULL,
1470                               &rbd_dev->watch_request, NULL);
1471
1472         if (ret < 0)
1473                 goto fail_event;
1474
1475         rbd_destroy_ops(ops);
1476         return 0;
1477
1478 fail_event:
1479         ceph_osdc_cancel_event(rbd_dev->watch_event);
1480         rbd_dev->watch_event = NULL;
1481 fail:
1482         rbd_destroy_ops(ops);
1483         return ret;
1484 }
1485
1486 /*
1487  * Request sync osd unwatch
1488  */
1489 static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
1490 {
1491         struct ceph_osd_req_op *ops;
1492         int ret;
1493
1494         ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1495         if (!ops)
1496                 return -ENOMEM;
1497
1498         ops[0].watch.ver = 0;
1499         ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
1500         ops[0].watch.flag = 0;
1501
1502         ret = rbd_req_sync_op(rbd_dev, NULL,
1503                               CEPH_NOSNAP,
1504                               CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1505                               ops,
1506                               rbd_dev->header_name,
1507                               0, 0, NULL, NULL, NULL);
1508
1509
1510         rbd_destroy_ops(ops);
1511         ceph_osdc_cancel_event(rbd_dev->watch_event);
1512         rbd_dev->watch_event = NULL;
1513         return ret;
1514 }
1515
1516 /*
1517  * Synchronous osd object method call
1518  */
1519 static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1520                              const char *object_name,
1521                              const char *class_name,
1522                              const char *method_name,
1523                              const char *outbound,
1524                              size_t outbound_size,
1525                              char *inbound,
1526                              size_t inbound_size,
1527                              int flags,
1528                              u64 *ver)
1529 {
1530         struct ceph_osd_req_op *ops;
1531         int class_name_len = strlen(class_name);
1532         int method_name_len = strlen(method_name);
1533         int payload_size;
1534         int ret;
1535
1536         /*
1537          * Any input parameters required by the method we're calling
1538          * will be sent along with the class and method names as
1539          * part of the message payload.  That data and its size are
1540          * supplied via the indata and indata_len fields (named from
1541          * the perspective of the server side) in the OSD request
1542          * operation.
1543          */
1544         payload_size = class_name_len + method_name_len + outbound_size;
1545         ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
1546         if (!ops)
1547                 return -ENOMEM;
1548
1549         ops[0].cls.class_name = class_name;
1550         ops[0].cls.class_len = (__u8) class_name_len;
1551         ops[0].cls.method_name = method_name;
1552         ops[0].cls.method_len = (__u8) method_name_len;
1553         ops[0].cls.argc = 0;
1554         ops[0].cls.indata = outbound;
1555         ops[0].cls.indata_len = outbound_size;
1556
1557         ret = rbd_req_sync_op(rbd_dev, NULL,
1558                                CEPH_NOSNAP,
1559                                flags, ops,
1560                                object_name, 0, inbound_size, inbound,
1561                                NULL, ver);
1562
1563         rbd_destroy_ops(ops);
1564
1565         dout("cls_exec returned %d\n", ret);
1566         return ret;
1567 }
1568
1569 static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1570 {
1571         struct rbd_req_coll *coll =
1572                         kzalloc(sizeof(struct rbd_req_coll) +
1573                                 sizeof(struct rbd_req_status) * num_reqs,
1574                                 GFP_ATOMIC);
1575
1576         if (!coll)
1577                 return NULL;
1578         coll->total = num_reqs;
1579         kref_init(&coll->kref);
1580         return coll;
1581 }
1582
1583 static int rbd_dev_do_request(struct request *rq,
1584                                 struct rbd_device *rbd_dev,
1585                                 struct ceph_snap_context *snapc,
1586                                 u64 ofs, unsigned int size,
1587                                 struct bio *bio_chain)
1588 {
1589         int num_segs;
1590         struct rbd_req_coll *coll;
1591         unsigned int bio_offset;
1592         int cur_seg = 0;
1593
1594         dout("%s 0x%x bytes at 0x%llx\n",
1595                 rq_data_dir(rq) == WRITE ? "write" : "read",
1596                 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1597
1598         num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1599         if (num_segs <= 0)
1600                 return num_segs;
1601
1602         coll = rbd_alloc_coll(num_segs);
1603         if (!coll)
1604                 return -ENOMEM;
1605
1606         bio_offset = 0;
1607         do {
1608                 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1609                 unsigned int clone_size;
1610                 struct bio *bio_clone;
1611
1612                 BUG_ON(limit > (u64)UINT_MAX);
1613                 clone_size = (unsigned int)limit;
1614                 dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt);
1615
1616                 kref_get(&coll->kref);
1617
1618                 /* Pass a cloned bio chain via an osd request */
1619
1620                 bio_clone = bio_chain_clone_range(&bio_chain,
1621                                         &bio_offset, clone_size,
1622                                         GFP_ATOMIC);
1623                 if (bio_clone)
1624                         (void)rbd_do_op(rq, rbd_dev, snapc,
1625                                         ofs, clone_size,
1626                                         bio_clone, coll, cur_seg);
1627                 else
1628                         rbd_coll_end_req_index(rq, coll, cur_seg,
1629                                                 (s32)-ENOMEM,
1630                                                 clone_size);
1631                 size -= clone_size;
1632                 ofs += clone_size;
1633
1634                 cur_seg++;
1635         } while (size > 0);
1636         kref_put(&coll->kref, rbd_coll_release);
1637
1638         return 0;
1639 }
1640
1641 /*
1642  * block device queue callback
1643  */
1644 static void rbd_rq_fn(struct request_queue *q)
1645 {
1646         struct rbd_device *rbd_dev = q->queuedata;
1647         bool read_only = rbd_dev->mapping.read_only;
1648         struct request *rq;
1649
1650         while ((rq = blk_fetch_request(q))) {
1651                 struct ceph_snap_context *snapc = NULL;
1652                 unsigned int size = 0;
1653                 int result;
1654
1655                 dout("fetched request\n");
1656
1657                 /* Filter out block requests we don't understand */
1658
1659                 if ((rq->cmd_type != REQ_TYPE_FS)) {
1660                         __blk_end_request_all(rq, 0);
1661                         continue;
1662                 }
1663                 spin_unlock_irq(q->queue_lock);
1664
1665                 /* Stop writes to a read-only device */
1666
1667                 result = -EROFS;
1668                 if (read_only && rq_data_dir(rq) == WRITE)
1669                         goto out_end_request;
1670
1671                 /* Grab a reference to the snapshot context */
1672
1673                 down_read(&rbd_dev->header_rwsem);
1674                 if (atomic_read(&rbd_dev->exists)) {
1675                         snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1676                         rbd_assert(snapc != NULL);
1677                 }
1678                 up_read(&rbd_dev->header_rwsem);
1679
1680                 if (!snapc) {
1681                         rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1682                         dout("request for non-existent snapshot");
1683                         result = -ENXIO;
1684                         goto out_end_request;
1685                 }
1686
1687                 size = blk_rq_bytes(rq);
1688                 result = rbd_dev_do_request(rq, rbd_dev, snapc,
1689                                 blk_rq_pos(rq) * SECTOR_SIZE,
1690                                 size, rq->bio);
1691 out_end_request:
1692                 ceph_put_snap_context(snapc);
1693                 spin_lock_irq(q->queue_lock);
1694                 if (!size || result < 0)
1695                         __blk_end_request_all(rq, result);
1696         }
1697 }
1698
1699 /*
1700  * a queue callback. Makes sure that we don't create a bio that spans across
1701  * multiple osd objects. One exception would be with a single page bios,
1702  * which we handle later at bio_chain_clone_range()
1703  */
1704 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1705                           struct bio_vec *bvec)
1706 {
1707         struct rbd_device *rbd_dev = q->queuedata;
1708         sector_t sector_offset;
1709         sector_t sectors_per_obj;
1710         sector_t obj_sector_offset;
1711         int ret;
1712
1713         /*
1714          * Find how far into its rbd object the partition-relative
1715          * bio start sector is to offset relative to the enclosing
1716          * device.
1717          */
1718         sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1719         sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1720         obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1721
1722         /*
1723          * Compute the number of bytes from that offset to the end
1724          * of the object.  Account for what's already used by the bio.
1725          */
1726         ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1727         if (ret > bmd->bi_size)
1728                 ret -= bmd->bi_size;
1729         else
1730                 ret = 0;
1731
1732         /*
1733          * Don't send back more than was asked for.  And if the bio
1734          * was empty, let the whole thing through because:  "Note
1735          * that a block device *must* allow a single page to be
1736          * added to an empty bio."
1737          */
1738         rbd_assert(bvec->bv_len <= PAGE_SIZE);
1739         if (ret > (int) bvec->bv_len || !bmd->bi_size)
1740                 ret = (int) bvec->bv_len;
1741
1742         return ret;
1743 }
1744
1745 static void rbd_free_disk(struct rbd_device *rbd_dev)
1746 {
1747         struct gendisk *disk = rbd_dev->disk;
1748
1749         if (!disk)
1750                 return;
1751
1752         if (disk->flags & GENHD_FL_UP)
1753                 del_gendisk(disk);
1754         if (disk->queue)
1755                 blk_cleanup_queue(disk->queue);
1756         put_disk(disk);
1757 }
1758
1759 /*
1760  * Read the complete header for the given rbd device.
1761  *
1762  * Returns a pointer to a dynamically-allocated buffer containing
1763  * the complete and validated header.  Caller can pass the address
1764  * of a variable that will be filled in with the version of the
1765  * header object at the time it was read.
1766  *
1767  * Returns a pointer-coded errno if a failure occurs.
1768  */
1769 static struct rbd_image_header_ondisk *
1770 rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1771 {
1772         struct rbd_image_header_ondisk *ondisk = NULL;
1773         u32 snap_count = 0;
1774         u64 names_size = 0;
1775         u32 want_count;
1776         int ret;
1777
1778         /*
1779          * The complete header will include an array of its 64-bit
1780          * snapshot ids, followed by the names of those snapshots as
1781          * a contiguous block of NUL-terminated strings.  Note that
1782          * the number of snapshots could change by the time we read
1783          * it in, in which case we re-read it.
1784          */
1785         do {
1786                 size_t size;
1787
1788                 kfree(ondisk);
1789
1790                 size = sizeof (*ondisk);
1791                 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1792                 size += names_size;
1793                 ondisk = kmalloc(size, GFP_KERNEL);
1794                 if (!ondisk)
1795                         return ERR_PTR(-ENOMEM);
1796
1797                 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1798                                        rbd_dev->header_name,
1799                                        0, size,
1800                                        (char *) ondisk, version);
1801
1802                 if (ret < 0)
1803                         goto out_err;
1804                 if (WARN_ON((size_t) ret < size)) {
1805                         ret = -ENXIO;
1806                         rbd_warn(rbd_dev, "short header read (want %zd got %d)",
1807                                 size, ret);
1808                         goto out_err;
1809                 }
1810                 if (!rbd_dev_ondisk_valid(ondisk)) {
1811                         ret = -ENXIO;
1812                         rbd_warn(rbd_dev, "invalid header");
1813                         goto out_err;
1814                 }
1815
1816                 names_size = le64_to_cpu(ondisk->snap_names_len);
1817                 want_count = snap_count;
1818                 snap_count = le32_to_cpu(ondisk->snap_count);
1819         } while (snap_count != want_count);
1820
1821         return ondisk;
1822
1823 out_err:
1824         kfree(ondisk);
1825
1826         return ERR_PTR(ret);
1827 }
1828
1829 /*
1830  * reload the ondisk the header
1831  */
1832 static int rbd_read_header(struct rbd_device *rbd_dev,
1833                            struct rbd_image_header *header)
1834 {
1835         struct rbd_image_header_ondisk *ondisk;
1836         u64 ver = 0;
1837         int ret;
1838
1839         ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1840         if (IS_ERR(ondisk))
1841                 return PTR_ERR(ondisk);
1842         ret = rbd_header_from_disk(header, ondisk);
1843         if (ret >= 0)
1844                 header->obj_version = ver;
1845         kfree(ondisk);
1846
1847         return ret;
1848 }
1849
1850 static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1851 {
1852         struct rbd_snap *snap;
1853         struct rbd_snap *next;
1854
1855         list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
1856                 rbd_remove_snap_dev(snap);
1857 }
1858
1859 static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1860 {
1861         sector_t size;
1862
1863         if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
1864                 return;
1865
1866         size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1867         dout("setting size to %llu sectors", (unsigned long long) size);
1868         rbd_dev->mapping.size = (u64) size;
1869         set_capacity(rbd_dev->disk, size);
1870 }
1871
1872 /*
1873  * only read the first part of the ondisk header, without the snaps info
1874  */
1875 static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
1876 {
1877         int ret;
1878         struct rbd_image_header h;
1879
1880         ret = rbd_read_header(rbd_dev, &h);
1881         if (ret < 0)
1882                 return ret;
1883
1884         down_write(&rbd_dev->header_rwsem);
1885
1886         /* Update image size, and check for resize of mapped image */
1887         rbd_dev->header.image_size = h.image_size;
1888         rbd_update_mapping_size(rbd_dev);
1889
1890         /* rbd_dev->header.object_prefix shouldn't change */
1891         kfree(rbd_dev->header.snap_sizes);
1892         kfree(rbd_dev->header.snap_names);
1893         /* osd requests may still refer to snapc */
1894         ceph_put_snap_context(rbd_dev->header.snapc);
1895
1896         if (hver)
1897                 *hver = h.obj_version;
1898         rbd_dev->header.obj_version = h.obj_version;
1899         rbd_dev->header.image_size = h.image_size;
1900         rbd_dev->header.snapc = h.snapc;
1901         rbd_dev->header.snap_names = h.snap_names;
1902         rbd_dev->header.snap_sizes = h.snap_sizes;
1903         /* Free the extra copy of the object prefix */
1904         WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1905         kfree(h.object_prefix);
1906
1907         ret = rbd_dev_snaps_update(rbd_dev);
1908         if (!ret)
1909                 ret = rbd_dev_snaps_register(rbd_dev);
1910
1911         up_write(&rbd_dev->header_rwsem);
1912
1913         return ret;
1914 }
1915
1916 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1917 {
1918         int ret;
1919
1920         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1921         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1922         if (rbd_dev->image_format == 1)
1923                 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1924         else
1925                 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1926         mutex_unlock(&ctl_mutex);
1927
1928         return ret;
1929 }
1930
1931 static int rbd_init_disk(struct rbd_device *rbd_dev)
1932 {
1933         struct gendisk *disk;
1934         struct request_queue *q;
1935         u64 segment_size;
1936
1937         /* create gendisk info */
1938         disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1939         if (!disk)
1940                 return -ENOMEM;
1941
1942         snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1943                  rbd_dev->dev_id);
1944         disk->major = rbd_dev->major;
1945         disk->first_minor = 0;
1946         disk->fops = &rbd_bd_ops;
1947         disk->private_data = rbd_dev;
1948
1949         /* init rq */
1950         q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1951         if (!q)
1952                 goto out_disk;
1953
1954         /* We use the default size, but let's be explicit about it. */
1955         blk_queue_physical_block_size(q, SECTOR_SIZE);
1956
1957         /* set io sizes to object size */
1958         segment_size = rbd_obj_bytes(&rbd_dev->header);
1959         blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1960         blk_queue_max_segment_size(q, segment_size);
1961         blk_queue_io_min(q, segment_size);
1962         blk_queue_io_opt(q, segment_size);
1963
1964         blk_queue_merge_bvec(q, rbd_merge_bvec);
1965         disk->queue = q;
1966
1967         q->queuedata = rbd_dev;
1968
1969         rbd_dev->disk = disk;
1970
1971         set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1972
1973         return 0;
1974 out_disk:
1975         put_disk(disk);
1976
1977         return -ENOMEM;
1978 }
1979
1980 /*
1981   sysfs
1982 */
1983
1984 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1985 {
1986         return container_of(dev, struct rbd_device, dev);
1987 }
1988
1989 static ssize_t rbd_size_show(struct device *dev,
1990                              struct device_attribute *attr, char *buf)
1991 {
1992         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1993         sector_t size;
1994
1995         down_read(&rbd_dev->header_rwsem);
1996         size = get_capacity(rbd_dev->disk);
1997         up_read(&rbd_dev->header_rwsem);
1998
1999         return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2000 }
2001
2002 /*
2003  * Note this shows the features for whatever's mapped, which is not
2004  * necessarily the base image.
2005  */
2006 static ssize_t rbd_features_show(struct device *dev,
2007                              struct device_attribute *attr, char *buf)
2008 {
2009         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2010
2011         return sprintf(buf, "0x%016llx\n",
2012                         (unsigned long long) rbd_dev->mapping.features);
2013 }
2014
2015 static ssize_t rbd_major_show(struct device *dev,
2016                               struct device_attribute *attr, char *buf)
2017 {
2018         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2019
2020         return sprintf(buf, "%d\n", rbd_dev->major);
2021 }
2022
2023 static ssize_t rbd_client_id_show(struct device *dev,
2024                                   struct device_attribute *attr, char *buf)
2025 {
2026         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2027
2028         return sprintf(buf, "client%lld\n",
2029                         ceph_client_id(rbd_dev->rbd_client->client));
2030 }
2031
2032 static ssize_t rbd_pool_show(struct device *dev,
2033                              struct device_attribute *attr, char *buf)
2034 {
2035         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2036
2037         return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2038 }
2039
2040 static ssize_t rbd_pool_id_show(struct device *dev,
2041                              struct device_attribute *attr, char *buf)
2042 {
2043         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2044
2045         return sprintf(buf, "%llu\n",
2046                 (unsigned long long) rbd_dev->spec->pool_id);
2047 }
2048
2049 static ssize_t rbd_name_show(struct device *dev,
2050                              struct device_attribute *attr, char *buf)
2051 {
2052         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2053
2054         if (rbd_dev->spec->image_name)
2055                 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2056
2057         return sprintf(buf, "(unknown)\n");
2058 }
2059
2060 static ssize_t rbd_image_id_show(struct device *dev,
2061                              struct device_attribute *attr, char *buf)
2062 {
2063         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2064
2065         return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2066 }
2067
2068 /*
2069  * Shows the name of the currently-mapped snapshot (or
2070  * RBD_SNAP_HEAD_NAME for the base image).
2071  */
2072 static ssize_t rbd_snap_show(struct device *dev,
2073                              struct device_attribute *attr,
2074                              char *buf)
2075 {
2076         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2077
2078         return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2079 }
2080
2081 /*
2082  * For an rbd v2 image, shows the pool id, image id, and snapshot id
2083  * for the parent image.  If there is no parent, simply shows
2084  * "(no parent image)".
2085  */
2086 static ssize_t rbd_parent_show(struct device *dev,
2087                              struct device_attribute *attr,
2088                              char *buf)
2089 {
2090         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2091         struct rbd_spec *spec = rbd_dev->parent_spec;
2092         int count;
2093         char *bufp = buf;
2094
2095         if (!spec)
2096                 return sprintf(buf, "(no parent image)\n");
2097
2098         count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2099                         (unsigned long long) spec->pool_id, spec->pool_name);
2100         if (count < 0)
2101                 return count;
2102         bufp += count;
2103
2104         count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2105                         spec->image_name ? spec->image_name : "(unknown)");
2106         if (count < 0)
2107                 return count;
2108         bufp += count;
2109
2110         count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2111                         (unsigned long long) spec->snap_id, spec->snap_name);
2112         if (count < 0)
2113                 return count;
2114         bufp += count;
2115
2116         count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2117         if (count < 0)
2118                 return count;
2119         bufp += count;
2120
2121         return (ssize_t) (bufp - buf);
2122 }
2123
2124 static ssize_t rbd_image_refresh(struct device *dev,
2125                                  struct device_attribute *attr,
2126                                  const char *buf,
2127                                  size_t size)
2128 {
2129         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2130         int ret;
2131
2132         ret = rbd_dev_refresh(rbd_dev, NULL);
2133
2134         return ret < 0 ? ret : size;
2135 }
2136
2137 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
2138 static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2139 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2140 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2141 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
2142 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2143 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2144 static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2145 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2146 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2147 static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2148
2149 static struct attribute *rbd_attrs[] = {
2150         &dev_attr_size.attr,
2151         &dev_attr_features.attr,
2152         &dev_attr_major.attr,
2153         &dev_attr_client_id.attr,
2154         &dev_attr_pool.attr,
2155         &dev_attr_pool_id.attr,
2156         &dev_attr_name.attr,
2157         &dev_attr_image_id.attr,
2158         &dev_attr_current_snap.attr,
2159         &dev_attr_parent.attr,
2160         &dev_attr_refresh.attr,
2161         NULL
2162 };
2163
2164 static struct attribute_group rbd_attr_group = {
2165         .attrs = rbd_attrs,
2166 };
2167
2168 static const struct attribute_group *rbd_attr_groups[] = {
2169         &rbd_attr_group,
2170         NULL
2171 };
2172
2173 static void rbd_sysfs_dev_release(struct device *dev)
2174 {
2175 }
2176
2177 static struct device_type rbd_device_type = {
2178         .name           = "rbd",
2179         .groups         = rbd_attr_groups,
2180         .release        = rbd_sysfs_dev_release,
2181 };
2182
2183
2184 /*
2185   sysfs - snapshots
2186 */
2187
2188 static ssize_t rbd_snap_size_show(struct device *dev,
2189                                   struct device_attribute *attr,
2190                                   char *buf)
2191 {
2192         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2193
2194         return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2195 }
2196
2197 static ssize_t rbd_snap_id_show(struct device *dev,
2198                                 struct device_attribute *attr,
2199                                 char *buf)
2200 {
2201         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2202
2203         return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2204 }
2205
2206 static ssize_t rbd_snap_features_show(struct device *dev,
2207                                 struct device_attribute *attr,
2208                                 char *buf)
2209 {
2210         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2211
2212         return sprintf(buf, "0x%016llx\n",
2213                         (unsigned long long) snap->features);
2214 }
2215
2216 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2217 static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2218 static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2219
2220 static struct attribute *rbd_snap_attrs[] = {
2221         &dev_attr_snap_size.attr,
2222         &dev_attr_snap_id.attr,
2223         &dev_attr_snap_features.attr,
2224         NULL,
2225 };
2226
2227 static struct attribute_group rbd_snap_attr_group = {
2228         .attrs = rbd_snap_attrs,
2229 };
2230
2231 static void rbd_snap_dev_release(struct device *dev)
2232 {
2233         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2234         kfree(snap->name);
2235         kfree(snap);
2236 }
2237
2238 static const struct attribute_group *rbd_snap_attr_groups[] = {
2239         &rbd_snap_attr_group,
2240         NULL
2241 };
2242
2243 static struct device_type rbd_snap_device_type = {
2244         .groups         = rbd_snap_attr_groups,
2245         .release        = rbd_snap_dev_release,
2246 };
2247
2248 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2249 {
2250         kref_get(&spec->kref);
2251
2252         return spec;
2253 }
2254
2255 static void rbd_spec_free(struct kref *kref);
2256 static void rbd_spec_put(struct rbd_spec *spec)
2257 {
2258         if (spec)
2259                 kref_put(&spec->kref, rbd_spec_free);
2260 }
2261
2262 static struct rbd_spec *rbd_spec_alloc(void)
2263 {
2264         struct rbd_spec *spec;
2265
2266         spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2267         if (!spec)
2268                 return NULL;
2269         kref_init(&spec->kref);
2270
2271         rbd_spec_put(rbd_spec_get(spec));       /* TEMPORARY */
2272
2273         return spec;
2274 }
2275
2276 static void rbd_spec_free(struct kref *kref)
2277 {
2278         struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2279
2280         kfree(spec->pool_name);
2281         kfree(spec->image_id);
2282         kfree(spec->image_name);
2283         kfree(spec->snap_name);
2284         kfree(spec);
2285 }
2286
2287 struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2288                                 struct rbd_spec *spec)
2289 {
2290         struct rbd_device *rbd_dev;
2291
2292         rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2293         if (!rbd_dev)
2294                 return NULL;
2295
2296         spin_lock_init(&rbd_dev->lock);
2297         atomic_set(&rbd_dev->exists, 0);
2298         INIT_LIST_HEAD(&rbd_dev->node);
2299         INIT_LIST_HEAD(&rbd_dev->snaps);
2300         init_rwsem(&rbd_dev->header_rwsem);
2301
2302         rbd_dev->spec = spec;
2303         rbd_dev->rbd_client = rbdc;
2304
2305         return rbd_dev;
2306 }
2307
2308 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2309 {
2310         rbd_spec_put(rbd_dev->parent_spec);
2311         kfree(rbd_dev->header_name);
2312         rbd_put_client(rbd_dev->rbd_client);
2313         rbd_spec_put(rbd_dev->spec);
2314         kfree(rbd_dev);
2315 }
2316
2317 static bool rbd_snap_registered(struct rbd_snap *snap)
2318 {
2319         bool ret = snap->dev.type == &rbd_snap_device_type;
2320         bool reg = device_is_registered(&snap->dev);
2321
2322         rbd_assert(!ret ^ reg);
2323
2324         return ret;
2325 }
2326
2327 static void rbd_remove_snap_dev(struct rbd_snap *snap)
2328 {
2329         list_del(&snap->node);
2330         if (device_is_registered(&snap->dev))
2331                 device_unregister(&snap->dev);
2332 }
2333
2334 static int rbd_register_snap_dev(struct rbd_snap *snap,
2335                                   struct device *parent)
2336 {
2337         struct device *dev = &snap->dev;
2338         int ret;
2339
2340         dev->type = &rbd_snap_device_type;
2341         dev->parent = parent;
2342         dev->release = rbd_snap_dev_release;
2343         dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2344         dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2345
2346         ret = device_register(dev);
2347
2348         return ret;
2349 }
2350
2351 static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2352                                                 const char *snap_name,
2353                                                 u64 snap_id, u64 snap_size,
2354                                                 u64 snap_features)
2355 {
2356         struct rbd_snap *snap;
2357         int ret;
2358
2359         snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2360         if (!snap)
2361                 return ERR_PTR(-ENOMEM);
2362
2363         ret = -ENOMEM;
2364         snap->name = kstrdup(snap_name, GFP_KERNEL);
2365         if (!snap->name)
2366                 goto err;
2367
2368         snap->id = snap_id;
2369         snap->size = snap_size;
2370         snap->features = snap_features;
2371
2372         return snap;
2373
2374 err:
2375         kfree(snap->name);
2376         kfree(snap);
2377
2378         return ERR_PTR(ret);
2379 }
2380
2381 static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2382                 u64 *snap_size, u64 *snap_features)
2383 {
2384         char *snap_name;
2385
2386         rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2387
2388         *snap_size = rbd_dev->header.snap_sizes[which];
2389         *snap_features = 0;     /* No features for v1 */
2390
2391         /* Skip over names until we find the one we are looking for */
2392
2393         snap_name = rbd_dev->header.snap_names;
2394         while (which--)
2395                 snap_name += strlen(snap_name) + 1;
2396
2397         return snap_name;
2398 }
2399
2400 /*
2401  * Get the size and object order for an image snapshot, or if
2402  * snap_id is CEPH_NOSNAP, gets this information for the base
2403  * image.
2404  */
2405 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2406                                 u8 *order, u64 *snap_size)
2407 {
2408         __le64 snapid = cpu_to_le64(snap_id);
2409         int ret;
2410         struct {
2411                 u8 order;
2412                 __le64 size;
2413         } __attribute__ ((packed)) size_buf = { 0 };
2414
2415         ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2416                                 "rbd", "get_size",
2417                                 (char *) &snapid, sizeof (snapid),
2418                                 (char *) &size_buf, sizeof (size_buf),
2419                                 CEPH_OSD_FLAG_READ, NULL);
2420         dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2421         if (ret < 0)
2422                 return ret;
2423
2424         *order = size_buf.order;
2425         *snap_size = le64_to_cpu(size_buf.size);
2426
2427         dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
2428                 (unsigned long long) snap_id, (unsigned int) *order,
2429                 (unsigned long long) *snap_size);
2430
2431         return 0;
2432 }
2433
2434 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2435 {
2436         return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2437                                         &rbd_dev->header.obj_order,
2438                                         &rbd_dev->header.image_size);
2439 }
2440
2441 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2442 {
2443         void *reply_buf;
2444         int ret;
2445         void *p;
2446
2447         reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2448         if (!reply_buf)
2449                 return -ENOMEM;
2450
2451         ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2452                                 "rbd", "get_object_prefix",
2453                                 NULL, 0,
2454                                 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2455                                 CEPH_OSD_FLAG_READ, NULL);
2456         dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2457         if (ret < 0)
2458                 goto out;
2459         ret = 0;    /* rbd_req_sync_exec() can return positive */
2460
2461         p = reply_buf;
2462         rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2463                                                 p + RBD_OBJ_PREFIX_LEN_MAX,
2464                                                 NULL, GFP_NOIO);
2465
2466         if (IS_ERR(rbd_dev->header.object_prefix)) {
2467                 ret = PTR_ERR(rbd_dev->header.object_prefix);
2468                 rbd_dev->header.object_prefix = NULL;
2469         } else {
2470                 dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
2471         }
2472
2473 out:
2474         kfree(reply_buf);
2475
2476         return ret;
2477 }
2478
2479 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2480                 u64 *snap_features)
2481 {
2482         __le64 snapid = cpu_to_le64(snap_id);
2483         struct {
2484                 __le64 features;
2485                 __le64 incompat;
2486         } features_buf = { 0 };
2487         u64 incompat;
2488         int ret;
2489
2490         ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2491                                 "rbd", "get_features",
2492                                 (char *) &snapid, sizeof (snapid),
2493                                 (char *) &features_buf, sizeof (features_buf),
2494                                 CEPH_OSD_FLAG_READ, NULL);
2495         dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2496         if (ret < 0)
2497                 return ret;
2498
2499         incompat = le64_to_cpu(features_buf.incompat);
2500         if (incompat & ~RBD_FEATURES_ALL)
2501                 return -ENXIO;
2502
2503         *snap_features = le64_to_cpu(features_buf.features);
2504
2505         dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2506                 (unsigned long long) snap_id,
2507                 (unsigned long long) *snap_features,
2508                 (unsigned long long) le64_to_cpu(features_buf.incompat));
2509
2510         return 0;
2511 }
2512
2513 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2514 {
2515         return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2516                                                 &rbd_dev->header.features);
2517 }
2518
2519 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2520 {
2521         struct rbd_spec *parent_spec;
2522         size_t size;
2523         void *reply_buf = NULL;
2524         __le64 snapid;
2525         void *p;
2526         void *end;
2527         char *image_id;
2528         u64 overlap;
2529         int ret;
2530
2531         parent_spec = rbd_spec_alloc();
2532         if (!parent_spec)
2533                 return -ENOMEM;
2534
2535         size = sizeof (__le64) +                                /* pool_id */
2536                 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +        /* image_id */
2537                 sizeof (__le64) +                               /* snap_id */
2538                 sizeof (__le64);                                /* overlap */
2539         reply_buf = kmalloc(size, GFP_KERNEL);
2540         if (!reply_buf) {
2541                 ret = -ENOMEM;
2542                 goto out_err;
2543         }
2544
2545         snapid = cpu_to_le64(CEPH_NOSNAP);
2546         ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2547                                 "rbd", "get_parent",
2548                                 (char *) &snapid, sizeof (snapid),
2549                                 (char *) reply_buf, size,
2550                                 CEPH_OSD_FLAG_READ, NULL);
2551         dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2552         if (ret < 0)
2553                 goto out_err;
2554
2555         ret = -ERANGE;
2556         p = reply_buf;
2557         end = (char *) reply_buf + size;
2558         ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2559         if (parent_spec->pool_id == CEPH_NOPOOL)
2560                 goto out;       /* No parent?  No problem. */
2561
2562         image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
2563         if (IS_ERR(image_id)) {
2564                 ret = PTR_ERR(image_id);
2565                 goto out_err;
2566         }
2567         parent_spec->image_id = image_id;
2568         ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2569         ceph_decode_64_safe(&p, end, overlap, out_err);
2570
2571         rbd_dev->parent_overlap = overlap;
2572         rbd_dev->parent_spec = parent_spec;
2573         parent_spec = NULL;     /* rbd_dev now owns this */
2574 out:
2575         ret = 0;
2576 out_err:
2577         kfree(reply_buf);
2578         rbd_spec_put(parent_spec);
2579
2580         return ret;
2581 }
2582
2583 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2584 {
2585         size_t image_id_size;
2586         char *image_id;
2587         void *p;
2588         void *end;
2589         size_t size;
2590         void *reply_buf = NULL;
2591         size_t len = 0;
2592         char *image_name = NULL;
2593         int ret;
2594
2595         rbd_assert(!rbd_dev->spec->image_name);
2596
2597         len = strlen(rbd_dev->spec->image_id);
2598         image_id_size = sizeof (__le32) + len;
2599         image_id = kmalloc(image_id_size, GFP_KERNEL);
2600         if (!image_id)
2601                 return NULL;
2602
2603         p = image_id;
2604         end = (char *) image_id + image_id_size;
2605         ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
2606
2607         size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2608         reply_buf = kmalloc(size, GFP_KERNEL);
2609         if (!reply_buf)
2610                 goto out;
2611
2612         ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2613                                 "rbd", "dir_get_name",
2614                                 image_id, image_id_size,
2615                                 (char *) reply_buf, size,
2616                                 CEPH_OSD_FLAG_READ, NULL);
2617         if (ret < 0)
2618                 goto out;
2619         p = reply_buf;
2620         end = (char *) reply_buf + size;
2621         image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2622         if (IS_ERR(image_name))
2623                 image_name = NULL;
2624         else
2625                 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2626 out:
2627         kfree(reply_buf);
2628         kfree(image_id);
2629
2630         return image_name;
2631 }
2632
2633 /*
2634  * When a parent image gets probed, we only have the pool, image,
2635  * and snapshot ids but not the names of any of them.  This call
2636  * is made later to fill in those names.  It has to be done after
2637  * rbd_dev_snaps_update() has completed because some of the
2638  * information (in particular, snapshot name) is not available
2639  * until then.
2640  */
2641 static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2642 {
2643         struct ceph_osd_client *osdc;
2644         const char *name;
2645         void *reply_buf = NULL;
2646         int ret;
2647
2648         if (rbd_dev->spec->pool_name)
2649                 return 0;       /* Already have the names */
2650
2651         /* Look up the pool name */
2652
2653         osdc = &rbd_dev->rbd_client->client->osdc;
2654         name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2655         if (!name) {
2656                 rbd_warn(rbd_dev, "there is no pool with id %llu",
2657                         rbd_dev->spec->pool_id);        /* Really a BUG() */
2658                 return -EIO;
2659         }
2660
2661         rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2662         if (!rbd_dev->spec->pool_name)
2663                 return -ENOMEM;
2664
2665         /* Fetch the image name; tolerate failure here */
2666
2667         name = rbd_dev_image_name(rbd_dev);
2668         if (name)
2669                 rbd_dev->spec->image_name = (char *) name;
2670         else
2671                 rbd_warn(rbd_dev, "unable to get image name");
2672
2673         /* Look up the snapshot name. */
2674
2675         name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2676         if (!name) {
2677                 rbd_warn(rbd_dev, "no snapshot with id %llu",
2678                         rbd_dev->spec->snap_id);        /* Really a BUG() */
2679                 ret = -EIO;
2680                 goto out_err;
2681         }
2682         rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2683         if(!rbd_dev->spec->snap_name)
2684                 goto out_err;
2685
2686         return 0;
2687 out_err:
2688         kfree(reply_buf);
2689         kfree(rbd_dev->spec->pool_name);
2690         rbd_dev->spec->pool_name = NULL;
2691
2692         return ret;
2693 }
2694
2695 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
2696 {
2697         size_t size;
2698         int ret;
2699         void *reply_buf;
2700         void *p;
2701         void *end;
2702         u64 seq;
2703         u32 snap_count;
2704         struct ceph_snap_context *snapc;
2705         u32 i;
2706
2707         /*
2708          * We'll need room for the seq value (maximum snapshot id),
2709          * snapshot count, and array of that many snapshot ids.
2710          * For now we have a fixed upper limit on the number we're
2711          * prepared to receive.
2712          */
2713         size = sizeof (__le64) + sizeof (__le32) +
2714                         RBD_MAX_SNAP_COUNT * sizeof (__le64);
2715         reply_buf = kzalloc(size, GFP_KERNEL);
2716         if (!reply_buf)
2717                 return -ENOMEM;
2718
2719         ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2720                                 "rbd", "get_snapcontext",
2721                                 NULL, 0,
2722                                 reply_buf, size,
2723                                 CEPH_OSD_FLAG_READ, ver);
2724         dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2725         if (ret < 0)
2726                 goto out;
2727
2728         ret = -ERANGE;
2729         p = reply_buf;
2730         end = (char *) reply_buf + size;
2731         ceph_decode_64_safe(&p, end, seq, out);
2732         ceph_decode_32_safe(&p, end, snap_count, out);
2733
2734         /*
2735          * Make sure the reported number of snapshot ids wouldn't go
2736          * beyond the end of our buffer.  But before checking that,
2737          * make sure the computed size of the snapshot context we
2738          * allocate is representable in a size_t.
2739          */
2740         if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2741                                  / sizeof (u64)) {
2742                 ret = -EINVAL;
2743                 goto out;
2744         }
2745         if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2746                 goto out;
2747
2748         size = sizeof (struct ceph_snap_context) +
2749                                 snap_count * sizeof (snapc->snaps[0]);
2750         snapc = kmalloc(size, GFP_KERNEL);
2751         if (!snapc) {
2752                 ret = -ENOMEM;
2753                 goto out;
2754         }
2755
2756         atomic_set(&snapc->nref, 1);
2757         snapc->seq = seq;
2758         snapc->num_snaps = snap_count;
2759         for (i = 0; i < snap_count; i++)
2760                 snapc->snaps[i] = ceph_decode_64(&p);
2761
2762         rbd_dev->header.snapc = snapc;
2763
2764         dout("  snap context seq = %llu, snap_count = %u\n",
2765                 (unsigned long long) seq, (unsigned int) snap_count);
2766
2767 out:
2768         kfree(reply_buf);
2769
2770         return 0;
2771 }
2772
2773 static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2774 {
2775         size_t size;
2776         void *reply_buf;
2777         __le64 snap_id;
2778         int ret;
2779         void *p;
2780         void *end;
2781         char *snap_name;
2782
2783         size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2784         reply_buf = kmalloc(size, GFP_KERNEL);
2785         if (!reply_buf)
2786                 return ERR_PTR(-ENOMEM);
2787
2788         snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2789         ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2790                                 "rbd", "get_snapshot_name",
2791                                 (char *) &snap_id, sizeof (snap_id),
2792                                 reply_buf, size,
2793                                 CEPH_OSD_FLAG_READ, NULL);
2794         dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2795         if (ret < 0)
2796                 goto out;
2797
2798         p = reply_buf;
2799         end = (char *) reply_buf + size;
2800         snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
2801         if (IS_ERR(snap_name)) {
2802                 ret = PTR_ERR(snap_name);
2803                 goto out;
2804         } else {
2805                 dout("  snap_id 0x%016llx snap_name = %s\n",
2806                         (unsigned long long) le64_to_cpu(snap_id), snap_name);
2807         }
2808         kfree(reply_buf);
2809
2810         return snap_name;
2811 out:
2812         kfree(reply_buf);
2813
2814         return ERR_PTR(ret);
2815 }
2816
2817 static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2818                 u64 *snap_size, u64 *snap_features)
2819 {
2820         __le64 snap_id;
2821         u8 order;
2822         int ret;
2823
2824         snap_id = rbd_dev->header.snapc->snaps[which];
2825         ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2826         if (ret)
2827                 return ERR_PTR(ret);
2828         ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2829         if (ret)
2830                 return ERR_PTR(ret);
2831
2832         return rbd_dev_v2_snap_name(rbd_dev, which);
2833 }
2834
2835 static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2836                 u64 *snap_size, u64 *snap_features)
2837 {
2838         if (rbd_dev->image_format == 1)
2839                 return rbd_dev_v1_snap_info(rbd_dev, which,
2840                                         snap_size, snap_features);
2841         if (rbd_dev->image_format == 2)
2842                 return rbd_dev_v2_snap_info(rbd_dev, which,
2843                                         snap_size, snap_features);
2844         return ERR_PTR(-EINVAL);
2845 }
2846
2847 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2848 {
2849         int ret;
2850         __u8 obj_order;
2851
2852         down_write(&rbd_dev->header_rwsem);
2853
2854         /* Grab old order first, to see if it changes */
2855
2856         obj_order = rbd_dev->header.obj_order,
2857         ret = rbd_dev_v2_image_size(rbd_dev);
2858         if (ret)
2859                 goto out;
2860         if (rbd_dev->header.obj_order != obj_order) {
2861                 ret = -EIO;
2862                 goto out;
2863         }
2864         rbd_update_mapping_size(rbd_dev);
2865
2866         ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2867         dout("rbd_dev_v2_snap_context returned %d\n", ret);
2868         if (ret)
2869                 goto out;
2870         ret = rbd_dev_snaps_update(rbd_dev);
2871         dout("rbd_dev_snaps_update returned %d\n", ret);
2872         if (ret)
2873                 goto out;
2874         ret = rbd_dev_snaps_register(rbd_dev);
2875         dout("rbd_dev_snaps_register returned %d\n", ret);
2876 out:
2877         up_write(&rbd_dev->header_rwsem);
2878
2879         return ret;
2880 }
2881
2882 /*
2883  * Scan the rbd device's current snapshot list and compare it to the
2884  * newly-received snapshot context.  Remove any existing snapshots
2885  * not present in the new snapshot context.  Add a new snapshot for
2886  * any snaphots in the snapshot context not in the current list.
2887  * And verify there are no changes to snapshots we already know
2888  * about.
2889  *
2890  * Assumes the snapshots in the snapshot context are sorted by
2891  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
2892  * are also maintained in that order.)
2893  */
2894 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
2895 {
2896         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2897         const u32 snap_count = snapc->num_snaps;
2898         struct list_head *head = &rbd_dev->snaps;
2899         struct list_head *links = head->next;
2900         u32 index = 0;
2901
2902         dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
2903         while (index < snap_count || links != head) {
2904                 u64 snap_id;
2905                 struct rbd_snap *snap;
2906                 char *snap_name;
2907                 u64 snap_size = 0;
2908                 u64 snap_features = 0;
2909
2910                 snap_id = index < snap_count ? snapc->snaps[index]
2911                                              : CEPH_NOSNAP;
2912                 snap = links != head ? list_entry(links, struct rbd_snap, node)
2913                                      : NULL;
2914                 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
2915
2916                 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2917                         struct list_head *next = links->next;
2918
2919                         /* Existing snapshot not in the new snap context */
2920
2921                         if (rbd_dev->spec->snap_id == snap->id)
2922                                 atomic_set(&rbd_dev->exists, 0);
2923                         rbd_remove_snap_dev(snap);
2924                         dout("%ssnap id %llu has been removed\n",
2925                                 rbd_dev->spec->snap_id == snap->id ?
2926                                                         "mapped " : "",
2927                                 (unsigned long long) snap->id);
2928
2929                         /* Done with this list entry; advance */
2930
2931                         links = next;
2932                         continue;
2933                 }
2934
2935                 snap_name = rbd_dev_snap_info(rbd_dev, index,
2936                                         &snap_size, &snap_features);
2937                 if (IS_ERR(snap_name))
2938                         return PTR_ERR(snap_name);
2939
2940                 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2941                         (unsigned long long) snap_id);
2942                 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2943                         struct rbd_snap *new_snap;
2944
2945                         /* We haven't seen this snapshot before */
2946
2947                         new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
2948                                         snap_id, snap_size, snap_features);
2949                         if (IS_ERR(new_snap)) {
2950                                 int err = PTR_ERR(new_snap);
2951
2952                                 dout("  failed to add dev, error %d\n", err);
2953
2954                                 return err;
2955                         }
2956
2957                         /* New goes before existing, or at end of list */
2958
2959                         dout("  added dev%s\n", snap ? "" : " at end\n");
2960                         if (snap)
2961                                 list_add_tail(&new_snap->node, &snap->node);
2962                         else
2963                                 list_add_tail(&new_snap->node, head);
2964                 } else {
2965                         /* Already have this one */
2966
2967                         dout("  already present\n");
2968
2969                         rbd_assert(snap->size == snap_size);
2970                         rbd_assert(!strcmp(snap->name, snap_name));
2971                         rbd_assert(snap->features == snap_features);
2972
2973                         /* Done with this list entry; advance */
2974
2975                         links = links->next;
2976                 }
2977
2978                 /* Advance to the next entry in the snapshot context */
2979
2980                 index++;
2981         }
2982         dout("%s: done\n", __func__);
2983
2984         return 0;
2985 }
2986
2987 /*
2988  * Scan the list of snapshots and register the devices for any that
2989  * have not already been registered.
2990  */
2991 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2992 {
2993         struct rbd_snap *snap;
2994         int ret = 0;
2995
2996         dout("%s called\n", __func__);
2997         if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2998                 return -EIO;
2999
3000         list_for_each_entry(snap, &rbd_dev->snaps, node) {
3001                 if (!rbd_snap_registered(snap)) {
3002                         ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3003                         if (ret < 0)
3004                                 break;
3005                 }
3006         }
3007         dout("%s: returning %d\n", __func__, ret);
3008
3009         return ret;
3010 }
3011
3012 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3013 {
3014         struct device *dev;
3015         int ret;
3016
3017         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3018
3019         dev = &rbd_dev->dev;
3020         dev->bus = &rbd_bus_type;
3021         dev->type = &rbd_device_type;
3022         dev->parent = &rbd_root_dev;
3023         dev->release = rbd_dev_release;
3024         dev_set_name(dev, "%d", rbd_dev->dev_id);
3025         ret = device_register(dev);
3026
3027         mutex_unlock(&ctl_mutex);
3028
3029         return ret;
3030 }
3031
3032 static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3033 {
3034         device_unregister(&rbd_dev->dev);
3035 }
3036
3037 static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
3038 {
3039         int ret, rc;
3040
3041         do {
3042                 ret = rbd_req_sync_watch(rbd_dev);
3043                 if (ret == -ERANGE) {
3044                         rc = rbd_dev_refresh(rbd_dev, NULL);
3045                         if (rc < 0)
3046                                 return rc;
3047                 }
3048         } while (ret == -ERANGE);
3049
3050         return ret;
3051 }
3052
3053 static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
3054
3055 /*
3056  * Get a unique rbd identifier for the given new rbd_dev, and add
3057  * the rbd_dev to the global list.  The minimum rbd id is 1.
3058  */
3059 static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3060 {
3061         rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3062
3063         spin_lock(&rbd_dev_list_lock);
3064         list_add_tail(&rbd_dev->node, &rbd_dev_list);
3065         spin_unlock(&rbd_dev_list_lock);
3066         dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3067                 (unsigned long long) rbd_dev->dev_id);
3068 }
3069
3070 /*
3071  * Remove an rbd_dev from the global list, and record that its
3072  * identifier is no longer in use.
3073  */
3074 static void rbd_dev_id_put(struct rbd_device *rbd_dev)
3075 {
3076         struct list_head *tmp;
3077         int rbd_id = rbd_dev->dev_id;
3078         int max_id;
3079
3080         rbd_assert(rbd_id > 0);
3081
3082         dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3083                 (unsigned long long) rbd_dev->dev_id);
3084         spin_lock(&rbd_dev_list_lock);
3085         list_del_init(&rbd_dev->node);
3086
3087         /*
3088          * If the id being "put" is not the current maximum, there
3089          * is nothing special we need to do.
3090          */
3091         if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3092                 spin_unlock(&rbd_dev_list_lock);
3093                 return;
3094         }
3095
3096         /*
3097          * We need to update the current maximum id.  Search the
3098          * list to find out what it is.  We're more likely to find
3099          * the maximum at the end, so search the list backward.
3100          */
3101         max_id = 0;
3102         list_for_each_prev(tmp, &rbd_dev_list) {
3103                 struct rbd_device *rbd_dev;
3104
3105                 rbd_dev = list_entry(tmp, struct rbd_device, node);
3106                 if (rbd_dev->dev_id > max_id)
3107                         max_id = rbd_dev->dev_id;
3108         }
3109         spin_unlock(&rbd_dev_list_lock);
3110
3111         /*
3112          * The max id could have been updated by rbd_dev_id_get(), in
3113          * which case it now accurately reflects the new maximum.
3114          * Be careful not to overwrite the maximum value in that
3115          * case.
3116          */
3117         atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3118         dout("  max dev id has been reset\n");
3119 }
3120
3121 /*
3122  * Skips over white space at *buf, and updates *buf to point to the
3123  * first found non-space character (if any). Returns the length of
3124  * the token (string of non-white space characters) found.  Note
3125  * that *buf must be terminated with '\0'.
3126  */
3127 static inline size_t next_token(const char **buf)
3128 {
3129         /*
3130         * These are the characters that produce nonzero for
3131         * isspace() in the "C" and "POSIX" locales.
3132         */
3133         const char *spaces = " \f\n\r\t\v";
3134
3135         *buf += strspn(*buf, spaces);   /* Find start of token */
3136
3137         return strcspn(*buf, spaces);   /* Return token length */
3138 }
3139
3140 /*
3141  * Finds the next token in *buf, and if the provided token buffer is
3142  * big enough, copies the found token into it.  The result, if
3143  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3144  * must be terminated with '\0' on entry.
3145  *
3146  * Returns the length of the token found (not including the '\0').
3147  * Return value will be 0 if no token is found, and it will be >=
3148  * token_size if the token would not fit.
3149  *
3150  * The *buf pointer will be updated to point beyond the end of the
3151  * found token.  Note that this occurs even if the token buffer is
3152  * too small to hold it.
3153  */
3154 static inline size_t copy_token(const char **buf,
3155                                 char *token,
3156                                 size_t token_size)
3157 {
3158         size_t len;
3159
3160         len = next_token(buf);
3161         if (len < token_size) {
3162                 memcpy(token, *buf, len);
3163                 *(token + len) = '\0';
3164         }
3165         *buf += len;
3166
3167         return len;
3168 }
3169
3170 /*
3171  * Finds the next token in *buf, dynamically allocates a buffer big
3172  * enough to hold a copy of it, and copies the token into the new
3173  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3174  * that a duplicate buffer is created even for a zero-length token.
3175  *
3176  * Returns a pointer to the newly-allocated duplicate, or a null
3177  * pointer if memory for the duplicate was not available.  If
3178  * the lenp argument is a non-null pointer, the length of the token
3179  * (not including the '\0') is returned in *lenp.
3180  *
3181  * If successful, the *buf pointer will be updated to point beyond
3182  * the end of the found token.
3183  *
3184  * Note: uses GFP_KERNEL for allocation.
3185  */
3186 static inline char *dup_token(const char **buf, size_t *lenp)
3187 {
3188         char *dup;
3189         size_t len;
3190
3191         len = next_token(buf);
3192         dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3193         if (!dup)
3194                 return NULL;
3195         *(dup + len) = '\0';
3196         *buf += len;
3197
3198         if (lenp)
3199                 *lenp = len;
3200
3201         return dup;
3202 }
3203
3204 /*
3205  * Parse the options provided for an "rbd add" (i.e., rbd image
3206  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3207  * and the data written is passed here via a NUL-terminated buffer.
3208  * Returns 0 if successful or an error code otherwise.
3209  *
3210  * The information extracted from these options is recorded in
3211  * the other parameters which return dynamically-allocated
3212  * structures:
3213  *  ceph_opts
3214  *      The address of a pointer that will refer to a ceph options
3215  *      structure.  Caller must release the returned pointer using
3216  *      ceph_destroy_options() when it is no longer needed.
3217  *  rbd_opts
3218  *      Address of an rbd options pointer.  Fully initialized by
3219  *      this function; caller must release with kfree().
3220  *  spec
3221  *      Address of an rbd image specification pointer.  Fully
3222  *      initialized by this function based on parsed options.
3223  *      Caller must release with rbd_spec_put().
3224  *
3225  * The options passed take this form:
3226  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3227  * where:
3228  *  <mon_addrs>
3229  *      A comma-separated list of one or more monitor addresses.
3230  *      A monitor address is an ip address, optionally followed
3231  *      by a port number (separated by a colon).
3232  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3233  *  <options>
3234  *      A comma-separated list of ceph and/or rbd options.
3235  *  <pool_name>
3236  *      The name of the rados pool containing the rbd image.
3237  *  <image_name>
3238  *      The name of the image in that pool to map.
3239  *  <snap_id>
3240  *      An optional snapshot id.  If provided, the mapping will
3241  *      present data from the image at the time that snapshot was
3242  *      created.  The image head is used if no snapshot id is
3243  *      provided.  Snapshot mappings are always read-only.
3244  */
3245 static int rbd_add_parse_args(const char *buf,
3246                                 struct ceph_options **ceph_opts,
3247                                 struct rbd_options **opts,
3248                                 struct rbd_spec **rbd_spec)
3249 {
3250         size_t len;
3251         char *options;
3252         const char *mon_addrs;
3253         size_t mon_addrs_size;
3254         struct rbd_spec *spec = NULL;
3255         struct rbd_options *rbd_opts = NULL;
3256         struct ceph_options *copts;
3257         int ret;
3258
3259         /* The first four tokens are required */
3260
3261         len = next_token(&buf);
3262         if (!len) {
3263                 rbd_warn(NULL, "no monitor address(es) provided");
3264                 return -EINVAL;
3265         }
3266         mon_addrs = buf;
3267         mon_addrs_size = len + 1;
3268         buf += len;
3269
3270         ret = -EINVAL;
3271         options = dup_token(&buf, NULL);
3272         if (!options)
3273                 return -ENOMEM;
3274         if (!*options) {
3275                 rbd_warn(NULL, "no options provided");
3276                 goto out_err;
3277         }
3278
3279         spec = rbd_spec_alloc();
3280         if (!spec)
3281                 goto out_mem;
3282
3283         spec->pool_name = dup_token(&buf, NULL);
3284         if (!spec->pool_name)
3285                 goto out_mem;
3286         if (!*spec->pool_name) {
3287                 rbd_warn(NULL, "no pool name provided");
3288                 goto out_err;
3289         }
3290
3291         spec->image_name = dup_token(&buf, NULL);
3292         if (!spec->image_name)
3293                 goto out_mem;
3294         if (!*spec->image_name) {
3295                 rbd_warn(NULL, "no image name provided");
3296                 goto out_err;
3297         }
3298
3299         /*
3300          * Snapshot name is optional; default is to use "-"
3301          * (indicating the head/no snapshot).
3302          */
3303         len = next_token(&buf);
3304         if (!len) {
3305                 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3306                 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3307         } else if (len > RBD_MAX_SNAP_NAME_LEN) {
3308                 ret = -ENAMETOOLONG;
3309                 goto out_err;
3310         }
3311         spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3312         if (!spec->snap_name)
3313                 goto out_mem;
3314         *(spec->snap_name + len) = '\0';
3315
3316         /* Initialize all rbd options to the defaults */
3317
3318         rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3319         if (!rbd_opts)
3320                 goto out_mem;
3321
3322         rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3323
3324         copts = ceph_parse_options(options, mon_addrs,
3325                                         mon_addrs + mon_addrs_size - 1,
3326                                         parse_rbd_opts_token, rbd_opts);
3327         if (IS_ERR(copts)) {
3328                 ret = PTR_ERR(copts);
3329                 goto out_err;
3330         }
3331         kfree(options);
3332
3333         *ceph_opts = copts;
3334         *opts = rbd_opts;
3335         *rbd_spec = spec;
3336
3337         return 0;
3338 out_mem:
3339         ret = -ENOMEM;
3340 out_err:
3341         kfree(rbd_opts);
3342         rbd_spec_put(spec);
3343         kfree(options);
3344
3345         return ret;
3346 }
3347
3348 /*
3349  * An rbd format 2 image has a unique identifier, distinct from the
3350  * name given to it by the user.  Internally, that identifier is
3351  * what's used to specify the names of objects related to the image.
3352  *
3353  * A special "rbd id" object is used to map an rbd image name to its
3354  * id.  If that object doesn't exist, then there is no v2 rbd image
3355  * with the supplied name.
3356  *
3357  * This function will record the given rbd_dev's image_id field if
3358  * it can be determined, and in that case will return 0.  If any
3359  * errors occur a negative errno will be returned and the rbd_dev's
3360  * image_id field will be unchanged (and should be NULL).
3361  */
3362 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3363 {
3364         int ret;
3365         size_t size;
3366         char *object_name;
3367         void *response;
3368         void *p;
3369
3370         /*
3371          * When probing a parent image, the image id is already
3372          * known (and the image name likely is not).  There's no
3373          * need to fetch the image id again in this case.
3374          */
3375         if (rbd_dev->spec->image_id)
3376                 return 0;
3377
3378         /*
3379          * First, see if the format 2 image id file exists, and if
3380          * so, get the image's persistent id from it.
3381          */
3382         size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3383         object_name = kmalloc(size, GFP_NOIO);
3384         if (!object_name)
3385                 return -ENOMEM;
3386         sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3387         dout("rbd id object name is %s\n", object_name);
3388
3389         /* Response will be an encoded string, which includes a length */
3390
3391         size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3392         response = kzalloc(size, GFP_NOIO);
3393         if (!response) {
3394                 ret = -ENOMEM;
3395                 goto out;
3396         }
3397
3398         ret = rbd_req_sync_exec(rbd_dev, object_name,
3399                                 "rbd", "get_id",
3400                                 NULL, 0,
3401                                 response, RBD_IMAGE_ID_LEN_MAX,
3402                                 CEPH_OSD_FLAG_READ, NULL);
3403         dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3404         if (ret < 0)
3405                 goto out;
3406         ret = 0;    /* rbd_req_sync_exec() can return positive */
3407
3408         p = response;
3409         rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3410                                                 p + RBD_IMAGE_ID_LEN_MAX,
3411                                                 NULL, GFP_NOIO);
3412         if (IS_ERR(rbd_dev->spec->image_id)) {
3413                 ret = PTR_ERR(rbd_dev->spec->image_id);
3414                 rbd_dev->spec->image_id = NULL;
3415         } else {
3416                 dout("image_id is %s\n", rbd_dev->spec->image_id);
3417         }
3418 out:
3419         kfree(response);
3420         kfree(object_name);
3421
3422         return ret;
3423 }
3424
3425 static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3426 {
3427         int ret;
3428         size_t size;
3429
3430         /* Version 1 images have no id; empty string is used */
3431
3432         rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3433         if (!rbd_dev->spec->image_id)
3434                 return -ENOMEM;
3435
3436         /* Record the header object name for this rbd image. */
3437
3438         size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3439         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3440         if (!rbd_dev->header_name) {
3441                 ret = -ENOMEM;
3442                 goto out_err;
3443         }
3444         sprintf(rbd_dev->header_name, "%s%s",
3445                 rbd_dev->spec->image_name, RBD_SUFFIX);
3446
3447         /* Populate rbd image metadata */
3448
3449         ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3450         if (ret < 0)
3451                 goto out_err;
3452
3453         /* Version 1 images have no parent (no layering) */
3454
3455         rbd_dev->parent_spec = NULL;
3456         rbd_dev->parent_overlap = 0;
3457
3458         rbd_dev->image_format = 1;
3459
3460         dout("discovered version 1 image, header name is %s\n",
3461                 rbd_dev->header_name);
3462
3463         return 0;
3464
3465 out_err:
3466         kfree(rbd_dev->header_name);
3467         rbd_dev->header_name = NULL;
3468         kfree(rbd_dev->spec->image_id);
3469         rbd_dev->spec->image_id = NULL;
3470
3471         return ret;
3472 }
3473
3474 static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3475 {
3476         size_t size;
3477         int ret;
3478         u64 ver = 0;
3479
3480         /*
3481          * Image id was filled in by the caller.  Record the header
3482          * object name for this rbd image.
3483          */
3484         size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3485         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3486         if (!rbd_dev->header_name)
3487                 return -ENOMEM;
3488         sprintf(rbd_dev->header_name, "%s%s",
3489                         RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
3490
3491         /* Get the size and object order for the image */
3492
3493         ret = rbd_dev_v2_image_size(rbd_dev);
3494         if (ret < 0)
3495                 goto out_err;
3496
3497         /* Get the object prefix (a.k.a. block_name) for the image */
3498
3499         ret = rbd_dev_v2_object_prefix(rbd_dev);
3500         if (ret < 0)
3501                 goto out_err;
3502
3503         /* Get the and check features for the image */
3504
3505         ret = rbd_dev_v2_features(rbd_dev);
3506         if (ret < 0)
3507                 goto out_err;
3508
3509         /* If the image supports layering, get the parent info */
3510
3511         if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3512                 ret = rbd_dev_v2_parent_info(rbd_dev);
3513                 if (ret < 0)
3514                         goto out_err;
3515         }
3516
3517         /* crypto and compression type aren't (yet) supported for v2 images */
3518
3519         rbd_dev->header.crypt_type = 0;
3520         rbd_dev->header.comp_type = 0;
3521
3522         /* Get the snapshot context, plus the header version */
3523
3524         ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
3525         if (ret)
3526                 goto out_err;
3527         rbd_dev->header.obj_version = ver;
3528
3529         rbd_dev->image_format = 2;
3530
3531         dout("discovered version 2 image, header name is %s\n",
3532                 rbd_dev->header_name);
3533
3534         return 0;
3535 out_err:
3536         rbd_dev->parent_overlap = 0;
3537         rbd_spec_put(rbd_dev->parent_spec);
3538         rbd_dev->parent_spec = NULL;
3539         kfree(rbd_dev->header_name);
3540         rbd_dev->header_name = NULL;
3541         kfree(rbd_dev->header.object_prefix);
3542         rbd_dev->header.object_prefix = NULL;
3543
3544         return ret;
3545 }
3546
3547 static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3548 {
3549         int ret;
3550
3551         /* no need to lock here, as rbd_dev is not registered yet */
3552         ret = rbd_dev_snaps_update(rbd_dev);
3553         if (ret)
3554                 return ret;
3555
3556         ret = rbd_dev_probe_update_spec(rbd_dev);
3557         if (ret)
3558                 goto err_out_snaps;
3559
3560         ret = rbd_dev_set_mapping(rbd_dev);
3561         if (ret)
3562                 goto err_out_snaps;
3563
3564         /* generate unique id: find highest unique id, add one */
3565         rbd_dev_id_get(rbd_dev);
3566
3567         /* Fill in the device name, now that we have its id. */
3568         BUILD_BUG_ON(DEV_NAME_LEN
3569                         < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3570         sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3571
3572         /* Get our block major device number. */
3573
3574         ret = register_blkdev(0, rbd_dev->name);
3575         if (ret < 0)
3576                 goto err_out_id;
3577         rbd_dev->major = ret;
3578
3579         /* Set up the blkdev mapping. */
3580
3581         ret = rbd_init_disk(rbd_dev);
3582         if (ret)
3583                 goto err_out_blkdev;
3584
3585         ret = rbd_bus_add_dev(rbd_dev);
3586         if (ret)
3587                 goto err_out_disk;
3588
3589         /*
3590          * At this point cleanup in the event of an error is the job
3591          * of the sysfs code (initiated by rbd_bus_del_dev()).
3592          */
3593         down_write(&rbd_dev->header_rwsem);
3594         ret = rbd_dev_snaps_register(rbd_dev);
3595         up_write(&rbd_dev->header_rwsem);
3596         if (ret)
3597                 goto err_out_bus;
3598
3599         ret = rbd_init_watch_dev(rbd_dev);
3600         if (ret)
3601                 goto err_out_bus;
3602
3603         /* Everything's ready.  Announce the disk to the world. */
3604
3605         add_disk(rbd_dev->disk);
3606
3607         pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3608                 (unsigned long long) rbd_dev->mapping.size);
3609
3610         return ret;
3611 err_out_bus:
3612         /* this will also clean up rest of rbd_dev stuff */
3613
3614         rbd_bus_del_dev(rbd_dev);
3615
3616         return ret;
3617 err_out_disk:
3618         rbd_free_disk(rbd_dev);
3619 err_out_blkdev:
3620         unregister_blkdev(rbd_dev->major, rbd_dev->name);
3621 err_out_id:
3622         rbd_dev_id_put(rbd_dev);
3623 err_out_snaps:
3624         rbd_remove_all_snaps(rbd_dev);
3625
3626         return ret;
3627 }
3628
3629 /*
3630  * Probe for the existence of the header object for the given rbd
3631  * device.  For format 2 images this includes determining the image
3632  * id.
3633  */
3634 static int rbd_dev_probe(struct rbd_device *rbd_dev)
3635 {
3636         int ret;
3637
3638         /*
3639          * Get the id from the image id object.  If it's not a
3640          * format 2 image, we'll get ENOENT back, and we'll assume
3641          * it's a format 1 image.
3642          */
3643         ret = rbd_dev_image_id(rbd_dev);
3644         if (ret)
3645                 ret = rbd_dev_v1_probe(rbd_dev);
3646         else
3647                 ret = rbd_dev_v2_probe(rbd_dev);
3648         if (ret) {
3649                 dout("probe failed, returning %d\n", ret);
3650
3651                 return ret;
3652         }
3653
3654         ret = rbd_dev_probe_finish(rbd_dev);
3655         if (ret)
3656                 rbd_header_free(&rbd_dev->header);
3657
3658         return ret;
3659 }
3660
3661 static ssize_t rbd_add(struct bus_type *bus,
3662                        const char *buf,
3663                        size_t count)
3664 {
3665         struct rbd_device *rbd_dev = NULL;
3666         struct ceph_options *ceph_opts = NULL;
3667         struct rbd_options *rbd_opts = NULL;
3668         struct rbd_spec *spec = NULL;
3669         struct rbd_client *rbdc;
3670         struct ceph_osd_client *osdc;
3671         int rc = -ENOMEM;
3672
3673         if (!try_module_get(THIS_MODULE))
3674                 return -ENODEV;
3675
3676         /* parse add command */
3677         rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
3678         if (rc < 0)
3679                 goto err_out_module;
3680
3681         rbdc = rbd_get_client(ceph_opts);
3682         if (IS_ERR(rbdc)) {
3683                 rc = PTR_ERR(rbdc);
3684                 goto err_out_args;
3685         }
3686         ceph_opts = NULL;       /* rbd_dev client now owns this */
3687
3688         /* pick the pool */
3689         osdc = &rbdc->client->osdc;
3690         rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
3691         if (rc < 0)
3692                 goto err_out_client;
3693         spec->pool_id = (u64) rc;
3694
3695         rbd_dev = rbd_dev_create(rbdc, spec);
3696         if (!rbd_dev)
3697                 goto err_out_client;
3698         rbdc = NULL;            /* rbd_dev now owns this */
3699         spec = NULL;            /* rbd_dev now owns this */
3700
3701         rbd_dev->mapping.read_only = rbd_opts->read_only;
3702         kfree(rbd_opts);
3703         rbd_opts = NULL;        /* done with this */
3704
3705         rc = rbd_dev_probe(rbd_dev);
3706         if (rc < 0)
3707                 goto err_out_rbd_dev;
3708
3709         return count;
3710 err_out_rbd_dev:
3711         rbd_dev_destroy(rbd_dev);
3712 err_out_client:
3713         rbd_put_client(rbdc);
3714 err_out_args:
3715         if (ceph_opts)
3716                 ceph_destroy_options(ceph_opts);
3717         kfree(rbd_opts);
3718         rbd_spec_put(spec);
3719 err_out_module:
3720         module_put(THIS_MODULE);
3721
3722         dout("Error adding device %s\n", buf);
3723
3724         return (ssize_t) rc;
3725 }
3726
3727 static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
3728 {
3729         struct list_head *tmp;
3730         struct rbd_device *rbd_dev;
3731
3732         spin_lock(&rbd_dev_list_lock);
3733         list_for_each(tmp, &rbd_dev_list) {
3734                 rbd_dev = list_entry(tmp, struct rbd_device, node);
3735                 if (rbd_dev->dev_id == dev_id) {
3736                         spin_unlock(&rbd_dev_list_lock);
3737                         return rbd_dev;
3738                 }
3739         }
3740         spin_unlock(&rbd_dev_list_lock);
3741         return NULL;
3742 }
3743
3744 static void rbd_dev_release(struct device *dev)
3745 {
3746         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3747
3748         if (rbd_dev->watch_request) {
3749                 struct ceph_client *client = rbd_dev->rbd_client->client;
3750
3751                 ceph_osdc_unregister_linger_request(&client->osdc,
3752                                                     rbd_dev->watch_request);
3753         }
3754         if (rbd_dev->watch_event)
3755                 rbd_req_sync_unwatch(rbd_dev);
3756
3757
3758         /* clean up and free blkdev */
3759         rbd_free_disk(rbd_dev);
3760         unregister_blkdev(rbd_dev->major, rbd_dev->name);
3761
3762         /* release allocated disk header fields */
3763         rbd_header_free(&rbd_dev->header);
3764
3765         /* done with the id, and with the rbd_dev */
3766         rbd_dev_id_put(rbd_dev);
3767         rbd_assert(rbd_dev->rbd_client != NULL);
3768         rbd_dev_destroy(rbd_dev);
3769
3770         /* release module ref */
3771         module_put(THIS_MODULE);
3772 }
3773
3774 static ssize_t rbd_remove(struct bus_type *bus,
3775                           const char *buf,
3776                           size_t count)
3777 {
3778         struct rbd_device *rbd_dev = NULL;
3779         int target_id, rc;
3780         unsigned long ul;
3781         int ret = count;
3782
3783         rc = strict_strtoul(buf, 10, &ul);
3784         if (rc)
3785                 return rc;
3786
3787         /* convert to int; abort if we lost anything in the conversion */
3788         target_id = (int) ul;
3789         if (target_id != ul)
3790                 return -EINVAL;
3791
3792         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3793
3794         rbd_dev = __rbd_get_dev(target_id);
3795         if (!rbd_dev) {
3796                 ret = -ENOENT;
3797                 goto done;
3798         }
3799
3800         if (rbd_dev->open_count) {
3801                 ret = -EBUSY;
3802                 goto done;
3803         }
3804
3805         rbd_remove_all_snaps(rbd_dev);
3806         rbd_bus_del_dev(rbd_dev);
3807
3808 done:
3809         mutex_unlock(&ctl_mutex);
3810
3811         return ret;
3812 }
3813
3814 /*
3815  * create control files in sysfs
3816  * /sys/bus/rbd/...
3817  */
3818 static int rbd_sysfs_init(void)
3819 {
3820         int ret;
3821
3822         ret = device_register(&rbd_root_dev);
3823         if (ret < 0)
3824                 return ret;
3825
3826         ret = bus_register(&rbd_bus_type);
3827         if (ret < 0)
3828                 device_unregister(&rbd_root_dev);
3829
3830         return ret;
3831 }
3832
3833 static void rbd_sysfs_cleanup(void)
3834 {
3835         bus_unregister(&rbd_bus_type);
3836         device_unregister(&rbd_root_dev);
3837 }
3838
3839 int __init rbd_init(void)
3840 {
3841         int rc;
3842
3843         rc = rbd_sysfs_init();
3844         if (rc)
3845                 return rc;
3846         pr_info("loaded " RBD_DRV_NAME_LONG "\n");
3847         return 0;
3848 }
3849
3850 void __exit rbd_exit(void)
3851 {
3852         rbd_sysfs_cleanup();
3853 }
3854
3855 module_init(rbd_init);
3856 module_exit(rbd_exit);
3857
3858 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3859 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3860 MODULE_DESCRIPTION("rados block device");
3861
3862 /* following authorship retained from original osdblk.c */
3863 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3864
3865 MODULE_LICENSE("GPL");