rbd: fix type of snap_id in rbd_dev_v2_snap_info()
[deliverable/linux.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
2647ba38 55/* It might be useful to have these defined elsewhere */
df111be6 56
2647ba38
AE
57#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
df111be6 61
f0f8cef5
AE
62#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
64
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
d4b125e9
AE
67#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
35d489f9 71#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
72#define RBD_MAX_OPT_LEN 1024
73
74#define RBD_SNAP_HEAD_NAME "-"
75
9e15b77d
AE
76/* This allows a single page to hold an image name sent by OSD */
77#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 78#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 79
1e130199 80#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 81
d889140c
AE
82/* Feature bits */
83
84#define RBD_FEATURE_LAYERING 1
85
86/* Features supported by this (client software) implementation. */
87
88#define RBD_FEATURES_ALL (0)
89
81a89793
AE
90/*
91 * An RBD device name will be "rbd#", where the "rbd" comes from
92 * RBD_DRV_NAME above, and # is a unique integer identifier.
93 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
94 * enough to hold all possible device names.
95 */
602adf40 96#define DEV_NAME_LEN 32
81a89793 97#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 98
cc0538b6 99#define RBD_READ_ONLY_DEFAULT false
59c2be1e 100
602adf40
YS
101/*
102 * block device image metadata (in-memory version)
103 */
104struct rbd_image_header {
f84344f3 105 /* These four fields never change for a given rbd image */
849b4260 106 char *object_prefix;
34b13184 107 u64 features;
602adf40
YS
108 __u8 obj_order;
109 __u8 crypt_type;
110 __u8 comp_type;
602adf40 111
f84344f3
AE
112 /* The remaining fields need to be updated occasionally */
113 u64 image_size;
114 struct ceph_snap_context *snapc;
602adf40
YS
115 char *snap_names;
116 u64 *snap_sizes;
59c2be1e
YS
117
118 u64 obj_version;
119};
120
0d7dbfce
AE
121/*
122 * An rbd image specification.
123 *
124 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
125 * identify an image. Each rbd_dev structure includes a pointer to
126 * an rbd_spec structure that encapsulates this identity.
127 *
128 * Each of the id's in an rbd_spec has an associated name. For a
129 * user-mapped image, the names are supplied and the id's associated
130 * with them are looked up. For a layered image, a parent image is
131 * defined by the tuple, and the names are looked up.
132 *
133 * An rbd_dev structure contains a parent_spec pointer which is
134 * non-null if the image it represents is a child in a layered
135 * image. This pointer will refer to the rbd_spec structure used
136 * by the parent rbd_dev for its own identity (i.e., the structure
137 * is shared between the parent and child).
138 *
139 * Since these structures are populated once, during the discovery
140 * phase of image construction, they are effectively immutable so
141 * we make no effort to synchronize access to them.
142 *
143 * Note that code herein does not assume the image name is known (it
144 * could be a null pointer).
0d7dbfce
AE
145 */
146struct rbd_spec {
147 u64 pool_id;
148 char *pool_name;
149
150 char *image_id;
0d7dbfce 151 char *image_name;
0d7dbfce
AE
152
153 u64 snap_id;
154 char *snap_name;
155
156 struct kref kref;
157};
158
59c2be1e 159struct rbd_options {
cc0538b6 160 bool read_only;
602adf40
YS
161};
162
163/*
f0f8cef5 164 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
165 */
166struct rbd_client {
167 struct ceph_client *client;
168 struct kref kref;
169 struct list_head node;
170};
171
172/*
f0f8cef5 173 * a request completion status
602adf40 174 */
1fec7093
YS
175struct rbd_req_status {
176 int done;
8986cb37 177 s32 rc;
1fec7093
YS
178 u64 bytes;
179};
180
181/*
182 * a collection of requests
183 */
184struct rbd_req_coll {
185 int total;
186 int num_done;
187 struct kref kref;
188 struct rbd_req_status status[0];
602adf40
YS
189};
190
f0f8cef5
AE
191/*
192 * a single io request
193 */
194struct rbd_request {
195 struct request *rq; /* blk layer request */
196 struct bio *bio; /* cloned bio */
197 struct page **pages; /* list of used pages */
198 u64 len;
199 int coll_index;
200 struct rbd_req_coll *coll;
201};
202
dfc5606d
YS
203struct rbd_snap {
204 struct device dev;
205 const char *name;
3591538f 206 u64 size;
dfc5606d
YS
207 struct list_head node;
208 u64 id;
34b13184 209 u64 features;
dfc5606d
YS
210};
211
f84344f3 212struct rbd_mapping {
99c1f08f 213 u64 size;
34b13184 214 u64 features;
f84344f3
AE
215 bool read_only;
216};
217
602adf40
YS
218/*
219 * a single device
220 */
221struct rbd_device {
de71a297 222 int dev_id; /* blkdev unique id */
602adf40
YS
223
224 int major; /* blkdev assigned major */
225 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 226
a30b71b9 227 u32 image_format; /* Either 1 or 2 */
602adf40
YS
228 struct rbd_client *rbd_client;
229
230 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
231
232 spinlock_t lock; /* queue lock */
233
234 struct rbd_image_header header;
d78b650a 235 atomic_t exists;
0d7dbfce 236 struct rbd_spec *spec;
602adf40 237
0d7dbfce 238 char *header_name;
971f839a 239
0903e875
AE
240 struct ceph_file_layout layout;
241
59c2be1e
YS
242 struct ceph_osd_event *watch_event;
243 struct ceph_osd_request *watch_request;
244
86b00e0d
AE
245 struct rbd_spec *parent_spec;
246 u64 parent_overlap;
247
c666601a
JD
248 /* protects updating the header */
249 struct rw_semaphore header_rwsem;
f84344f3
AE
250
251 struct rbd_mapping mapping;
602adf40
YS
252
253 struct list_head node;
dfc5606d
YS
254
255 /* list of snapshots */
256 struct list_head snaps;
257
258 /* sysfs related */
259 struct device dev;
42382b70 260 unsigned long open_count;
dfc5606d
YS
261};
262
602adf40 263static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 264
602adf40 265static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
266static DEFINE_SPINLOCK(rbd_dev_list_lock);
267
432b8587
AE
268static LIST_HEAD(rbd_client_list); /* clients */
269static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 270
304f6808
AE
271static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
272static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
273
dfc5606d 274static void rbd_dev_release(struct device *dev);
41f38c2b 275static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 276
f0f8cef5
AE
277static ssize_t rbd_add(struct bus_type *bus, const char *buf,
278 size_t count);
279static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
280 size_t count);
281
282static struct bus_attribute rbd_bus_attrs[] = {
283 __ATTR(add, S_IWUSR, NULL, rbd_add),
284 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
285 __ATTR_NULL
286};
287
288static struct bus_type rbd_bus_type = {
289 .name = "rbd",
290 .bus_attrs = rbd_bus_attrs,
291};
292
293static void rbd_root_dev_release(struct device *dev)
294{
295}
296
297static struct device rbd_root_dev = {
298 .init_name = "rbd",
299 .release = rbd_root_dev_release,
300};
301
06ecc6cb
AE
302static __printf(2, 3)
303void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
304{
305 struct va_format vaf;
306 va_list args;
307
308 va_start(args, fmt);
309 vaf.fmt = fmt;
310 vaf.va = &args;
311
312 if (!rbd_dev)
313 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
314 else if (rbd_dev->disk)
315 printk(KERN_WARNING "%s: %s: %pV\n",
316 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
317 else if (rbd_dev->spec && rbd_dev->spec->image_name)
318 printk(KERN_WARNING "%s: image %s: %pV\n",
319 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
320 else if (rbd_dev->spec && rbd_dev->spec->image_id)
321 printk(KERN_WARNING "%s: id %s: %pV\n",
322 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
323 else /* punt */
324 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
325 RBD_DRV_NAME, rbd_dev, &vaf);
326 va_end(args);
327}
328
aafb230e
AE
329#ifdef RBD_DEBUG
330#define rbd_assert(expr) \
331 if (unlikely(!(expr))) { \
332 printk(KERN_ERR "\nAssertion failure in %s() " \
333 "at line %d:\n\n" \
334 "\trbd_assert(%s);\n\n", \
335 __func__, __LINE__, #expr); \
336 BUG(); \
337 }
338#else /* !RBD_DEBUG */
339# define rbd_assert(expr) ((void) 0)
340#endif /* !RBD_DEBUG */
dfc5606d 341
117973fb
AE
342static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
343static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 344
602adf40
YS
345static int rbd_open(struct block_device *bdev, fmode_t mode)
346{
f0f8cef5 347 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 348
f84344f3 349 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
350 return -EROFS;
351
42382b70 352 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 353 (void) get_device(&rbd_dev->dev);
f84344f3 354 set_device_ro(bdev, rbd_dev->mapping.read_only);
42382b70
AE
355 rbd_dev->open_count++;
356 mutex_unlock(&ctl_mutex);
340c7a2b 357
602adf40
YS
358 return 0;
359}
360
dfc5606d
YS
361static int rbd_release(struct gendisk *disk, fmode_t mode)
362{
363 struct rbd_device *rbd_dev = disk->private_data;
364
42382b70
AE
365 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
366 rbd_assert(rbd_dev->open_count > 0);
367 rbd_dev->open_count--;
c3e946ce 368 put_device(&rbd_dev->dev);
42382b70 369 mutex_unlock(&ctl_mutex);
dfc5606d
YS
370
371 return 0;
372}
373
602adf40
YS
374static const struct block_device_operations rbd_bd_ops = {
375 .owner = THIS_MODULE,
376 .open = rbd_open,
dfc5606d 377 .release = rbd_release,
602adf40
YS
378};
379
380/*
381 * Initialize an rbd client instance.
43ae4701 382 * We own *ceph_opts.
602adf40 383 */
f8c38929 384static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
385{
386 struct rbd_client *rbdc;
387 int ret = -ENOMEM;
388
389 dout("rbd_client_create\n");
390 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
391 if (!rbdc)
392 goto out_opt;
393
394 kref_init(&rbdc->kref);
395 INIT_LIST_HEAD(&rbdc->node);
396
bc534d86
AE
397 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
398
43ae4701 399 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 400 if (IS_ERR(rbdc->client))
bc534d86 401 goto out_mutex;
43ae4701 402 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
403
404 ret = ceph_open_session(rbdc->client);
405 if (ret < 0)
406 goto out_err;
407
432b8587 408 spin_lock(&rbd_client_list_lock);
602adf40 409 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 410 spin_unlock(&rbd_client_list_lock);
602adf40 411
bc534d86
AE
412 mutex_unlock(&ctl_mutex);
413
602adf40
YS
414 dout("rbd_client_create created %p\n", rbdc);
415 return rbdc;
416
417out_err:
418 ceph_destroy_client(rbdc->client);
bc534d86
AE
419out_mutex:
420 mutex_unlock(&ctl_mutex);
602adf40
YS
421 kfree(rbdc);
422out_opt:
43ae4701
AE
423 if (ceph_opts)
424 ceph_destroy_options(ceph_opts);
28f259b7 425 return ERR_PTR(ret);
602adf40
YS
426}
427
428/*
1f7ba331
AE
429 * Find a ceph client with specific addr and configuration. If
430 * found, bump its reference count.
602adf40 431 */
1f7ba331 432static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
433{
434 struct rbd_client *client_node;
1f7ba331 435 bool found = false;
602adf40 436
43ae4701 437 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
438 return NULL;
439
1f7ba331
AE
440 spin_lock(&rbd_client_list_lock);
441 list_for_each_entry(client_node, &rbd_client_list, node) {
442 if (!ceph_compare_options(ceph_opts, client_node->client)) {
443 kref_get(&client_node->kref);
444 found = true;
445 break;
446 }
447 }
448 spin_unlock(&rbd_client_list_lock);
449
450 return found ? client_node : NULL;
602adf40
YS
451}
452
59c2be1e
YS
453/*
454 * mount options
455 */
456enum {
59c2be1e
YS
457 Opt_last_int,
458 /* int args above */
459 Opt_last_string,
460 /* string args above */
cc0538b6
AE
461 Opt_read_only,
462 Opt_read_write,
463 /* Boolean args above */
464 Opt_last_bool,
59c2be1e
YS
465};
466
43ae4701 467static match_table_t rbd_opts_tokens = {
59c2be1e
YS
468 /* int args above */
469 /* string args above */
be466c1c 470 {Opt_read_only, "read_only"},
cc0538b6
AE
471 {Opt_read_only, "ro"}, /* Alternate spelling */
472 {Opt_read_write, "read_write"},
473 {Opt_read_write, "rw"}, /* Alternate spelling */
474 /* Boolean args above */
59c2be1e
YS
475 {-1, NULL}
476};
477
478static int parse_rbd_opts_token(char *c, void *private)
479{
43ae4701 480 struct rbd_options *rbd_opts = private;
59c2be1e
YS
481 substring_t argstr[MAX_OPT_ARGS];
482 int token, intval, ret;
483
43ae4701 484 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
485 if (token < 0)
486 return -EINVAL;
487
488 if (token < Opt_last_int) {
489 ret = match_int(&argstr[0], &intval);
490 if (ret < 0) {
491 pr_err("bad mount option arg (not int) "
492 "at '%s'\n", c);
493 return ret;
494 }
495 dout("got int token %d val %d\n", token, intval);
496 } else if (token > Opt_last_int && token < Opt_last_string) {
497 dout("got string token %d val %s\n", token,
498 argstr[0].from);
cc0538b6
AE
499 } else if (token > Opt_last_string && token < Opt_last_bool) {
500 dout("got Boolean token %d\n", token);
59c2be1e
YS
501 } else {
502 dout("got token %d\n", token);
503 }
504
505 switch (token) {
cc0538b6
AE
506 case Opt_read_only:
507 rbd_opts->read_only = true;
508 break;
509 case Opt_read_write:
510 rbd_opts->read_only = false;
511 break;
59c2be1e 512 default:
aafb230e
AE
513 rbd_assert(false);
514 break;
59c2be1e
YS
515 }
516 return 0;
517}
518
602adf40
YS
519/*
520 * Get a ceph client with specific addr and configuration, if one does
521 * not exist create it.
522 */
9d3997fd 523static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
602adf40 524{
f8c38929 525 struct rbd_client *rbdc;
59c2be1e 526
1f7ba331 527 rbdc = rbd_client_find(ceph_opts);
9d3997fd 528 if (rbdc) /* using an existing client */
43ae4701 529 ceph_destroy_options(ceph_opts);
9d3997fd 530 else
f8c38929 531 rbdc = rbd_client_create(ceph_opts);
602adf40 532
9d3997fd 533 return rbdc;
602adf40
YS
534}
535
536/*
537 * Destroy ceph client
d23a4b3f 538 *
432b8587 539 * Caller must hold rbd_client_list_lock.
602adf40
YS
540 */
541static void rbd_client_release(struct kref *kref)
542{
543 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
544
545 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 546 spin_lock(&rbd_client_list_lock);
602adf40 547 list_del(&rbdc->node);
cd9d9f5d 548 spin_unlock(&rbd_client_list_lock);
602adf40
YS
549
550 ceph_destroy_client(rbdc->client);
551 kfree(rbdc);
552}
553
554/*
555 * Drop reference to ceph client node. If it's not referenced anymore, release
556 * it.
557 */
9d3997fd 558static void rbd_put_client(struct rbd_client *rbdc)
602adf40 559{
c53d5893
AE
560 if (rbdc)
561 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
562}
563
1fec7093
YS
564/*
565 * Destroy requests collection
566 */
567static void rbd_coll_release(struct kref *kref)
568{
569 struct rbd_req_coll *coll =
570 container_of(kref, struct rbd_req_coll, kref);
571
572 dout("rbd_coll_release %p\n", coll);
573 kfree(coll);
574}
602adf40 575
a30b71b9
AE
576static bool rbd_image_format_valid(u32 image_format)
577{
578 return image_format == 1 || image_format == 2;
579}
580
8e94af8e
AE
581static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
582{
103a150f
AE
583 size_t size;
584 u32 snap_count;
585
586 /* The header has to start with the magic rbd header text */
587 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
588 return false;
589
db2388b6
AE
590 /* The bio layer requires at least sector-sized I/O */
591
592 if (ondisk->options.order < SECTOR_SHIFT)
593 return false;
594
595 /* If we use u64 in a few spots we may be able to loosen this */
596
597 if (ondisk->options.order > 8 * sizeof (int) - 1)
598 return false;
599
103a150f
AE
600 /*
601 * The size of a snapshot header has to fit in a size_t, and
602 * that limits the number of snapshots.
603 */
604 snap_count = le32_to_cpu(ondisk->snap_count);
605 size = SIZE_MAX - sizeof (struct ceph_snap_context);
606 if (snap_count > size / sizeof (__le64))
607 return false;
608
609 /*
610 * Not only that, but the size of the entire the snapshot
611 * header must also be representable in a size_t.
612 */
613 size -= snap_count * sizeof (__le64);
614 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
615 return false;
616
617 return true;
8e94af8e
AE
618}
619
602adf40
YS
620/*
621 * Create a new header structure, translate header format from the on-disk
622 * header.
623 */
624static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 625 struct rbd_image_header_ondisk *ondisk)
602adf40 626{
ccece235 627 u32 snap_count;
58c17b0e 628 size_t len;
d2bb24e5 629 size_t size;
621901d6 630 u32 i;
602adf40 631
6a52325f
AE
632 memset(header, 0, sizeof (*header));
633
103a150f
AE
634 snap_count = le32_to_cpu(ondisk->snap_count);
635
58c17b0e
AE
636 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
637 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 638 if (!header->object_prefix)
602adf40 639 return -ENOMEM;
58c17b0e
AE
640 memcpy(header->object_prefix, ondisk->object_prefix, len);
641 header->object_prefix[len] = '\0';
00f1f36f 642
602adf40 643 if (snap_count) {
f785cc1d
AE
644 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
645
621901d6
AE
646 /* Save a copy of the snapshot names */
647
f785cc1d
AE
648 if (snap_names_len > (u64) SIZE_MAX)
649 return -EIO;
650 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 651 if (!header->snap_names)
6a52325f 652 goto out_err;
f785cc1d
AE
653 /*
654 * Note that rbd_dev_v1_header_read() guarantees
655 * the ondisk buffer we're working with has
656 * snap_names_len bytes beyond the end of the
657 * snapshot id array, this memcpy() is safe.
658 */
659 memcpy(header->snap_names, &ondisk->snaps[snap_count],
660 snap_names_len);
6a52325f 661
621901d6
AE
662 /* Record each snapshot's size */
663
d2bb24e5
AE
664 size = snap_count * sizeof (*header->snap_sizes);
665 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 666 if (!header->snap_sizes)
6a52325f 667 goto out_err;
621901d6
AE
668 for (i = 0; i < snap_count; i++)
669 header->snap_sizes[i] =
670 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 671 } else {
ccece235 672 WARN_ON(ondisk->snap_names_len);
602adf40
YS
673 header->snap_names = NULL;
674 header->snap_sizes = NULL;
675 }
849b4260 676
34b13184 677 header->features = 0; /* No features support in v1 images */
602adf40
YS
678 header->obj_order = ondisk->options.order;
679 header->crypt_type = ondisk->options.crypt_type;
680 header->comp_type = ondisk->options.comp_type;
6a52325f 681
621901d6
AE
682 /* Allocate and fill in the snapshot context */
683
f84344f3 684 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
685 size = sizeof (struct ceph_snap_context);
686 size += snap_count * sizeof (header->snapc->snaps[0]);
687 header->snapc = kzalloc(size, GFP_KERNEL);
688 if (!header->snapc)
689 goto out_err;
602adf40
YS
690
691 atomic_set(&header->snapc->nref, 1);
505cbb9b 692 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 693 header->snapc->num_snaps = snap_count;
621901d6
AE
694 for (i = 0; i < snap_count; i++)
695 header->snapc->snaps[i] =
696 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
697
698 return 0;
699
6a52325f 700out_err:
849b4260 701 kfree(header->snap_sizes);
ccece235 702 header->snap_sizes = NULL;
602adf40 703 kfree(header->snap_names);
ccece235 704 header->snap_names = NULL;
6a52325f
AE
705 kfree(header->object_prefix);
706 header->object_prefix = NULL;
ccece235 707
00f1f36f 708 return -ENOMEM;
602adf40
YS
709}
710
9e15b77d
AE
711static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
712{
713 struct rbd_snap *snap;
714
715 if (snap_id == CEPH_NOSNAP)
716 return RBD_SNAP_HEAD_NAME;
717
718 list_for_each_entry(snap, &rbd_dev->snaps, node)
719 if (snap_id == snap->id)
720 return snap->name;
721
722 return NULL;
723}
724
8836b995 725static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 726{
602adf40 727
e86924a8 728 struct rbd_snap *snap;
602adf40 729
e86924a8
AE
730 list_for_each_entry(snap, &rbd_dev->snaps, node) {
731 if (!strcmp(snap_name, snap->name)) {
0d7dbfce 732 rbd_dev->spec->snap_id = snap->id;
e86924a8 733 rbd_dev->mapping.size = snap->size;
34b13184 734 rbd_dev->mapping.features = snap->features;
602adf40 735
e86924a8 736 return 0;
00f1f36f 737 }
00f1f36f 738 }
e86924a8 739
00f1f36f 740 return -ENOENT;
602adf40
YS
741}
742
819d52bf 743static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 744{
78dc447d 745 int ret;
602adf40 746
0d7dbfce 747 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 748 sizeof (RBD_SNAP_HEAD_NAME))) {
0d7dbfce 749 rbd_dev->spec->snap_id = CEPH_NOSNAP;
99c1f08f 750 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 751 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 752 ret = 0;
602adf40 753 } else {
0d7dbfce 754 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
602adf40
YS
755 if (ret < 0)
756 goto done;
f84344f3 757 rbd_dev->mapping.read_only = true;
602adf40 758 }
d78b650a 759 atomic_set(&rbd_dev->exists, 1);
602adf40 760done:
602adf40
YS
761 return ret;
762}
763
764static void rbd_header_free(struct rbd_image_header *header)
765{
849b4260 766 kfree(header->object_prefix);
d78fd7ae 767 header->object_prefix = NULL;
602adf40 768 kfree(header->snap_sizes);
d78fd7ae 769 header->snap_sizes = NULL;
849b4260 770 kfree(header->snap_names);
d78fd7ae 771 header->snap_names = NULL;
d1d25646 772 ceph_put_snap_context(header->snapc);
d78fd7ae 773 header->snapc = NULL;
602adf40
YS
774}
775
65ccfe21 776static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 777{
65ccfe21
AE
778 char *name;
779 u64 segment;
780 int ret;
602adf40 781
2fd82b9e 782 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
65ccfe21
AE
783 if (!name)
784 return NULL;
785 segment = offset >> rbd_dev->header.obj_order;
2fd82b9e 786 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
65ccfe21 787 rbd_dev->header.object_prefix, segment);
2fd82b9e 788 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
65ccfe21
AE
789 pr_err("error formatting segment name for #%llu (%d)\n",
790 segment, ret);
791 kfree(name);
792 name = NULL;
793 }
602adf40 794
65ccfe21
AE
795 return name;
796}
602adf40 797
65ccfe21
AE
798static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
799{
800 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 801
65ccfe21
AE
802 return offset & (segment_size - 1);
803}
804
805static u64 rbd_segment_length(struct rbd_device *rbd_dev,
806 u64 offset, u64 length)
807{
808 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
809
810 offset &= segment_size - 1;
811
aafb230e 812 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
813 if (offset + length > segment_size)
814 length = segment_size - offset;
815
816 return length;
602adf40
YS
817}
818
1fec7093
YS
819static int rbd_get_num_segments(struct rbd_image_header *header,
820 u64 ofs, u64 len)
821{
df111be6
AE
822 u64 start_seg;
823 u64 end_seg;
824
825 if (!len)
826 return 0;
827 if (len - 1 > U64_MAX - ofs)
828 return -ERANGE;
829
830 start_seg = ofs >> header->obj_order;
831 end_seg = (ofs + len - 1) >> header->obj_order;
832
1fec7093
YS
833 return end_seg - start_seg + 1;
834}
835
029bcbd8
JD
836/*
837 * returns the size of an object in the image
838 */
839static u64 rbd_obj_bytes(struct rbd_image_header *header)
840{
841 return 1 << header->obj_order;
842}
843
602adf40
YS
844/*
845 * bio helpers
846 */
847
848static void bio_chain_put(struct bio *chain)
849{
850 struct bio *tmp;
851
852 while (chain) {
853 tmp = chain;
854 chain = chain->bi_next;
855 bio_put(tmp);
856 }
857}
858
859/*
860 * zeros a bio chain, starting at specific offset
861 */
862static void zero_bio_chain(struct bio *chain, int start_ofs)
863{
864 struct bio_vec *bv;
865 unsigned long flags;
866 void *buf;
867 int i;
868 int pos = 0;
869
870 while (chain) {
871 bio_for_each_segment(bv, chain, i) {
872 if (pos + bv->bv_len > start_ofs) {
873 int remainder = max(start_ofs - pos, 0);
874 buf = bvec_kmap_irq(bv, &flags);
875 memset(buf + remainder, 0,
876 bv->bv_len - remainder);
85b5aaa6 877 bvec_kunmap_irq(buf, &flags);
602adf40
YS
878 }
879 pos += bv->bv_len;
880 }
881
882 chain = chain->bi_next;
883 }
884}
885
886/*
f7760dad
AE
887 * Clone a portion of a bio, starting at the given byte offset
888 * and continuing for the number of bytes indicated.
602adf40 889 */
f7760dad
AE
890static struct bio *bio_clone_range(struct bio *bio_src,
891 unsigned int offset,
892 unsigned int len,
893 gfp_t gfpmask)
602adf40 894{
f7760dad
AE
895 struct bio_vec *bv;
896 unsigned int resid;
897 unsigned short idx;
898 unsigned int voff;
899 unsigned short end_idx;
900 unsigned short vcnt;
901 struct bio *bio;
902
903 /* Handle the easy case for the caller */
904
905 if (!offset && len == bio_src->bi_size)
906 return bio_clone(bio_src, gfpmask);
907
908 if (WARN_ON_ONCE(!len))
909 return NULL;
910 if (WARN_ON_ONCE(len > bio_src->bi_size))
911 return NULL;
912 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
913 return NULL;
914
915 /* Find first affected segment... */
916
917 resid = offset;
918 __bio_for_each_segment(bv, bio_src, idx, 0) {
919 if (resid < bv->bv_len)
920 break;
921 resid -= bv->bv_len;
602adf40 922 }
f7760dad 923 voff = resid;
602adf40 924
f7760dad 925 /* ...and the last affected segment */
602adf40 926
f7760dad
AE
927 resid += len;
928 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
929 if (resid <= bv->bv_len)
930 break;
931 resid -= bv->bv_len;
932 }
933 vcnt = end_idx - idx + 1;
934
935 /* Build the clone */
936
937 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
938 if (!bio)
939 return NULL; /* ENOMEM */
602adf40 940
f7760dad
AE
941 bio->bi_bdev = bio_src->bi_bdev;
942 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
943 bio->bi_rw = bio_src->bi_rw;
944 bio->bi_flags |= 1 << BIO_CLONED;
945
946 /*
947 * Copy over our part of the bio_vec, then update the first
948 * and last (or only) entries.
949 */
950 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
951 vcnt * sizeof (struct bio_vec));
952 bio->bi_io_vec[0].bv_offset += voff;
953 if (vcnt > 1) {
954 bio->bi_io_vec[0].bv_len -= voff;
955 bio->bi_io_vec[vcnt - 1].bv_len = resid;
956 } else {
957 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
958 }
959
f7760dad
AE
960 bio->bi_vcnt = vcnt;
961 bio->bi_size = len;
962 bio->bi_idx = 0;
963
964 return bio;
965}
966
967/*
968 * Clone a portion of a bio chain, starting at the given byte offset
969 * into the first bio in the source chain and continuing for the
970 * number of bytes indicated. The result is another bio chain of
971 * exactly the given length, or a null pointer on error.
972 *
973 * The bio_src and offset parameters are both in-out. On entry they
974 * refer to the first source bio and the offset into that bio where
975 * the start of data to be cloned is located.
976 *
977 * On return, bio_src is updated to refer to the bio in the source
978 * chain that contains first un-cloned byte, and *offset will
979 * contain the offset of that byte within that bio.
980 */
981static struct bio *bio_chain_clone_range(struct bio **bio_src,
982 unsigned int *offset,
983 unsigned int len,
984 gfp_t gfpmask)
985{
986 struct bio *bi = *bio_src;
987 unsigned int off = *offset;
988 struct bio *chain = NULL;
989 struct bio **end;
990
991 /* Build up a chain of clone bios up to the limit */
992
993 if (!bi || off >= bi->bi_size || !len)
994 return NULL; /* Nothing to clone */
602adf40 995
f7760dad
AE
996 end = &chain;
997 while (len) {
998 unsigned int bi_size;
999 struct bio *bio;
1000
f5400b7a
AE
1001 if (!bi) {
1002 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
f7760dad 1003 goto out_err; /* EINVAL; ran out of bio's */
f5400b7a 1004 }
f7760dad
AE
1005 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1006 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1007 if (!bio)
1008 goto out_err; /* ENOMEM */
1009
1010 *end = bio;
1011 end = &bio->bi_next;
602adf40 1012
f7760dad
AE
1013 off += bi_size;
1014 if (off == bi->bi_size) {
1015 bi = bi->bi_next;
1016 off = 0;
1017 }
1018 len -= bi_size;
1019 }
1020 *bio_src = bi;
1021 *offset = off;
1022
1023 return chain;
1024out_err:
1025 bio_chain_put(chain);
602adf40 1026
602adf40
YS
1027 return NULL;
1028}
1029
8d23bf29
AE
1030struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1031{
1032 struct ceph_osd_req_op *op;
1033 va_list args;
2647ba38 1034 size_t size;
8d23bf29
AE
1035
1036 op = kzalloc(sizeof (*op), GFP_NOIO);
1037 if (!op)
1038 return NULL;
1039 op->op = opcode;
1040 va_start(args, opcode);
1041 switch (opcode) {
1042 case CEPH_OSD_OP_READ:
1043 case CEPH_OSD_OP_WRITE:
1044 /* rbd_osd_req_op_create(READ, offset, length) */
1045 /* rbd_osd_req_op_create(WRITE, offset, length) */
1046 op->extent.offset = va_arg(args, u64);
1047 op->extent.length = va_arg(args, u64);
1048 if (opcode == CEPH_OSD_OP_WRITE)
1049 op->payload_len = op->extent.length;
1050 break;
2647ba38
AE
1051 case CEPH_OSD_OP_CALL:
1052 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1053 op->cls.class_name = va_arg(args, char *);
1054 size = strlen(op->cls.class_name);
1055 rbd_assert(size <= (size_t) U8_MAX);
1056 op->cls.class_len = size;
1057 op->payload_len = size;
1058
1059 op->cls.method_name = va_arg(args, char *);
1060 size = strlen(op->cls.method_name);
1061 rbd_assert(size <= (size_t) U8_MAX);
1062 op->cls.method_len = size;
1063 op->payload_len += size;
1064
1065 op->cls.argc = 0;
1066 op->cls.indata = va_arg(args, void *);
1067 size = va_arg(args, size_t);
1068 rbd_assert(size <= (size_t) U32_MAX);
1069 op->cls.indata_len = (u32) size;
1070 op->payload_len += size;
1071 break;
5efea49a
AE
1072 case CEPH_OSD_OP_NOTIFY_ACK:
1073 case CEPH_OSD_OP_WATCH:
1074 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1075 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1076 op->watch.cookie = va_arg(args, u64);
1077 op->watch.ver = va_arg(args, u64);
1078 op->watch.ver = cpu_to_le64(op->watch.ver);
1079 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1080 op->watch.flag = (u8) 1;
1081 break;
8d23bf29
AE
1082 default:
1083 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1084 kfree(op);
1085 op = NULL;
1086 break;
1087 }
1088 va_end(args);
1089
1090 return op;
1091}
1092
1093static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1094{
1095 kfree(op);
1096}
1097
1fec7093
YS
1098static void rbd_coll_end_req_index(struct request *rq,
1099 struct rbd_req_coll *coll,
1100 int index,
8986cb37 1101 s32 ret, u64 len)
1fec7093
YS
1102{
1103 struct request_queue *q;
1104 int min, max, i;
1105
bd919d45 1106 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
8986cb37 1107 coll, index, (int)ret, (unsigned long long)len);
1fec7093
YS
1108
1109 if (!rq)
1110 return;
1111
1112 if (!coll) {
1113 blk_end_request(rq, ret, len);
1114 return;
1115 }
1116
1117 q = rq->q;
1118
1119 spin_lock_irq(q->queue_lock);
1120 coll->status[index].done = 1;
1121 coll->status[index].rc = ret;
1122 coll->status[index].bytes = len;
1123 max = min = coll->num_done;
1124 while (max < coll->total && coll->status[max].done)
1125 max++;
1126
1127 for (i = min; i<max; i++) {
8986cb37 1128 __blk_end_request(rq, (int)coll->status[i].rc,
1fec7093
YS
1129 coll->status[i].bytes);
1130 coll->num_done++;
1131 kref_put(&coll->kref, rbd_coll_release);
1132 }
1133 spin_unlock_irq(q->queue_lock);
1134}
1135
725afc97 1136static void rbd_coll_end_req(struct rbd_request *rbd_req,
8986cb37 1137 s32 ret, u64 len)
1fec7093 1138{
725afc97
AE
1139 rbd_coll_end_req_index(rbd_req->rq,
1140 rbd_req->coll, rbd_req->coll_index,
1141 ret, len);
1fec7093
YS
1142}
1143
602adf40
YS
1144/*
1145 * Send ceph osd request
1146 */
1147static int rbd_do_request(struct request *rq,
0ce1a794 1148 struct rbd_device *rbd_dev,
602adf40
YS
1149 struct ceph_snap_context *snapc,
1150 u64 snapid,
aded07ea 1151 const char *object_name, u64 ofs, u64 len,
602adf40
YS
1152 struct bio *bio,
1153 struct page **pages,
1154 int num_pages,
1155 int flags,
30573d68 1156 struct ceph_osd_req_op *op,
1fec7093
YS
1157 struct rbd_req_coll *coll,
1158 int coll_index,
5f29ddd4
AE
1159 void (*rbd_cb)(struct ceph_osd_request *,
1160 struct ceph_msg *),
59c2be1e 1161 u64 *ver)
602adf40 1162{
2e53c6c3 1163 struct ceph_osd_client *osdc;
5f29ddd4 1164 struct ceph_osd_request *osd_req;
2e53c6c3 1165 struct rbd_request *rbd_req = NULL;
602adf40 1166 struct timespec mtime = CURRENT_TIME;
2e53c6c3 1167 int ret;
602adf40 1168
f7760dad
AE
1169 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1170 object_name, (unsigned long long) ofs,
1171 (unsigned long long) len, coll, coll_index);
602adf40 1172
0ce1a794 1173 osdc = &rbd_dev->rbd_client->client->osdc;
30573d68 1174 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
2e53c6c3
AE
1175 if (!osd_req)
1176 return -ENOMEM;
602adf40 1177
d178a9e7 1178 osd_req->r_flags = flags;
54a54007
AE
1179 osd_req->r_pages = pages;
1180 if (bio) {
1181 osd_req->r_bio = bio;
1182 bio_get(osd_req->r_bio);
1183 }
602adf40 1184
18216657 1185 if (coll) {
2e53c6c3
AE
1186 ret = -ENOMEM;
1187 rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO);
1188 if (!rbd_req)
1189 goto done_osd_req;
1190
1191 rbd_req->rq = rq;
1192 rbd_req->bio = bio;
1193 rbd_req->pages = pages;
1194 rbd_req->len = len;
1195 rbd_req->coll = coll;
18216657 1196 rbd_req->coll_index = coll_index;
2e53c6c3 1197 }
602adf40 1198
2e53c6c3 1199 osd_req->r_callback = rbd_cb;
5f29ddd4 1200 osd_req->r_priv = rbd_req;
602adf40 1201
5f29ddd4
AE
1202 strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
1203 osd_req->r_oid_len = strlen(osd_req->r_oid);
602adf40 1204
0903e875 1205 osd_req->r_file_layout = rbd_dev->layout; /* struct */
e01e7927
AE
1206 osd_req->r_num_pages = calc_pages_for(ofs, len);
1207 osd_req->r_page_alignment = ofs & ~PAGE_MASK;
602adf40 1208
30573d68 1209 ceph_osdc_build_request(osd_req, ofs, len, 1, op,
ae7ca4a3 1210 snapc, snapid, &mtime);
602adf40 1211
8b84de79 1212 if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) {
5f29ddd4 1213 ceph_osdc_set_request_linger(osdc, osd_req);
8b84de79 1214 rbd_dev->watch_request = osd_req;
59c2be1e
YS
1215 }
1216
5f29ddd4 1217 ret = ceph_osdc_start_request(osdc, osd_req, false);
602adf40
YS
1218 if (ret < 0)
1219 goto done_err;
1220
1221 if (!rbd_cb) {
5f29ddd4
AE
1222 u64 version;
1223
1224 ret = ceph_osdc_wait_request(osdc, osd_req);
1225 version = le64_to_cpu(osd_req->r_reassert_version.version);
59c2be1e 1226 if (ver)
5f29ddd4
AE
1227 *ver = version;
1228 dout("reassert_ver=%llu\n", (unsigned long long) version);
1229 ceph_osdc_put_request(osd_req);
602adf40
YS
1230 }
1231 return ret;
1232
1233done_err:
2e53c6c3
AE
1234 if (bio)
1235 bio_chain_put(osd_req->r_bio);
725afc97 1236 kfree(rbd_req);
2e53c6c3
AE
1237done_osd_req:
1238 ceph_osdc_put_request(osd_req);
1239
602adf40
YS
1240 return ret;
1241}
1242
1243/*
1244 * Ceph osd op callback
1245 */
5f29ddd4 1246static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
602adf40 1247{
5f29ddd4 1248 struct rbd_request *rbd_req = osd_req->r_priv;
602adf40
YS
1249 struct ceph_osd_reply_head *replyhead;
1250 struct ceph_osd_op *op;
8986cb37 1251 s32 rc;
602adf40
YS
1252 u64 bytes;
1253 int read_op;
1254
1255 /* parse reply */
1256 replyhead = msg->front.iov_base;
1257 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1258 op = (void *)(replyhead + 1);
8986cb37 1259 rc = (s32)le32_to_cpu(replyhead->result);
602adf40 1260 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1261 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1262
bd919d45
AE
1263 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1264 (unsigned long long) bytes, read_op, (int) rc);
602adf40 1265
8986cb37 1266 if (rc == (s32)-ENOENT && read_op) {
725afc97 1267 zero_bio_chain(rbd_req->bio, 0);
602adf40 1268 rc = 0;
725afc97
AE
1269 } else if (rc == 0 && read_op && bytes < rbd_req->len) {
1270 zero_bio_chain(rbd_req->bio, bytes);
1271 bytes = rbd_req->len;
602adf40
YS
1272 }
1273
725afc97 1274 rbd_coll_end_req(rbd_req, rc, bytes);
602adf40 1275
725afc97
AE
1276 if (rbd_req->bio)
1277 bio_chain_put(rbd_req->bio);
602adf40 1278
5f29ddd4 1279 ceph_osdc_put_request(osd_req);
725afc97 1280 kfree(rbd_req);
602adf40
YS
1281}
1282
5f29ddd4
AE
1283static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
1284 struct ceph_msg *msg)
59c2be1e 1285{
5f29ddd4 1286 ceph_osdc_put_request(osd_req);
59c2be1e
YS
1287}
1288
602adf40
YS
1289/*
1290 * Do a synchronous ceph osd operation
1291 */
0ce1a794 1292static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40 1293 int flags,
30573d68 1294 struct ceph_osd_req_op *op,
aded07ea 1295 const char *object_name,
f8d4de6e
AE
1296 u64 ofs, u64 inbound_size,
1297 char *inbound,
59c2be1e 1298 u64 *ver)
602adf40
YS
1299{
1300 int ret;
1301 struct page **pages;
1302 int num_pages;
913d2fdc 1303
30573d68 1304 rbd_assert(op != NULL);
602adf40 1305
f8d4de6e 1306 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1307 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1308 if (IS_ERR(pages))
1309 return PTR_ERR(pages);
602adf40 1310
25704ac9 1311 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
f8d4de6e 1312 object_name, ofs, inbound_size, NULL,
602adf40
YS
1313 pages, num_pages,
1314 flags,
30573d68 1315 op,
1fec7093 1316 NULL, 0,
59c2be1e 1317 NULL,
8b84de79 1318 ver);
602adf40 1319 if (ret < 0)
913d2fdc 1320 goto done;
602adf40 1321
f8d4de6e
AE
1322 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1323 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1324
602adf40
YS
1325done:
1326 ceph_release_page_vector(pages, num_pages);
1327 return ret;
1328}
1329
1330/*
1331 * Do an asynchronous ceph osd operation
1332 */
1333static int rbd_do_op(struct request *rq,
0ce1a794 1334 struct rbd_device *rbd_dev,
602adf40 1335 struct ceph_snap_context *snapc,
602adf40 1336 u64 ofs, u64 len,
1fec7093
YS
1337 struct bio *bio,
1338 struct rbd_req_coll *coll,
1339 int coll_index)
602adf40
YS
1340{
1341 char *seg_name;
1342 u64 seg_ofs;
1343 u64 seg_len;
1344 int ret;
139b4318 1345 struct ceph_osd_req_op *op;
ff2e4bb5
AE
1346 int opcode;
1347 int flags;
4634246d 1348 u64 snapid;
602adf40 1349
65ccfe21 1350 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1351 if (!seg_name)
1352 return -ENOMEM;
65ccfe21
AE
1353 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1354 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40 1355
ff2e4bb5
AE
1356 if (rq_data_dir(rq) == WRITE) {
1357 opcode = CEPH_OSD_OP_WRITE;
1358 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
4634246d 1359 snapid = CEPH_NOSNAP;
ff2e4bb5
AE
1360 } else {
1361 opcode = CEPH_OSD_OP_READ;
1362 flags = CEPH_OSD_FLAG_READ;
a7b4c65f 1363 rbd_assert(!snapc);
0d7dbfce 1364 snapid = rbd_dev->spec->snap_id;
ff2e4bb5 1365 }
602adf40 1366
57cfc106 1367 ret = -ENOMEM;
8d23bf29 1368 op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len);
139b4318 1369 if (!op)
602adf40
YS
1370 goto done;
1371
1372 /* we've taken care of segment sizes earlier when we
1373 cloned the bios. We should never have a segment
1374 truncated at this point */
aafb230e 1375 rbd_assert(seg_len == len);
602adf40
YS
1376
1377 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1378 seg_name, seg_ofs, seg_len,
1379 bio,
1380 NULL, 0,
1381 flags,
30573d68 1382 op,
1fec7093 1383 coll, coll_index,
8b84de79 1384 rbd_req_cb, NULL);
cd323ac0
AE
1385 if (ret < 0)
1386 rbd_coll_end_req_index(rq, coll, coll_index,
1387 (s32)ret, seg_len);
8d23bf29 1388 rbd_osd_req_op_destroy(op);
602adf40
YS
1389done:
1390 kfree(seg_name);
1391 return ret;
1392}
1393
602adf40
YS
1394/*
1395 * Request sync osd read
1396 */
0ce1a794 1397static int rbd_req_sync_read(struct rbd_device *rbd_dev,
aded07ea 1398 const char *object_name,
602adf40 1399 u64 ofs, u64 len,
59c2be1e
YS
1400 char *buf,
1401 u64 *ver)
602adf40 1402{
139b4318 1403 struct ceph_osd_req_op *op;
913d2fdc
AE
1404 int ret;
1405
8d23bf29 1406 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len);
139b4318 1407 if (!op)
913d2fdc
AE
1408 return -ENOMEM;
1409
25704ac9 1410 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
8b84de79 1411 op, object_name, ofs, len, buf, ver);
8d23bf29 1412 rbd_osd_req_op_destroy(op);
913d2fdc
AE
1413
1414 return ret;
602adf40
YS
1415}
1416
1417/*
59c2be1e
YS
1418 * Request sync osd watch
1419 */
0ce1a794 1420static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1421 u64 ver,
7f0a24d8 1422 u64 notify_id)
59c2be1e 1423{
139b4318 1424 struct ceph_osd_req_op *op;
11f77002
SW
1425 int ret;
1426
5efea49a 1427 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
139b4318 1428 if (!op)
57cfc106 1429 return -ENOMEM;
59c2be1e 1430
0ce1a794 1431 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1432 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1433 NULL, 0,
59c2be1e 1434 CEPH_OSD_FLAG_READ,
30573d68 1435 op,
1fec7093 1436 NULL, 0,
8b84de79 1437 rbd_simple_req_cb, NULL);
59c2be1e 1438
5efea49a
AE
1439 rbd_osd_req_op_destroy(op);
1440
59c2be1e
YS
1441 return ret;
1442}
1443
1444static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1445{
0ce1a794 1446 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1447 u64 hver;
13143d2d
SW
1448 int rc;
1449
0ce1a794 1450 if (!rbd_dev)
59c2be1e
YS
1451 return;
1452
bd919d45
AE
1453 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1454 rbd_dev->header_name, (unsigned long long) notify_id,
1455 (unsigned int) opcode);
117973fb 1456 rc = rbd_dev_refresh(rbd_dev, &hver);
13143d2d 1457 if (rc)
06ecc6cb
AE
1458 rbd_warn(rbd_dev, "got notification but failed to "
1459 " update snaps: %d\n", rc);
59c2be1e 1460
7f0a24d8 1461 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1462}
1463
1464/*
907703d0
AE
1465 * Request sync osd watch/unwatch. The value of "start" determines
1466 * whether a watch request is being initiated or torn down.
59c2be1e 1467 */
907703d0 1468static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
59c2be1e 1469{
5efea49a
AE
1470 struct ceph_osd_req_op *op;
1471 int ret = 0;
59c2be1e 1472
907703d0
AE
1473 if (start) {
1474 struct ceph_osd_client *osdc;
79e3057c 1475
907703d0
AE
1476 osdc = &rbd_dev->rbd_client->client->osdc;
1477 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1478 &rbd_dev->watch_event);
1479 if (ret < 0)
5efea49a 1480 return ret;
5efea49a
AE
1481 } else {
1482 rbd_assert(rbd_dev->watch_request != NULL);
907703d0 1483 }
79e3057c 1484
5efea49a
AE
1485 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1486 rbd_dev->watch_event->cookie,
1487 rbd_dev->header.obj_version, start);
1488 if (op)
1489 ret = rbd_req_sync_op(rbd_dev,
79e3057c 1490 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
907703d0 1491 op, rbd_dev->header_name,
8b84de79 1492 0, 0, NULL, NULL);
79e3057c 1493
5efea49a
AE
1494 /* Cancel the event if we're tearing down, or on error */
1495
1496 if (!start || !op || ret < 0) {
907703d0
AE
1497 ceph_osdc_cancel_event(rbd_dev->watch_event);
1498 rbd_dev->watch_event = NULL;
1499 }
5efea49a 1500 rbd_osd_req_op_destroy(op);
907703d0 1501
79e3057c
YS
1502 return ret;
1503}
1504
602adf40 1505/*
3cb4a687 1506 * Synchronous osd object method call
602adf40 1507 */
0ce1a794 1508static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1509 const char *object_name,
1510 const char *class_name,
1511 const char *method_name,
3cb4a687
AE
1512 const char *outbound,
1513 size_t outbound_size,
f8d4de6e
AE
1514 char *inbound,
1515 size_t inbound_size,
59c2be1e 1516 u64 *ver)
602adf40 1517{
139b4318 1518 struct ceph_osd_req_op *op;
57cfc106
AE
1519 int ret;
1520
3cb4a687
AE
1521 /*
1522 * Any input parameters required by the method we're calling
1523 * will be sent along with the class and method names as
1524 * part of the message payload. That data and its size are
1525 * supplied via the indata and indata_len fields (named from
1526 * the perspective of the server side) in the OSD request
1527 * operation.
1528 */
2647ba38
AE
1529 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1530 method_name, outbound, outbound_size);
139b4318 1531 if (!op)
57cfc106 1532 return -ENOMEM;
602adf40 1533
30573d68 1534 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
f8d4de6e 1535 object_name, 0, inbound_size, inbound,
8b84de79 1536 ver);
602adf40 1537
2647ba38 1538 rbd_osd_req_op_destroy(op);
602adf40
YS
1539
1540 dout("cls_exec returned %d\n", ret);
1541 return ret;
1542}
1543
1fec7093
YS
1544static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1545{
1546 struct rbd_req_coll *coll =
1547 kzalloc(sizeof(struct rbd_req_coll) +
1548 sizeof(struct rbd_req_status) * num_reqs,
1549 GFP_ATOMIC);
1550
1551 if (!coll)
1552 return NULL;
1553 coll->total = num_reqs;
1554 kref_init(&coll->kref);
1555 return coll;
1556}
1557
8295cda7
AE
1558static int rbd_dev_do_request(struct request *rq,
1559 struct rbd_device *rbd_dev,
1560 struct ceph_snap_context *snapc,
1561 u64 ofs, unsigned int size,
1562 struct bio *bio_chain)
1563{
1564 int num_segs;
1565 struct rbd_req_coll *coll;
1566 unsigned int bio_offset;
1567 int cur_seg = 0;
1568
1569 dout("%s 0x%x bytes at 0x%llx\n",
1570 rq_data_dir(rq) == WRITE ? "write" : "read",
1571 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1572
1573 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1574 if (num_segs <= 0)
1575 return num_segs;
1576
1577 coll = rbd_alloc_coll(num_segs);
1578 if (!coll)
1579 return -ENOMEM;
1580
1581 bio_offset = 0;
1582 do {
1583 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1584 unsigned int clone_size;
1585 struct bio *bio_clone;
1586
1587 BUG_ON(limit > (u64)UINT_MAX);
1588 clone_size = (unsigned int)limit;
1589 dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt);
1590
1591 kref_get(&coll->kref);
1592
1593 /* Pass a cloned bio chain via an osd request */
1594
1595 bio_clone = bio_chain_clone_range(&bio_chain,
1596 &bio_offset, clone_size,
1597 GFP_ATOMIC);
1598 if (bio_clone)
1599 (void)rbd_do_op(rq, rbd_dev, snapc,
1600 ofs, clone_size,
1601 bio_clone, coll, cur_seg);
1602 else
1603 rbd_coll_end_req_index(rq, coll, cur_seg,
1604 (s32)-ENOMEM,
1605 clone_size);
1606 size -= clone_size;
1607 ofs += clone_size;
1608
1609 cur_seg++;
1610 } while (size > 0);
1611 kref_put(&coll->kref, rbd_coll_release);
1612
1613 return 0;
1614}
1615
602adf40
YS
1616/*
1617 * block device queue callback
1618 */
1619static void rbd_rq_fn(struct request_queue *q)
1620{
1621 struct rbd_device *rbd_dev = q->queuedata;
b395e8b5 1622 bool read_only = rbd_dev->mapping.read_only;
602adf40 1623 struct request *rq;
602adf40 1624
00f1f36f 1625 while ((rq = blk_fetch_request(q))) {
b395e8b5
AE
1626 struct ceph_snap_context *snapc = NULL;
1627 unsigned int size = 0;
8295cda7 1628 int result;
602adf40 1629
602adf40
YS
1630 dout("fetched request\n");
1631
b395e8b5
AE
1632 /* Filter out block requests we don't understand */
1633
602adf40
YS
1634 if ((rq->cmd_type != REQ_TYPE_FS)) {
1635 __blk_end_request_all(rq, 0);
00f1f36f 1636 continue;
602adf40 1637 }
b395e8b5 1638 spin_unlock_irq(q->queue_lock);
602adf40 1639
a7b4c65f
AE
1640 /* Write requests need a reference to the snapshot context */
1641
1642 if (rq_data_dir(rq) == WRITE) {
1643 result = -EROFS;
1644 if (read_only) /* Can't write to a read-only device */
1645 goto out_end_request;
1646
1647 /*
1648 * Note that each osd request will take its
1649 * own reference to the snapshot context
1650 * supplied. The reference we take here
1651 * just guarantees the one we provide stays
1652 * valid.
1653 */
1654 down_read(&rbd_dev->header_rwsem);
b395e8b5 1655 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
a7b4c65f 1656 up_read(&rbd_dev->header_rwsem);
b395e8b5 1657 rbd_assert(snapc != NULL);
a7b4c65f 1658 } else if (!atomic_read(&rbd_dev->exists)) {
0d7dbfce 1659 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
d1d25646 1660 dout("request for non-existent snapshot");
b395e8b5
AE
1661 result = -ENXIO;
1662 goto out_end_request;
e88a36ec
JD
1663 }
1664
f7760dad 1665 size = blk_rq_bytes(rq);
b395e8b5
AE
1666 result = rbd_dev_do_request(rq, rbd_dev, snapc,
1667 blk_rq_pos(rq) * SECTOR_SIZE,
1668 size, rq->bio);
1669out_end_request:
a7b4c65f
AE
1670 if (snapc)
1671 ceph_put_snap_context(snapc);
8295cda7
AE
1672 spin_lock_irq(q->queue_lock);
1673 if (!size || result < 0)
1674 __blk_end_request_all(rq, result);
602adf40
YS
1675 }
1676}
1677
1678/*
1679 * a queue callback. Makes sure that we don't create a bio that spans across
1680 * multiple osd objects. One exception would be with a single page bios,
f7760dad 1681 * which we handle later at bio_chain_clone_range()
602adf40
YS
1682 */
1683static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1684 struct bio_vec *bvec)
1685{
1686 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
1687 sector_t sector_offset;
1688 sector_t sectors_per_obj;
1689 sector_t obj_sector_offset;
1690 int ret;
1691
1692 /*
1693 * Find how far into its rbd object the partition-relative
1694 * bio start sector is to offset relative to the enclosing
1695 * device.
1696 */
1697 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1698 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1699 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1700
1701 /*
1702 * Compute the number of bytes from that offset to the end
1703 * of the object. Account for what's already used by the bio.
1704 */
1705 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1706 if (ret > bmd->bi_size)
1707 ret -= bmd->bi_size;
1708 else
1709 ret = 0;
1710
1711 /*
1712 * Don't send back more than was asked for. And if the bio
1713 * was empty, let the whole thing through because: "Note
1714 * that a block device *must* allow a single page to be
1715 * added to an empty bio."
1716 */
1717 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1718 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1719 ret = (int) bvec->bv_len;
1720
1721 return ret;
602adf40
YS
1722}
1723
1724static void rbd_free_disk(struct rbd_device *rbd_dev)
1725{
1726 struct gendisk *disk = rbd_dev->disk;
1727
1728 if (!disk)
1729 return;
1730
602adf40
YS
1731 if (disk->flags & GENHD_FL_UP)
1732 del_gendisk(disk);
1733 if (disk->queue)
1734 blk_cleanup_queue(disk->queue);
1735 put_disk(disk);
1736}
1737
1738/*
4156d998
AE
1739 * Read the complete header for the given rbd device.
1740 *
1741 * Returns a pointer to a dynamically-allocated buffer containing
1742 * the complete and validated header. Caller can pass the address
1743 * of a variable that will be filled in with the version of the
1744 * header object at the time it was read.
1745 *
1746 * Returns a pointer-coded errno if a failure occurs.
602adf40 1747 */
4156d998
AE
1748static struct rbd_image_header_ondisk *
1749rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1750{
4156d998 1751 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1752 u32 snap_count = 0;
4156d998
AE
1753 u64 names_size = 0;
1754 u32 want_count;
1755 int ret;
602adf40 1756
00f1f36f 1757 /*
4156d998
AE
1758 * The complete header will include an array of its 64-bit
1759 * snapshot ids, followed by the names of those snapshots as
1760 * a contiguous block of NUL-terminated strings. Note that
1761 * the number of snapshots could change by the time we read
1762 * it in, in which case we re-read it.
00f1f36f 1763 */
4156d998
AE
1764 do {
1765 size_t size;
1766
1767 kfree(ondisk);
1768
1769 size = sizeof (*ondisk);
1770 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1771 size += names_size;
1772 ondisk = kmalloc(size, GFP_KERNEL);
1773 if (!ondisk)
1774 return ERR_PTR(-ENOMEM);
1775
4775618d 1776 ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name,
4156d998
AE
1777 0, size,
1778 (char *) ondisk, version);
1779
1780 if (ret < 0)
1781 goto out_err;
1782 if (WARN_ON((size_t) ret < size)) {
1783 ret = -ENXIO;
06ecc6cb
AE
1784 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
1785 size, ret);
4156d998
AE
1786 goto out_err;
1787 }
1788 if (!rbd_dev_ondisk_valid(ondisk)) {
1789 ret = -ENXIO;
06ecc6cb 1790 rbd_warn(rbd_dev, "invalid header");
4156d998 1791 goto out_err;
81e759fb 1792 }
602adf40 1793
4156d998
AE
1794 names_size = le64_to_cpu(ondisk->snap_names_len);
1795 want_count = snap_count;
1796 snap_count = le32_to_cpu(ondisk->snap_count);
1797 } while (snap_count != want_count);
00f1f36f 1798
4156d998 1799 return ondisk;
00f1f36f 1800
4156d998
AE
1801out_err:
1802 kfree(ondisk);
1803
1804 return ERR_PTR(ret);
1805}
1806
1807/*
1808 * reload the ondisk the header
1809 */
1810static int rbd_read_header(struct rbd_device *rbd_dev,
1811 struct rbd_image_header *header)
1812{
1813 struct rbd_image_header_ondisk *ondisk;
1814 u64 ver = 0;
1815 int ret;
602adf40 1816
4156d998
AE
1817 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1818 if (IS_ERR(ondisk))
1819 return PTR_ERR(ondisk);
1820 ret = rbd_header_from_disk(header, ondisk);
1821 if (ret >= 0)
1822 header->obj_version = ver;
1823 kfree(ondisk);
1824
1825 return ret;
602adf40
YS
1826}
1827
41f38c2b 1828static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
1829{
1830 struct rbd_snap *snap;
a0593290 1831 struct rbd_snap *next;
dfc5606d 1832
a0593290 1833 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 1834 rbd_remove_snap_dev(snap);
dfc5606d
YS
1835}
1836
9478554a
AE
1837static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1838{
1839 sector_t size;
1840
0d7dbfce 1841 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9478554a
AE
1842 return;
1843
1844 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1845 dout("setting size to %llu sectors", (unsigned long long) size);
1846 rbd_dev->mapping.size = (u64) size;
1847 set_capacity(rbd_dev->disk, size);
1848}
1849
602adf40
YS
1850/*
1851 * only read the first part of the ondisk header, without the snaps info
1852 */
117973fb 1853static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1854{
1855 int ret;
1856 struct rbd_image_header h;
602adf40
YS
1857
1858 ret = rbd_read_header(rbd_dev, &h);
1859 if (ret < 0)
1860 return ret;
1861
a51aa0c0
JD
1862 down_write(&rbd_dev->header_rwsem);
1863
9478554a
AE
1864 /* Update image size, and check for resize of mapped image */
1865 rbd_dev->header.image_size = h.image_size;
1866 rbd_update_mapping_size(rbd_dev);
9db4b3e3 1867
849b4260 1868 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1869 kfree(rbd_dev->header.snap_sizes);
849b4260 1870 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1871 /* osd requests may still refer to snapc */
1872 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1873
b813623a
AE
1874 if (hver)
1875 *hver = h.obj_version;
a71b891b 1876 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1877 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1878 rbd_dev->header.snapc = h.snapc;
1879 rbd_dev->header.snap_names = h.snap_names;
1880 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1881 /* Free the extra copy of the object prefix */
1882 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1883 kfree(h.object_prefix);
1884
304f6808
AE
1885 ret = rbd_dev_snaps_update(rbd_dev);
1886 if (!ret)
1887 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1888
c666601a 1889 up_write(&rbd_dev->header_rwsem);
602adf40 1890
dfc5606d 1891 return ret;
602adf40
YS
1892}
1893
117973fb 1894static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
1895{
1896 int ret;
1897
117973fb 1898 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 1899 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
1900 if (rbd_dev->image_format == 1)
1901 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1902 else
1903 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
1904 mutex_unlock(&ctl_mutex);
1905
1906 return ret;
1907}
1908
602adf40
YS
1909static int rbd_init_disk(struct rbd_device *rbd_dev)
1910{
1911 struct gendisk *disk;
1912 struct request_queue *q;
593a9e7b 1913 u64 segment_size;
602adf40 1914
602adf40 1915 /* create gendisk info */
602adf40
YS
1916 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1917 if (!disk)
1fcdb8aa 1918 return -ENOMEM;
602adf40 1919
f0f8cef5 1920 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1921 rbd_dev->dev_id);
602adf40
YS
1922 disk->major = rbd_dev->major;
1923 disk->first_minor = 0;
1924 disk->fops = &rbd_bd_ops;
1925 disk->private_data = rbd_dev;
1926
1927 /* init rq */
602adf40
YS
1928 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1929 if (!q)
1930 goto out_disk;
029bcbd8 1931
593a9e7b
AE
1932 /* We use the default size, but let's be explicit about it. */
1933 blk_queue_physical_block_size(q, SECTOR_SIZE);
1934
029bcbd8 1935 /* set io sizes to object size */
593a9e7b
AE
1936 segment_size = rbd_obj_bytes(&rbd_dev->header);
1937 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1938 blk_queue_max_segment_size(q, segment_size);
1939 blk_queue_io_min(q, segment_size);
1940 blk_queue_io_opt(q, segment_size);
029bcbd8 1941
602adf40
YS
1942 blk_queue_merge_bvec(q, rbd_merge_bvec);
1943 disk->queue = q;
1944
1945 q->queuedata = rbd_dev;
1946
1947 rbd_dev->disk = disk;
602adf40 1948
12f02944
AE
1949 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1950
602adf40 1951 return 0;
602adf40
YS
1952out_disk:
1953 put_disk(disk);
1fcdb8aa
AE
1954
1955 return -ENOMEM;
602adf40
YS
1956}
1957
dfc5606d
YS
1958/*
1959 sysfs
1960*/
1961
593a9e7b
AE
1962static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1963{
1964 return container_of(dev, struct rbd_device, dev);
1965}
1966
dfc5606d
YS
1967static ssize_t rbd_size_show(struct device *dev,
1968 struct device_attribute *attr, char *buf)
1969{
593a9e7b 1970 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1971 sector_t size;
1972
1973 down_read(&rbd_dev->header_rwsem);
1974 size = get_capacity(rbd_dev->disk);
1975 up_read(&rbd_dev->header_rwsem);
dfc5606d 1976
a51aa0c0 1977 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1978}
1979
34b13184
AE
1980/*
1981 * Note this shows the features for whatever's mapped, which is not
1982 * necessarily the base image.
1983 */
1984static ssize_t rbd_features_show(struct device *dev,
1985 struct device_attribute *attr, char *buf)
1986{
1987 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1988
1989 return sprintf(buf, "0x%016llx\n",
1990 (unsigned long long) rbd_dev->mapping.features);
1991}
1992
dfc5606d
YS
1993static ssize_t rbd_major_show(struct device *dev,
1994 struct device_attribute *attr, char *buf)
1995{
593a9e7b 1996 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1997
dfc5606d
YS
1998 return sprintf(buf, "%d\n", rbd_dev->major);
1999}
2000
2001static ssize_t rbd_client_id_show(struct device *dev,
2002 struct device_attribute *attr, char *buf)
602adf40 2003{
593a9e7b 2004 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2005
1dbb4399
AE
2006 return sprintf(buf, "client%lld\n",
2007 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
2008}
2009
dfc5606d
YS
2010static ssize_t rbd_pool_show(struct device *dev,
2011 struct device_attribute *attr, char *buf)
602adf40 2012{
593a9e7b 2013 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2014
0d7dbfce 2015 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
2016}
2017
9bb2f334
AE
2018static ssize_t rbd_pool_id_show(struct device *dev,
2019 struct device_attribute *attr, char *buf)
2020{
2021 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2022
0d7dbfce
AE
2023 return sprintf(buf, "%llu\n",
2024 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
2025}
2026
dfc5606d
YS
2027static ssize_t rbd_name_show(struct device *dev,
2028 struct device_attribute *attr, char *buf)
2029{
593a9e7b 2030 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2031
a92ffdf8
AE
2032 if (rbd_dev->spec->image_name)
2033 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2034
2035 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
2036}
2037
589d30e0
AE
2038static ssize_t rbd_image_id_show(struct device *dev,
2039 struct device_attribute *attr, char *buf)
2040{
2041 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2042
0d7dbfce 2043 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
2044}
2045
34b13184
AE
2046/*
2047 * Shows the name of the currently-mapped snapshot (or
2048 * RBD_SNAP_HEAD_NAME for the base image).
2049 */
dfc5606d
YS
2050static ssize_t rbd_snap_show(struct device *dev,
2051 struct device_attribute *attr,
2052 char *buf)
2053{
593a9e7b 2054 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2055
0d7dbfce 2056 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
2057}
2058
86b00e0d
AE
2059/*
2060 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2061 * for the parent image. If there is no parent, simply shows
2062 * "(no parent image)".
2063 */
2064static ssize_t rbd_parent_show(struct device *dev,
2065 struct device_attribute *attr,
2066 char *buf)
2067{
2068 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2069 struct rbd_spec *spec = rbd_dev->parent_spec;
2070 int count;
2071 char *bufp = buf;
2072
2073 if (!spec)
2074 return sprintf(buf, "(no parent image)\n");
2075
2076 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2077 (unsigned long long) spec->pool_id, spec->pool_name);
2078 if (count < 0)
2079 return count;
2080 bufp += count;
2081
2082 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2083 spec->image_name ? spec->image_name : "(unknown)");
2084 if (count < 0)
2085 return count;
2086 bufp += count;
2087
2088 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2089 (unsigned long long) spec->snap_id, spec->snap_name);
2090 if (count < 0)
2091 return count;
2092 bufp += count;
2093
2094 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2095 if (count < 0)
2096 return count;
2097 bufp += count;
2098
2099 return (ssize_t) (bufp - buf);
2100}
2101
dfc5606d
YS
2102static ssize_t rbd_image_refresh(struct device *dev,
2103 struct device_attribute *attr,
2104 const char *buf,
2105 size_t size)
2106{
593a9e7b 2107 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2108 int ret;
602adf40 2109
117973fb 2110 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2111
2112 return ret < 0 ? ret : size;
dfc5606d 2113}
602adf40 2114
dfc5606d 2115static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2116static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2117static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2118static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2119static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2120static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2121static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2122static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2123static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2124static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
86b00e0d 2125static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
dfc5606d
YS
2126
2127static struct attribute *rbd_attrs[] = {
2128 &dev_attr_size.attr,
34b13184 2129 &dev_attr_features.attr,
dfc5606d
YS
2130 &dev_attr_major.attr,
2131 &dev_attr_client_id.attr,
2132 &dev_attr_pool.attr,
9bb2f334 2133 &dev_attr_pool_id.attr,
dfc5606d 2134 &dev_attr_name.attr,
589d30e0 2135 &dev_attr_image_id.attr,
dfc5606d 2136 &dev_attr_current_snap.attr,
86b00e0d 2137 &dev_attr_parent.attr,
dfc5606d 2138 &dev_attr_refresh.attr,
dfc5606d
YS
2139 NULL
2140};
2141
2142static struct attribute_group rbd_attr_group = {
2143 .attrs = rbd_attrs,
2144};
2145
2146static const struct attribute_group *rbd_attr_groups[] = {
2147 &rbd_attr_group,
2148 NULL
2149};
2150
2151static void rbd_sysfs_dev_release(struct device *dev)
2152{
2153}
2154
2155static struct device_type rbd_device_type = {
2156 .name = "rbd",
2157 .groups = rbd_attr_groups,
2158 .release = rbd_sysfs_dev_release,
2159};
2160
2161
2162/*
2163 sysfs - snapshots
2164*/
2165
2166static ssize_t rbd_snap_size_show(struct device *dev,
2167 struct device_attribute *attr,
2168 char *buf)
2169{
2170 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2171
3591538f 2172 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2173}
2174
2175static ssize_t rbd_snap_id_show(struct device *dev,
2176 struct device_attribute *attr,
2177 char *buf)
2178{
2179 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2180
3591538f 2181 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2182}
2183
34b13184
AE
2184static ssize_t rbd_snap_features_show(struct device *dev,
2185 struct device_attribute *attr,
2186 char *buf)
2187{
2188 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2189
2190 return sprintf(buf, "0x%016llx\n",
2191 (unsigned long long) snap->features);
2192}
2193
dfc5606d
YS
2194static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2195static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2196static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2197
2198static struct attribute *rbd_snap_attrs[] = {
2199 &dev_attr_snap_size.attr,
2200 &dev_attr_snap_id.attr,
34b13184 2201 &dev_attr_snap_features.attr,
dfc5606d
YS
2202 NULL,
2203};
2204
2205static struct attribute_group rbd_snap_attr_group = {
2206 .attrs = rbd_snap_attrs,
2207};
2208
2209static void rbd_snap_dev_release(struct device *dev)
2210{
2211 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2212 kfree(snap->name);
2213 kfree(snap);
2214}
2215
2216static const struct attribute_group *rbd_snap_attr_groups[] = {
2217 &rbd_snap_attr_group,
2218 NULL
2219};
2220
2221static struct device_type rbd_snap_device_type = {
2222 .groups = rbd_snap_attr_groups,
2223 .release = rbd_snap_dev_release,
2224};
2225
8b8fb99c
AE
2226static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2227{
2228 kref_get(&spec->kref);
2229
2230 return spec;
2231}
2232
2233static void rbd_spec_free(struct kref *kref);
2234static void rbd_spec_put(struct rbd_spec *spec)
2235{
2236 if (spec)
2237 kref_put(&spec->kref, rbd_spec_free);
2238}
2239
2240static struct rbd_spec *rbd_spec_alloc(void)
2241{
2242 struct rbd_spec *spec;
2243
2244 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2245 if (!spec)
2246 return NULL;
2247 kref_init(&spec->kref);
2248
2249 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2250
2251 return spec;
2252}
2253
2254static void rbd_spec_free(struct kref *kref)
2255{
2256 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2257
2258 kfree(spec->pool_name);
2259 kfree(spec->image_id);
2260 kfree(spec->image_name);
2261 kfree(spec->snap_name);
2262 kfree(spec);
2263}
2264
c53d5893
AE
2265struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2266 struct rbd_spec *spec)
2267{
2268 struct rbd_device *rbd_dev;
2269
2270 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2271 if (!rbd_dev)
2272 return NULL;
2273
2274 spin_lock_init(&rbd_dev->lock);
d78b650a 2275 atomic_set(&rbd_dev->exists, 0);
c53d5893
AE
2276 INIT_LIST_HEAD(&rbd_dev->node);
2277 INIT_LIST_HEAD(&rbd_dev->snaps);
2278 init_rwsem(&rbd_dev->header_rwsem);
2279
2280 rbd_dev->spec = spec;
2281 rbd_dev->rbd_client = rbdc;
2282
0903e875
AE
2283 /* Initialize the layout used for all rbd requests */
2284
2285 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2286 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2287 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2288 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2289
c53d5893
AE
2290 return rbd_dev;
2291}
2292
2293static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2294{
86b00e0d 2295 rbd_spec_put(rbd_dev->parent_spec);
c53d5893
AE
2296 kfree(rbd_dev->header_name);
2297 rbd_put_client(rbd_dev->rbd_client);
2298 rbd_spec_put(rbd_dev->spec);
2299 kfree(rbd_dev);
2300}
2301
304f6808
AE
2302static bool rbd_snap_registered(struct rbd_snap *snap)
2303{
2304 bool ret = snap->dev.type == &rbd_snap_device_type;
2305 bool reg = device_is_registered(&snap->dev);
2306
2307 rbd_assert(!ret ^ reg);
2308
2309 return ret;
2310}
2311
41f38c2b 2312static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2313{
2314 list_del(&snap->node);
304f6808
AE
2315 if (device_is_registered(&snap->dev))
2316 device_unregister(&snap->dev);
dfc5606d
YS
2317}
2318
14e7085d 2319static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2320 struct device *parent)
2321{
2322 struct device *dev = &snap->dev;
2323 int ret;
2324
2325 dev->type = &rbd_snap_device_type;
2326 dev->parent = parent;
2327 dev->release = rbd_snap_dev_release;
d4b125e9 2328 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2329 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2330
dfc5606d
YS
2331 ret = device_register(dev);
2332
2333 return ret;
2334}
2335
4e891e0a 2336static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2337 const char *snap_name,
34b13184
AE
2338 u64 snap_id, u64 snap_size,
2339 u64 snap_features)
dfc5606d 2340{
4e891e0a 2341 struct rbd_snap *snap;
dfc5606d 2342 int ret;
4e891e0a
AE
2343
2344 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2345 if (!snap)
4e891e0a
AE
2346 return ERR_PTR(-ENOMEM);
2347
2348 ret = -ENOMEM;
c8d18425 2349 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2350 if (!snap->name)
2351 goto err;
2352
c8d18425
AE
2353 snap->id = snap_id;
2354 snap->size = snap_size;
34b13184 2355 snap->features = snap_features;
4e891e0a
AE
2356
2357 return snap;
2358
dfc5606d
YS
2359err:
2360 kfree(snap->name);
2361 kfree(snap);
4e891e0a
AE
2362
2363 return ERR_PTR(ret);
dfc5606d
YS
2364}
2365
cd892126
AE
2366static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2367 u64 *snap_size, u64 *snap_features)
2368{
2369 char *snap_name;
2370
2371 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2372
2373 *snap_size = rbd_dev->header.snap_sizes[which];
2374 *snap_features = 0; /* No features for v1 */
2375
2376 /* Skip over names until we find the one we are looking for */
2377
2378 snap_name = rbd_dev->header.snap_names;
2379 while (which--)
2380 snap_name += strlen(snap_name) + 1;
2381
2382 return snap_name;
2383}
2384
9d475de5
AE
2385/*
2386 * Get the size and object order for an image snapshot, or if
2387 * snap_id is CEPH_NOSNAP, gets this information for the base
2388 * image.
2389 */
2390static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2391 u8 *order, u64 *snap_size)
2392{
2393 __le64 snapid = cpu_to_le64(snap_id);
2394 int ret;
2395 struct {
2396 u8 order;
2397 __le64 size;
2398 } __attribute__ ((packed)) size_buf = { 0 };
2399
2400 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2401 "rbd", "get_size",
2402 (char *) &snapid, sizeof (snapid),
07b2391f 2403 (char *) &size_buf, sizeof (size_buf), NULL);
9d475de5
AE
2404 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2405 if (ret < 0)
2406 return ret;
2407
2408 *order = size_buf.order;
2409 *snap_size = le64_to_cpu(size_buf.size);
2410
2411 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2412 (unsigned long long) snap_id, (unsigned int) *order,
2413 (unsigned long long) *snap_size);
2414
2415 return 0;
2416}
2417
2418static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2419{
2420 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2421 &rbd_dev->header.obj_order,
2422 &rbd_dev->header.image_size);
2423}
2424
1e130199
AE
2425static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2426{
2427 void *reply_buf;
2428 int ret;
2429 void *p;
2430
2431 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2432 if (!reply_buf)
2433 return -ENOMEM;
2434
2435 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2436 "rbd", "get_object_prefix",
2437 NULL, 0,
07b2391f 2438 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
1e130199
AE
2439 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2440 if (ret < 0)
2441 goto out;
a0ea3a40 2442 ret = 0; /* rbd_req_sync_exec() can return positive */
1e130199
AE
2443
2444 p = reply_buf;
2445 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2446 p + RBD_OBJ_PREFIX_LEN_MAX,
2447 NULL, GFP_NOIO);
2448
2449 if (IS_ERR(rbd_dev->header.object_prefix)) {
2450 ret = PTR_ERR(rbd_dev->header.object_prefix);
2451 rbd_dev->header.object_prefix = NULL;
2452 } else {
2453 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2454 }
2455
2456out:
2457 kfree(reply_buf);
2458
2459 return ret;
2460}
2461
b1b5402a
AE
2462static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2463 u64 *snap_features)
2464{
2465 __le64 snapid = cpu_to_le64(snap_id);
2466 struct {
2467 __le64 features;
2468 __le64 incompat;
2469 } features_buf = { 0 };
d889140c 2470 u64 incompat;
b1b5402a
AE
2471 int ret;
2472
2473 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2474 "rbd", "get_features",
2475 (char *) &snapid, sizeof (snapid),
2476 (char *) &features_buf, sizeof (features_buf),
07b2391f 2477 NULL);
b1b5402a
AE
2478 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2479 if (ret < 0)
2480 return ret;
d889140c
AE
2481
2482 incompat = le64_to_cpu(features_buf.incompat);
2483 if (incompat & ~RBD_FEATURES_ALL)
b8f5c6ed 2484 return -ENXIO;
d889140c 2485
b1b5402a
AE
2486 *snap_features = le64_to_cpu(features_buf.features);
2487
2488 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2489 (unsigned long long) snap_id,
2490 (unsigned long long) *snap_features,
2491 (unsigned long long) le64_to_cpu(features_buf.incompat));
2492
2493 return 0;
2494}
2495
2496static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2497{
2498 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2499 &rbd_dev->header.features);
2500}
2501
86b00e0d
AE
2502static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2503{
2504 struct rbd_spec *parent_spec;
2505 size_t size;
2506 void *reply_buf = NULL;
2507 __le64 snapid;
2508 void *p;
2509 void *end;
2510 char *image_id;
2511 u64 overlap;
86b00e0d
AE
2512 int ret;
2513
2514 parent_spec = rbd_spec_alloc();
2515 if (!parent_spec)
2516 return -ENOMEM;
2517
2518 size = sizeof (__le64) + /* pool_id */
2519 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2520 sizeof (__le64) + /* snap_id */
2521 sizeof (__le64); /* overlap */
2522 reply_buf = kmalloc(size, GFP_KERNEL);
2523 if (!reply_buf) {
2524 ret = -ENOMEM;
2525 goto out_err;
2526 }
2527
2528 snapid = cpu_to_le64(CEPH_NOSNAP);
2529 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2530 "rbd", "get_parent",
2531 (char *) &snapid, sizeof (snapid),
07b2391f 2532 (char *) reply_buf, size, NULL);
86b00e0d
AE
2533 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2534 if (ret < 0)
2535 goto out_err;
2536
2537 ret = -ERANGE;
2538 p = reply_buf;
2539 end = (char *) reply_buf + size;
2540 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2541 if (parent_spec->pool_id == CEPH_NOPOOL)
2542 goto out; /* No parent? No problem. */
2543
0903e875
AE
2544 /* The ceph file layout needs to fit pool id in 32 bits */
2545
2546 ret = -EIO;
2547 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2548 goto out;
2549
979ed480 2550 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
86b00e0d
AE
2551 if (IS_ERR(image_id)) {
2552 ret = PTR_ERR(image_id);
2553 goto out_err;
2554 }
2555 parent_spec->image_id = image_id;
2556 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2557 ceph_decode_64_safe(&p, end, overlap, out_err);
2558
2559 rbd_dev->parent_overlap = overlap;
2560 rbd_dev->parent_spec = parent_spec;
2561 parent_spec = NULL; /* rbd_dev now owns this */
2562out:
2563 ret = 0;
2564out_err:
2565 kfree(reply_buf);
2566 rbd_spec_put(parent_spec);
2567
2568 return ret;
2569}
2570
9e15b77d
AE
2571static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2572{
2573 size_t image_id_size;
2574 char *image_id;
2575 void *p;
2576 void *end;
2577 size_t size;
2578 void *reply_buf = NULL;
2579 size_t len = 0;
2580 char *image_name = NULL;
2581 int ret;
2582
2583 rbd_assert(!rbd_dev->spec->image_name);
2584
69e7a02f
AE
2585 len = strlen(rbd_dev->spec->image_id);
2586 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
2587 image_id = kmalloc(image_id_size, GFP_KERNEL);
2588 if (!image_id)
2589 return NULL;
2590
2591 p = image_id;
2592 end = (char *) image_id + image_id_size;
69e7a02f 2593 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
9e15b77d
AE
2594
2595 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2596 reply_buf = kmalloc(size, GFP_KERNEL);
2597 if (!reply_buf)
2598 goto out;
2599
2600 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2601 "rbd", "dir_get_name",
2602 image_id, image_id_size,
07b2391f 2603 (char *) reply_buf, size, NULL);
9e15b77d
AE
2604 if (ret < 0)
2605 goto out;
2606 p = reply_buf;
2607 end = (char *) reply_buf + size;
2608 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2609 if (IS_ERR(image_name))
2610 image_name = NULL;
2611 else
2612 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2613out:
2614 kfree(reply_buf);
2615 kfree(image_id);
2616
2617 return image_name;
2618}
2619
2620/*
2621 * When a parent image gets probed, we only have the pool, image,
2622 * and snapshot ids but not the names of any of them. This call
2623 * is made later to fill in those names. It has to be done after
2624 * rbd_dev_snaps_update() has completed because some of the
2625 * information (in particular, snapshot name) is not available
2626 * until then.
2627 */
2628static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2629{
2630 struct ceph_osd_client *osdc;
2631 const char *name;
2632 void *reply_buf = NULL;
2633 int ret;
2634
2635 if (rbd_dev->spec->pool_name)
2636 return 0; /* Already have the names */
2637
2638 /* Look up the pool name */
2639
2640 osdc = &rbd_dev->rbd_client->client->osdc;
2641 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
935dc89f
AE
2642 if (!name) {
2643 rbd_warn(rbd_dev, "there is no pool with id %llu",
2644 rbd_dev->spec->pool_id); /* Really a BUG() */
2645 return -EIO;
2646 }
9e15b77d
AE
2647
2648 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2649 if (!rbd_dev->spec->pool_name)
2650 return -ENOMEM;
2651
2652 /* Fetch the image name; tolerate failure here */
2653
2654 name = rbd_dev_image_name(rbd_dev);
69e7a02f 2655 if (name)
9e15b77d 2656 rbd_dev->spec->image_name = (char *) name;
69e7a02f 2657 else
06ecc6cb 2658 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d
AE
2659
2660 /* Look up the snapshot name. */
2661
2662 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2663 if (!name) {
935dc89f
AE
2664 rbd_warn(rbd_dev, "no snapshot with id %llu",
2665 rbd_dev->spec->snap_id); /* Really a BUG() */
9e15b77d
AE
2666 ret = -EIO;
2667 goto out_err;
2668 }
2669 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2670 if(!rbd_dev->spec->snap_name)
2671 goto out_err;
2672
2673 return 0;
2674out_err:
2675 kfree(reply_buf);
2676 kfree(rbd_dev->spec->pool_name);
2677 rbd_dev->spec->pool_name = NULL;
2678
2679 return ret;
2680}
2681
6e14b1a6 2682static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
2683{
2684 size_t size;
2685 int ret;
2686 void *reply_buf;
2687 void *p;
2688 void *end;
2689 u64 seq;
2690 u32 snap_count;
2691 struct ceph_snap_context *snapc;
2692 u32 i;
2693
2694 /*
2695 * We'll need room for the seq value (maximum snapshot id),
2696 * snapshot count, and array of that many snapshot ids.
2697 * For now we have a fixed upper limit on the number we're
2698 * prepared to receive.
2699 */
2700 size = sizeof (__le64) + sizeof (__le32) +
2701 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2702 reply_buf = kzalloc(size, GFP_KERNEL);
2703 if (!reply_buf)
2704 return -ENOMEM;
2705
2706 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2707 "rbd", "get_snapcontext",
2708 NULL, 0,
07b2391f 2709 reply_buf, size, ver);
35d489f9
AE
2710 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2711 if (ret < 0)
2712 goto out;
2713
2714 ret = -ERANGE;
2715 p = reply_buf;
2716 end = (char *) reply_buf + size;
2717 ceph_decode_64_safe(&p, end, seq, out);
2718 ceph_decode_32_safe(&p, end, snap_count, out);
2719
2720 /*
2721 * Make sure the reported number of snapshot ids wouldn't go
2722 * beyond the end of our buffer. But before checking that,
2723 * make sure the computed size of the snapshot context we
2724 * allocate is representable in a size_t.
2725 */
2726 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2727 / sizeof (u64)) {
2728 ret = -EINVAL;
2729 goto out;
2730 }
2731 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2732 goto out;
2733
2734 size = sizeof (struct ceph_snap_context) +
2735 snap_count * sizeof (snapc->snaps[0]);
2736 snapc = kmalloc(size, GFP_KERNEL);
2737 if (!snapc) {
2738 ret = -ENOMEM;
2739 goto out;
2740 }
2741
2742 atomic_set(&snapc->nref, 1);
2743 snapc->seq = seq;
2744 snapc->num_snaps = snap_count;
2745 for (i = 0; i < snap_count; i++)
2746 snapc->snaps[i] = ceph_decode_64(&p);
2747
2748 rbd_dev->header.snapc = snapc;
2749
2750 dout(" snap context seq = %llu, snap_count = %u\n",
2751 (unsigned long long) seq, (unsigned int) snap_count);
2752
2753out:
2754 kfree(reply_buf);
2755
2756 return 0;
2757}
2758
b8b1e2db
AE
2759static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2760{
2761 size_t size;
2762 void *reply_buf;
2763 __le64 snap_id;
2764 int ret;
2765 void *p;
2766 void *end;
b8b1e2db
AE
2767 char *snap_name;
2768
2769 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2770 reply_buf = kmalloc(size, GFP_KERNEL);
2771 if (!reply_buf)
2772 return ERR_PTR(-ENOMEM);
2773
2774 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2775 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2776 "rbd", "get_snapshot_name",
2777 (char *) &snap_id, sizeof (snap_id),
07b2391f 2778 reply_buf, size, NULL);
b8b1e2db
AE
2779 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2780 if (ret < 0)
2781 goto out;
2782
2783 p = reply_buf;
2784 end = (char *) reply_buf + size;
e5c35534 2785 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
2786 if (IS_ERR(snap_name)) {
2787 ret = PTR_ERR(snap_name);
2788 goto out;
2789 } else {
2790 dout(" snap_id 0x%016llx snap_name = %s\n",
2791 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2792 }
2793 kfree(reply_buf);
2794
2795 return snap_name;
2796out:
2797 kfree(reply_buf);
2798
2799 return ERR_PTR(ret);
2800}
2801
2802static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2803 u64 *snap_size, u64 *snap_features)
2804{
e0b49868 2805 u64 snap_id;
b8b1e2db
AE
2806 u8 order;
2807 int ret;
2808
2809 snap_id = rbd_dev->header.snapc->snaps[which];
2810 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2811 if (ret)
2812 return ERR_PTR(ret);
2813 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2814 if (ret)
2815 return ERR_PTR(ret);
2816
2817 return rbd_dev_v2_snap_name(rbd_dev, which);
2818}
2819
2820static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2821 u64 *snap_size, u64 *snap_features)
2822{
2823 if (rbd_dev->image_format == 1)
2824 return rbd_dev_v1_snap_info(rbd_dev, which,
2825 snap_size, snap_features);
2826 if (rbd_dev->image_format == 2)
2827 return rbd_dev_v2_snap_info(rbd_dev, which,
2828 snap_size, snap_features);
2829 return ERR_PTR(-EINVAL);
2830}
2831
117973fb
AE
2832static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2833{
2834 int ret;
2835 __u8 obj_order;
2836
2837 down_write(&rbd_dev->header_rwsem);
2838
2839 /* Grab old order first, to see if it changes */
2840
2841 obj_order = rbd_dev->header.obj_order,
2842 ret = rbd_dev_v2_image_size(rbd_dev);
2843 if (ret)
2844 goto out;
2845 if (rbd_dev->header.obj_order != obj_order) {
2846 ret = -EIO;
2847 goto out;
2848 }
2849 rbd_update_mapping_size(rbd_dev);
2850
2851 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2852 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2853 if (ret)
2854 goto out;
2855 ret = rbd_dev_snaps_update(rbd_dev);
2856 dout("rbd_dev_snaps_update returned %d\n", ret);
2857 if (ret)
2858 goto out;
2859 ret = rbd_dev_snaps_register(rbd_dev);
2860 dout("rbd_dev_snaps_register returned %d\n", ret);
2861out:
2862 up_write(&rbd_dev->header_rwsem);
2863
2864 return ret;
2865}
2866
dfc5606d 2867/*
35938150
AE
2868 * Scan the rbd device's current snapshot list and compare it to the
2869 * newly-received snapshot context. Remove any existing snapshots
2870 * not present in the new snapshot context. Add a new snapshot for
2871 * any snaphots in the snapshot context not in the current list.
2872 * And verify there are no changes to snapshots we already know
2873 * about.
2874 *
2875 * Assumes the snapshots in the snapshot context are sorted by
2876 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2877 * are also maintained in that order.)
dfc5606d 2878 */
304f6808 2879static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2880{
35938150
AE
2881 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2882 const u32 snap_count = snapc->num_snaps;
35938150
AE
2883 struct list_head *head = &rbd_dev->snaps;
2884 struct list_head *links = head->next;
2885 u32 index = 0;
dfc5606d 2886
9fcbb800 2887 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2888 while (index < snap_count || links != head) {
2889 u64 snap_id;
2890 struct rbd_snap *snap;
cd892126
AE
2891 char *snap_name;
2892 u64 snap_size = 0;
2893 u64 snap_features = 0;
dfc5606d 2894
35938150
AE
2895 snap_id = index < snap_count ? snapc->snaps[index]
2896 : CEPH_NOSNAP;
2897 snap = links != head ? list_entry(links, struct rbd_snap, node)
2898 : NULL;
aafb230e 2899 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2900
35938150
AE
2901 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2902 struct list_head *next = links->next;
dfc5606d 2903
35938150 2904 /* Existing snapshot not in the new snap context */
dfc5606d 2905
0d7dbfce 2906 if (rbd_dev->spec->snap_id == snap->id)
d78b650a 2907 atomic_set(&rbd_dev->exists, 0);
41f38c2b 2908 rbd_remove_snap_dev(snap);
9fcbb800 2909 dout("%ssnap id %llu has been removed\n",
0d7dbfce
AE
2910 rbd_dev->spec->snap_id == snap->id ?
2911 "mapped " : "",
9fcbb800 2912 (unsigned long long) snap->id);
35938150
AE
2913
2914 /* Done with this list entry; advance */
2915
2916 links = next;
dfc5606d
YS
2917 continue;
2918 }
35938150 2919
b8b1e2db
AE
2920 snap_name = rbd_dev_snap_info(rbd_dev, index,
2921 &snap_size, &snap_features);
cd892126
AE
2922 if (IS_ERR(snap_name))
2923 return PTR_ERR(snap_name);
2924
9fcbb800
AE
2925 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2926 (unsigned long long) snap_id);
35938150
AE
2927 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2928 struct rbd_snap *new_snap;
2929
2930 /* We haven't seen this snapshot before */
2931
c8d18425 2932 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2933 snap_id, snap_size, snap_features);
9fcbb800
AE
2934 if (IS_ERR(new_snap)) {
2935 int err = PTR_ERR(new_snap);
2936
2937 dout(" failed to add dev, error %d\n", err);
2938
2939 return err;
2940 }
35938150
AE
2941
2942 /* New goes before existing, or at end of list */
2943
9fcbb800 2944 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2945 if (snap)
2946 list_add_tail(&new_snap->node, &snap->node);
2947 else
523f3258 2948 list_add_tail(&new_snap->node, head);
35938150
AE
2949 } else {
2950 /* Already have this one */
2951
9fcbb800
AE
2952 dout(" already present\n");
2953
cd892126 2954 rbd_assert(snap->size == snap_size);
aafb230e 2955 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2956 rbd_assert(snap->features == snap_features);
35938150
AE
2957
2958 /* Done with this list entry; advance */
2959
2960 links = links->next;
dfc5606d 2961 }
35938150
AE
2962
2963 /* Advance to the next entry in the snapshot context */
2964
2965 index++;
dfc5606d 2966 }
9fcbb800 2967 dout("%s: done\n", __func__);
dfc5606d
YS
2968
2969 return 0;
2970}
2971
304f6808
AE
2972/*
2973 * Scan the list of snapshots and register the devices for any that
2974 * have not already been registered.
2975 */
2976static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2977{
2978 struct rbd_snap *snap;
2979 int ret = 0;
2980
2981 dout("%s called\n", __func__);
86ff77bb
AE
2982 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2983 return -EIO;
304f6808
AE
2984
2985 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2986 if (!rbd_snap_registered(snap)) {
2987 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2988 if (ret < 0)
2989 break;
2990 }
2991 }
2992 dout("%s: returning %d\n", __func__, ret);
2993
2994 return ret;
2995}
2996
dfc5606d
YS
2997static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2998{
dfc5606d 2999 struct device *dev;
cd789ab9 3000 int ret;
dfc5606d
YS
3001
3002 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 3003
cd789ab9 3004 dev = &rbd_dev->dev;
dfc5606d
YS
3005 dev->bus = &rbd_bus_type;
3006 dev->type = &rbd_device_type;
3007 dev->parent = &rbd_root_dev;
3008 dev->release = rbd_dev_release;
de71a297 3009 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 3010 ret = device_register(dev);
dfc5606d 3011
dfc5606d 3012 mutex_unlock(&ctl_mutex);
cd789ab9 3013
dfc5606d 3014 return ret;
602adf40
YS
3015}
3016
dfc5606d
YS
3017static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3018{
3019 device_unregister(&rbd_dev->dev);
3020}
3021
59c2be1e
YS
3022static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
3023{
3024 int ret, rc;
3025
3026 do {
907703d0 3027 ret = rbd_req_sync_watch(rbd_dev, 1);
59c2be1e 3028 if (ret == -ERANGE) {
117973fb 3029 rc = rbd_dev_refresh(rbd_dev, NULL);
59c2be1e
YS
3030 if (rc < 0)
3031 return rc;
3032 }
3033 } while (ret == -ERANGE);
3034
3035 return ret;
3036}
3037
e2839308 3038static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
3039
3040/*
499afd5b
AE
3041 * Get a unique rbd identifier for the given new rbd_dev, and add
3042 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 3043 */
e2839308 3044static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 3045{
e2839308 3046 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
3047
3048 spin_lock(&rbd_dev_list_lock);
3049 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3050 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
3051 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3052 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 3053}
b7f23c36 3054
1ddbe94e 3055/*
499afd5b
AE
3056 * Remove an rbd_dev from the global list, and record that its
3057 * identifier is no longer in use.
1ddbe94e 3058 */
e2839308 3059static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 3060{
d184f6bf 3061 struct list_head *tmp;
de71a297 3062 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
3063 int max_id;
3064
aafb230e 3065 rbd_assert(rbd_id > 0);
499afd5b 3066
e2839308
AE
3067 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3068 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
3069 spin_lock(&rbd_dev_list_lock);
3070 list_del_init(&rbd_dev->node);
d184f6bf
AE
3071
3072 /*
3073 * If the id being "put" is not the current maximum, there
3074 * is nothing special we need to do.
3075 */
e2839308 3076 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
3077 spin_unlock(&rbd_dev_list_lock);
3078 return;
3079 }
3080
3081 /*
3082 * We need to update the current maximum id. Search the
3083 * list to find out what it is. We're more likely to find
3084 * the maximum at the end, so search the list backward.
3085 */
3086 max_id = 0;
3087 list_for_each_prev(tmp, &rbd_dev_list) {
3088 struct rbd_device *rbd_dev;
3089
3090 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
3091 if (rbd_dev->dev_id > max_id)
3092 max_id = rbd_dev->dev_id;
d184f6bf 3093 }
499afd5b 3094 spin_unlock(&rbd_dev_list_lock);
b7f23c36 3095
1ddbe94e 3096 /*
e2839308 3097 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
3098 * which case it now accurately reflects the new maximum.
3099 * Be careful not to overwrite the maximum value in that
3100 * case.
1ddbe94e 3101 */
e2839308
AE
3102 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3103 dout(" max dev id has been reset\n");
b7f23c36
AE
3104}
3105
e28fff26
AE
3106/*
3107 * Skips over white space at *buf, and updates *buf to point to the
3108 * first found non-space character (if any). Returns the length of
593a9e7b
AE
3109 * the token (string of non-white space characters) found. Note
3110 * that *buf must be terminated with '\0'.
e28fff26
AE
3111 */
3112static inline size_t next_token(const char **buf)
3113{
3114 /*
3115 * These are the characters that produce nonzero for
3116 * isspace() in the "C" and "POSIX" locales.
3117 */
3118 const char *spaces = " \f\n\r\t\v";
3119
3120 *buf += strspn(*buf, spaces); /* Find start of token */
3121
3122 return strcspn(*buf, spaces); /* Return token length */
3123}
3124
3125/*
3126 * Finds the next token in *buf, and if the provided token buffer is
3127 * big enough, copies the found token into it. The result, if
593a9e7b
AE
3128 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3129 * must be terminated with '\0' on entry.
e28fff26
AE
3130 *
3131 * Returns the length of the token found (not including the '\0').
3132 * Return value will be 0 if no token is found, and it will be >=
3133 * token_size if the token would not fit.
3134 *
593a9e7b 3135 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
3136 * found token. Note that this occurs even if the token buffer is
3137 * too small to hold it.
3138 */
3139static inline size_t copy_token(const char **buf,
3140 char *token,
3141 size_t token_size)
3142{
3143 size_t len;
3144
3145 len = next_token(buf);
3146 if (len < token_size) {
3147 memcpy(token, *buf, len);
3148 *(token + len) = '\0';
3149 }
3150 *buf += len;
3151
3152 return len;
3153}
3154
ea3352f4
AE
3155/*
3156 * Finds the next token in *buf, dynamically allocates a buffer big
3157 * enough to hold a copy of it, and copies the token into the new
3158 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3159 * that a duplicate buffer is created even for a zero-length token.
3160 *
3161 * Returns a pointer to the newly-allocated duplicate, or a null
3162 * pointer if memory for the duplicate was not available. If
3163 * the lenp argument is a non-null pointer, the length of the token
3164 * (not including the '\0') is returned in *lenp.
3165 *
3166 * If successful, the *buf pointer will be updated to point beyond
3167 * the end of the found token.
3168 *
3169 * Note: uses GFP_KERNEL for allocation.
3170 */
3171static inline char *dup_token(const char **buf, size_t *lenp)
3172{
3173 char *dup;
3174 size_t len;
3175
3176 len = next_token(buf);
4caf35f9 3177 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
3178 if (!dup)
3179 return NULL;
ea3352f4
AE
3180 *(dup + len) = '\0';
3181 *buf += len;
3182
3183 if (lenp)
3184 *lenp = len;
3185
3186 return dup;
3187}
3188
a725f65e 3189/*
859c31df
AE
3190 * Parse the options provided for an "rbd add" (i.e., rbd image
3191 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3192 * and the data written is passed here via a NUL-terminated buffer.
3193 * Returns 0 if successful or an error code otherwise.
d22f76e7 3194 *
859c31df
AE
3195 * The information extracted from these options is recorded in
3196 * the other parameters which return dynamically-allocated
3197 * structures:
3198 * ceph_opts
3199 * The address of a pointer that will refer to a ceph options
3200 * structure. Caller must release the returned pointer using
3201 * ceph_destroy_options() when it is no longer needed.
3202 * rbd_opts
3203 * Address of an rbd options pointer. Fully initialized by
3204 * this function; caller must release with kfree().
3205 * spec
3206 * Address of an rbd image specification pointer. Fully
3207 * initialized by this function based on parsed options.
3208 * Caller must release with rbd_spec_put().
3209 *
3210 * The options passed take this form:
3211 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3212 * where:
3213 * <mon_addrs>
3214 * A comma-separated list of one or more monitor addresses.
3215 * A monitor address is an ip address, optionally followed
3216 * by a port number (separated by a colon).
3217 * I.e.: ip1[:port1][,ip2[:port2]...]
3218 * <options>
3219 * A comma-separated list of ceph and/or rbd options.
3220 * <pool_name>
3221 * The name of the rados pool containing the rbd image.
3222 * <image_name>
3223 * The name of the image in that pool to map.
3224 * <snap_id>
3225 * An optional snapshot id. If provided, the mapping will
3226 * present data from the image at the time that snapshot was
3227 * created. The image head is used if no snapshot id is
3228 * provided. Snapshot mappings are always read-only.
a725f65e 3229 */
859c31df 3230static int rbd_add_parse_args(const char *buf,
dc79b113 3231 struct ceph_options **ceph_opts,
859c31df
AE
3232 struct rbd_options **opts,
3233 struct rbd_spec **rbd_spec)
e28fff26 3234{
d22f76e7 3235 size_t len;
859c31df 3236 char *options;
0ddebc0c
AE
3237 const char *mon_addrs;
3238 size_t mon_addrs_size;
859c31df 3239 struct rbd_spec *spec = NULL;
4e9afeba 3240 struct rbd_options *rbd_opts = NULL;
859c31df 3241 struct ceph_options *copts;
dc79b113 3242 int ret;
e28fff26
AE
3243
3244 /* The first four tokens are required */
3245
7ef3214a 3246 len = next_token(&buf);
4fb5d671
AE
3247 if (!len) {
3248 rbd_warn(NULL, "no monitor address(es) provided");
3249 return -EINVAL;
3250 }
0ddebc0c 3251 mon_addrs = buf;
f28e565a 3252 mon_addrs_size = len + 1;
7ef3214a 3253 buf += len;
a725f65e 3254
dc79b113 3255 ret = -EINVAL;
f28e565a
AE
3256 options = dup_token(&buf, NULL);
3257 if (!options)
dc79b113 3258 return -ENOMEM;
4fb5d671
AE
3259 if (!*options) {
3260 rbd_warn(NULL, "no options provided");
3261 goto out_err;
3262 }
e28fff26 3263
859c31df
AE
3264 spec = rbd_spec_alloc();
3265 if (!spec)
f28e565a 3266 goto out_mem;
859c31df
AE
3267
3268 spec->pool_name = dup_token(&buf, NULL);
3269 if (!spec->pool_name)
3270 goto out_mem;
4fb5d671
AE
3271 if (!*spec->pool_name) {
3272 rbd_warn(NULL, "no pool name provided");
3273 goto out_err;
3274 }
e28fff26 3275
69e7a02f 3276 spec->image_name = dup_token(&buf, NULL);
859c31df 3277 if (!spec->image_name)
f28e565a 3278 goto out_mem;
4fb5d671
AE
3279 if (!*spec->image_name) {
3280 rbd_warn(NULL, "no image name provided");
3281 goto out_err;
3282 }
d4b125e9 3283
f28e565a
AE
3284 /*
3285 * Snapshot name is optional; default is to use "-"
3286 * (indicating the head/no snapshot).
3287 */
3feeb894 3288 len = next_token(&buf);
820a5f3e 3289 if (!len) {
3feeb894
AE
3290 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3291 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 3292 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 3293 ret = -ENAMETOOLONG;
f28e565a 3294 goto out_err;
849b4260 3295 }
4caf35f9 3296 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
859c31df 3297 if (!spec->snap_name)
f28e565a 3298 goto out_mem;
859c31df 3299 *(spec->snap_name + len) = '\0';
e5c35534 3300
0ddebc0c 3301 /* Initialize all rbd options to the defaults */
e28fff26 3302
4e9afeba
AE
3303 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3304 if (!rbd_opts)
3305 goto out_mem;
3306
3307 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 3308
859c31df 3309 copts = ceph_parse_options(options, mon_addrs,
0ddebc0c 3310 mon_addrs + mon_addrs_size - 1,
4e9afeba 3311 parse_rbd_opts_token, rbd_opts);
859c31df
AE
3312 if (IS_ERR(copts)) {
3313 ret = PTR_ERR(copts);
dc79b113
AE
3314 goto out_err;
3315 }
859c31df
AE
3316 kfree(options);
3317
3318 *ceph_opts = copts;
4e9afeba 3319 *opts = rbd_opts;
859c31df 3320 *rbd_spec = spec;
0ddebc0c 3321
dc79b113 3322 return 0;
f28e565a 3323out_mem:
dc79b113 3324 ret = -ENOMEM;
d22f76e7 3325out_err:
859c31df
AE
3326 kfree(rbd_opts);
3327 rbd_spec_put(spec);
f28e565a 3328 kfree(options);
d22f76e7 3329
dc79b113 3330 return ret;
a725f65e
AE
3331}
3332
589d30e0
AE
3333/*
3334 * An rbd format 2 image has a unique identifier, distinct from the
3335 * name given to it by the user. Internally, that identifier is
3336 * what's used to specify the names of objects related to the image.
3337 *
3338 * A special "rbd id" object is used to map an rbd image name to its
3339 * id. If that object doesn't exist, then there is no v2 rbd image
3340 * with the supplied name.
3341 *
3342 * This function will record the given rbd_dev's image_id field if
3343 * it can be determined, and in that case will return 0. If any
3344 * errors occur a negative errno will be returned and the rbd_dev's
3345 * image_id field will be unchanged (and should be NULL).
3346 */
3347static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3348{
3349 int ret;
3350 size_t size;
3351 char *object_name;
3352 void *response;
3353 void *p;
3354
2c0d0a10
AE
3355 /*
3356 * When probing a parent image, the image id is already
3357 * known (and the image name likely is not). There's no
3358 * need to fetch the image id again in this case.
3359 */
3360 if (rbd_dev->spec->image_id)
3361 return 0;
3362
589d30e0
AE
3363 /*
3364 * First, see if the format 2 image id file exists, and if
3365 * so, get the image's persistent id from it.
3366 */
69e7a02f 3367 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
589d30e0
AE
3368 object_name = kmalloc(size, GFP_NOIO);
3369 if (!object_name)
3370 return -ENOMEM;
0d7dbfce 3371 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
589d30e0
AE
3372 dout("rbd id object name is %s\n", object_name);
3373
3374 /* Response will be an encoded string, which includes a length */
3375
3376 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3377 response = kzalloc(size, GFP_NOIO);
3378 if (!response) {
3379 ret = -ENOMEM;
3380 goto out;
3381 }
3382
3383 ret = rbd_req_sync_exec(rbd_dev, object_name,
3384 "rbd", "get_id",
3385 NULL, 0,
07b2391f 3386 response, RBD_IMAGE_ID_LEN_MAX, NULL);
589d30e0
AE
3387 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3388 if (ret < 0)
3389 goto out;
a0ea3a40 3390 ret = 0; /* rbd_req_sync_exec() can return positive */
589d30e0
AE
3391
3392 p = response;
0d7dbfce 3393 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
589d30e0 3394 p + RBD_IMAGE_ID_LEN_MAX,
979ed480 3395 NULL, GFP_NOIO);
0d7dbfce
AE
3396 if (IS_ERR(rbd_dev->spec->image_id)) {
3397 ret = PTR_ERR(rbd_dev->spec->image_id);
3398 rbd_dev->spec->image_id = NULL;
589d30e0 3399 } else {
0d7dbfce 3400 dout("image_id is %s\n", rbd_dev->spec->image_id);
589d30e0
AE
3401 }
3402out:
3403 kfree(response);
3404 kfree(object_name);
3405
3406 return ret;
3407}
3408
a30b71b9
AE
3409static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3410{
3411 int ret;
3412 size_t size;
3413
3414 /* Version 1 images have no id; empty string is used */
3415
0d7dbfce
AE
3416 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3417 if (!rbd_dev->spec->image_id)
a30b71b9 3418 return -ENOMEM;
a30b71b9
AE
3419
3420 /* Record the header object name for this rbd image. */
3421
69e7a02f 3422 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
a30b71b9
AE
3423 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3424 if (!rbd_dev->header_name) {
3425 ret = -ENOMEM;
3426 goto out_err;
3427 }
0d7dbfce
AE
3428 sprintf(rbd_dev->header_name, "%s%s",
3429 rbd_dev->spec->image_name, RBD_SUFFIX);
a30b71b9
AE
3430
3431 /* Populate rbd image metadata */
3432
3433 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3434 if (ret < 0)
3435 goto out_err;
86b00e0d
AE
3436
3437 /* Version 1 images have no parent (no layering) */
3438
3439 rbd_dev->parent_spec = NULL;
3440 rbd_dev->parent_overlap = 0;
3441
a30b71b9
AE
3442 rbd_dev->image_format = 1;
3443
3444 dout("discovered version 1 image, header name is %s\n",
3445 rbd_dev->header_name);
3446
3447 return 0;
3448
3449out_err:
3450 kfree(rbd_dev->header_name);
3451 rbd_dev->header_name = NULL;
0d7dbfce
AE
3452 kfree(rbd_dev->spec->image_id);
3453 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
3454
3455 return ret;
3456}
3457
3458static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3459{
3460 size_t size;
9d475de5 3461 int ret;
6e14b1a6 3462 u64 ver = 0;
a30b71b9
AE
3463
3464 /*
3465 * Image id was filled in by the caller. Record the header
3466 * object name for this rbd image.
3467 */
979ed480 3468 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
a30b71b9
AE
3469 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3470 if (!rbd_dev->header_name)
3471 return -ENOMEM;
3472 sprintf(rbd_dev->header_name, "%s%s",
0d7dbfce 3473 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
9d475de5
AE
3474
3475 /* Get the size and object order for the image */
3476
3477 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3478 if (ret < 0)
3479 goto out_err;
3480
3481 /* Get the object prefix (a.k.a. block_name) for the image */
3482
3483 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3484 if (ret < 0)
3485 goto out_err;
3486
d889140c 3487 /* Get the and check features for the image */
b1b5402a
AE
3488
3489 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3490 if (ret < 0)
3491 goto out_err;
35d489f9 3492
86b00e0d
AE
3493 /* If the image supports layering, get the parent info */
3494
3495 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3496 ret = rbd_dev_v2_parent_info(rbd_dev);
3497 if (ret < 0)
3498 goto out_err;
3499 }
3500
6e14b1a6
AE
3501 /* crypto and compression type aren't (yet) supported for v2 images */
3502
3503 rbd_dev->header.crypt_type = 0;
3504 rbd_dev->header.comp_type = 0;
35d489f9 3505
6e14b1a6
AE
3506 /* Get the snapshot context, plus the header version */
3507
3508 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3509 if (ret)
3510 goto out_err;
6e14b1a6
AE
3511 rbd_dev->header.obj_version = ver;
3512
a30b71b9
AE
3513 rbd_dev->image_format = 2;
3514
3515 dout("discovered version 2 image, header name is %s\n",
3516 rbd_dev->header_name);
3517
35152979 3518 return 0;
9d475de5 3519out_err:
86b00e0d
AE
3520 rbd_dev->parent_overlap = 0;
3521 rbd_spec_put(rbd_dev->parent_spec);
3522 rbd_dev->parent_spec = NULL;
9d475de5
AE
3523 kfree(rbd_dev->header_name);
3524 rbd_dev->header_name = NULL;
1e130199
AE
3525 kfree(rbd_dev->header.object_prefix);
3526 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3527
3528 return ret;
a30b71b9
AE
3529}
3530
83a06263
AE
3531static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3532{
3533 int ret;
3534
3535 /* no need to lock here, as rbd_dev is not registered yet */
3536 ret = rbd_dev_snaps_update(rbd_dev);
3537 if (ret)
3538 return ret;
3539
9e15b77d
AE
3540 ret = rbd_dev_probe_update_spec(rbd_dev);
3541 if (ret)
3542 goto err_out_snaps;
3543
83a06263
AE
3544 ret = rbd_dev_set_mapping(rbd_dev);
3545 if (ret)
3546 goto err_out_snaps;
3547
3548 /* generate unique id: find highest unique id, add one */
3549 rbd_dev_id_get(rbd_dev);
3550
3551 /* Fill in the device name, now that we have its id. */
3552 BUILD_BUG_ON(DEV_NAME_LEN
3553 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3554 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3555
3556 /* Get our block major device number. */
3557
3558 ret = register_blkdev(0, rbd_dev->name);
3559 if (ret < 0)
3560 goto err_out_id;
3561 rbd_dev->major = ret;
3562
3563 /* Set up the blkdev mapping. */
3564
3565 ret = rbd_init_disk(rbd_dev);
3566 if (ret)
3567 goto err_out_blkdev;
3568
3569 ret = rbd_bus_add_dev(rbd_dev);
3570 if (ret)
3571 goto err_out_disk;
3572
3573 /*
3574 * At this point cleanup in the event of an error is the job
3575 * of the sysfs code (initiated by rbd_bus_del_dev()).
3576 */
3577 down_write(&rbd_dev->header_rwsem);
3578 ret = rbd_dev_snaps_register(rbd_dev);
3579 up_write(&rbd_dev->header_rwsem);
3580 if (ret)
3581 goto err_out_bus;
3582
3583 ret = rbd_init_watch_dev(rbd_dev);
3584 if (ret)
3585 goto err_out_bus;
3586
3587 /* Everything's ready. Announce the disk to the world. */
3588
3589 add_disk(rbd_dev->disk);
3590
3591 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3592 (unsigned long long) rbd_dev->mapping.size);
3593
3594 return ret;
3595err_out_bus:
3596 /* this will also clean up rest of rbd_dev stuff */
3597
3598 rbd_bus_del_dev(rbd_dev);
3599
3600 return ret;
3601err_out_disk:
3602 rbd_free_disk(rbd_dev);
3603err_out_blkdev:
3604 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3605err_out_id:
3606 rbd_dev_id_put(rbd_dev);
3607err_out_snaps:
3608 rbd_remove_all_snaps(rbd_dev);
3609
3610 return ret;
3611}
3612
a30b71b9
AE
3613/*
3614 * Probe for the existence of the header object for the given rbd
3615 * device. For format 2 images this includes determining the image
3616 * id.
3617 */
3618static int rbd_dev_probe(struct rbd_device *rbd_dev)
3619{
3620 int ret;
3621
3622 /*
3623 * Get the id from the image id object. If it's not a
3624 * format 2 image, we'll get ENOENT back, and we'll assume
3625 * it's a format 1 image.
3626 */
3627 ret = rbd_dev_image_id(rbd_dev);
3628 if (ret)
3629 ret = rbd_dev_v1_probe(rbd_dev);
3630 else
3631 ret = rbd_dev_v2_probe(rbd_dev);
83a06263 3632 if (ret) {
a30b71b9
AE
3633 dout("probe failed, returning %d\n", ret);
3634
83a06263
AE
3635 return ret;
3636 }
3637
3638 ret = rbd_dev_probe_finish(rbd_dev);
3639 if (ret)
3640 rbd_header_free(&rbd_dev->header);
3641
a30b71b9
AE
3642 return ret;
3643}
3644
59c2be1e
YS
3645static ssize_t rbd_add(struct bus_type *bus,
3646 const char *buf,
3647 size_t count)
602adf40 3648{
cb8627c7 3649 struct rbd_device *rbd_dev = NULL;
dc79b113 3650 struct ceph_options *ceph_opts = NULL;
4e9afeba 3651 struct rbd_options *rbd_opts = NULL;
859c31df 3652 struct rbd_spec *spec = NULL;
9d3997fd 3653 struct rbd_client *rbdc;
27cc2594
AE
3654 struct ceph_osd_client *osdc;
3655 int rc = -ENOMEM;
602adf40
YS
3656
3657 if (!try_module_get(THIS_MODULE))
3658 return -ENODEV;
3659
602adf40 3660 /* parse add command */
859c31df 3661 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 3662 if (rc < 0)
bd4ba655 3663 goto err_out_module;
78cea76e 3664
9d3997fd
AE
3665 rbdc = rbd_get_client(ceph_opts);
3666 if (IS_ERR(rbdc)) {
3667 rc = PTR_ERR(rbdc);
0ddebc0c 3668 goto err_out_args;
9d3997fd 3669 }
c53d5893 3670 ceph_opts = NULL; /* rbd_dev client now owns this */
602adf40 3671
602adf40 3672 /* pick the pool */
9d3997fd 3673 osdc = &rbdc->client->osdc;
859c31df 3674 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
602adf40
YS
3675 if (rc < 0)
3676 goto err_out_client;
859c31df
AE
3677 spec->pool_id = (u64) rc;
3678
0903e875
AE
3679 /* The ceph file layout needs to fit pool id in 32 bits */
3680
3681 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
3682 rc = -EIO;
3683 goto err_out_client;
3684 }
3685
c53d5893 3686 rbd_dev = rbd_dev_create(rbdc, spec);
bd4ba655
AE
3687 if (!rbd_dev)
3688 goto err_out_client;
c53d5893
AE
3689 rbdc = NULL; /* rbd_dev now owns this */
3690 spec = NULL; /* rbd_dev now owns this */
602adf40 3691
bd4ba655 3692 rbd_dev->mapping.read_only = rbd_opts->read_only;
c53d5893
AE
3693 kfree(rbd_opts);
3694 rbd_opts = NULL; /* done with this */
bd4ba655 3695
a30b71b9
AE
3696 rc = rbd_dev_probe(rbd_dev);
3697 if (rc < 0)
c53d5893 3698 goto err_out_rbd_dev;
05fd6f6f 3699
602adf40 3700 return count;
c53d5893
AE
3701err_out_rbd_dev:
3702 rbd_dev_destroy(rbd_dev);
bd4ba655 3703err_out_client:
9d3997fd 3704 rbd_put_client(rbdc);
0ddebc0c 3705err_out_args:
78cea76e
AE
3706 if (ceph_opts)
3707 ceph_destroy_options(ceph_opts);
4e9afeba 3708 kfree(rbd_opts);
859c31df 3709 rbd_spec_put(spec);
bd4ba655
AE
3710err_out_module:
3711 module_put(THIS_MODULE);
27cc2594 3712
602adf40 3713 dout("Error adding device %s\n", buf);
27cc2594
AE
3714
3715 return (ssize_t) rc;
602adf40
YS
3716}
3717
de71a297 3718static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
3719{
3720 struct list_head *tmp;
3721 struct rbd_device *rbd_dev;
3722
e124a82f 3723 spin_lock(&rbd_dev_list_lock);
602adf40
YS
3724 list_for_each(tmp, &rbd_dev_list) {
3725 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 3726 if (rbd_dev->dev_id == dev_id) {
e124a82f 3727 spin_unlock(&rbd_dev_list_lock);
602adf40 3728 return rbd_dev;
e124a82f 3729 }
602adf40 3730 }
e124a82f 3731 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
3732 return NULL;
3733}
3734
dfc5606d 3735static void rbd_dev_release(struct device *dev)
602adf40 3736{
593a9e7b 3737 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 3738
1dbb4399
AE
3739 if (rbd_dev->watch_request) {
3740 struct ceph_client *client = rbd_dev->rbd_client->client;
3741
3742 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 3743 rbd_dev->watch_request);
1dbb4399 3744 }
59c2be1e 3745 if (rbd_dev->watch_event)
907703d0 3746 rbd_req_sync_watch(rbd_dev, 0);
602adf40
YS
3747
3748 /* clean up and free blkdev */
3749 rbd_free_disk(rbd_dev);
3750 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 3751
2ac4e75d
AE
3752 /* release allocated disk header fields */
3753 rbd_header_free(&rbd_dev->header);
3754
32eec68d 3755 /* done with the id, and with the rbd_dev */
e2839308 3756 rbd_dev_id_put(rbd_dev);
c53d5893
AE
3757 rbd_assert(rbd_dev->rbd_client != NULL);
3758 rbd_dev_destroy(rbd_dev);
602adf40
YS
3759
3760 /* release module ref */
3761 module_put(THIS_MODULE);
602adf40
YS
3762}
3763
dfc5606d
YS
3764static ssize_t rbd_remove(struct bus_type *bus,
3765 const char *buf,
3766 size_t count)
602adf40
YS
3767{
3768 struct rbd_device *rbd_dev = NULL;
3769 int target_id, rc;
3770 unsigned long ul;
3771 int ret = count;
3772
3773 rc = strict_strtoul(buf, 10, &ul);
3774 if (rc)
3775 return rc;
3776
3777 /* convert to int; abort if we lost anything in the conversion */
3778 target_id = (int) ul;
3779 if (target_id != ul)
3780 return -EINVAL;
3781
3782 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3783
3784 rbd_dev = __rbd_get_dev(target_id);
3785 if (!rbd_dev) {
3786 ret = -ENOENT;
3787 goto done;
42382b70
AE
3788 }
3789
3790 if (rbd_dev->open_count) {
3791 ret = -EBUSY;
3792 goto done;
602adf40
YS
3793 }
3794
41f38c2b 3795 rbd_remove_all_snaps(rbd_dev);
dfc5606d 3796 rbd_bus_del_dev(rbd_dev);
602adf40
YS
3797
3798done:
3799 mutex_unlock(&ctl_mutex);
aafb230e 3800
602adf40
YS
3801 return ret;
3802}
3803
602adf40
YS
3804/*
3805 * create control files in sysfs
dfc5606d 3806 * /sys/bus/rbd/...
602adf40
YS
3807 */
3808static int rbd_sysfs_init(void)
3809{
dfc5606d 3810 int ret;
602adf40 3811
fed4c143 3812 ret = device_register(&rbd_root_dev);
21079786 3813 if (ret < 0)
dfc5606d 3814 return ret;
602adf40 3815
fed4c143
AE
3816 ret = bus_register(&rbd_bus_type);
3817 if (ret < 0)
3818 device_unregister(&rbd_root_dev);
602adf40 3819
602adf40
YS
3820 return ret;
3821}
3822
3823static void rbd_sysfs_cleanup(void)
3824{
dfc5606d 3825 bus_unregister(&rbd_bus_type);
fed4c143 3826 device_unregister(&rbd_root_dev);
602adf40
YS
3827}
3828
3829int __init rbd_init(void)
3830{
3831 int rc;
3832
3833 rc = rbd_sysfs_init();
3834 if (rc)
3835 return rc;
f0f8cef5 3836 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
3837 return 0;
3838}
3839
3840void __exit rbd_exit(void)
3841{
3842 rbd_sysfs_cleanup();
3843}
3844
3845module_init(rbd_init);
3846module_exit(rbd_exit);
3847
3848MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3849MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3850MODULE_DESCRIPTION("rados block device");
3851
3852/* following authorship retained from original osdblk.c */
3853MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3854
3855MODULE_LICENSE("GPL");
This page took 0.382075 seconds and 5 git commands to generate.