Commit | Line | Data |
---|---|---|
5c83746a | 1 | /* |
d9186c03 | 2 | * Copyright (c) 2014-2016 Christoph Hellwig. |
5c83746a CH |
3 | */ |
4 | #include <linux/sunrpc/svc.h> | |
5 | #include <linux/blkdev.h> | |
6 | #include <linux/nfs4.h> | |
7 | #include <linux/nfs_fs.h> | |
8 | #include <linux/nfs_xdr.h> | |
d9186c03 | 9 | #include <linux/pr.h> |
5c83746a CH |
10 | |
11 | #include "blocklayout.h" | |
12 | ||
13 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | |
14 | ||
15 | static void | |
16 | bl_free_device(struct pnfs_block_dev *dev) | |
17 | { | |
18 | if (dev->nr_children) { | |
19 | int i; | |
20 | ||
21 | for (i = 0; i < dev->nr_children; i++) | |
22 | bl_free_device(&dev->children[i]); | |
23 | kfree(dev->children); | |
24 | } else { | |
d9186c03 CH |
25 | if (dev->pr_registered) { |
26 | const struct pr_ops *ops = | |
27 | dev->bdev->bd_disk->fops->pr_ops; | |
28 | int error; | |
29 | ||
30 | error = ops->pr_register(dev->bdev, dev->pr_key, 0, | |
31 | false); | |
32 | if (error) | |
33 | pr_err("failed to unregister PR key.\n"); | |
34 | } | |
35 | ||
5c83746a | 36 | if (dev->bdev) |
513d6d7a | 37 | blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE); |
5c83746a CH |
38 | } |
39 | } | |
40 | ||
41 | void | |
42 | bl_free_deviceid_node(struct nfs4_deviceid_node *d) | |
43 | { | |
44 | struct pnfs_block_dev *dev = | |
45 | container_of(d, struct pnfs_block_dev, node); | |
46 | ||
47 | bl_free_device(dev); | |
84a80f62 | 48 | kfree_rcu(dev, node.rcu); |
5c83746a CH |
49 | } |
50 | ||
51 | static int | |
52 | nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) | |
53 | { | |
54 | __be32 *p; | |
55 | int i; | |
56 | ||
57 | p = xdr_inline_decode(xdr, 4); | |
58 | if (!p) | |
59 | return -EIO; | |
60 | b->type = be32_to_cpup(p++); | |
61 | ||
62 | switch (b->type) { | |
63 | case PNFS_BLOCK_VOLUME_SIMPLE: | |
64 | p = xdr_inline_decode(xdr, 4); | |
65 | if (!p) | |
66 | return -EIO; | |
67 | b->simple.nr_sigs = be32_to_cpup(p++); | |
c77efc1e KM |
68 | if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) { |
69 | dprintk("Bad signature count: %d\n", b->simple.nr_sigs); | |
5c83746a CH |
70 | return -EIO; |
71 | } | |
72 | ||
73 | b->simple.len = 4 + 4; | |
74 | for (i = 0; i < b->simple.nr_sigs; i++) { | |
75 | p = xdr_inline_decode(xdr, 8 + 4); | |
76 | if (!p) | |
77 | return -EIO; | |
78 | p = xdr_decode_hyper(p, &b->simple.sigs[i].offset); | |
79 | b->simple.sigs[i].sig_len = be32_to_cpup(p++); | |
2bd3c63a CH |
80 | if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) { |
81 | pr_info("signature too long: %d\n", | |
82 | b->simple.sigs[i].sig_len); | |
83 | return -EIO; | |
84 | } | |
5c83746a CH |
85 | |
86 | p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len); | |
87 | if (!p) | |
88 | return -EIO; | |
89 | memcpy(&b->simple.sigs[i].sig, p, | |
90 | b->simple.sigs[i].sig_len); | |
91 | ||
ecc2b88c KM |
92 | b->simple.len += 8 + 4 + \ |
93 | (XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2); | |
5c83746a CH |
94 | } |
95 | break; | |
96 | case PNFS_BLOCK_VOLUME_SLICE: | |
97 | p = xdr_inline_decode(xdr, 8 + 8 + 4); | |
98 | if (!p) | |
99 | return -EIO; | |
100 | p = xdr_decode_hyper(p, &b->slice.start); | |
101 | p = xdr_decode_hyper(p, &b->slice.len); | |
102 | b->slice.volume = be32_to_cpup(p++); | |
103 | break; | |
104 | case PNFS_BLOCK_VOLUME_CONCAT: | |
105 | p = xdr_inline_decode(xdr, 4); | |
106 | if (!p) | |
107 | return -EIO; | |
c77efc1e | 108 | |
5c83746a | 109 | b->concat.volumes_count = be32_to_cpup(p++); |
c77efc1e KM |
110 | if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) { |
111 | dprintk("Too many volumes: %d\n", b->concat.volumes_count); | |
112 | return -EIO; | |
113 | } | |
5c83746a CH |
114 | |
115 | p = xdr_inline_decode(xdr, b->concat.volumes_count * 4); | |
116 | if (!p) | |
117 | return -EIO; | |
118 | for (i = 0; i < b->concat.volumes_count; i++) | |
119 | b->concat.volumes[i] = be32_to_cpup(p++); | |
120 | break; | |
121 | case PNFS_BLOCK_VOLUME_STRIPE: | |
122 | p = xdr_inline_decode(xdr, 8 + 4); | |
123 | if (!p) | |
124 | return -EIO; | |
c77efc1e | 125 | |
5c83746a CH |
126 | p = xdr_decode_hyper(p, &b->stripe.chunk_size); |
127 | b->stripe.volumes_count = be32_to_cpup(p++); | |
c77efc1e KM |
128 | if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) { |
129 | dprintk("Too many volumes: %d\n", b->stripe.volumes_count); | |
130 | return -EIO; | |
131 | } | |
5c83746a CH |
132 | |
133 | p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4); | |
134 | if (!p) | |
135 | return -EIO; | |
136 | for (i = 0; i < b->stripe.volumes_count; i++) | |
137 | b->stripe.volumes[i] = be32_to_cpup(p++); | |
138 | break; | |
d9186c03 CH |
139 | case PNFS_BLOCK_VOLUME_SCSI: |
140 | p = xdr_inline_decode(xdr, 4 + 4 + 4); | |
141 | if (!p) | |
142 | return -EIO; | |
143 | b->scsi.code_set = be32_to_cpup(p++); | |
144 | b->scsi.designator_type = be32_to_cpup(p++); | |
145 | b->scsi.designator_len = be32_to_cpup(p++); | |
146 | p = xdr_inline_decode(xdr, b->scsi.designator_len); | |
147 | if (!p) | |
148 | return -EIO; | |
149 | if (b->scsi.designator_len > 256) | |
150 | return -EIO; | |
151 | memcpy(&b->scsi.designator, p, b->scsi.designator_len); | |
152 | p = xdr_inline_decode(xdr, 8); | |
153 | if (!p) | |
154 | return -EIO; | |
155 | p = xdr_decode_hyper(p, &b->scsi.pr_key); | |
156 | break; | |
5c83746a CH |
157 | default: |
158 | dprintk("unknown volume type!\n"); | |
159 | return -EIO; | |
160 | } | |
161 | ||
162 | return 0; | |
163 | } | |
164 | ||
165 | static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset, | |
166 | struct pnfs_block_dev_map *map) | |
167 | { | |
168 | map->start = dev->start; | |
169 | map->len = dev->len; | |
170 | map->disk_offset = dev->disk_offset; | |
171 | map->bdev = dev->bdev; | |
172 | return true; | |
173 | } | |
174 | ||
175 | static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset, | |
176 | struct pnfs_block_dev_map *map) | |
177 | { | |
178 | int i; | |
179 | ||
180 | for (i = 0; i < dev->nr_children; i++) { | |
181 | struct pnfs_block_dev *child = &dev->children[i]; | |
182 | ||
183 | if (child->start > offset || | |
184 | child->start + child->len <= offset) | |
185 | continue; | |
186 | ||
187 | child->map(child, offset - child->start, map); | |
188 | return true; | |
189 | } | |
190 | ||
191 | dprintk("%s: ran off loop!\n", __func__); | |
192 | return false; | |
193 | } | |
194 | ||
195 | static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset, | |
196 | struct pnfs_block_dev_map *map) | |
197 | { | |
198 | struct pnfs_block_dev *child; | |
5466112f TM |
199 | u64 chunk; |
200 | u32 chunk_idx; | |
5c83746a CH |
201 | u64 disk_offset; |
202 | ||
5466112f TM |
203 | chunk = div_u64(offset, dev->chunk_size); |
204 | div_u64_rem(chunk, dev->nr_children, &chunk_idx); | |
205 | ||
5c83746a CH |
206 | if (chunk_idx > dev->nr_children) { |
207 | dprintk("%s: invalid chunk idx %d (%lld/%lld)\n", | |
208 | __func__, chunk_idx, offset, dev->chunk_size); | |
209 | /* error, should not happen */ | |
210 | return false; | |
211 | } | |
212 | ||
213 | /* truncate offset to the beginning of the stripe */ | |
214 | offset = chunk * dev->chunk_size; | |
215 | ||
216 | /* disk offset of the stripe */ | |
5466112f | 217 | disk_offset = div_u64(offset, dev->nr_children); |
5c83746a CH |
218 | |
219 | child = &dev->children[chunk_idx]; | |
220 | child->map(child, disk_offset, map); | |
221 | ||
222 | map->start += offset; | |
223 | map->disk_offset += disk_offset; | |
224 | map->len = dev->chunk_size; | |
225 | return true; | |
226 | } | |
227 | ||
228 | static int | |
229 | bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, | |
230 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask); | |
231 | ||
232 | ||
233 | static int | |
234 | bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, | |
235 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) | |
236 | { | |
237 | struct pnfs_block_volume *v = &volumes[idx]; | |
297fae4d | 238 | struct block_device *bdev; |
5c83746a CH |
239 | dev_t dev; |
240 | ||
241 | dev = bl_resolve_deviceid(server, v, gfp_mask); | |
242 | if (!dev) | |
243 | return -EIO; | |
244 | ||
297fae4d AS |
245 | bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); |
246 | if (IS_ERR(bdev)) { | |
5c83746a | 247 | printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", |
297fae4d AS |
248 | MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); |
249 | return PTR_ERR(bdev); | |
5c83746a | 250 | } |
297fae4d | 251 | d->bdev = bdev; |
5c83746a CH |
252 | |
253 | ||
254 | d->len = i_size_read(d->bdev->bd_inode); | |
255 | d->map = bl_map_simple; | |
256 | ||
257 | printk(KERN_INFO "pNFS: using block device %s\n", | |
258 | d->bdev->bd_disk->disk_name); | |
259 | return 0; | |
260 | } | |
261 | ||
d9186c03 CH |
262 | static bool |
263 | bl_validate_designator(struct pnfs_block_volume *v) | |
264 | { | |
265 | switch (v->scsi.designator_type) { | |
266 | case PS_DESIGNATOR_EUI64: | |
267 | if (v->scsi.code_set != PS_CODE_SET_BINARY) | |
268 | return false; | |
269 | ||
270 | if (v->scsi.designator_len != 8 && | |
271 | v->scsi.designator_len != 10 && | |
272 | v->scsi.designator_len != 16) | |
273 | return false; | |
274 | ||
275 | return true; | |
276 | case PS_DESIGNATOR_NAA: | |
277 | if (v->scsi.code_set != PS_CODE_SET_BINARY) | |
278 | return false; | |
279 | ||
280 | if (v->scsi.designator_len != 8 && | |
281 | v->scsi.designator_len != 16) | |
282 | return false; | |
283 | ||
284 | return true; | |
285 | case PS_DESIGNATOR_T10: | |
286 | case PS_DESIGNATOR_NAME: | |
287 | pr_err("pNFS: unsupported designator " | |
288 | "(code set %d, type %d, len %d.\n", | |
289 | v->scsi.code_set, | |
290 | v->scsi.designator_type, | |
291 | v->scsi.designator_len); | |
292 | return false; | |
293 | default: | |
294 | pr_err("pNFS: invalid designator " | |
295 | "(code set %d, type %d, len %d.\n", | |
296 | v->scsi.code_set, | |
297 | v->scsi.designator_type, | |
298 | v->scsi.designator_len); | |
299 | return false; | |
300 | } | |
301 | } | |
302 | ||
d702d41e CH |
303 | /* |
304 | * Try to open the udev path for the WWN. At least on Debian the udev | |
305 | * by-id path will always point to the dm-multipath device if one exists. | |
306 | */ | |
307 | static struct block_device * | |
308 | bl_open_udev_path(struct pnfs_block_volume *v) | |
309 | { | |
310 | struct block_device *bdev; | |
311 | const char *devname; | |
312 | ||
313 | devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN", | |
314 | v->scsi.designator_len, v->scsi.designator); | |
315 | if (!devname) | |
316 | return ERR_PTR(-ENOMEM); | |
317 | ||
318 | bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); | |
319 | if (IS_ERR(bdev)) { | |
320 | pr_warn("pNFS: failed to open device %s (%ld)\n", | |
321 | devname, PTR_ERR(bdev)); | |
322 | } | |
323 | ||
324 | kfree(devname); | |
325 | return bdev; | |
326 | } | |
327 | ||
11487ddb CH |
328 | /* |
329 | * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the | |
330 | * wwn- links will only point to the first discovered SCSI device there. | |
331 | */ | |
332 | static struct block_device * | |
333 | bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v) | |
334 | { | |
335 | struct block_device *bdev; | |
336 | const char *devname; | |
337 | ||
338 | devname = kasprintf(GFP_KERNEL, | |
339 | "/dev/disk/by-id/dm-uuid-mpath-%d%*phN", | |
340 | v->scsi.designator_type, | |
341 | v->scsi.designator_len, v->scsi.designator); | |
342 | if (!devname) | |
343 | return ERR_PTR(-ENOMEM); | |
344 | ||
345 | bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); | |
346 | kfree(devname); | |
347 | return bdev; | |
348 | } | |
349 | ||
d9186c03 CH |
350 | static int |
351 | bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, | |
352 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) | |
353 | { | |
354 | struct pnfs_block_volume *v = &volumes[idx]; | |
297fae4d | 355 | struct block_device *bdev; |
d9186c03 | 356 | const struct pr_ops *ops; |
d9186c03 CH |
357 | int error; |
358 | ||
359 | if (!bl_validate_designator(v)) | |
360 | return -EINVAL; | |
361 | ||
297fae4d AS |
362 | bdev = bl_open_dm_mpath_udev_path(v); |
363 | if (IS_ERR(bdev)) | |
364 | bdev = bl_open_udev_path(v); | |
365 | if (IS_ERR(bdev)) | |
366 | return PTR_ERR(bdev); | |
367 | d->bdev = bdev; | |
d9186c03 CH |
368 | |
369 | d->len = i_size_read(d->bdev->bd_inode); | |
370 | d->map = bl_map_simple; | |
371 | d->pr_key = v->scsi.pr_key; | |
372 | ||
373 | pr_info("pNFS: using block device %s (reservation key 0x%llx)\n", | |
374 | d->bdev->bd_disk->disk_name, d->pr_key); | |
375 | ||
376 | ops = d->bdev->bd_disk->fops->pr_ops; | |
377 | if (!ops) { | |
378 | pr_err("pNFS: block device %s does not support reservations.", | |
379 | d->bdev->bd_disk->disk_name); | |
380 | error = -EINVAL; | |
381 | goto out_blkdev_put; | |
382 | } | |
383 | ||
384 | error = ops->pr_register(d->bdev, 0, d->pr_key, true); | |
385 | if (error) { | |
386 | pr_err("pNFS: failed to register key for block device %s.", | |
387 | d->bdev->bd_disk->disk_name); | |
388 | goto out_blkdev_put; | |
389 | } | |
390 | ||
391 | d->pr_registered = true; | |
392 | return 0; | |
393 | ||
394 | out_blkdev_put: | |
0173ca05 | 395 | blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE); |
d9186c03 CH |
396 | return error; |
397 | } | |
398 | ||
5c83746a CH |
399 | static int |
400 | bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d, | |
401 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) | |
402 | { | |
403 | struct pnfs_block_volume *v = &volumes[idx]; | |
404 | int ret; | |
405 | ||
406 | ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask); | |
407 | if (ret) | |
408 | return ret; | |
409 | ||
410 | d->disk_offset = v->slice.start; | |
411 | d->len = v->slice.len; | |
412 | return 0; | |
413 | } | |
414 | ||
415 | static int | |
416 | bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d, | |
417 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) | |
418 | { | |
419 | struct pnfs_block_volume *v = &volumes[idx]; | |
420 | u64 len = 0; | |
421 | int ret, i; | |
422 | ||
423 | d->children = kcalloc(v->concat.volumes_count, | |
424 | sizeof(struct pnfs_block_dev), GFP_KERNEL); | |
425 | if (!d->children) | |
426 | return -ENOMEM; | |
427 | ||
428 | for (i = 0; i < v->concat.volumes_count; i++) { | |
429 | ret = bl_parse_deviceid(server, &d->children[i], | |
430 | volumes, v->concat.volumes[i], gfp_mask); | |
431 | if (ret) | |
432 | return ret; | |
433 | ||
434 | d->nr_children++; | |
435 | d->children[i].start += len; | |
436 | len += d->children[i].len; | |
437 | } | |
438 | ||
439 | d->len = len; | |
440 | d->map = bl_map_concat; | |
441 | return 0; | |
442 | } | |
443 | ||
444 | static int | |
445 | bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d, | |
446 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) | |
447 | { | |
448 | struct pnfs_block_volume *v = &volumes[idx]; | |
449 | u64 len = 0; | |
450 | int ret, i; | |
451 | ||
452 | d->children = kcalloc(v->stripe.volumes_count, | |
453 | sizeof(struct pnfs_block_dev), GFP_KERNEL); | |
454 | if (!d->children) | |
455 | return -ENOMEM; | |
456 | ||
457 | for (i = 0; i < v->stripe.volumes_count; i++) { | |
458 | ret = bl_parse_deviceid(server, &d->children[i], | |
459 | volumes, v->stripe.volumes[i], gfp_mask); | |
460 | if (ret) | |
461 | return ret; | |
462 | ||
463 | d->nr_children++; | |
464 | len += d->children[i].len; | |
465 | } | |
466 | ||
467 | d->len = len; | |
468 | d->chunk_size = v->stripe.chunk_size; | |
469 | d->map = bl_map_stripe; | |
470 | return 0; | |
471 | } | |
472 | ||
473 | static int | |
474 | bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, | |
475 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) | |
476 | { | |
477 | switch (volumes[idx].type) { | |
478 | case PNFS_BLOCK_VOLUME_SIMPLE: | |
479 | return bl_parse_simple(server, d, volumes, idx, gfp_mask); | |
480 | case PNFS_BLOCK_VOLUME_SLICE: | |
481 | return bl_parse_slice(server, d, volumes, idx, gfp_mask); | |
482 | case PNFS_BLOCK_VOLUME_CONCAT: | |
483 | return bl_parse_concat(server, d, volumes, idx, gfp_mask); | |
484 | case PNFS_BLOCK_VOLUME_STRIPE: | |
485 | return bl_parse_stripe(server, d, volumes, idx, gfp_mask); | |
d9186c03 CH |
486 | case PNFS_BLOCK_VOLUME_SCSI: |
487 | return bl_parse_scsi(server, d, volumes, idx, gfp_mask); | |
5c83746a CH |
488 | default: |
489 | dprintk("unsupported volume type: %d\n", volumes[idx].type); | |
490 | return -EIO; | |
491 | } | |
492 | } | |
493 | ||
494 | struct nfs4_deviceid_node * | |
495 | bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, | |
496 | gfp_t gfp_mask) | |
497 | { | |
498 | struct nfs4_deviceid_node *node = NULL; | |
499 | struct pnfs_block_volume *volumes; | |
500 | struct pnfs_block_dev *top; | |
501 | struct xdr_stream xdr; | |
502 | struct xdr_buf buf; | |
503 | struct page *scratch; | |
504 | int nr_volumes, ret, i; | |
505 | __be32 *p; | |
506 | ||
507 | scratch = alloc_page(gfp_mask); | |
508 | if (!scratch) | |
509 | goto out; | |
510 | ||
511 | xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen); | |
512 | xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); | |
513 | ||
514 | p = xdr_inline_decode(&xdr, sizeof(__be32)); | |
515 | if (!p) | |
516 | goto out_free_scratch; | |
517 | nr_volumes = be32_to_cpup(p++); | |
518 | ||
519 | volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume), | |
520 | gfp_mask); | |
521 | if (!volumes) | |
522 | goto out_free_scratch; | |
523 | ||
524 | for (i = 0; i < nr_volumes; i++) { | |
525 | ret = nfs4_block_decode_volume(&xdr, &volumes[i]); | |
526 | if (ret < 0) | |
527 | goto out_free_volumes; | |
528 | } | |
529 | ||
530 | top = kzalloc(sizeof(*top), gfp_mask); | |
531 | if (!top) | |
532 | goto out_free_volumes; | |
533 | ||
534 | ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask); | |
535 | if (ret) { | |
536 | bl_free_device(top); | |
537 | kfree(top); | |
538 | goto out_free_volumes; | |
539 | } | |
540 | ||
541 | node = &top->node; | |
542 | nfs4_init_deviceid_node(node, server, &pdev->dev_id); | |
543 | ||
544 | out_free_volumes: | |
545 | kfree(volumes); | |
546 | out_free_scratch: | |
547 | __free_page(scratch); | |
548 | out: | |
549 | return node; | |
550 | } |