dm: allow immutable request-based targets to use blk-mq pdu
[deliverable/linux.git] / drivers / md / dm-mpath.c
1 /*
2 * Copyright (C) 2003 Sistina Software Limited.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8 #include <linux/device-mapper.h>
9
10 #include "dm.h"
11 #include "dm-path-selector.h"
12 #include "dm-uevent.h"
13
14 #include <linux/blkdev.h>
15 #include <linux/ctype.h>
16 #include <linux/init.h>
17 #include <linux/mempool.h>
18 #include <linux/module.h>
19 #include <linux/pagemap.h>
20 #include <linux/slab.h>
21 #include <linux/time.h>
22 #include <linux/workqueue.h>
23 #include <linux/delay.h>
24 #include <scsi/scsi_dh.h>
25 #include <linux/atomic.h>
26
27 #define DM_MSG_PREFIX "multipath"
28 #define DM_PG_INIT_DELAY_MSECS 2000
29 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
30
31 /* Path properties */
32 struct pgpath {
33 struct list_head list;
34
35 struct priority_group *pg; /* Owning PG */
36 unsigned is_active; /* Path status */
37 unsigned fail_count; /* Cumulative failure count */
38
39 struct dm_path path;
40 struct delayed_work activate_path;
41 };
42
43 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
44
45 /*
46 * Paths are grouped into Priority Groups and numbered from 1 upwards.
47 * Each has a path selector which controls which path gets used.
48 */
49 struct priority_group {
50 struct list_head list;
51
52 struct multipath *m; /* Owning multipath instance */
53 struct path_selector ps;
54
55 unsigned pg_num; /* Reference number */
56 unsigned bypassed; /* Temporarily bypass this PG? */
57
58 unsigned nr_pgpaths; /* Number of paths in PG */
59 struct list_head pgpaths;
60 };
61
62 /* Multipath context */
63 struct multipath {
64 struct list_head list;
65 struct dm_target *ti;
66
67 const char *hw_handler_name;
68 char *hw_handler_params;
69
70 spinlock_t lock;
71
72 unsigned nr_priority_groups;
73 struct list_head priority_groups;
74
75 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
76
77 unsigned pg_init_required; /* pg_init needs calling? */
78 unsigned pg_init_in_progress; /* Only one pg_init allowed at once */
79 unsigned pg_init_delay_retry; /* Delay pg_init retry? */
80
81 unsigned nr_valid_paths; /* Total number of usable paths */
82 struct pgpath *current_pgpath;
83 struct priority_group *current_pg;
84 struct priority_group *next_pg; /* Switch to this PG if set */
85 unsigned repeat_count; /* I/Os left before calling PS again */
86
87 unsigned queue_io:1; /* Must we queue all I/O? */
88 unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */
89 unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
90 unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
91 unsigned pg_init_disabled:1; /* pg_init is not currently allowed */
92
93 unsigned pg_init_retries; /* Number of times to retry pg_init */
94 unsigned pg_init_count; /* Number of times pg_init called */
95 unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */
96
97 struct work_struct trigger_event;
98
99 /*
100 * We must use a mempool of dm_mpath_io structs so that we
101 * can resubmit bios on error.
102 */
103 mempool_t *mpio_pool;
104
105 struct mutex work_mutex;
106 };
107
108 /*
109 * Context information attached to each bio we process.
110 */
111 struct dm_mpath_io {
112 struct pgpath *pgpath;
113 size_t nr_bytes;
114 };
115
116 typedef int (*action_fn) (struct pgpath *pgpath);
117
118 static struct kmem_cache *_mpio_cache;
119
120 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
121 static void trigger_event(struct work_struct *work);
122 static void activate_path(struct work_struct *work);
123 static int __pgpath_busy(struct pgpath *pgpath);
124
125
126 /*-----------------------------------------------
127 * Allocation routines
128 *-----------------------------------------------*/
129
130 static struct pgpath *alloc_pgpath(void)
131 {
132 struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
133
134 if (pgpath) {
135 pgpath->is_active = 1;
136 INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
137 }
138
139 return pgpath;
140 }
141
142 static void free_pgpath(struct pgpath *pgpath)
143 {
144 kfree(pgpath);
145 }
146
147 static struct priority_group *alloc_priority_group(void)
148 {
149 struct priority_group *pg;
150
151 pg = kzalloc(sizeof(*pg), GFP_KERNEL);
152
153 if (pg)
154 INIT_LIST_HEAD(&pg->pgpaths);
155
156 return pg;
157 }
158
159 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
160 {
161 struct pgpath *pgpath, *tmp;
162
163 list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
164 list_del(&pgpath->list);
165 dm_put_device(ti, pgpath->path.dev);
166 free_pgpath(pgpath);
167 }
168 }
169
170 static void free_priority_group(struct priority_group *pg,
171 struct dm_target *ti)
172 {
173 struct path_selector *ps = &pg->ps;
174
175 if (ps->type) {
176 ps->type->destroy(ps);
177 dm_put_path_selector(ps->type);
178 }
179
180 free_pgpaths(&pg->pgpaths, ti);
181 kfree(pg);
182 }
183
184 static struct multipath *alloc_multipath(struct dm_target *ti)
185 {
186 struct multipath *m;
187 unsigned min_ios = dm_get_reserved_rq_based_ios();
188
189 m = kzalloc(sizeof(*m), GFP_KERNEL);
190 if (m) {
191 INIT_LIST_HEAD(&m->priority_groups);
192 spin_lock_init(&m->lock);
193 m->queue_io = 1;
194 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
195 INIT_WORK(&m->trigger_event, trigger_event);
196 init_waitqueue_head(&m->pg_init_wait);
197 mutex_init(&m->work_mutex);
198 m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
199 if (!m->mpio_pool) {
200 kfree(m);
201 return NULL;
202 }
203 m->ti = ti;
204 ti->private = m;
205 }
206
207 return m;
208 }
209
210 static void free_multipath(struct multipath *m)
211 {
212 struct priority_group *pg, *tmp;
213
214 list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
215 list_del(&pg->list);
216 free_priority_group(pg, m->ti);
217 }
218
219 kfree(m->hw_handler_name);
220 kfree(m->hw_handler_params);
221 mempool_destroy(m->mpio_pool);
222 kfree(m);
223 }
224
225 static int set_mapinfo(struct multipath *m, union map_info *info)
226 {
227 struct dm_mpath_io *mpio;
228
229 mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
230 if (!mpio)
231 return -ENOMEM;
232
233 memset(mpio, 0, sizeof(*mpio));
234 info->ptr = mpio;
235
236 return 0;
237 }
238
239 static void clear_mapinfo(struct multipath *m, union map_info *info)
240 {
241 struct dm_mpath_io *mpio = info->ptr;
242
243 info->ptr = NULL;
244 mempool_free(mpio, m->mpio_pool);
245 }
246
247 /*-----------------------------------------------
248 * Path selection
249 *-----------------------------------------------*/
250
251 static int __pg_init_all_paths(struct multipath *m)
252 {
253 struct pgpath *pgpath;
254 unsigned long pg_init_delay = 0;
255
256 if (m->pg_init_in_progress || m->pg_init_disabled)
257 return 0;
258
259 m->pg_init_count++;
260 m->pg_init_required = 0;
261
262 /* Check here to reset pg_init_required */
263 if (!m->current_pg)
264 return 0;
265
266 if (m->pg_init_delay_retry)
267 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
268 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
269 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
270 /* Skip failed paths */
271 if (!pgpath->is_active)
272 continue;
273 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
274 pg_init_delay))
275 m->pg_init_in_progress++;
276 }
277 return m->pg_init_in_progress;
278 }
279
280 static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
281 {
282 m->current_pg = pgpath->pg;
283
284 /* Must we initialise the PG first, and queue I/O till it's ready? */
285 if (m->hw_handler_name) {
286 m->pg_init_required = 1;
287 m->queue_io = 1;
288 } else {
289 m->pg_init_required = 0;
290 m->queue_io = 0;
291 }
292
293 m->pg_init_count = 0;
294 }
295
296 static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
297 size_t nr_bytes)
298 {
299 struct dm_path *path;
300
301 path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
302 if (!path)
303 return -ENXIO;
304
305 m->current_pgpath = path_to_pgpath(path);
306
307 if (m->current_pg != pg)
308 __switch_pg(m, m->current_pgpath);
309
310 return 0;
311 }
312
313 static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
314 {
315 struct priority_group *pg;
316 unsigned bypassed = 1;
317
318 if (!m->nr_valid_paths) {
319 m->queue_io = 0;
320 goto failed;
321 }
322
323 /* Were we instructed to switch PG? */
324 if (m->next_pg) {
325 pg = m->next_pg;
326 m->next_pg = NULL;
327 if (!__choose_path_in_pg(m, pg, nr_bytes))
328 return;
329 }
330
331 /* Don't change PG until it has no remaining paths */
332 if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
333 return;
334
335 /*
336 * Loop through priority groups until we find a valid path.
337 * First time we skip PGs marked 'bypassed'.
338 * Second time we only try the ones we skipped, but set
339 * pg_init_delay_retry so we do not hammer controllers.
340 */
341 do {
342 list_for_each_entry(pg, &m->priority_groups, list) {
343 if (pg->bypassed == bypassed)
344 continue;
345 if (!__choose_path_in_pg(m, pg, nr_bytes)) {
346 if (!bypassed)
347 m->pg_init_delay_retry = 1;
348 return;
349 }
350 }
351 } while (bypassed--);
352
353 failed:
354 m->current_pgpath = NULL;
355 m->current_pg = NULL;
356 }
357
358 /*
359 * Check whether bios must be queued in the device-mapper core rather
360 * than here in the target.
361 *
362 * m->lock must be held on entry.
363 *
364 * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
365 * same value then we are not between multipath_presuspend()
366 * and multipath_resume() calls and we have no need to check
367 * for the DMF_NOFLUSH_SUSPENDING flag.
368 */
369 static int __must_push_back(struct multipath *m)
370 {
371 return (m->queue_if_no_path ||
372 (m->queue_if_no_path != m->saved_queue_if_no_path &&
373 dm_noflush_suspending(m->ti)));
374 }
375
376 /*
377 * Map cloned requests
378 */
379 static int __multipath_map(struct dm_target *ti, struct request *clone,
380 union map_info *map_context,
381 struct request *rq, struct request **__clone)
382 {
383 struct multipath *m = (struct multipath *) ti->private;
384 int r = DM_MAPIO_REQUEUE;
385 size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq);
386 struct pgpath *pgpath;
387 struct block_device *bdev;
388 struct dm_mpath_io *mpio;
389
390 spin_lock_irq(&m->lock);
391
392 /* Do we need to select a new pgpath? */
393 if (!m->current_pgpath ||
394 (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
395 __choose_pgpath(m, nr_bytes);
396
397 pgpath = m->current_pgpath;
398
399 if (!pgpath) {
400 if (!__must_push_back(m))
401 r = -EIO; /* Failed */
402 goto out_unlock;
403 } else if (m->queue_io || m->pg_init_required) {
404 __pg_init_all_paths(m);
405 goto out_unlock;
406 }
407
408 if (set_mapinfo(m, map_context) < 0)
409 /* ENOMEM, requeue */
410 goto out_unlock;
411
412 mpio = map_context->ptr;
413 mpio->pgpath = pgpath;
414 mpio->nr_bytes = nr_bytes;
415
416 bdev = pgpath->path.dev->bdev;
417
418 spin_unlock_irq(&m->lock);
419
420 if (clone) {
421 /*
422 * Old request-based interface: allocated clone is passed in.
423 * Used by: .request_fn stacked on .request_fn path(s).
424 */
425 clone->q = bdev_get_queue(bdev);
426 clone->rq_disk = bdev->bd_disk;
427 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
428 } else {
429 /*
430 * blk-mq request-based interface; used by both:
431 * .request_fn stacked on blk-mq path(s) and
432 * blk-mq stacked on blk-mq path(s).
433 */
434 *__clone = blk_get_request(bdev_get_queue(bdev),
435 rq_data_dir(rq), GFP_ATOMIC);
436 if (IS_ERR(*__clone)) {
437 /* ENOMEM, requeue */
438 clear_mapinfo(m, map_context);
439 return r;
440 }
441 (*__clone)->bio = (*__clone)->biotail = NULL;
442 (*__clone)->rq_disk = bdev->bd_disk;
443 (*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT;
444 }
445
446 if (pgpath->pg->ps.type->start_io)
447 pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
448 &pgpath->path,
449 nr_bytes);
450 return DM_MAPIO_REMAPPED;
451
452 out_unlock:
453 spin_unlock_irq(&m->lock);
454
455 return r;
456 }
457
458 static int multipath_map(struct dm_target *ti, struct request *clone,
459 union map_info *map_context)
460 {
461 return __multipath_map(ti, clone, map_context, NULL, NULL);
462 }
463
464 static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
465 union map_info *map_context,
466 struct request **clone)
467 {
468 return __multipath_map(ti, NULL, map_context, rq, clone);
469 }
470
471 static void multipath_release_clone(struct request *clone)
472 {
473 blk_put_request(clone);
474 }
475
476 /*
477 * If we run out of usable paths, should we queue I/O or error it?
478 */
479 static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
480 unsigned save_old_value)
481 {
482 unsigned long flags;
483
484 spin_lock_irqsave(&m->lock, flags);
485
486 if (save_old_value)
487 m->saved_queue_if_no_path = m->queue_if_no_path;
488 else
489 m->saved_queue_if_no_path = queue_if_no_path;
490 m->queue_if_no_path = queue_if_no_path;
491 spin_unlock_irqrestore(&m->lock, flags);
492
493 if (!queue_if_no_path)
494 dm_table_run_md_queue_async(m->ti->table);
495
496 return 0;
497 }
498
499 /*
500 * An event is triggered whenever a path is taken out of use.
501 * Includes path failure and PG bypass.
502 */
503 static void trigger_event(struct work_struct *work)
504 {
505 struct multipath *m =
506 container_of(work, struct multipath, trigger_event);
507
508 dm_table_event(m->ti->table);
509 }
510
511 /*-----------------------------------------------------------------
512 * Constructor/argument parsing:
513 * <#multipath feature args> [<arg>]*
514 * <#hw_handler args> [hw_handler [<arg>]*]
515 * <#priority groups>
516 * <initial priority group>
517 * [<selector> <#selector args> [<arg>]*
518 * <#paths> <#per-path selector args>
519 * [<path> [<arg>]* ]+ ]+
520 *---------------------------------------------------------------*/
521 static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
522 struct dm_target *ti)
523 {
524 int r;
525 struct path_selector_type *pst;
526 unsigned ps_argc;
527
528 static struct dm_arg _args[] = {
529 {0, 1024, "invalid number of path selector args"},
530 };
531
532 pst = dm_get_path_selector(dm_shift_arg(as));
533 if (!pst) {
534 ti->error = "unknown path selector type";
535 return -EINVAL;
536 }
537
538 r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
539 if (r) {
540 dm_put_path_selector(pst);
541 return -EINVAL;
542 }
543
544 r = pst->create(&pg->ps, ps_argc, as->argv);
545 if (r) {
546 dm_put_path_selector(pst);
547 ti->error = "path selector constructor failed";
548 return r;
549 }
550
551 pg->ps.type = pst;
552 dm_consume_args(as, ps_argc);
553
554 return 0;
555 }
556
557 static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
558 struct dm_target *ti)
559 {
560 int r;
561 struct pgpath *p;
562 struct multipath *m = ti->private;
563 struct request_queue *q = NULL;
564 const char *attached_handler_name;
565
566 /* we need at least a path arg */
567 if (as->argc < 1) {
568 ti->error = "no device given";
569 return ERR_PTR(-EINVAL);
570 }
571
572 p = alloc_pgpath();
573 if (!p)
574 return ERR_PTR(-ENOMEM);
575
576 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
577 &p->path.dev);
578 if (r) {
579 ti->error = "error getting device";
580 goto bad;
581 }
582
583 if (m->retain_attached_hw_handler || m->hw_handler_name)
584 q = bdev_get_queue(p->path.dev->bdev);
585
586 if (m->retain_attached_hw_handler) {
587 retain:
588 attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
589 if (attached_handler_name) {
590 /*
591 * Reset hw_handler_name to match the attached handler
592 * and clear any hw_handler_params associated with the
593 * ignored handler.
594 *
595 * NB. This modifies the table line to show the actual
596 * handler instead of the original table passed in.
597 */
598 kfree(m->hw_handler_name);
599 m->hw_handler_name = attached_handler_name;
600
601 kfree(m->hw_handler_params);
602 m->hw_handler_params = NULL;
603 }
604 }
605
606 if (m->hw_handler_name) {
607 r = scsi_dh_attach(q, m->hw_handler_name);
608 if (r == -EBUSY) {
609 char b[BDEVNAME_SIZE];
610
611 printk(KERN_INFO "dm-mpath: retaining handler on device %s\n",
612 bdevname(p->path.dev->bdev, b));
613 goto retain;
614 }
615 if (r < 0) {
616 ti->error = "error attaching hardware handler";
617 dm_put_device(ti, p->path.dev);
618 goto bad;
619 }
620
621 if (m->hw_handler_params) {
622 r = scsi_dh_set_params(q, m->hw_handler_params);
623 if (r < 0) {
624 ti->error = "unable to set hardware "
625 "handler parameters";
626 dm_put_device(ti, p->path.dev);
627 goto bad;
628 }
629 }
630 }
631
632 r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
633 if (r) {
634 dm_put_device(ti, p->path.dev);
635 goto bad;
636 }
637
638 return p;
639
640 bad:
641 free_pgpath(p);
642 return ERR_PTR(r);
643 }
644
645 static struct priority_group *parse_priority_group(struct dm_arg_set *as,
646 struct multipath *m)
647 {
648 static struct dm_arg _args[] = {
649 {1, 1024, "invalid number of paths"},
650 {0, 1024, "invalid number of selector args"}
651 };
652
653 int r;
654 unsigned i, nr_selector_args, nr_args;
655 struct priority_group *pg;
656 struct dm_target *ti = m->ti;
657
658 if (as->argc < 2) {
659 as->argc = 0;
660 ti->error = "not enough priority group arguments";
661 return ERR_PTR(-EINVAL);
662 }
663
664 pg = alloc_priority_group();
665 if (!pg) {
666 ti->error = "couldn't allocate priority group";
667 return ERR_PTR(-ENOMEM);
668 }
669 pg->m = m;
670
671 r = parse_path_selector(as, pg, ti);
672 if (r)
673 goto bad;
674
675 /*
676 * read the paths
677 */
678 r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
679 if (r)
680 goto bad;
681
682 r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
683 if (r)
684 goto bad;
685
686 nr_args = 1 + nr_selector_args;
687 for (i = 0; i < pg->nr_pgpaths; i++) {
688 struct pgpath *pgpath;
689 struct dm_arg_set path_args;
690
691 if (as->argc < nr_args) {
692 ti->error = "not enough path parameters";
693 r = -EINVAL;
694 goto bad;
695 }
696
697 path_args.argc = nr_args;
698 path_args.argv = as->argv;
699
700 pgpath = parse_path(&path_args, &pg->ps, ti);
701 if (IS_ERR(pgpath)) {
702 r = PTR_ERR(pgpath);
703 goto bad;
704 }
705
706 pgpath->pg = pg;
707 list_add_tail(&pgpath->list, &pg->pgpaths);
708 dm_consume_args(as, nr_args);
709 }
710
711 return pg;
712
713 bad:
714 free_priority_group(pg, ti);
715 return ERR_PTR(r);
716 }
717
718 static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
719 {
720 unsigned hw_argc;
721 int ret;
722 struct dm_target *ti = m->ti;
723
724 static struct dm_arg _args[] = {
725 {0, 1024, "invalid number of hardware handler args"},
726 };
727
728 if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
729 return -EINVAL;
730
731 if (!hw_argc)
732 return 0;
733
734 m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
735
736 if (hw_argc > 1) {
737 char *p;
738 int i, j, len = 4;
739
740 for (i = 0; i <= hw_argc - 2; i++)
741 len += strlen(as->argv[i]) + 1;
742 p = m->hw_handler_params = kzalloc(len, GFP_KERNEL);
743 if (!p) {
744 ti->error = "memory allocation failed";
745 ret = -ENOMEM;
746 goto fail;
747 }
748 j = sprintf(p, "%d", hw_argc - 1);
749 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
750 j = sprintf(p, "%s", as->argv[i]);
751 }
752 dm_consume_args(as, hw_argc - 1);
753
754 return 0;
755 fail:
756 kfree(m->hw_handler_name);
757 m->hw_handler_name = NULL;
758 return ret;
759 }
760
761 static int parse_features(struct dm_arg_set *as, struct multipath *m)
762 {
763 int r;
764 unsigned argc;
765 struct dm_target *ti = m->ti;
766 const char *arg_name;
767
768 static struct dm_arg _args[] = {
769 {0, 6, "invalid number of feature args"},
770 {1, 50, "pg_init_retries must be between 1 and 50"},
771 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
772 };
773
774 r = dm_read_arg_group(_args, as, &argc, &ti->error);
775 if (r)
776 return -EINVAL;
777
778 if (!argc)
779 return 0;
780
781 do {
782 arg_name = dm_shift_arg(as);
783 argc--;
784
785 if (!strcasecmp(arg_name, "queue_if_no_path")) {
786 r = queue_if_no_path(m, 1, 0);
787 continue;
788 }
789
790 if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
791 m->retain_attached_hw_handler = 1;
792 continue;
793 }
794
795 if (!strcasecmp(arg_name, "pg_init_retries") &&
796 (argc >= 1)) {
797 r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
798 argc--;
799 continue;
800 }
801
802 if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
803 (argc >= 1)) {
804 r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
805 argc--;
806 continue;
807 }
808
809 ti->error = "Unrecognised multipath feature request";
810 r = -EINVAL;
811 } while (argc && !r);
812
813 return r;
814 }
815
816 static int multipath_ctr(struct dm_target *ti, unsigned int argc,
817 char **argv)
818 {
819 /* target arguments */
820 static struct dm_arg _args[] = {
821 {0, 1024, "invalid number of priority groups"},
822 {0, 1024, "invalid initial priority group number"},
823 };
824
825 int r;
826 struct multipath *m;
827 struct dm_arg_set as;
828 unsigned pg_count = 0;
829 unsigned next_pg_num;
830
831 as.argc = argc;
832 as.argv = argv;
833
834 m = alloc_multipath(ti);
835 if (!m) {
836 ti->error = "can't allocate multipath";
837 return -EINVAL;
838 }
839
840 r = parse_features(&as, m);
841 if (r)
842 goto bad;
843
844 r = parse_hw_handler(&as, m);
845 if (r)
846 goto bad;
847
848 r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
849 if (r)
850 goto bad;
851
852 r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
853 if (r)
854 goto bad;
855
856 if ((!m->nr_priority_groups && next_pg_num) ||
857 (m->nr_priority_groups && !next_pg_num)) {
858 ti->error = "invalid initial priority group";
859 r = -EINVAL;
860 goto bad;
861 }
862
863 /* parse the priority groups */
864 while (as.argc) {
865 struct priority_group *pg;
866
867 pg = parse_priority_group(&as, m);
868 if (IS_ERR(pg)) {
869 r = PTR_ERR(pg);
870 goto bad;
871 }
872
873 m->nr_valid_paths += pg->nr_pgpaths;
874 list_add_tail(&pg->list, &m->priority_groups);
875 pg_count++;
876 pg->pg_num = pg_count;
877 if (!--next_pg_num)
878 m->next_pg = pg;
879 }
880
881 if (pg_count != m->nr_priority_groups) {
882 ti->error = "priority group count mismatch";
883 r = -EINVAL;
884 goto bad;
885 }
886
887 ti->num_flush_bios = 1;
888 ti->num_discard_bios = 1;
889 ti->num_write_same_bios = 1;
890
891 return 0;
892
893 bad:
894 free_multipath(m);
895 return r;
896 }
897
898 static void multipath_wait_for_pg_init_completion(struct multipath *m)
899 {
900 DECLARE_WAITQUEUE(wait, current);
901 unsigned long flags;
902
903 add_wait_queue(&m->pg_init_wait, &wait);
904
905 while (1) {
906 set_current_state(TASK_UNINTERRUPTIBLE);
907
908 spin_lock_irqsave(&m->lock, flags);
909 if (!m->pg_init_in_progress) {
910 spin_unlock_irqrestore(&m->lock, flags);
911 break;
912 }
913 spin_unlock_irqrestore(&m->lock, flags);
914
915 io_schedule();
916 }
917 set_current_state(TASK_RUNNING);
918
919 remove_wait_queue(&m->pg_init_wait, &wait);
920 }
921
922 static void flush_multipath_work(struct multipath *m)
923 {
924 unsigned long flags;
925
926 spin_lock_irqsave(&m->lock, flags);
927 m->pg_init_disabled = 1;
928 spin_unlock_irqrestore(&m->lock, flags);
929
930 flush_workqueue(kmpath_handlerd);
931 multipath_wait_for_pg_init_completion(m);
932 flush_workqueue(kmultipathd);
933 flush_work(&m->trigger_event);
934
935 spin_lock_irqsave(&m->lock, flags);
936 m->pg_init_disabled = 0;
937 spin_unlock_irqrestore(&m->lock, flags);
938 }
939
940 static void multipath_dtr(struct dm_target *ti)
941 {
942 struct multipath *m = ti->private;
943
944 flush_multipath_work(m);
945 free_multipath(m);
946 }
947
948 /*
949 * Take a path out of use.
950 */
951 static int fail_path(struct pgpath *pgpath)
952 {
953 unsigned long flags;
954 struct multipath *m = pgpath->pg->m;
955
956 spin_lock_irqsave(&m->lock, flags);
957
958 if (!pgpath->is_active)
959 goto out;
960
961 DMWARN("Failing path %s.", pgpath->path.dev->name);
962
963 pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
964 pgpath->is_active = 0;
965 pgpath->fail_count++;
966
967 m->nr_valid_paths--;
968
969 if (pgpath == m->current_pgpath)
970 m->current_pgpath = NULL;
971
972 dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
973 pgpath->path.dev->name, m->nr_valid_paths);
974
975 schedule_work(&m->trigger_event);
976
977 out:
978 spin_unlock_irqrestore(&m->lock, flags);
979
980 return 0;
981 }
982
983 /*
984 * Reinstate a previously-failed path
985 */
986 static int reinstate_path(struct pgpath *pgpath)
987 {
988 int r = 0, run_queue = 0;
989 unsigned long flags;
990 struct multipath *m = pgpath->pg->m;
991
992 spin_lock_irqsave(&m->lock, flags);
993
994 if (pgpath->is_active)
995 goto out;
996
997 if (!pgpath->pg->ps.type->reinstate_path) {
998 DMWARN("Reinstate path not supported by path selector %s",
999 pgpath->pg->ps.type->name);
1000 r = -EINVAL;
1001 goto out;
1002 }
1003
1004 r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
1005 if (r)
1006 goto out;
1007
1008 pgpath->is_active = 1;
1009
1010 if (!m->nr_valid_paths++) {
1011 m->current_pgpath = NULL;
1012 run_queue = 1;
1013 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
1014 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
1015 m->pg_init_in_progress++;
1016 }
1017
1018 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
1019 pgpath->path.dev->name, m->nr_valid_paths);
1020
1021 schedule_work(&m->trigger_event);
1022
1023 out:
1024 spin_unlock_irqrestore(&m->lock, flags);
1025 if (run_queue)
1026 dm_table_run_md_queue_async(m->ti->table);
1027
1028 return r;
1029 }
1030
1031 /*
1032 * Fail or reinstate all paths that match the provided struct dm_dev.
1033 */
1034 static int action_dev(struct multipath *m, struct dm_dev *dev,
1035 action_fn action)
1036 {
1037 int r = -EINVAL;
1038 struct pgpath *pgpath;
1039 struct priority_group *pg;
1040
1041 list_for_each_entry(pg, &m->priority_groups, list) {
1042 list_for_each_entry(pgpath, &pg->pgpaths, list) {
1043 if (pgpath->path.dev == dev)
1044 r = action(pgpath);
1045 }
1046 }
1047
1048 return r;
1049 }
1050
1051 /*
1052 * Temporarily try to avoid having to use the specified PG
1053 */
1054 static void bypass_pg(struct multipath *m, struct priority_group *pg,
1055 int bypassed)
1056 {
1057 unsigned long flags;
1058
1059 spin_lock_irqsave(&m->lock, flags);
1060
1061 pg->bypassed = bypassed;
1062 m->current_pgpath = NULL;
1063 m->current_pg = NULL;
1064
1065 spin_unlock_irqrestore(&m->lock, flags);
1066
1067 schedule_work(&m->trigger_event);
1068 }
1069
1070 /*
1071 * Switch to using the specified PG from the next I/O that gets mapped
1072 */
1073 static int switch_pg_num(struct multipath *m, const char *pgstr)
1074 {
1075 struct priority_group *pg;
1076 unsigned pgnum;
1077 unsigned long flags;
1078 char dummy;
1079
1080 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1081 (pgnum > m->nr_priority_groups)) {
1082 DMWARN("invalid PG number supplied to switch_pg_num");
1083 return -EINVAL;
1084 }
1085
1086 spin_lock_irqsave(&m->lock, flags);
1087 list_for_each_entry(pg, &m->priority_groups, list) {
1088 pg->bypassed = 0;
1089 if (--pgnum)
1090 continue;
1091
1092 m->current_pgpath = NULL;
1093 m->current_pg = NULL;
1094 m->next_pg = pg;
1095 }
1096 spin_unlock_irqrestore(&m->lock, flags);
1097
1098 schedule_work(&m->trigger_event);
1099 return 0;
1100 }
1101
1102 /*
1103 * Set/clear bypassed status of a PG.
1104 * PGs are numbered upwards from 1 in the order they were declared.
1105 */
1106 static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
1107 {
1108 struct priority_group *pg;
1109 unsigned pgnum;
1110 char dummy;
1111
1112 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1113 (pgnum > m->nr_priority_groups)) {
1114 DMWARN("invalid PG number supplied to bypass_pg");
1115 return -EINVAL;
1116 }
1117
1118 list_for_each_entry(pg, &m->priority_groups, list) {
1119 if (!--pgnum)
1120 break;
1121 }
1122
1123 bypass_pg(m, pg, bypassed);
1124 return 0;
1125 }
1126
1127 /*
1128 * Should we retry pg_init immediately?
1129 */
1130 static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1131 {
1132 unsigned long flags;
1133 int limit_reached = 0;
1134
1135 spin_lock_irqsave(&m->lock, flags);
1136
1137 if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
1138 m->pg_init_required = 1;
1139 else
1140 limit_reached = 1;
1141
1142 spin_unlock_irqrestore(&m->lock, flags);
1143
1144 return limit_reached;
1145 }
1146
1147 static void pg_init_done(void *data, int errors)
1148 {
1149 struct pgpath *pgpath = data;
1150 struct priority_group *pg = pgpath->pg;
1151 struct multipath *m = pg->m;
1152 unsigned long flags;
1153 unsigned delay_retry = 0;
1154
1155 /* device or driver problems */
1156 switch (errors) {
1157 case SCSI_DH_OK:
1158 break;
1159 case SCSI_DH_NOSYS:
1160 if (!m->hw_handler_name) {
1161 errors = 0;
1162 break;
1163 }
1164 DMERR("Could not failover the device: Handler scsi_dh_%s "
1165 "Error %d.", m->hw_handler_name, errors);
1166 /*
1167 * Fail path for now, so we do not ping pong
1168 */
1169 fail_path(pgpath);
1170 break;
1171 case SCSI_DH_DEV_TEMP_BUSY:
1172 /*
1173 * Probably doing something like FW upgrade on the
1174 * controller so try the other pg.
1175 */
1176 bypass_pg(m, pg, 1);
1177 break;
1178 case SCSI_DH_RETRY:
1179 /* Wait before retrying. */
1180 delay_retry = 1;
1181 case SCSI_DH_IMM_RETRY:
1182 case SCSI_DH_RES_TEMP_UNAVAIL:
1183 if (pg_init_limit_reached(m, pgpath))
1184 fail_path(pgpath);
1185 errors = 0;
1186 break;
1187 default:
1188 /*
1189 * We probably do not want to fail the path for a device
1190 * error, but this is what the old dm did. In future
1191 * patches we can do more advanced handling.
1192 */
1193 fail_path(pgpath);
1194 }
1195
1196 spin_lock_irqsave(&m->lock, flags);
1197 if (errors) {
1198 if (pgpath == m->current_pgpath) {
1199 DMERR("Could not failover device. Error %d.", errors);
1200 m->current_pgpath = NULL;
1201 m->current_pg = NULL;
1202 }
1203 } else if (!m->pg_init_required)
1204 pg->bypassed = 0;
1205
1206 if (--m->pg_init_in_progress)
1207 /* Activations of other paths are still on going */
1208 goto out;
1209
1210 if (m->pg_init_required) {
1211 m->pg_init_delay_retry = delay_retry;
1212 if (__pg_init_all_paths(m))
1213 goto out;
1214 }
1215 m->queue_io = 0;
1216
1217 /*
1218 * Wake up any thread waiting to suspend.
1219 */
1220 wake_up(&m->pg_init_wait);
1221
1222 out:
1223 spin_unlock_irqrestore(&m->lock, flags);
1224 }
1225
1226 static void activate_path(struct work_struct *work)
1227 {
1228 struct pgpath *pgpath =
1229 container_of(work, struct pgpath, activate_path.work);
1230
1231 if (pgpath->is_active)
1232 scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
1233 pg_init_done, pgpath);
1234 else
1235 pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
1236 }
1237
1238 static int noretry_error(int error)
1239 {
1240 switch (error) {
1241 case -EOPNOTSUPP:
1242 case -EREMOTEIO:
1243 case -EILSEQ:
1244 case -ENODATA:
1245 case -ENOSPC:
1246 return 1;
1247 }
1248
1249 /* Anything else could be a path failure, so should be retried */
1250 return 0;
1251 }
1252
1253 /*
1254 * end_io handling
1255 */
1256 static int do_end_io(struct multipath *m, struct request *clone,
1257 int error, struct dm_mpath_io *mpio)
1258 {
1259 /*
1260 * We don't queue any clone request inside the multipath target
1261 * during end I/O handling, since those clone requests don't have
1262 * bio clones. If we queue them inside the multipath target,
1263 * we need to make bio clones, that requires memory allocation.
1264 * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
1265 * don't have bio clones.)
1266 * Instead of queueing the clone request here, we queue the original
1267 * request into dm core, which will remake a clone request and
1268 * clone bios for it and resubmit it later.
1269 */
1270 int r = DM_ENDIO_REQUEUE;
1271 unsigned long flags;
1272
1273 if (!error && !clone->errors)
1274 return 0; /* I/O complete */
1275
1276 if (noretry_error(error))
1277 return error;
1278
1279 if (mpio->pgpath)
1280 fail_path(mpio->pgpath);
1281
1282 spin_lock_irqsave(&m->lock, flags);
1283 if (!m->nr_valid_paths) {
1284 if (!m->queue_if_no_path) {
1285 if (!__must_push_back(m))
1286 r = -EIO;
1287 } else {
1288 if (error == -EBADE)
1289 r = error;
1290 }
1291 }
1292 spin_unlock_irqrestore(&m->lock, flags);
1293
1294 return r;
1295 }
1296
1297 static int multipath_end_io(struct dm_target *ti, struct request *clone,
1298 int error, union map_info *map_context)
1299 {
1300 struct multipath *m = ti->private;
1301 struct dm_mpath_io *mpio = map_context->ptr;
1302 struct pgpath *pgpath;
1303 struct path_selector *ps;
1304 int r;
1305
1306 BUG_ON(!mpio);
1307
1308 r = do_end_io(m, clone, error, mpio);
1309 pgpath = mpio->pgpath;
1310 if (pgpath) {
1311 ps = &pgpath->pg->ps;
1312 if (ps->type->end_io)
1313 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1314 }
1315 clear_mapinfo(m, map_context);
1316
1317 return r;
1318 }
1319
1320 /*
1321 * Suspend can't complete until all the I/O is processed so if
1322 * the last path fails we must error any remaining I/O.
1323 * Note that if the freeze_bdev fails while suspending, the
1324 * queue_if_no_path state is lost - userspace should reset it.
1325 */
1326 static void multipath_presuspend(struct dm_target *ti)
1327 {
1328 struct multipath *m = (struct multipath *) ti->private;
1329
1330 queue_if_no_path(m, 0, 1);
1331 }
1332
1333 static void multipath_postsuspend(struct dm_target *ti)
1334 {
1335 struct multipath *m = ti->private;
1336
1337 mutex_lock(&m->work_mutex);
1338 flush_multipath_work(m);
1339 mutex_unlock(&m->work_mutex);
1340 }
1341
1342 /*
1343 * Restore the queue_if_no_path setting.
1344 */
1345 static void multipath_resume(struct dm_target *ti)
1346 {
1347 struct multipath *m = (struct multipath *) ti->private;
1348 unsigned long flags;
1349
1350 spin_lock_irqsave(&m->lock, flags);
1351 m->queue_if_no_path = m->saved_queue_if_no_path;
1352 spin_unlock_irqrestore(&m->lock, flags);
1353 }
1354
1355 /*
1356 * Info output has the following format:
1357 * num_multipath_feature_args [multipath_feature_args]*
1358 * num_handler_status_args [handler_status_args]*
1359 * num_groups init_group_number
1360 * [A|D|E num_ps_status_args [ps_status_args]*
1361 * num_paths num_selector_args
1362 * [path_dev A|F fail_count [selector_args]* ]+ ]+
1363 *
1364 * Table output has the following format (identical to the constructor string):
1365 * num_feature_args [features_args]*
1366 * num_handler_args hw_handler [hw_handler_args]*
1367 * num_groups init_group_number
1368 * [priority selector-name num_ps_args [ps_args]*
1369 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1370 */
1371 static void multipath_status(struct dm_target *ti, status_type_t type,
1372 unsigned status_flags, char *result, unsigned maxlen)
1373 {
1374 int sz = 0;
1375 unsigned long flags;
1376 struct multipath *m = (struct multipath *) ti->private;
1377 struct priority_group *pg;
1378 struct pgpath *p;
1379 unsigned pg_num;
1380 char state;
1381
1382 spin_lock_irqsave(&m->lock, flags);
1383
1384 /* Features */
1385 if (type == STATUSTYPE_INFO)
1386 DMEMIT("2 %u %u ", m->queue_io, m->pg_init_count);
1387 else {
1388 DMEMIT("%u ", m->queue_if_no_path +
1389 (m->pg_init_retries > 0) * 2 +
1390 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
1391 m->retain_attached_hw_handler);
1392 if (m->queue_if_no_path)
1393 DMEMIT("queue_if_no_path ");
1394 if (m->pg_init_retries)
1395 DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1396 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1397 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1398 if (m->retain_attached_hw_handler)
1399 DMEMIT("retain_attached_hw_handler ");
1400 }
1401
1402 if (!m->hw_handler_name || type == STATUSTYPE_INFO)
1403 DMEMIT("0 ");
1404 else
1405 DMEMIT("1 %s ", m->hw_handler_name);
1406
1407 DMEMIT("%u ", m->nr_priority_groups);
1408
1409 if (m->next_pg)
1410 pg_num = m->next_pg->pg_num;
1411 else if (m->current_pg)
1412 pg_num = m->current_pg->pg_num;
1413 else
1414 pg_num = (m->nr_priority_groups ? 1 : 0);
1415
1416 DMEMIT("%u ", pg_num);
1417
1418 switch (type) {
1419 case STATUSTYPE_INFO:
1420 list_for_each_entry(pg, &m->priority_groups, list) {
1421 if (pg->bypassed)
1422 state = 'D'; /* Disabled */
1423 else if (pg == m->current_pg)
1424 state = 'A'; /* Currently Active */
1425 else
1426 state = 'E'; /* Enabled */
1427
1428 DMEMIT("%c ", state);
1429
1430 if (pg->ps.type->status)
1431 sz += pg->ps.type->status(&pg->ps, NULL, type,
1432 result + sz,
1433 maxlen - sz);
1434 else
1435 DMEMIT("0 ");
1436
1437 DMEMIT("%u %u ", pg->nr_pgpaths,
1438 pg->ps.type->info_args);
1439
1440 list_for_each_entry(p, &pg->pgpaths, list) {
1441 DMEMIT("%s %s %u ", p->path.dev->name,
1442 p->is_active ? "A" : "F",
1443 p->fail_count);
1444 if (pg->ps.type->status)
1445 sz += pg->ps.type->status(&pg->ps,
1446 &p->path, type, result + sz,
1447 maxlen - sz);
1448 }
1449 }
1450 break;
1451
1452 case STATUSTYPE_TABLE:
1453 list_for_each_entry(pg, &m->priority_groups, list) {
1454 DMEMIT("%s ", pg->ps.type->name);
1455
1456 if (pg->ps.type->status)
1457 sz += pg->ps.type->status(&pg->ps, NULL, type,
1458 result + sz,
1459 maxlen - sz);
1460 else
1461 DMEMIT("0 ");
1462
1463 DMEMIT("%u %u ", pg->nr_pgpaths,
1464 pg->ps.type->table_args);
1465
1466 list_for_each_entry(p, &pg->pgpaths, list) {
1467 DMEMIT("%s ", p->path.dev->name);
1468 if (pg->ps.type->status)
1469 sz += pg->ps.type->status(&pg->ps,
1470 &p->path, type, result + sz,
1471 maxlen - sz);
1472 }
1473 }
1474 break;
1475 }
1476
1477 spin_unlock_irqrestore(&m->lock, flags);
1478 }
1479
1480 static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1481 {
1482 int r = -EINVAL;
1483 struct dm_dev *dev;
1484 struct multipath *m = (struct multipath *) ti->private;
1485 action_fn action;
1486
1487 mutex_lock(&m->work_mutex);
1488
1489 if (dm_suspended(ti)) {
1490 r = -EBUSY;
1491 goto out;
1492 }
1493
1494 if (argc == 1) {
1495 if (!strcasecmp(argv[0], "queue_if_no_path")) {
1496 r = queue_if_no_path(m, 1, 0);
1497 goto out;
1498 } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
1499 r = queue_if_no_path(m, 0, 0);
1500 goto out;
1501 }
1502 }
1503
1504 if (argc != 2) {
1505 DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc);
1506 goto out;
1507 }
1508
1509 if (!strcasecmp(argv[0], "disable_group")) {
1510 r = bypass_pg_num(m, argv[1], 1);
1511 goto out;
1512 } else if (!strcasecmp(argv[0], "enable_group")) {
1513 r = bypass_pg_num(m, argv[1], 0);
1514 goto out;
1515 } else if (!strcasecmp(argv[0], "switch_group")) {
1516 r = switch_pg_num(m, argv[1]);
1517 goto out;
1518 } else if (!strcasecmp(argv[0], "reinstate_path"))
1519 action = reinstate_path;
1520 else if (!strcasecmp(argv[0], "fail_path"))
1521 action = fail_path;
1522 else {
1523 DMWARN("Unrecognised multipath message received: %s", argv[0]);
1524 goto out;
1525 }
1526
1527 r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
1528 if (r) {
1529 DMWARN("message: error getting device %s",
1530 argv[1]);
1531 goto out;
1532 }
1533
1534 r = action_dev(m, dev, action);
1535
1536 dm_put_device(ti, dev);
1537
1538 out:
1539 mutex_unlock(&m->work_mutex);
1540 return r;
1541 }
1542
1543 static int multipath_prepare_ioctl(struct dm_target *ti,
1544 struct block_device **bdev, fmode_t *mode)
1545 {
1546 struct multipath *m = ti->private;
1547 unsigned long flags;
1548 int r;
1549
1550 spin_lock_irqsave(&m->lock, flags);
1551
1552 if (!m->current_pgpath)
1553 __choose_pgpath(m, 0);
1554
1555 if (m->current_pgpath) {
1556 if (!m->queue_io) {
1557 *bdev = m->current_pgpath->path.dev->bdev;
1558 *mode = m->current_pgpath->path.dev->mode;
1559 r = 0;
1560 } else {
1561 /* pg_init has not started or completed */
1562 r = -ENOTCONN;
1563 }
1564 } else {
1565 /* No path is available */
1566 if (m->queue_if_no_path)
1567 r = -ENOTCONN;
1568 else
1569 r = -EIO;
1570 }
1571
1572 spin_unlock_irqrestore(&m->lock, flags);
1573
1574 if (r == -ENOTCONN) {
1575 spin_lock_irqsave(&m->lock, flags);
1576 if (!m->current_pg) {
1577 /* Path status changed, redo selection */
1578 __choose_pgpath(m, 0);
1579 }
1580 if (m->pg_init_required)
1581 __pg_init_all_paths(m);
1582 spin_unlock_irqrestore(&m->lock, flags);
1583 dm_table_run_md_queue_async(m->ti->table);
1584 }
1585
1586 /*
1587 * Only pass ioctls through if the device sizes match exactly.
1588 */
1589 if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
1590 return 1;
1591 return r;
1592 }
1593
1594 static int multipath_iterate_devices(struct dm_target *ti,
1595 iterate_devices_callout_fn fn, void *data)
1596 {
1597 struct multipath *m = ti->private;
1598 struct priority_group *pg;
1599 struct pgpath *p;
1600 int ret = 0;
1601
1602 list_for_each_entry(pg, &m->priority_groups, list) {
1603 list_for_each_entry(p, &pg->pgpaths, list) {
1604 ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
1605 if (ret)
1606 goto out;
1607 }
1608 }
1609
1610 out:
1611 return ret;
1612 }
1613
1614 static int __pgpath_busy(struct pgpath *pgpath)
1615 {
1616 struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1617
1618 return blk_lld_busy(q);
1619 }
1620
1621 /*
1622 * We return "busy", only when we can map I/Os but underlying devices
1623 * are busy (so even if we map I/Os now, the I/Os will wait on
1624 * the underlying queue).
1625 * In other words, if we want to kill I/Os or queue them inside us
1626 * due to map unavailability, we don't return "busy". Otherwise,
1627 * dm core won't give us the I/Os and we can't do what we want.
1628 */
1629 static int multipath_busy(struct dm_target *ti)
1630 {
1631 int busy = 0, has_active = 0;
1632 struct multipath *m = ti->private;
1633 struct priority_group *pg;
1634 struct pgpath *pgpath;
1635 unsigned long flags;
1636
1637 spin_lock_irqsave(&m->lock, flags);
1638
1639 /* pg_init in progress or no paths available */
1640 if (m->pg_init_in_progress ||
1641 (!m->nr_valid_paths && m->queue_if_no_path)) {
1642 busy = 1;
1643 goto out;
1644 }
1645 /* Guess which priority_group will be used at next mapping time */
1646 if (unlikely(!m->current_pgpath && m->next_pg))
1647 pg = m->next_pg;
1648 else if (likely(m->current_pg))
1649 pg = m->current_pg;
1650 else
1651 /*
1652 * We don't know which pg will be used at next mapping time.
1653 * We don't call __choose_pgpath() here to avoid to trigger
1654 * pg_init just by busy checking.
1655 * So we don't know whether underlying devices we will be using
1656 * at next mapping time are busy or not. Just try mapping.
1657 */
1658 goto out;
1659
1660 /*
1661 * If there is one non-busy active path at least, the path selector
1662 * will be able to select it. So we consider such a pg as not busy.
1663 */
1664 busy = 1;
1665 list_for_each_entry(pgpath, &pg->pgpaths, list)
1666 if (pgpath->is_active) {
1667 has_active = 1;
1668
1669 if (!__pgpath_busy(pgpath)) {
1670 busy = 0;
1671 break;
1672 }
1673 }
1674
1675 if (!has_active)
1676 /*
1677 * No active path in this pg, so this pg won't be used and
1678 * the current_pg will be changed at next mapping time.
1679 * We need to try mapping to determine it.
1680 */
1681 busy = 0;
1682
1683 out:
1684 spin_unlock_irqrestore(&m->lock, flags);
1685
1686 return busy;
1687 }
1688
1689 /*-----------------------------------------------------------------
1690 * Module setup
1691 *---------------------------------------------------------------*/
1692 static struct target_type multipath_target = {
1693 .name = "multipath",
1694 .version = {1, 11, 0},
1695 .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE,
1696 .module = THIS_MODULE,
1697 .ctr = multipath_ctr,
1698 .dtr = multipath_dtr,
1699 .map_rq = multipath_map,
1700 .clone_and_map_rq = multipath_clone_and_map,
1701 .release_clone_rq = multipath_release_clone,
1702 .rq_end_io = multipath_end_io,
1703 .presuspend = multipath_presuspend,
1704 .postsuspend = multipath_postsuspend,
1705 .resume = multipath_resume,
1706 .status = multipath_status,
1707 .message = multipath_message,
1708 .prepare_ioctl = multipath_prepare_ioctl,
1709 .iterate_devices = multipath_iterate_devices,
1710 .busy = multipath_busy,
1711 };
1712
1713 static int __init dm_multipath_init(void)
1714 {
1715 int r;
1716
1717 /* allocate a slab for the dm_ios */
1718 _mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
1719 if (!_mpio_cache)
1720 return -ENOMEM;
1721
1722 r = dm_register_target(&multipath_target);
1723 if (r < 0) {
1724 DMERR("register failed %d", r);
1725 r = -EINVAL;
1726 goto bad_register_target;
1727 }
1728
1729 kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
1730 if (!kmultipathd) {
1731 DMERR("failed to create workqueue kmpathd");
1732 r = -ENOMEM;
1733 goto bad_alloc_kmultipathd;
1734 }
1735
1736 /*
1737 * A separate workqueue is used to handle the device handlers
1738 * to avoid overloading existing workqueue. Overloading the
1739 * old workqueue would also create a bottleneck in the
1740 * path of the storage hardware device activation.
1741 */
1742 kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
1743 WQ_MEM_RECLAIM);
1744 if (!kmpath_handlerd) {
1745 DMERR("failed to create workqueue kmpath_handlerd");
1746 r = -ENOMEM;
1747 goto bad_alloc_kmpath_handlerd;
1748 }
1749
1750 DMINFO("version %u.%u.%u loaded",
1751 multipath_target.version[0], multipath_target.version[1],
1752 multipath_target.version[2]);
1753
1754 return 0;
1755
1756 bad_alloc_kmpath_handlerd:
1757 destroy_workqueue(kmultipathd);
1758 bad_alloc_kmultipathd:
1759 dm_unregister_target(&multipath_target);
1760 bad_register_target:
1761 kmem_cache_destroy(_mpio_cache);
1762
1763 return r;
1764 }
1765
1766 static void __exit dm_multipath_exit(void)
1767 {
1768 destroy_workqueue(kmpath_handlerd);
1769 destroy_workqueue(kmultipathd);
1770
1771 dm_unregister_target(&multipath_target);
1772 kmem_cache_destroy(_mpio_cache);
1773 }
1774
1775 module_init(dm_multipath_init);
1776 module_exit(dm_multipath_exit);
1777
1778 MODULE_DESCRIPTION(DM_NAME " multipath target");
1779 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
1780 MODULE_LICENSE("GPL");
This page took 0.130889 seconds and 6 git commands to generate.