2 * Copyright (C) 2010-2011 Neil Brown
3 * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
5 * This file is released under the GPL.
8 #include <linux/slab.h>
14 #include <linux/device-mapper.h>
16 #define DM_MSG_PREFIX "raid"
19 * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
20 * make it so the flag doesn't set anything.
22 #ifndef MD_SYNC_STATE_FORCED
23 #define MD_SYNC_STATE_FORCED 0
28 * Two DM devices, one to hold metadata and one to hold the
29 * actual data/parity. The reason for this is to not confuse
30 * ti->len and give more flexibility in altering size and
33 * While it is possible for this device to be associated
34 * with a different physical device than the data_dev, it
35 * is intended for it to be the same.
36 * |--------- Physical Device ---------|
37 * |- meta_dev -|------ data_dev ------|
39 struct dm_dev
*meta_dev
;
40 struct dm_dev
*data_dev
;
41 struct mdk_rdev_s rdev
;
45 * Flags for rs->print_flags field.
48 #define DMPF_NOSYNC 0x2
49 #define DMPF_REBUILD 0x4
50 #define DMPF_DAEMON_SLEEP 0x8
51 #define DMPF_MIN_RECOVERY_RATE 0x10
52 #define DMPF_MAX_RECOVERY_RATE 0x20
53 #define DMPF_MAX_WRITE_BEHIND 0x40
54 #define DMPF_STRIPE_CACHE 0x80
62 struct raid_type
*raid_type
;
63 struct dm_target_callbacks callbacks
;
65 struct raid_dev dev
[0];
68 /* Supported raid types and properties. */
69 static struct raid_type
{
70 const char *name
; /* RAID algorithm. */
71 const char *descr
; /* Descriptor text for logging. */
72 const unsigned parity_devs
; /* # of parity devices. */
73 const unsigned minimal_devs
; /* minimal # of devices in set. */
74 const unsigned level
; /* RAID level. */
75 const unsigned algorithm
; /* RAID algorithm. */
77 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0
},
78 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC
},
79 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC
},
80 {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC
},
81 {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC
},
82 {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART
},
83 {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART
},
84 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE
}
87 static struct raid_type
*get_raid_type(char *name
)
91 for (i
= 0; i
< ARRAY_SIZE(raid_types
); i
++)
92 if (!strcmp(raid_types
[i
].name
, name
))
93 return &raid_types
[i
];
98 static struct raid_set
*context_alloc(struct dm_target
*ti
, struct raid_type
*raid_type
, unsigned raid_devs
)
102 sector_t sectors_per_dev
;
104 if (raid_devs
<= raid_type
->parity_devs
) {
105 ti
->error
= "Insufficient number of devices";
106 return ERR_PTR(-EINVAL
);
109 sectors_per_dev
= ti
->len
;
110 if (sector_div(sectors_per_dev
, (raid_devs
- raid_type
->parity_devs
))) {
111 ti
->error
= "Target length not divisible by number of data devices";
112 return ERR_PTR(-EINVAL
);
115 rs
= kzalloc(sizeof(*rs
) + raid_devs
* sizeof(rs
->dev
[0]), GFP_KERNEL
);
117 ti
->error
= "Cannot allocate raid context";
118 return ERR_PTR(-ENOMEM
);
124 rs
->raid_type
= raid_type
;
125 rs
->md
.raid_disks
= raid_devs
;
126 rs
->md
.level
= raid_type
->level
;
127 rs
->md
.new_level
= rs
->md
.level
;
128 rs
->md
.dev_sectors
= sectors_per_dev
;
129 rs
->md
.layout
= raid_type
->algorithm
;
130 rs
->md
.new_layout
= rs
->md
.layout
;
131 rs
->md
.delta_disks
= 0;
132 rs
->md
.recovery_cp
= 0;
134 for (i
= 0; i
< raid_devs
; i
++)
135 md_rdev_init(&rs
->dev
[i
].rdev
);
138 * Remaining items to be initialized by further RAID params:
141 * rs->md.chunk_sectors
142 * rs->md.new_chunk_sectors
148 static void context_free(struct raid_set
*rs
)
152 for (i
= 0; i
< rs
->md
.raid_disks
; i
++)
153 if (rs
->dev
[i
].data_dev
)
154 dm_put_device(rs
->ti
, rs
->dev
[i
].data_dev
);
160 * For every device we have two words
161 * <meta_dev>: meta device name or '-' if missing
162 * <data_dev>: data device name or '-' if missing
164 * This code parses those words.
166 static int dev_parms(struct raid_set
*rs
, char **argv
)
170 int metadata_available
= 0;
173 for (i
= 0; i
< rs
->md
.raid_disks
; i
++, argv
+= 2) {
174 rs
->dev
[i
].rdev
.raid_disk
= i
;
176 rs
->dev
[i
].meta_dev
= NULL
;
177 rs
->dev
[i
].data_dev
= NULL
;
180 * There are no offsets, since there is a separate device
181 * for data and metadata.
183 rs
->dev
[i
].rdev
.data_offset
= 0;
184 rs
->dev
[i
].rdev
.mddev
= &rs
->md
;
186 if (strcmp(argv
[0], "-")) {
187 rs
->ti
->error
= "Metadata devices not supported";
191 if (!strcmp(argv
[1], "-")) {
192 if (!test_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
) &&
193 (!rs
->dev
[i
].rdev
.recovery_offset
)) {
194 rs
->ti
->error
= "Drive designated for rebuild not specified";
201 ret
= dm_get_device(rs
->ti
, argv
[1],
202 dm_table_get_mode(rs
->ti
->table
),
203 &rs
->dev
[i
].data_dev
);
205 rs
->ti
->error
= "RAID device lookup failure";
209 rs
->dev
[i
].rdev
.bdev
= rs
->dev
[i
].data_dev
->bdev
;
210 list_add(&rs
->dev
[i
].rdev
.same_set
, &rs
->md
.disks
);
211 if (!test_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
))
215 if (metadata_available
) {
217 rs
->md
.persistent
= 1;
218 rs
->md
.major_version
= 2;
219 } else if (rebuild
&& !rs
->md
.recovery_cp
) {
221 * Without metadata, we will not be able to tell if the array
222 * is in-sync or not - we must assume it is not. Therefore,
223 * it is impossible to rebuild a drive.
225 * Even if there is metadata, the on-disk information may
226 * indicate that the array is not in-sync and it will then
229 * User could specify 'nosync' option if desperate.
231 DMERR("Unable to rebuild drive while array is not in-sync");
232 rs
->ti
->error
= "RAID device lookup failure";
240 * Possible arguments are...
242 * <chunk_size> [optional_args]
245 * [[no]sync] Force or prevent recovery of the entire array
246 * [rebuild <idx>] Rebuild the drive indicated by the index
247 * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits
248 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
249 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
250 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
251 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
253 static int parse_raid_params(struct raid_set
*rs
, char **argv
,
254 unsigned num_raid_params
)
256 unsigned i
, rebuild_cnt
= 0;
261 * First, parse the in-order required arguments
263 if ((strict_strtoul(argv
[0], 10, &value
) < 0) ||
264 !is_power_of_2(value
) || (value
< 8)) {
265 rs
->ti
->error
= "Bad chunk size";
269 rs
->md
.new_chunk_sectors
= rs
->md
.chunk_sectors
= value
;
274 * Second, parse the unordered optional arguments
276 for (i
= 0; i
< rs
->md
.raid_disks
; i
++)
277 set_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
);
279 for (i
= 0; i
< num_raid_params
; i
++) {
280 if (!strcasecmp(argv
[i
], "nosync")) {
281 rs
->md
.recovery_cp
= MaxSector
;
282 rs
->print_flags
|= DMPF_NOSYNC
;
283 rs
->md
.flags
|= MD_SYNC_STATE_FORCED
;
286 if (!strcasecmp(argv
[i
], "sync")) {
287 rs
->md
.recovery_cp
= 0;
288 rs
->print_flags
|= DMPF_SYNC
;
289 rs
->md
.flags
|= MD_SYNC_STATE_FORCED
;
293 /* The rest of the optional arguments come in key/value pairs */
294 if ((i
+ 1) >= num_raid_params
) {
295 rs
->ti
->error
= "Wrong number of raid parameters given";
300 if (strict_strtoul(argv
[i
], 10, &value
) < 0) {
301 rs
->ti
->error
= "Bad numerical argument given in raid params";
305 if (!strcasecmp(key
, "rebuild")) {
306 if (++rebuild_cnt
> rs
->raid_type
->parity_devs
) {
307 rs
->ti
->error
= "Too many rebuild drives given";
310 if (value
> rs
->md
.raid_disks
) {
311 rs
->ti
->error
= "Invalid rebuild index given";
314 clear_bit(In_sync
, &rs
->dev
[value
].rdev
.flags
);
315 rs
->dev
[value
].rdev
.recovery_offset
= 0;
316 rs
->print_flags
|= DMPF_REBUILD
;
317 } else if (!strcasecmp(key
, "max_write_behind")) {
318 rs
->print_flags
|= DMPF_MAX_WRITE_BEHIND
;
321 * In device-mapper, we specify things in sectors, but
322 * MD records this value in kB
325 if (value
> COUNTER_MAX
) {
326 rs
->ti
->error
= "Max write-behind limit out of range";
329 rs
->md
.bitmap_info
.max_write_behind
= value
;
330 } else if (!strcasecmp(key
, "daemon_sleep")) {
331 rs
->print_flags
|= DMPF_DAEMON_SLEEP
;
332 if (!value
|| (value
> MAX_SCHEDULE_TIMEOUT
)) {
333 rs
->ti
->error
= "daemon sleep period out of range";
336 rs
->md
.bitmap_info
.daemon_sleep
= value
;
337 } else if (!strcasecmp(key
, "stripe_cache")) {
338 rs
->print_flags
|= DMPF_STRIPE_CACHE
;
341 * In device-mapper, we specify things in sectors, but
342 * MD records this value in kB
346 if (rs
->raid_type
->level
< 5) {
347 rs
->ti
->error
= "Inappropriate argument: stripe_cache";
350 if (raid5_set_cache_size(&rs
->md
, (int)value
)) {
351 rs
->ti
->error
= "Bad stripe_cache size";
354 } else if (!strcasecmp(key
, "min_recovery_rate")) {
355 rs
->print_flags
|= DMPF_MIN_RECOVERY_RATE
;
356 if (value
> INT_MAX
) {
357 rs
->ti
->error
= "min_recovery_rate out of range";
360 rs
->md
.sync_speed_min
= (int)value
;
361 } else if (!strcasecmp(key
, "max_recovery_rate")) {
362 rs
->print_flags
|= DMPF_MAX_RECOVERY_RATE
;
363 if (value
> INT_MAX
) {
364 rs
->ti
->error
= "max_recovery_rate out of range";
367 rs
->md
.sync_speed_max
= (int)value
;
369 DMERR("Unable to parse RAID parameter: %s", key
);
370 rs
->ti
->error
= "Unable to parse RAID parameters";
375 /* Assume there are no metadata devices until the drives are parsed */
376 rs
->md
.persistent
= 0;
382 static void do_table_event(struct work_struct
*ws
)
384 struct raid_set
*rs
= container_of(ws
, struct raid_set
, md
.event_work
);
386 dm_table_event(rs
->ti
->table
);
389 static int raid_is_congested(struct dm_target_callbacks
*cb
, int bits
)
391 struct raid_set
*rs
= container_of(cb
, struct raid_set
, callbacks
);
393 return md_raid5_congested(&rs
->md
, bits
);
397 * Construct a RAID4/5/6 mapping:
399 * <raid_type> <#raid_params> <raid_params> \
400 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
402 * ** metadata devices are not supported yet, use '-' instead **
404 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
405 * details on possible <raid_params>.
407 static int raid_ctr(struct dm_target
*ti
, unsigned argc
, char **argv
)
410 struct raid_type
*rt
;
411 unsigned long num_raid_params
, num_raid_devs
;
412 struct raid_set
*rs
= NULL
;
414 /* Must have at least <raid_type> <#raid_params> */
416 ti
->error
= "Too few arguments";
421 rt
= get_raid_type(argv
[0]);
423 ti
->error
= "Unrecognised raid_type";
429 /* number of RAID parameters */
430 if (strict_strtoul(argv
[0], 10, &num_raid_params
) < 0) {
431 ti
->error
= "Cannot understand number of RAID parameters";
437 /* Skip over RAID params for now and find out # of devices */
438 if (num_raid_params
+ 1 > argc
) {
439 ti
->error
= "Arguments do not agree with counts given";
443 if ((strict_strtoul(argv
[num_raid_params
], 10, &num_raid_devs
) < 0) ||
444 (num_raid_devs
>= INT_MAX
)) {
445 ti
->error
= "Cannot understand number of raid devices";
449 rs
= context_alloc(ti
, rt
, (unsigned)num_raid_devs
);
453 ret
= parse_raid_params(rs
, argv
, (unsigned)num_raid_params
);
459 argc
-= num_raid_params
+ 1; /* +1: we already have num_raid_devs */
460 argv
+= num_raid_params
+ 1;
462 if (argc
!= (num_raid_devs
* 2)) {
463 ti
->error
= "Supplied RAID devices does not match the count given";
467 ret
= dev_parms(rs
, argv
);
471 INIT_WORK(&rs
->md
.event_work
, do_table_event
);
472 ti
->split_io
= rs
->md
.chunk_sectors
;
475 mutex_lock(&rs
->md
.reconfig_mutex
);
476 ret
= md_run(&rs
->md
);
477 rs
->md
.in_sync
= 0; /* Assume already marked dirty */
478 mutex_unlock(&rs
->md
.reconfig_mutex
);
481 ti
->error
= "Fail to run raid array";
485 rs
->callbacks
.congested_fn
= raid_is_congested
;
486 dm_table_add_target_callbacks(ti
->table
, &rs
->callbacks
);
496 static void raid_dtr(struct dm_target
*ti
)
498 struct raid_set
*rs
= ti
->private;
500 list_del_init(&rs
->callbacks
.list
);
505 static int raid_map(struct dm_target
*ti
, struct bio
*bio
, union map_info
*map_context
)
507 struct raid_set
*rs
= ti
->private;
508 mddev_t
*mddev
= &rs
->md
;
510 mddev
->pers
->make_request(mddev
, bio
);
512 return DM_MAPIO_SUBMITTED
;
515 static int raid_status(struct dm_target
*ti
, status_type_t type
,
516 char *result
, unsigned maxlen
)
518 struct raid_set
*rs
= ti
->private;
519 unsigned raid_param_cnt
= 1; /* at least 1 for chunksize */
525 case STATUSTYPE_INFO
:
526 DMEMIT("%s %d ", rs
->raid_type
->name
, rs
->md
.raid_disks
);
528 for (i
= 0; i
< rs
->md
.raid_disks
; i
++) {
529 if (test_bit(Faulty
, &rs
->dev
[i
].rdev
.flags
))
531 else if (test_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
))
537 if (test_bit(MD_RECOVERY_RUNNING
, &rs
->md
.recovery
))
538 sync
= rs
->md
.curr_resync_completed
;
540 sync
= rs
->md
.recovery_cp
;
542 if (sync
> rs
->md
.resync_max_sectors
)
543 sync
= rs
->md
.resync_max_sectors
;
546 (unsigned long long) sync
,
547 (unsigned long long) rs
->md
.resync_max_sectors
);
550 case STATUSTYPE_TABLE
:
551 /* The string you would use to construct this array */
552 for (i
= 0; i
< rs
->md
.raid_disks
; i
++)
553 if ((rs
->print_flags
& DMPF_REBUILD
) &&
554 rs
->dev
[i
].data_dev
&&
555 !test_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
))
556 raid_param_cnt
+= 2; /* for rebuilds */
558 raid_param_cnt
+= (hweight64(rs
->print_flags
& ~DMPF_REBUILD
) * 2);
559 if (rs
->print_flags
& (DMPF_SYNC
| DMPF_NOSYNC
))
562 DMEMIT("%s %u %u", rs
->raid_type
->name
,
563 raid_param_cnt
, rs
->md
.chunk_sectors
);
565 if ((rs
->print_flags
& DMPF_SYNC
) &&
566 (rs
->md
.recovery_cp
== MaxSector
))
568 if (rs
->print_flags
& DMPF_NOSYNC
)
571 for (i
= 0; i
< rs
->md
.raid_disks
; i
++)
572 if ((rs
->print_flags
& DMPF_REBUILD
) &&
573 rs
->dev
[i
].data_dev
&&
574 !test_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
))
575 DMEMIT(" rebuild %u", i
);
577 if (rs
->print_flags
& DMPF_DAEMON_SLEEP
)
578 DMEMIT(" daemon_sleep %lu",
579 rs
->md
.bitmap_info
.daemon_sleep
);
581 if (rs
->print_flags
& DMPF_MIN_RECOVERY_RATE
)
582 DMEMIT(" min_recovery_rate %d", rs
->md
.sync_speed_min
);
584 if (rs
->print_flags
& DMPF_MAX_RECOVERY_RATE
)
585 DMEMIT(" max_recovery_rate %d", rs
->md
.sync_speed_max
);
587 if (rs
->print_flags
& DMPF_MAX_WRITE_BEHIND
)
588 DMEMIT(" max_write_behind %lu",
589 rs
->md
.bitmap_info
.max_write_behind
);
591 if (rs
->print_flags
& DMPF_STRIPE_CACHE
) {
592 raid5_conf_t
*conf
= rs
->md
.private;
594 /* convert from kiB to sectors */
595 DMEMIT(" stripe_cache %d",
596 conf
? conf
->max_nr_stripes
* 2 : 0);
599 DMEMIT(" %d", rs
->md
.raid_disks
);
600 for (i
= 0; i
< rs
->md
.raid_disks
; i
++) {
601 DMEMIT(" -"); /* metadata device */
603 if (rs
->dev
[i
].data_dev
)
604 DMEMIT(" %s", rs
->dev
[i
].data_dev
->name
);
613 static int raid_iterate_devices(struct dm_target
*ti
, iterate_devices_callout_fn fn
, void *data
)
615 struct raid_set
*rs
= ti
->private;
619 for (i
= 0; !ret
&& i
< rs
->md
.raid_disks
; i
++)
620 if (rs
->dev
[i
].data_dev
)
623 0, /* No offset on data devs */
630 static void raid_io_hints(struct dm_target
*ti
, struct queue_limits
*limits
)
632 struct raid_set
*rs
= ti
->private;
633 unsigned chunk_size
= rs
->md
.chunk_sectors
<< 9;
634 raid5_conf_t
*conf
= rs
->md
.private;
636 blk_limits_io_min(limits
, chunk_size
);
637 blk_limits_io_opt(limits
, chunk_size
* (conf
->raid_disks
- conf
->max_degraded
));
640 static void raid_presuspend(struct dm_target
*ti
)
642 struct raid_set
*rs
= ti
->private;
644 md_stop_writes(&rs
->md
);
647 static void raid_postsuspend(struct dm_target
*ti
)
649 struct raid_set
*rs
= ti
->private;
651 mddev_suspend(&rs
->md
);
654 static void raid_resume(struct dm_target
*ti
)
656 struct raid_set
*rs
= ti
->private;
658 mddev_resume(&rs
->md
);
661 static struct target_type raid_target
= {
663 .version
= {1, 0, 0},
664 .module
= THIS_MODULE
,
668 .status
= raid_status
,
669 .iterate_devices
= raid_iterate_devices
,
670 .io_hints
= raid_io_hints
,
671 .presuspend
= raid_presuspend
,
672 .postsuspend
= raid_postsuspend
,
673 .resume
= raid_resume
,
676 static int __init
dm_raid_init(void)
678 return dm_register_target(&raid_target
);
681 static void __exit
dm_raid_exit(void)
683 dm_unregister_target(&raid_target
);
686 module_init(dm_raid_init
);
687 module_exit(dm_raid_exit
);
689 MODULE_DESCRIPTION(DM_NAME
" raid4/5/6 target");
690 MODULE_ALIAS("dm-raid4");
691 MODULE_ALIAS("dm-raid5");
692 MODULE_ALIAS("dm-raid6");
693 MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
694 MODULE_LICENSE("GPL");