2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
5 * This file is released under the GPL.
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/mutex.h>
14 #include <linux/moduleparam.h>
15 #include <linux/blkpg.h>
16 #include <linux/bio.h>
17 #include <linux/buffer_head.h>
18 #include <linux/mempool.h>
19 #include <linux/slab.h>
20 #include <linux/idr.h>
21 #include <linux/hdreg.h>
23 #include <trace/events/block.h>
25 #define DM_MSG_PREFIX "core"
27 static const char *_name
= DM_NAME
;
29 static unsigned int major
= 0;
30 static unsigned int _major
= 0;
32 static DEFINE_SPINLOCK(_minor_lock
);
35 * One of these is allocated per bio.
38 struct mapped_device
*md
;
42 unsigned long start_time
;
47 * One of these is allocated per target within a bio. Hopefully
48 * this will be simplified out one day.
57 * For request-based dm.
58 * One of these is allocated per request.
60 struct dm_rq_target_io
{
61 struct mapped_device
*md
;
63 struct request
*orig
, clone
;
69 * For request-based dm.
70 * One of these is allocated per bio.
72 struct dm_rq_clone_bio_info
{
77 union map_info
*dm_get_mapinfo(struct bio
*bio
)
79 if (bio
&& bio
->bi_private
)
80 return &((struct dm_target_io
*)bio
->bi_private
)->info
;
84 #define MINOR_ALLOCED ((void *)-1)
87 * Bits for the md->flags field.
89 #define DMF_BLOCK_IO_FOR_SUSPEND 0
90 #define DMF_SUSPENDED 1
93 #define DMF_DELETING 4
94 #define DMF_NOFLUSH_SUSPENDING 5
95 #define DMF_QUEUE_IO_TO_THREAD 6
98 * Work processed by per-device workqueue.
100 struct mapped_device
{
101 struct rw_semaphore io_lock
;
102 struct mutex suspend_lock
;
109 struct request_queue
*queue
;
110 struct gendisk
*disk
;
116 * A list of ios that arrived while we were suspended.
119 wait_queue_head_t wait
;
120 struct work_struct work
;
121 struct bio_list deferred
;
122 spinlock_t deferred_lock
;
125 * An error from the barrier request currently being processed.
130 * Processing queue (flush/barriers)
132 struct workqueue_struct
*wq
;
135 * The current mapping.
137 struct dm_table
*map
;
140 * io objects are allocated from here.
151 wait_queue_head_t eventq
;
153 struct list_head uevent_list
;
154 spinlock_t uevent_lock
; /* Protect access to uevent_list */
157 * freeze/thaw support require holding onto a super block
159 struct super_block
*frozen_sb
;
160 struct block_device
*bdev
;
162 /* forced geometry settings */
163 struct hd_geometry geometry
;
168 /* zero-length barrier that will be cloned and submitted to targets */
169 struct bio barrier_bio
;
173 static struct kmem_cache
*_io_cache
;
174 static struct kmem_cache
*_tio_cache
;
175 static struct kmem_cache
*_rq_tio_cache
;
176 static struct kmem_cache
*_rq_bio_info_cache
;
178 static int __init
local_init(void)
182 /* allocate a slab for the dm_ios */
183 _io_cache
= KMEM_CACHE(dm_io
, 0);
187 /* allocate a slab for the target ios */
188 _tio_cache
= KMEM_CACHE(dm_target_io
, 0);
190 goto out_free_io_cache
;
192 _rq_tio_cache
= KMEM_CACHE(dm_rq_target_io
, 0);
194 goto out_free_tio_cache
;
196 _rq_bio_info_cache
= KMEM_CACHE(dm_rq_clone_bio_info
, 0);
197 if (!_rq_bio_info_cache
)
198 goto out_free_rq_tio_cache
;
200 r
= dm_uevent_init();
202 goto out_free_rq_bio_info_cache
;
205 r
= register_blkdev(_major
, _name
);
207 goto out_uevent_exit
;
216 out_free_rq_bio_info_cache
:
217 kmem_cache_destroy(_rq_bio_info_cache
);
218 out_free_rq_tio_cache
:
219 kmem_cache_destroy(_rq_tio_cache
);
221 kmem_cache_destroy(_tio_cache
);
223 kmem_cache_destroy(_io_cache
);
228 static void local_exit(void)
230 kmem_cache_destroy(_rq_bio_info_cache
);
231 kmem_cache_destroy(_rq_tio_cache
);
232 kmem_cache_destroy(_tio_cache
);
233 kmem_cache_destroy(_io_cache
);
234 unregister_blkdev(_major
, _name
);
239 DMINFO("cleaned up");
242 static int (*_inits
[])(void) __initdata
= {
251 static void (*_exits
[])(void) = {
260 static int __init
dm_init(void)
262 const int count
= ARRAY_SIZE(_inits
);
266 for (i
= 0; i
< count
; i
++) {
281 static void __exit
dm_exit(void)
283 int i
= ARRAY_SIZE(_exits
);
290 * Block device functions
292 static int dm_blk_open(struct block_device
*bdev
, fmode_t mode
)
294 struct mapped_device
*md
;
296 spin_lock(&_minor_lock
);
298 md
= bdev
->bd_disk
->private_data
;
302 if (test_bit(DMF_FREEING
, &md
->flags
) ||
303 test_bit(DMF_DELETING
, &md
->flags
)) {
309 atomic_inc(&md
->open_count
);
312 spin_unlock(&_minor_lock
);
314 return md
? 0 : -ENXIO
;
317 static int dm_blk_close(struct gendisk
*disk
, fmode_t mode
)
319 struct mapped_device
*md
= disk
->private_data
;
320 atomic_dec(&md
->open_count
);
325 int dm_open_count(struct mapped_device
*md
)
327 return atomic_read(&md
->open_count
);
331 * Guarantees nothing is using the device before it's deleted.
333 int dm_lock_for_deletion(struct mapped_device
*md
)
337 spin_lock(&_minor_lock
);
339 if (dm_open_count(md
))
342 set_bit(DMF_DELETING
, &md
->flags
);
344 spin_unlock(&_minor_lock
);
349 static int dm_blk_getgeo(struct block_device
*bdev
, struct hd_geometry
*geo
)
351 struct mapped_device
*md
= bdev
->bd_disk
->private_data
;
353 return dm_get_geometry(md
, geo
);
356 static int dm_blk_ioctl(struct block_device
*bdev
, fmode_t mode
,
357 unsigned int cmd
, unsigned long arg
)
359 struct mapped_device
*md
= bdev
->bd_disk
->private_data
;
360 struct dm_table
*map
= dm_get_table(md
);
361 struct dm_target
*tgt
;
364 if (!map
|| !dm_table_get_size(map
))
367 /* We only support devices that have a single target */
368 if (dm_table_get_num_targets(map
) != 1)
371 tgt
= dm_table_get_target(map
, 0);
373 if (dm_suspended(md
)) {
378 if (tgt
->type
->ioctl
)
379 r
= tgt
->type
->ioctl(tgt
, cmd
, arg
);
387 static struct dm_io
*alloc_io(struct mapped_device
*md
)
389 return mempool_alloc(md
->io_pool
, GFP_NOIO
);
392 static void free_io(struct mapped_device
*md
, struct dm_io
*io
)
394 mempool_free(io
, md
->io_pool
);
397 static void free_tio(struct mapped_device
*md
, struct dm_target_io
*tio
)
399 mempool_free(tio
, md
->tio_pool
);
402 static void start_io_acct(struct dm_io
*io
)
404 struct mapped_device
*md
= io
->md
;
407 io
->start_time
= jiffies
;
409 cpu
= part_stat_lock();
410 part_round_stats(cpu
, &dm_disk(md
)->part0
);
412 dm_disk(md
)->part0
.in_flight
= atomic_inc_return(&md
->pending
);
415 static void end_io_acct(struct dm_io
*io
)
417 struct mapped_device
*md
= io
->md
;
418 struct bio
*bio
= io
->bio
;
419 unsigned long duration
= jiffies
- io
->start_time
;
421 int rw
= bio_data_dir(bio
);
423 cpu
= part_stat_lock();
424 part_round_stats(cpu
, &dm_disk(md
)->part0
);
425 part_stat_add(cpu
, &dm_disk(md
)->part0
, ticks
[rw
], duration
);
429 * After this is decremented the bio must not be touched if it is
432 dm_disk(md
)->part0
.in_flight
= pending
=
433 atomic_dec_return(&md
->pending
);
435 /* nudge anyone waiting on suspend queue */
441 * Add the bio to the list of deferred io.
443 static void queue_io(struct mapped_device
*md
, struct bio
*bio
)
445 down_write(&md
->io_lock
);
447 spin_lock_irq(&md
->deferred_lock
);
448 bio_list_add(&md
->deferred
, bio
);
449 spin_unlock_irq(&md
->deferred_lock
);
451 if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD
, &md
->flags
))
452 queue_work(md
->wq
, &md
->work
);
454 up_write(&md
->io_lock
);
458 * Everyone (including functions in this file), should use this
459 * function to access the md->map field, and make sure they call
460 * dm_table_put() when finished.
462 struct dm_table
*dm_get_table(struct mapped_device
*md
)
466 read_lock(&md
->map_lock
);
470 read_unlock(&md
->map_lock
);
476 * Get the geometry associated with a dm device
478 int dm_get_geometry(struct mapped_device
*md
, struct hd_geometry
*geo
)
486 * Set the geometry of a device.
488 int dm_set_geometry(struct mapped_device
*md
, struct hd_geometry
*geo
)
490 sector_t sz
= (sector_t
)geo
->cylinders
* geo
->heads
* geo
->sectors
;
492 if (geo
->start
> sz
) {
493 DMWARN("Start sector is beyond the geometry limits.");
502 /*-----------------------------------------------------------------
504 * A more elegant soln is in the works that uses the queue
505 * merge fn, unfortunately there are a couple of changes to
506 * the block layer that I want to make for this. So in the
507 * interests of getting something for people to use I give
508 * you this clearly demarcated crap.
509 *---------------------------------------------------------------*/
511 static int __noflush_suspending(struct mapped_device
*md
)
513 return test_bit(DMF_NOFLUSH_SUSPENDING
, &md
->flags
);
517 * Decrements the number of outstanding ios that a bio has been
518 * cloned into, completing the original io if necc.
520 static void dec_pending(struct dm_io
*io
, int error
)
525 struct mapped_device
*md
= io
->md
;
527 /* Push-back supersedes any I/O errors */
528 if (error
&& !(io
->error
> 0 && __noflush_suspending(md
)))
531 if (atomic_dec_and_test(&io
->io_count
)) {
532 if (io
->error
== DM_ENDIO_REQUEUE
) {
534 * Target requested pushing back the I/O.
536 spin_lock_irqsave(&md
->deferred_lock
, flags
);
537 if (__noflush_suspending(md
)) {
538 if (!bio_barrier(io
->bio
))
539 bio_list_add_head(&md
->deferred
,
542 /* noflush suspend was interrupted. */
544 spin_unlock_irqrestore(&md
->deferred_lock
, flags
);
547 io_error
= io
->error
;
550 if (bio_barrier(bio
)) {
552 * There can be just one barrier request so we use
553 * a per-device variable for error reporting.
554 * Note that you can't touch the bio after end_io_acct
556 if (!md
->barrier_error
&& io_error
!= -EOPNOTSUPP
)
557 md
->barrier_error
= io_error
;
562 if (io_error
!= DM_ENDIO_REQUEUE
) {
563 trace_block_bio_complete(md
->queue
, bio
);
565 bio_endio(bio
, io_error
);
573 static void clone_endio(struct bio
*bio
, int error
)
576 struct dm_target_io
*tio
= bio
->bi_private
;
577 struct dm_io
*io
= tio
->io
;
578 struct mapped_device
*md
= tio
->io
->md
;
579 dm_endio_fn endio
= tio
->ti
->type
->end_io
;
581 if (!bio_flagged(bio
, BIO_UPTODATE
) && !error
)
585 r
= endio(tio
->ti
, bio
, error
, &tio
->info
);
586 if (r
< 0 || r
== DM_ENDIO_REQUEUE
)
588 * error and requeue request are handled
592 else if (r
== DM_ENDIO_INCOMPLETE
)
593 /* The target will handle the io */
596 DMWARN("unimplemented target endio return value: %d", r
);
602 * Store md for cleanup instead of tio which is about to get freed.
604 bio
->bi_private
= md
->bs
;
608 dec_pending(io
, error
);
611 static sector_t
max_io_len(struct mapped_device
*md
,
612 sector_t sector
, struct dm_target
*ti
)
614 sector_t offset
= sector
- ti
->begin
;
615 sector_t len
= ti
->len
- offset
;
618 * Does the target need to split even further ?
622 boundary
= ((offset
+ ti
->split_io
) & ~(ti
->split_io
- 1))
631 static void __map_bio(struct dm_target
*ti
, struct bio
*clone
,
632 struct dm_target_io
*tio
)
636 struct mapped_device
*md
;
638 clone
->bi_end_io
= clone_endio
;
639 clone
->bi_private
= tio
;
642 * Map the clone. If r == 0 we don't need to do
643 * anything, the target has assumed ownership of
646 atomic_inc(&tio
->io
->io_count
);
647 sector
= clone
->bi_sector
;
648 r
= ti
->type
->map(ti
, clone
, &tio
->info
);
649 if (r
== DM_MAPIO_REMAPPED
) {
650 /* the bio has been remapped so dispatch it */
652 trace_block_remap(bdev_get_queue(clone
->bi_bdev
), clone
,
653 tio
->io
->bio
->bi_bdev
->bd_dev
, sector
);
655 generic_make_request(clone
);
656 } else if (r
< 0 || r
== DM_MAPIO_REQUEUE
) {
657 /* error the io and bail out, or requeue it if needed */
659 dec_pending(tio
->io
, r
);
661 * Store bio_set for cleanup.
663 clone
->bi_private
= md
->bs
;
667 DMWARN("unimplemented target map return value: %d", r
);
673 struct mapped_device
*md
;
674 struct dm_table
*map
;
678 sector_t sector_count
;
682 static void dm_bio_destructor(struct bio
*bio
)
684 struct bio_set
*bs
= bio
->bi_private
;
690 * Creates a little bio that is just does part of a bvec.
692 static struct bio
*split_bvec(struct bio
*bio
, sector_t sector
,
693 unsigned short idx
, unsigned int offset
,
694 unsigned int len
, struct bio_set
*bs
)
697 struct bio_vec
*bv
= bio
->bi_io_vec
+ idx
;
699 clone
= bio_alloc_bioset(GFP_NOIO
, 1, bs
);
700 clone
->bi_destructor
= dm_bio_destructor
;
701 *clone
->bi_io_vec
= *bv
;
703 clone
->bi_sector
= sector
;
704 clone
->bi_bdev
= bio
->bi_bdev
;
705 clone
->bi_rw
= bio
->bi_rw
& ~(1 << BIO_RW_BARRIER
);
707 clone
->bi_size
= to_bytes(len
);
708 clone
->bi_io_vec
->bv_offset
= offset
;
709 clone
->bi_io_vec
->bv_len
= clone
->bi_size
;
710 clone
->bi_flags
|= 1 << BIO_CLONED
;
712 if (bio_integrity(bio
)) {
713 bio_integrity_clone(clone
, bio
, GFP_NOIO
);
714 bio_integrity_trim(clone
,
715 bio_sector_offset(bio
, idx
, offset
), len
);
722 * Creates a bio that consists of range of complete bvecs.
724 static struct bio
*clone_bio(struct bio
*bio
, sector_t sector
,
725 unsigned short idx
, unsigned short bv_count
,
726 unsigned int len
, struct bio_set
*bs
)
730 clone
= bio_alloc_bioset(GFP_NOIO
, bio
->bi_max_vecs
, bs
);
731 __bio_clone(clone
, bio
);
732 clone
->bi_rw
&= ~(1 << BIO_RW_BARRIER
);
733 clone
->bi_destructor
= dm_bio_destructor
;
734 clone
->bi_sector
= sector
;
736 clone
->bi_vcnt
= idx
+ bv_count
;
737 clone
->bi_size
= to_bytes(len
);
738 clone
->bi_flags
&= ~(1 << BIO_SEG_VALID
);
740 if (bio_integrity(bio
)) {
741 bio_integrity_clone(clone
, bio
, GFP_NOIO
);
743 if (idx
!= bio
->bi_idx
|| clone
->bi_size
< bio
->bi_size
)
744 bio_integrity_trim(clone
,
745 bio_sector_offset(bio
, idx
, 0), len
);
751 static struct dm_target_io
*alloc_tio(struct clone_info
*ci
,
752 struct dm_target
*ti
)
754 struct dm_target_io
*tio
= mempool_alloc(ci
->md
->tio_pool
, GFP_NOIO
);
758 memset(&tio
->info
, 0, sizeof(tio
->info
));
763 static void __flush_target(struct clone_info
*ci
, struct dm_target
*ti
,
766 struct dm_target_io
*tio
= alloc_tio(ci
, ti
);
769 tio
->info
.flush_request
= flush_nr
;
771 clone
= bio_alloc_bioset(GFP_NOIO
, 0, ci
->md
->bs
);
772 __bio_clone(clone
, ci
->bio
);
773 clone
->bi_destructor
= dm_bio_destructor
;
775 __map_bio(ti
, clone
, tio
);
778 static int __clone_and_map_empty_barrier(struct clone_info
*ci
)
780 unsigned target_nr
= 0, flush_nr
;
781 struct dm_target
*ti
;
783 while ((ti
= dm_table_get_target(ci
->map
, target_nr
++)))
784 for (flush_nr
= 0; flush_nr
< ti
->num_flush_requests
;
786 __flush_target(ci
, ti
, flush_nr
);
788 ci
->sector_count
= 0;
793 static int __clone_and_map(struct clone_info
*ci
)
795 struct bio
*clone
, *bio
= ci
->bio
;
796 struct dm_target
*ti
;
797 sector_t len
= 0, max
;
798 struct dm_target_io
*tio
;
800 if (unlikely(bio_empty_barrier(bio
)))
801 return __clone_and_map_empty_barrier(ci
);
803 ti
= dm_table_find_target(ci
->map
, ci
->sector
);
804 if (!dm_target_is_valid(ti
))
807 max
= max_io_len(ci
->md
, ci
->sector
, ti
);
810 * Allocate a target io object.
812 tio
= alloc_tio(ci
, ti
);
814 if (ci
->sector_count
<= max
) {
816 * Optimise for the simple case where we can do all of
817 * the remaining io with a single clone.
819 clone
= clone_bio(bio
, ci
->sector
, ci
->idx
,
820 bio
->bi_vcnt
- ci
->idx
, ci
->sector_count
,
822 __map_bio(ti
, clone
, tio
);
823 ci
->sector_count
= 0;
825 } else if (to_sector(bio
->bi_io_vec
[ci
->idx
].bv_len
) <= max
) {
827 * There are some bvecs that don't span targets.
828 * Do as many of these as possible.
831 sector_t remaining
= max
;
834 for (i
= ci
->idx
; remaining
&& (i
< bio
->bi_vcnt
); i
++) {
835 bv_len
= to_sector(bio
->bi_io_vec
[i
].bv_len
);
837 if (bv_len
> remaining
)
844 clone
= clone_bio(bio
, ci
->sector
, ci
->idx
, i
- ci
->idx
, len
,
846 __map_bio(ti
, clone
, tio
);
849 ci
->sector_count
-= len
;
854 * Handle a bvec that must be split between two or more targets.
856 struct bio_vec
*bv
= bio
->bi_io_vec
+ ci
->idx
;
857 sector_t remaining
= to_sector(bv
->bv_len
);
858 unsigned int offset
= 0;
862 ti
= dm_table_find_target(ci
->map
, ci
->sector
);
863 if (!dm_target_is_valid(ti
))
866 max
= max_io_len(ci
->md
, ci
->sector
, ti
);
868 tio
= alloc_tio(ci
, ti
);
871 len
= min(remaining
, max
);
873 clone
= split_bvec(bio
, ci
->sector
, ci
->idx
,
874 bv
->bv_offset
+ offset
, len
,
877 __map_bio(ti
, clone
, tio
);
880 ci
->sector_count
-= len
;
881 offset
+= to_bytes(len
);
882 } while (remaining
-= len
);
891 * Split the bio into several clones and submit it to targets.
893 static void __split_and_process_bio(struct mapped_device
*md
, struct bio
*bio
)
895 struct clone_info ci
;
898 ci
.map
= dm_get_table(md
);
899 if (unlikely(!ci
.map
)) {
900 if (!bio_barrier(bio
))
903 if (!md
->barrier_error
)
904 md
->barrier_error
= -EIO
;
910 ci
.io
= alloc_io(md
);
912 atomic_set(&ci
.io
->io_count
, 1);
915 ci
.sector
= bio
->bi_sector
;
916 ci
.sector_count
= bio_sectors(bio
);
917 if (unlikely(bio_empty_barrier(bio
)))
919 ci
.idx
= bio
->bi_idx
;
921 start_io_acct(ci
.io
);
922 while (ci
.sector_count
&& !error
)
923 error
= __clone_and_map(&ci
);
925 /* drop the extra reference count */
926 dec_pending(ci
.io
, error
);
927 dm_table_put(ci
.map
);
929 /*-----------------------------------------------------------------
931 *---------------------------------------------------------------*/
933 static int dm_merge_bvec(struct request_queue
*q
,
934 struct bvec_merge_data
*bvm
,
935 struct bio_vec
*biovec
)
937 struct mapped_device
*md
= q
->queuedata
;
938 struct dm_table
*map
= dm_get_table(md
);
939 struct dm_target
*ti
;
940 sector_t max_sectors
;
946 ti
= dm_table_find_target(map
, bvm
->bi_sector
);
947 if (!dm_target_is_valid(ti
))
951 * Find maximum amount of I/O that won't need splitting
953 max_sectors
= min(max_io_len(md
, bvm
->bi_sector
, ti
),
954 (sector_t
) BIO_MAX_SECTORS
);
955 max_size
= (max_sectors
<< SECTOR_SHIFT
) - bvm
->bi_size
;
960 * merge_bvec_fn() returns number of bytes
961 * it can accept at this offset
962 * max is precomputed maximal io size
964 if (max_size
&& ti
->type
->merge
)
965 max_size
= ti
->type
->merge(ti
, bvm
, biovec
, max_size
);
967 * If the target doesn't support merge method and some of the devices
968 * provided their merge_bvec method (we know this by looking at
969 * queue_max_hw_sectors), then we can't allow bios with multiple vector
970 * entries. So always set max_size to 0, and the code below allows
973 else if (queue_max_hw_sectors(q
) <= PAGE_SIZE
>> 9)
982 * Always allow an entire first page
984 if (max_size
<= biovec
->bv_len
&& !(bvm
->bi_size
>> SECTOR_SHIFT
))
985 max_size
= biovec
->bv_len
;
991 * The request function that just remaps the bio built up by
994 static int dm_request(struct request_queue
*q
, struct bio
*bio
)
996 int rw
= bio_data_dir(bio
);
997 struct mapped_device
*md
= q
->queuedata
;
1000 down_read(&md
->io_lock
);
1002 cpu
= part_stat_lock();
1003 part_stat_inc(cpu
, &dm_disk(md
)->part0
, ios
[rw
]);
1004 part_stat_add(cpu
, &dm_disk(md
)->part0
, sectors
[rw
], bio_sectors(bio
));
1008 * If we're suspended or the thread is processing barriers
1009 * we have to queue this io for later.
1011 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD
, &md
->flags
)) ||
1012 unlikely(bio_barrier(bio
))) {
1013 up_read(&md
->io_lock
);
1015 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND
, &md
->flags
)) &&
1016 bio_rw(bio
) == READA
) {
1026 __split_and_process_bio(md
, bio
);
1027 up_read(&md
->io_lock
);
1031 static void dm_unplug_all(struct request_queue
*q
)
1033 struct mapped_device
*md
= q
->queuedata
;
1034 struct dm_table
*map
= dm_get_table(md
);
1037 dm_table_unplug_all(map
);
1042 static int dm_any_congested(void *congested_data
, int bdi_bits
)
1045 struct mapped_device
*md
= congested_data
;
1046 struct dm_table
*map
;
1048 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND
, &md
->flags
)) {
1049 map
= dm_get_table(md
);
1051 r
= dm_table_any_congested(map
, bdi_bits
);
1059 /*-----------------------------------------------------------------
1060 * An IDR is used to keep track of allocated minor numbers.
1061 *---------------------------------------------------------------*/
1062 static DEFINE_IDR(_minor_idr
);
1064 static void free_minor(int minor
)
1066 spin_lock(&_minor_lock
);
1067 idr_remove(&_minor_idr
, minor
);
1068 spin_unlock(&_minor_lock
);
1072 * See if the device with a specific minor # is free.
1074 static int specific_minor(int minor
)
1078 if (minor
>= (1 << MINORBITS
))
1081 r
= idr_pre_get(&_minor_idr
, GFP_KERNEL
);
1085 spin_lock(&_minor_lock
);
1087 if (idr_find(&_minor_idr
, minor
)) {
1092 r
= idr_get_new_above(&_minor_idr
, MINOR_ALLOCED
, minor
, &m
);
1097 idr_remove(&_minor_idr
, m
);
1103 spin_unlock(&_minor_lock
);
1107 static int next_free_minor(int *minor
)
1111 r
= idr_pre_get(&_minor_idr
, GFP_KERNEL
);
1115 spin_lock(&_minor_lock
);
1117 r
= idr_get_new(&_minor_idr
, MINOR_ALLOCED
, &m
);
1121 if (m
>= (1 << MINORBITS
)) {
1122 idr_remove(&_minor_idr
, m
);
1130 spin_unlock(&_minor_lock
);
1134 static struct block_device_operations dm_blk_dops
;
1136 static void dm_wq_work(struct work_struct
*work
);
1139 * Allocate and initialise a blank device with a given minor.
1141 static struct mapped_device
*alloc_dev(int minor
)
1144 struct mapped_device
*md
= kzalloc(sizeof(*md
), GFP_KERNEL
);
1148 DMWARN("unable to allocate device, out of memory.");
1152 if (!try_module_get(THIS_MODULE
))
1153 goto bad_module_get
;
1155 /* get a minor number for the dev */
1156 if (minor
== DM_ANY_MINOR
)
1157 r
= next_free_minor(&minor
);
1159 r
= specific_minor(minor
);
1163 init_rwsem(&md
->io_lock
);
1164 mutex_init(&md
->suspend_lock
);
1165 spin_lock_init(&md
->deferred_lock
);
1166 rwlock_init(&md
->map_lock
);
1167 atomic_set(&md
->holders
, 1);
1168 atomic_set(&md
->open_count
, 0);
1169 atomic_set(&md
->event_nr
, 0);
1170 atomic_set(&md
->uevent_seq
, 0);
1171 INIT_LIST_HEAD(&md
->uevent_list
);
1172 spin_lock_init(&md
->uevent_lock
);
1174 md
->queue
= blk_alloc_queue(GFP_KERNEL
);
1178 md
->queue
->queuedata
= md
;
1179 md
->queue
->backing_dev_info
.congested_fn
= dm_any_congested
;
1180 md
->queue
->backing_dev_info
.congested_data
= md
;
1181 blk_queue_make_request(md
->queue
, dm_request
);
1182 blk_queue_ordered(md
->queue
, QUEUE_ORDERED_DRAIN
, NULL
);
1183 blk_queue_bounce_limit(md
->queue
, BLK_BOUNCE_ANY
);
1184 md
->queue
->unplug_fn
= dm_unplug_all
;
1185 blk_queue_merge_bvec(md
->queue
, dm_merge_bvec
);
1187 md
->io_pool
= mempool_create_slab_pool(MIN_IOS
, _io_cache
);
1191 md
->tio_pool
= mempool_create_slab_pool(MIN_IOS
, _tio_cache
);
1195 md
->bs
= bioset_create(16, 0);
1199 md
->disk
= alloc_disk(1);
1203 atomic_set(&md
->pending
, 0);
1204 init_waitqueue_head(&md
->wait
);
1205 INIT_WORK(&md
->work
, dm_wq_work
);
1206 init_waitqueue_head(&md
->eventq
);
1208 md
->disk
->major
= _major
;
1209 md
->disk
->first_minor
= minor
;
1210 md
->disk
->fops
= &dm_blk_dops
;
1211 md
->disk
->queue
= md
->queue
;
1212 md
->disk
->private_data
= md
;
1213 sprintf(md
->disk
->disk_name
, "dm-%d", minor
);
1215 format_dev_t(md
->name
, MKDEV(_major
, minor
));
1217 md
->wq
= create_singlethread_workqueue("kdmflush");
1221 md
->bdev
= bdget_disk(md
->disk
, 0);
1225 /* Populate the mapping, nobody knows we exist yet */
1226 spin_lock(&_minor_lock
);
1227 old_md
= idr_replace(&_minor_idr
, md
, minor
);
1228 spin_unlock(&_minor_lock
);
1230 BUG_ON(old_md
!= MINOR_ALLOCED
);
1235 destroy_workqueue(md
->wq
);
1239 bioset_free(md
->bs
);
1241 mempool_destroy(md
->tio_pool
);
1243 mempool_destroy(md
->io_pool
);
1245 blk_cleanup_queue(md
->queue
);
1249 module_put(THIS_MODULE
);
1255 static void unlock_fs(struct mapped_device
*md
);
1257 static void free_dev(struct mapped_device
*md
)
1259 int minor
= MINOR(disk_devt(md
->disk
));
1263 destroy_workqueue(md
->wq
);
1264 mempool_destroy(md
->tio_pool
);
1265 mempool_destroy(md
->io_pool
);
1266 bioset_free(md
->bs
);
1267 blk_integrity_unregister(md
->disk
);
1268 del_gendisk(md
->disk
);
1271 spin_lock(&_minor_lock
);
1272 md
->disk
->private_data
= NULL
;
1273 spin_unlock(&_minor_lock
);
1276 blk_cleanup_queue(md
->queue
);
1277 module_put(THIS_MODULE
);
1282 * Bind a table to the device.
1284 static void event_callback(void *context
)
1286 unsigned long flags
;
1288 struct mapped_device
*md
= (struct mapped_device
*) context
;
1290 spin_lock_irqsave(&md
->uevent_lock
, flags
);
1291 list_splice_init(&md
->uevent_list
, &uevents
);
1292 spin_unlock_irqrestore(&md
->uevent_lock
, flags
);
1294 dm_send_uevents(&uevents
, &disk_to_dev(md
->disk
)->kobj
);
1296 atomic_inc(&md
->event_nr
);
1297 wake_up(&md
->eventq
);
1300 static void __set_size(struct mapped_device
*md
, sector_t size
)
1302 set_capacity(md
->disk
, size
);
1304 mutex_lock(&md
->bdev
->bd_inode
->i_mutex
);
1305 i_size_write(md
->bdev
->bd_inode
, (loff_t
)size
<< SECTOR_SHIFT
);
1306 mutex_unlock(&md
->bdev
->bd_inode
->i_mutex
);
1309 static int __bind(struct mapped_device
*md
, struct dm_table
*t
)
1311 struct request_queue
*q
= md
->queue
;
1314 size
= dm_table_get_size(t
);
1317 * Wipe any geometry if the size of the table changed.
1319 if (size
!= get_capacity(md
->disk
))
1320 memset(&md
->geometry
, 0, sizeof(md
->geometry
));
1322 __set_size(md
, size
);
1325 dm_table_destroy(t
);
1329 dm_table_event_callback(t
, event_callback
, md
);
1331 write_lock(&md
->map_lock
);
1333 dm_table_set_restrictions(t
, q
);
1334 write_unlock(&md
->map_lock
);
1339 static void __unbind(struct mapped_device
*md
)
1341 struct dm_table
*map
= md
->map
;
1346 dm_table_event_callback(map
, NULL
, NULL
);
1347 write_lock(&md
->map_lock
);
1349 write_unlock(&md
->map_lock
);
1350 dm_table_destroy(map
);
1354 * Constructor for a new device.
1356 int dm_create(int minor
, struct mapped_device
**result
)
1358 struct mapped_device
*md
;
1360 md
= alloc_dev(minor
);
1370 static struct mapped_device
*dm_find_md(dev_t dev
)
1372 struct mapped_device
*md
;
1373 unsigned minor
= MINOR(dev
);
1375 if (MAJOR(dev
) != _major
|| minor
>= (1 << MINORBITS
))
1378 spin_lock(&_minor_lock
);
1380 md
= idr_find(&_minor_idr
, minor
);
1381 if (md
&& (md
== MINOR_ALLOCED
||
1382 (MINOR(disk_devt(dm_disk(md
))) != minor
) ||
1383 test_bit(DMF_FREEING
, &md
->flags
))) {
1389 spin_unlock(&_minor_lock
);
1394 struct mapped_device
*dm_get_md(dev_t dev
)
1396 struct mapped_device
*md
= dm_find_md(dev
);
1404 void *dm_get_mdptr(struct mapped_device
*md
)
1406 return md
->interface_ptr
;
1409 void dm_set_mdptr(struct mapped_device
*md
, void *ptr
)
1411 md
->interface_ptr
= ptr
;
1414 void dm_get(struct mapped_device
*md
)
1416 atomic_inc(&md
->holders
);
1419 const char *dm_device_name(struct mapped_device
*md
)
1423 EXPORT_SYMBOL_GPL(dm_device_name
);
1425 void dm_put(struct mapped_device
*md
)
1427 struct dm_table
*map
;
1429 BUG_ON(test_bit(DMF_FREEING
, &md
->flags
));
1431 if (atomic_dec_and_lock(&md
->holders
, &_minor_lock
)) {
1432 map
= dm_get_table(md
);
1433 idr_replace(&_minor_idr
, MINOR_ALLOCED
,
1434 MINOR(disk_devt(dm_disk(md
))));
1435 set_bit(DMF_FREEING
, &md
->flags
);
1436 spin_unlock(&_minor_lock
);
1437 if (!dm_suspended(md
)) {
1438 dm_table_presuspend_targets(map
);
1439 dm_table_postsuspend_targets(map
);
1447 EXPORT_SYMBOL_GPL(dm_put
);
1449 static int dm_wait_for_completion(struct mapped_device
*md
, int interruptible
)
1452 DECLARE_WAITQUEUE(wait
, current
);
1454 dm_unplug_all(md
->queue
);
1456 add_wait_queue(&md
->wait
, &wait
);
1459 set_current_state(interruptible
);
1462 if (!atomic_read(&md
->pending
))
1465 if (interruptible
== TASK_INTERRUPTIBLE
&&
1466 signal_pending(current
)) {
1473 set_current_state(TASK_RUNNING
);
1475 remove_wait_queue(&md
->wait
, &wait
);
1480 static void dm_flush(struct mapped_device
*md
)
1482 dm_wait_for_completion(md
, TASK_UNINTERRUPTIBLE
);
1484 bio_init(&md
->barrier_bio
);
1485 md
->barrier_bio
.bi_bdev
= md
->bdev
;
1486 md
->barrier_bio
.bi_rw
= WRITE_BARRIER
;
1487 __split_and_process_bio(md
, &md
->barrier_bio
);
1489 dm_wait_for_completion(md
, TASK_UNINTERRUPTIBLE
);
1492 static void process_barrier(struct mapped_device
*md
, struct bio
*bio
)
1494 md
->barrier_error
= 0;
1498 if (!bio_empty_barrier(bio
)) {
1499 __split_and_process_bio(md
, bio
);
1503 if (md
->barrier_error
!= DM_ENDIO_REQUEUE
)
1504 bio_endio(bio
, md
->barrier_error
);
1506 spin_lock_irq(&md
->deferred_lock
);
1507 bio_list_add_head(&md
->deferred
, bio
);
1508 spin_unlock_irq(&md
->deferred_lock
);
1513 * Process the deferred bios
1515 static void dm_wq_work(struct work_struct
*work
)
1517 struct mapped_device
*md
= container_of(work
, struct mapped_device
,
1521 down_write(&md
->io_lock
);
1523 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND
, &md
->flags
)) {
1524 spin_lock_irq(&md
->deferred_lock
);
1525 c
= bio_list_pop(&md
->deferred
);
1526 spin_unlock_irq(&md
->deferred_lock
);
1529 clear_bit(DMF_QUEUE_IO_TO_THREAD
, &md
->flags
);
1533 up_write(&md
->io_lock
);
1536 process_barrier(md
, c
);
1538 __split_and_process_bio(md
, c
);
1540 down_write(&md
->io_lock
);
1543 up_write(&md
->io_lock
);
1546 static void dm_queue_flush(struct mapped_device
*md
)
1548 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND
, &md
->flags
);
1549 smp_mb__after_clear_bit();
1550 queue_work(md
->wq
, &md
->work
);
1554 * Swap in a new table (destroying old one).
1556 int dm_swap_table(struct mapped_device
*md
, struct dm_table
*table
)
1560 mutex_lock(&md
->suspend_lock
);
1562 /* device must be suspended */
1563 if (!dm_suspended(md
))
1567 r
= __bind(md
, table
);
1570 mutex_unlock(&md
->suspend_lock
);
1575 * Functions to lock and unlock any filesystem running on the
1578 static int lock_fs(struct mapped_device
*md
)
1582 WARN_ON(md
->frozen_sb
);
1584 md
->frozen_sb
= freeze_bdev(md
->bdev
);
1585 if (IS_ERR(md
->frozen_sb
)) {
1586 r
= PTR_ERR(md
->frozen_sb
);
1587 md
->frozen_sb
= NULL
;
1591 set_bit(DMF_FROZEN
, &md
->flags
);
1596 static void unlock_fs(struct mapped_device
*md
)
1598 if (!test_bit(DMF_FROZEN
, &md
->flags
))
1601 thaw_bdev(md
->bdev
, md
->frozen_sb
);
1602 md
->frozen_sb
= NULL
;
1603 clear_bit(DMF_FROZEN
, &md
->flags
);
1607 * We need to be able to change a mapping table under a mounted
1608 * filesystem. For example we might want to move some data in
1609 * the background. Before the table can be swapped with
1610 * dm_bind_table, dm_suspend must be called to flush any in
1611 * flight bios and ensure that any further io gets deferred.
1613 int dm_suspend(struct mapped_device
*md
, unsigned suspend_flags
)
1615 struct dm_table
*map
= NULL
;
1617 int do_lockfs
= suspend_flags
& DM_SUSPEND_LOCKFS_FLAG
? 1 : 0;
1618 int noflush
= suspend_flags
& DM_SUSPEND_NOFLUSH_FLAG
? 1 : 0;
1620 mutex_lock(&md
->suspend_lock
);
1622 if (dm_suspended(md
)) {
1627 map
= dm_get_table(md
);
1630 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
1631 * This flag is cleared before dm_suspend returns.
1634 set_bit(DMF_NOFLUSH_SUSPENDING
, &md
->flags
);
1636 /* This does not get reverted if there's an error later. */
1637 dm_table_presuspend_targets(map
);
1640 * Flush I/O to the device. noflush supersedes do_lockfs,
1641 * because lock_fs() needs to flush I/Os.
1643 if (!noflush
&& do_lockfs
) {
1650 * Here we must make sure that no processes are submitting requests
1651 * to target drivers i.e. no one may be executing
1652 * __split_and_process_bio. This is called from dm_request and
1655 * To get all processes out of __split_and_process_bio in dm_request,
1656 * we take the write lock. To prevent any process from reentering
1657 * __split_and_process_bio from dm_request, we set
1658 * DMF_QUEUE_IO_TO_THREAD.
1660 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
1661 * and call flush_workqueue(md->wq). flush_workqueue will wait until
1662 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
1663 * further calls to __split_and_process_bio from dm_wq_work.
1665 down_write(&md
->io_lock
);
1666 set_bit(DMF_BLOCK_IO_FOR_SUSPEND
, &md
->flags
);
1667 set_bit(DMF_QUEUE_IO_TO_THREAD
, &md
->flags
);
1668 up_write(&md
->io_lock
);
1670 flush_workqueue(md
->wq
);
1673 * At this point no more requests are entering target request routines.
1674 * We call dm_wait_for_completion to wait for all existing requests
1677 r
= dm_wait_for_completion(md
, TASK_INTERRUPTIBLE
);
1679 down_write(&md
->io_lock
);
1681 clear_bit(DMF_NOFLUSH_SUSPENDING
, &md
->flags
);
1682 up_write(&md
->io_lock
);
1684 /* were we interrupted ? */
1689 goto out
; /* pushback list is already flushed, so skip flush */
1693 * If dm_wait_for_completion returned 0, the device is completely
1694 * quiescent now. There is no request-processing activity. All new
1695 * requests are being added to md->deferred list.
1698 dm_table_postsuspend_targets(map
);
1700 set_bit(DMF_SUSPENDED
, &md
->flags
);
1706 mutex_unlock(&md
->suspend_lock
);
1710 int dm_resume(struct mapped_device
*md
)
1713 struct dm_table
*map
= NULL
;
1715 mutex_lock(&md
->suspend_lock
);
1716 if (!dm_suspended(md
))
1719 map
= dm_get_table(md
);
1720 if (!map
|| !dm_table_get_size(map
))
1723 r
= dm_table_resume_targets(map
);
1731 clear_bit(DMF_SUSPENDED
, &md
->flags
);
1733 dm_table_unplug_all(map
);
1735 dm_kobject_uevent(md
);
1741 mutex_unlock(&md
->suspend_lock
);
1746 /*-----------------------------------------------------------------
1747 * Event notification.
1748 *---------------------------------------------------------------*/
1749 void dm_kobject_uevent(struct mapped_device
*md
)
1751 kobject_uevent(&disk_to_dev(md
->disk
)->kobj
, KOBJ_CHANGE
);
1754 uint32_t dm_next_uevent_seq(struct mapped_device
*md
)
1756 return atomic_add_return(1, &md
->uevent_seq
);
1759 uint32_t dm_get_event_nr(struct mapped_device
*md
)
1761 return atomic_read(&md
->event_nr
);
1764 int dm_wait_event(struct mapped_device
*md
, int event_nr
)
1766 return wait_event_interruptible(md
->eventq
,
1767 (event_nr
!= atomic_read(&md
->event_nr
)));
1770 void dm_uevent_add(struct mapped_device
*md
, struct list_head
*elist
)
1772 unsigned long flags
;
1774 spin_lock_irqsave(&md
->uevent_lock
, flags
);
1775 list_add(elist
, &md
->uevent_list
);
1776 spin_unlock_irqrestore(&md
->uevent_lock
, flags
);
1780 * The gendisk is only valid as long as you have a reference
1783 struct gendisk
*dm_disk(struct mapped_device
*md
)
1788 struct kobject
*dm_kobject(struct mapped_device
*md
)
1794 * struct mapped_device should not be exported outside of dm.c
1795 * so use this check to verify that kobj is part of md structure
1797 struct mapped_device
*dm_get_from_kobject(struct kobject
*kobj
)
1799 struct mapped_device
*md
;
1801 md
= container_of(kobj
, struct mapped_device
, kobj
);
1802 if (&md
->kobj
!= kobj
)
1805 if (test_bit(DMF_FREEING
, &md
->flags
) ||
1806 test_bit(DMF_DELETING
, &md
->flags
))
1813 int dm_suspended(struct mapped_device
*md
)
1815 return test_bit(DMF_SUSPENDED
, &md
->flags
);
1818 int dm_noflush_suspending(struct dm_target
*ti
)
1820 struct mapped_device
*md
= dm_table_get_md(ti
->table
);
1821 int r
= __noflush_suspending(md
);
1827 EXPORT_SYMBOL_GPL(dm_noflush_suspending
);
1829 static struct block_device_operations dm_blk_dops
= {
1830 .open
= dm_blk_open
,
1831 .release
= dm_blk_close
,
1832 .ioctl
= dm_blk_ioctl
,
1833 .getgeo
= dm_blk_getgeo
,
1834 .owner
= THIS_MODULE
1837 EXPORT_SYMBOL(dm_get_mapinfo
);
1842 module_init(dm_init
);
1843 module_exit(dm_exit
);
1845 module_param(major
, uint
, 0);
1846 MODULE_PARM_DESC(major
, "The major number of the device mapper");
1847 MODULE_DESCRIPTION(DM_NAME
" driver");
1848 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1849 MODULE_LICENSE("GPL");