2 * Copyright (C) 2011 Red Hat UK.
4 * This file is released under the GPL.
7 #include "dm-thin-metadata.h"
9 #include <linux/device-mapper.h>
10 #include <linux/dm-io.h>
11 #include <linux/dm-kcopyd.h>
12 #include <linux/list.h>
13 #include <linux/init.h>
14 #include <linux/module.h>
15 #include <linux/slab.h>
17 #define DM_MSG_PREFIX "thin"
22 #define ENDIO_HOOK_POOL_SIZE 1024
23 #define DEFERRED_SET_SIZE 64
24 #define MAPPING_POOL_SIZE 1024
25 #define PRISON_CELLS 1024
26 #define COMMIT_PERIOD HZ
29 * The block size of the device holding pool data must be
30 * between 64KB and 1GB.
32 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
33 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
36 * Device id is restricted to 24 bits.
38 #define MAX_DEV_ID ((1 << 24) - 1)
41 * How do we handle breaking sharing of data blocks?
42 * =================================================
44 * We use a standard copy-on-write btree to store the mappings for the
45 * devices (note I'm talking about copy-on-write of the metadata here, not
46 * the data). When you take an internal snapshot you clone the root node
47 * of the origin btree. After this there is no concept of an origin or a
48 * snapshot. They are just two device trees that happen to point to the
51 * When we get a write in we decide if it's to a shared data block using
52 * some timestamp magic. If it is, we have to break sharing.
54 * Let's say we write to a shared block in what was the origin. The
57 * i) plug io further to this physical block. (see bio_prison code).
59 * ii) quiesce any read io to that shared data block. Obviously
60 * including all devices that share this block. (see deferred_set code)
62 * iii) copy the data block to a newly allocate block. This step can be
63 * missed out if the io covers the block. (schedule_copy).
65 * iv) insert the new mapping into the origin's btree
66 * (process_prepared_mapping). This act of inserting breaks some
67 * sharing of btree nodes between the two devices. Breaking sharing only
68 * effects the btree of that specific device. Btrees for the other
69 * devices that share the block never change. The btree for the origin
70 * device as it was after the last commit is untouched, ie. we're using
71 * persistent data structures in the functional programming sense.
73 * v) unplug io to this physical block, including the io that triggered
74 * the breaking of sharing.
76 * Steps (ii) and (iii) occur in parallel.
78 * The metadata _doesn't_ need to be committed before the io continues. We
79 * get away with this because the io is always written to a _new_ block.
80 * If there's a crash, then:
82 * - The origin mapping will point to the old origin block (the shared
83 * one). This will contain the data as it was before the io that triggered
84 * the breaking of sharing came in.
86 * - The snap mapping still points to the old block. As it would after
89 * The downside of this scheme is the timestamp magic isn't perfect, and
90 * will continue to think that data block in the snapshot device is shared
91 * even after the write to the origin has broken sharing. I suspect data
92 * blocks will typically be shared by many different devices, so we're
93 * breaking sharing n + 1 times, rather than n, where n is the number of
94 * devices that reference this data block. At the moment I think the
95 * benefits far, far outweigh the disadvantages.
98 /*----------------------------------------------------------------*/
101 * Sometimes we can't deal with a bio straight away. We put them in prison
102 * where they can't cause any mischief. Bios are put in a cell identified
103 * by a key, multiple bios can be in the same cell. When the cell is
104 * subsequently unlocked the bios become available.
114 struct dm_bio_prison_cell
{
115 struct hlist_node list
;
116 struct bio_prison
*prison
;
119 struct bio_list bios
;
124 mempool_t
*cell_pool
;
128 struct hlist_head
*cells
;
131 static uint32_t calc_nr_buckets(unsigned nr_cells
)
136 nr_cells
= min(nr_cells
, 8192u);
144 static struct kmem_cache
*_cell_cache
;
147 * @nr_cells should be the number of cells you want in use _concurrently_.
148 * Don't confuse it with the number of distinct keys.
150 static struct bio_prison
*prison_create(unsigned nr_cells
)
153 uint32_t nr_buckets
= calc_nr_buckets(nr_cells
);
154 size_t len
= sizeof(struct bio_prison
) +
155 (sizeof(struct hlist_head
) * nr_buckets
);
156 struct bio_prison
*prison
= kmalloc(len
, GFP_KERNEL
);
161 spin_lock_init(&prison
->lock
);
162 prison
->cell_pool
= mempool_create_slab_pool(nr_cells
, _cell_cache
);
163 if (!prison
->cell_pool
) {
168 prison
->nr_buckets
= nr_buckets
;
169 prison
->hash_mask
= nr_buckets
- 1;
170 prison
->cells
= (struct hlist_head
*) (prison
+ 1);
171 for (i
= 0; i
< nr_buckets
; i
++)
172 INIT_HLIST_HEAD(prison
->cells
+ i
);
177 static void prison_destroy(struct bio_prison
*prison
)
179 mempool_destroy(prison
->cell_pool
);
183 static uint32_t hash_key(struct bio_prison
*prison
, struct cell_key
*key
)
185 const unsigned long BIG_PRIME
= 4294967291UL;
186 uint64_t hash
= key
->block
* BIG_PRIME
;
188 return (uint32_t) (hash
& prison
->hash_mask
);
191 static int keys_equal(struct cell_key
*lhs
, struct cell_key
*rhs
)
193 return (lhs
->virtual == rhs
->virtual) &&
194 (lhs
->dev
== rhs
->dev
) &&
195 (lhs
->block
== rhs
->block
);
198 static struct dm_bio_prison_cell
*__search_bucket(struct hlist_head
*bucket
,
199 struct cell_key
*key
)
201 struct dm_bio_prison_cell
*cell
;
202 struct hlist_node
*tmp
;
204 hlist_for_each_entry(cell
, tmp
, bucket
, list
)
205 if (keys_equal(&cell
->key
, key
))
212 * This may block if a new cell needs allocating. You must ensure that
213 * cells will be unlocked even if the calling thread is blocked.
215 * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
217 static int bio_detain(struct bio_prison
*prison
, struct cell_key
*key
,
218 struct bio
*inmate
, struct dm_bio_prison_cell
**ref
)
222 uint32_t hash
= hash_key(prison
, key
);
223 struct dm_bio_prison_cell
*cell
, *cell2
;
225 BUG_ON(hash
> prison
->nr_buckets
);
227 spin_lock_irqsave(&prison
->lock
, flags
);
229 cell
= __search_bucket(prison
->cells
+ hash
, key
);
231 bio_list_add(&cell
->bios
, inmate
);
236 * Allocate a new cell
238 spin_unlock_irqrestore(&prison
->lock
, flags
);
239 cell2
= mempool_alloc(prison
->cell_pool
, GFP_NOIO
);
240 spin_lock_irqsave(&prison
->lock
, flags
);
243 * We've been unlocked, so we have to double check that
244 * nobody else has inserted this cell in the meantime.
246 cell
= __search_bucket(prison
->cells
+ hash
, key
);
248 mempool_free(cell2
, prison
->cell_pool
);
249 bio_list_add(&cell
->bios
, inmate
);
258 cell
->prison
= prison
;
259 memcpy(&cell
->key
, key
, sizeof(cell
->key
));
260 cell
->holder
= inmate
;
261 bio_list_init(&cell
->bios
);
262 hlist_add_head(&cell
->list
, prison
->cells
+ hash
);
267 spin_unlock_irqrestore(&prison
->lock
, flags
);
275 * @inmates must have been initialised prior to this call
277 static void __cell_release(struct dm_bio_prison_cell
*cell
, struct bio_list
*inmates
)
279 struct bio_prison
*prison
= cell
->prison
;
281 hlist_del(&cell
->list
);
284 bio_list_add(inmates
, cell
->holder
);
285 bio_list_merge(inmates
, &cell
->bios
);
288 mempool_free(cell
, prison
->cell_pool
);
291 static void cell_release(struct dm_bio_prison_cell
*cell
, struct bio_list
*bios
)
294 struct bio_prison
*prison
= cell
->prison
;
296 spin_lock_irqsave(&prison
->lock
, flags
);
297 __cell_release(cell
, bios
);
298 spin_unlock_irqrestore(&prison
->lock
, flags
);
302 * There are a couple of places where we put a bio into a cell briefly
303 * before taking it out again. In these situations we know that no other
304 * bio may be in the cell. This function releases the cell, and also does
307 static void __cell_release_singleton(struct dm_bio_prison_cell
*cell
, struct bio
*bio
)
309 BUG_ON(cell
->holder
!= bio
);
310 BUG_ON(!bio_list_empty(&cell
->bios
));
312 __cell_release(cell
, NULL
);
315 static void cell_release_singleton(struct dm_bio_prison_cell
*cell
, struct bio
*bio
)
318 struct bio_prison
*prison
= cell
->prison
;
320 spin_lock_irqsave(&prison
->lock
, flags
);
321 __cell_release_singleton(cell
, bio
);
322 spin_unlock_irqrestore(&prison
->lock
, flags
);
326 * Sometimes we don't want the holder, just the additional bios.
328 static void __cell_release_no_holder(struct dm_bio_prison_cell
*cell
,
329 struct bio_list
*inmates
)
331 struct bio_prison
*prison
= cell
->prison
;
333 hlist_del(&cell
->list
);
334 bio_list_merge(inmates
, &cell
->bios
);
336 mempool_free(cell
, prison
->cell_pool
);
339 static void cell_release_no_holder(struct dm_bio_prison_cell
*cell
,
340 struct bio_list
*inmates
)
343 struct bio_prison
*prison
= cell
->prison
;
345 spin_lock_irqsave(&prison
->lock
, flags
);
346 __cell_release_no_holder(cell
, inmates
);
347 spin_unlock_irqrestore(&prison
->lock
, flags
);
350 static void cell_error(struct dm_bio_prison_cell
*cell
)
352 struct bio_prison
*prison
= cell
->prison
;
353 struct bio_list bios
;
357 bio_list_init(&bios
);
359 spin_lock_irqsave(&prison
->lock
, flags
);
360 __cell_release(cell
, &bios
);
361 spin_unlock_irqrestore(&prison
->lock
, flags
);
363 while ((bio
= bio_list_pop(&bios
)))
367 /*----------------------------------------------------------------*/
370 * We use the deferred set to keep track of pending reads to shared blocks.
371 * We do this to ensure the new mapping caused by a write isn't performed
372 * until these prior reads have completed. Otherwise the insertion of the
373 * new mapping could free the old block that the read bios are mapped to.
377 struct deferred_entry
{
378 struct deferred_set
*ds
;
380 struct list_head work_items
;
383 struct deferred_set
{
385 unsigned current_entry
;
387 struct deferred_entry entries
[DEFERRED_SET_SIZE
];
390 static void ds_init(struct deferred_set
*ds
)
394 spin_lock_init(&ds
->lock
);
395 ds
->current_entry
= 0;
397 for (i
= 0; i
< DEFERRED_SET_SIZE
; i
++) {
398 ds
->entries
[i
].ds
= ds
;
399 ds
->entries
[i
].count
= 0;
400 INIT_LIST_HEAD(&ds
->entries
[i
].work_items
);
404 static struct deferred_entry
*ds_inc(struct deferred_set
*ds
)
407 struct deferred_entry
*entry
;
409 spin_lock_irqsave(&ds
->lock
, flags
);
410 entry
= ds
->entries
+ ds
->current_entry
;
412 spin_unlock_irqrestore(&ds
->lock
, flags
);
417 static unsigned ds_next(unsigned index
)
419 return (index
+ 1) % DEFERRED_SET_SIZE
;
422 static void __sweep(struct deferred_set
*ds
, struct list_head
*head
)
424 while ((ds
->sweeper
!= ds
->current_entry
) &&
425 !ds
->entries
[ds
->sweeper
].count
) {
426 list_splice_init(&ds
->entries
[ds
->sweeper
].work_items
, head
);
427 ds
->sweeper
= ds_next(ds
->sweeper
);
430 if ((ds
->sweeper
== ds
->current_entry
) && !ds
->entries
[ds
->sweeper
].count
)
431 list_splice_init(&ds
->entries
[ds
->sweeper
].work_items
, head
);
434 static void ds_dec(struct deferred_entry
*entry
, struct list_head
*head
)
438 spin_lock_irqsave(&entry
->ds
->lock
, flags
);
439 BUG_ON(!entry
->count
);
441 __sweep(entry
->ds
, head
);
442 spin_unlock_irqrestore(&entry
->ds
->lock
, flags
);
446 * Returns 1 if deferred or 0 if no pending items to delay job.
448 static int ds_add_work(struct deferred_set
*ds
, struct list_head
*work
)
454 spin_lock_irqsave(&ds
->lock
, flags
);
455 if ((ds
->sweeper
== ds
->current_entry
) &&
456 !ds
->entries
[ds
->current_entry
].count
)
459 list_add(work
, &ds
->entries
[ds
->current_entry
].work_items
);
460 next_entry
= ds_next(ds
->current_entry
);
461 if (!ds
->entries
[next_entry
].count
)
462 ds
->current_entry
= next_entry
;
464 spin_unlock_irqrestore(&ds
->lock
, flags
);
469 /*----------------------------------------------------------------*/
474 static void build_data_key(struct dm_thin_device
*td
,
475 dm_block_t b
, struct cell_key
*key
)
478 key
->dev
= dm_thin_dev_id(td
);
482 static void build_virtual_key(struct dm_thin_device
*td
, dm_block_t b
,
483 struct cell_key
*key
)
486 key
->dev
= dm_thin_dev_id(td
);
490 /*----------------------------------------------------------------*/
493 * A pool device ties together a metadata device and a data device. It
494 * also provides the interface for creating and destroying internal
497 struct dm_thin_new_mapping
;
499 struct pool_features
{
500 unsigned zero_new_blocks
:1;
501 unsigned discard_enabled
:1;
502 unsigned discard_passdown
:1;
506 struct list_head list
;
507 struct dm_target
*ti
; /* Only set if a pool target is bound */
509 struct mapped_device
*pool_md
;
510 struct block_device
*md_dev
;
511 struct dm_pool_metadata
*pmd
;
513 dm_block_t low_water_blocks
;
514 uint32_t sectors_per_block
;
515 int sectors_per_block_shift
;
517 struct pool_features pf
;
518 unsigned low_water_triggered
:1; /* A dm event has been sent */
519 unsigned no_free_space
:1; /* A -ENOSPC warning has been issued */
521 struct bio_prison
*prison
;
522 struct dm_kcopyd_client
*copier
;
524 struct workqueue_struct
*wq
;
525 struct work_struct worker
;
526 struct delayed_work waker
;
528 unsigned long last_commit_jiffies
;
532 struct bio_list deferred_bios
;
533 struct bio_list deferred_flush_bios
;
534 struct list_head prepared_mappings
;
535 struct list_head prepared_discards
;
537 struct bio_list retry_on_resume_list
;
539 struct deferred_set shared_read_ds
;
540 struct deferred_set all_io_ds
;
542 struct dm_thin_new_mapping
*next_mapping
;
543 mempool_t
*mapping_pool
;
544 mempool_t
*endio_hook_pool
;
548 * Target context for a pool.
551 struct dm_target
*ti
;
553 struct dm_dev
*data_dev
;
554 struct dm_dev
*metadata_dev
;
555 struct dm_target_callbacks callbacks
;
557 dm_block_t low_water_blocks
;
558 struct pool_features pf
;
562 * Target context for a thin.
565 struct dm_dev
*pool_dev
;
566 struct dm_dev
*origin_dev
;
570 struct dm_thin_device
*td
;
573 /*----------------------------------------------------------------*/
576 * A global list of pools that uses a struct mapped_device as a key.
578 static struct dm_thin_pool_table
{
580 struct list_head pools
;
581 } dm_thin_pool_table
;
583 static void pool_table_init(void)
585 mutex_init(&dm_thin_pool_table
.mutex
);
586 INIT_LIST_HEAD(&dm_thin_pool_table
.pools
);
589 static void __pool_table_insert(struct pool
*pool
)
591 BUG_ON(!mutex_is_locked(&dm_thin_pool_table
.mutex
));
592 list_add(&pool
->list
, &dm_thin_pool_table
.pools
);
595 static void __pool_table_remove(struct pool
*pool
)
597 BUG_ON(!mutex_is_locked(&dm_thin_pool_table
.mutex
));
598 list_del(&pool
->list
);
601 static struct pool
*__pool_table_lookup(struct mapped_device
*md
)
603 struct pool
*pool
= NULL
, *tmp
;
605 BUG_ON(!mutex_is_locked(&dm_thin_pool_table
.mutex
));
607 list_for_each_entry(tmp
, &dm_thin_pool_table
.pools
, list
) {
608 if (tmp
->pool_md
== md
) {
617 static struct pool
*__pool_table_lookup_metadata_dev(struct block_device
*md_dev
)
619 struct pool
*pool
= NULL
, *tmp
;
621 BUG_ON(!mutex_is_locked(&dm_thin_pool_table
.mutex
));
623 list_for_each_entry(tmp
, &dm_thin_pool_table
.pools
, list
) {
624 if (tmp
->md_dev
== md_dev
) {
633 /*----------------------------------------------------------------*/
635 struct dm_thin_endio_hook
{
637 struct deferred_entry
*shared_read_entry
;
638 struct deferred_entry
*all_io_entry
;
639 struct dm_thin_new_mapping
*overwrite_mapping
;
642 static void __requeue_bio_list(struct thin_c
*tc
, struct bio_list
*master
)
645 struct bio_list bios
;
647 bio_list_init(&bios
);
648 bio_list_merge(&bios
, master
);
649 bio_list_init(master
);
651 while ((bio
= bio_list_pop(&bios
))) {
652 struct dm_thin_endio_hook
*h
= dm_get_mapinfo(bio
)->ptr
;
655 bio_endio(bio
, DM_ENDIO_REQUEUE
);
657 bio_list_add(master
, bio
);
661 static void requeue_io(struct thin_c
*tc
)
663 struct pool
*pool
= tc
->pool
;
666 spin_lock_irqsave(&pool
->lock
, flags
);
667 __requeue_bio_list(tc
, &pool
->deferred_bios
);
668 __requeue_bio_list(tc
, &pool
->retry_on_resume_list
);
669 spin_unlock_irqrestore(&pool
->lock
, flags
);
673 * This section of code contains the logic for processing a thin device's IO.
674 * Much of the code depends on pool object resources (lists, workqueues, etc)
675 * but most is exclusively called from the thin target rather than the thin-pool
679 static dm_block_t
get_bio_block(struct thin_c
*tc
, struct bio
*bio
)
681 sector_t block_nr
= bio
->bi_sector
;
683 if (tc
->pool
->sectors_per_block_shift
< 0)
684 (void) sector_div(block_nr
, tc
->pool
->sectors_per_block
);
686 block_nr
>>= tc
->pool
->sectors_per_block_shift
;
691 static void remap(struct thin_c
*tc
, struct bio
*bio
, dm_block_t block
)
693 struct pool
*pool
= tc
->pool
;
694 sector_t bi_sector
= bio
->bi_sector
;
696 bio
->bi_bdev
= tc
->pool_dev
->bdev
;
697 if (tc
->pool
->sectors_per_block_shift
< 0)
698 bio
->bi_sector
= (block
* pool
->sectors_per_block
) +
699 sector_div(bi_sector
, pool
->sectors_per_block
);
701 bio
->bi_sector
= (block
<< pool
->sectors_per_block_shift
) |
702 (bi_sector
& (pool
->sectors_per_block
- 1));
705 static void remap_to_origin(struct thin_c
*tc
, struct bio
*bio
)
707 bio
->bi_bdev
= tc
->origin_dev
->bdev
;
710 static void issue(struct thin_c
*tc
, struct bio
*bio
)
712 struct pool
*pool
= tc
->pool
;
716 * Batch together any FUA/FLUSH bios we find and then issue
717 * a single commit for them in process_deferred_bios().
719 if (bio
->bi_rw
& (REQ_FLUSH
| REQ_FUA
)) {
720 spin_lock_irqsave(&pool
->lock
, flags
);
721 bio_list_add(&pool
->deferred_flush_bios
, bio
);
722 spin_unlock_irqrestore(&pool
->lock
, flags
);
724 generic_make_request(bio
);
727 static void remap_to_origin_and_issue(struct thin_c
*tc
, struct bio
*bio
)
729 remap_to_origin(tc
, bio
);
733 static void remap_and_issue(struct thin_c
*tc
, struct bio
*bio
,
736 remap(tc
, bio
, block
);
741 * wake_worker() is used when new work is queued and when pool_resume is
742 * ready to continue deferred IO processing.
744 static void wake_worker(struct pool
*pool
)
746 queue_work(pool
->wq
, &pool
->worker
);
749 /*----------------------------------------------------------------*/
752 * Bio endio functions.
754 struct dm_thin_new_mapping
{
755 struct list_head list
;
759 unsigned pass_discard
:1;
762 dm_block_t virt_block
;
763 dm_block_t data_block
;
764 struct dm_bio_prison_cell
*cell
, *cell2
;
768 * If the bio covers the whole area of a block then we can avoid
769 * zeroing or copying. Instead this bio is hooked. The bio will
770 * still be in the cell, so care has to be taken to avoid issuing
774 bio_end_io_t
*saved_bi_end_io
;
777 static void __maybe_add_mapping(struct dm_thin_new_mapping
*m
)
779 struct pool
*pool
= m
->tc
->pool
;
781 if (m
->quiesced
&& m
->prepared
) {
782 list_add(&m
->list
, &pool
->prepared_mappings
);
787 static void copy_complete(int read_err
, unsigned long write_err
, void *context
)
790 struct dm_thin_new_mapping
*m
= context
;
791 struct pool
*pool
= m
->tc
->pool
;
793 m
->err
= read_err
|| write_err
? -EIO
: 0;
795 spin_lock_irqsave(&pool
->lock
, flags
);
797 __maybe_add_mapping(m
);
798 spin_unlock_irqrestore(&pool
->lock
, flags
);
801 static void overwrite_endio(struct bio
*bio
, int err
)
804 struct dm_thin_endio_hook
*h
= dm_get_mapinfo(bio
)->ptr
;
805 struct dm_thin_new_mapping
*m
= h
->overwrite_mapping
;
806 struct pool
*pool
= m
->tc
->pool
;
810 spin_lock_irqsave(&pool
->lock
, flags
);
812 __maybe_add_mapping(m
);
813 spin_unlock_irqrestore(&pool
->lock
, flags
);
816 /*----------------------------------------------------------------*/
823 * Prepared mapping jobs.
827 * This sends the bios in the cell back to the deferred_bios list.
829 static void cell_defer(struct thin_c
*tc
, struct dm_bio_prison_cell
*cell
,
830 dm_block_t data_block
)
832 struct pool
*pool
= tc
->pool
;
835 spin_lock_irqsave(&pool
->lock
, flags
);
836 cell_release(cell
, &pool
->deferred_bios
);
837 spin_unlock_irqrestore(&tc
->pool
->lock
, flags
);
843 * Same as cell_defer above, except it omits one particular detainee,
844 * a write bio that covers the block and has already been processed.
846 static void cell_defer_except(struct thin_c
*tc
, struct dm_bio_prison_cell
*cell
)
848 struct bio_list bios
;
849 struct pool
*pool
= tc
->pool
;
852 bio_list_init(&bios
);
854 spin_lock_irqsave(&pool
->lock
, flags
);
855 cell_release_no_holder(cell
, &pool
->deferred_bios
);
856 spin_unlock_irqrestore(&pool
->lock
, flags
);
861 static void process_prepared_mapping(struct dm_thin_new_mapping
*m
)
863 struct thin_c
*tc
= m
->tc
;
869 bio
->bi_end_io
= m
->saved_bi_end_io
;
877 * Commit the prepared block into the mapping btree.
878 * Any I/O for this block arriving after this point will get
879 * remapped to it directly.
881 r
= dm_thin_insert_block(tc
->td
, m
->virt_block
, m
->data_block
);
883 DMERR("dm_thin_insert_block() failed");
889 * Release any bios held while the block was being provisioned.
890 * If we are processing a write bio that completely covers the block,
891 * we already processed it so can ignore it now when processing
892 * the bios in the cell.
895 cell_defer_except(tc
, m
->cell
);
898 cell_defer(tc
, m
->cell
, m
->data_block
);
901 mempool_free(m
, tc
->pool
->mapping_pool
);
904 static void process_prepared_discard(struct dm_thin_new_mapping
*m
)
907 struct thin_c
*tc
= m
->tc
;
909 r
= dm_thin_remove_block(tc
->td
, m
->virt_block
);
911 DMERR("dm_thin_remove_block() failed");
914 * Pass the discard down to the underlying device?
917 remap_and_issue(tc
, m
->bio
, m
->data_block
);
919 bio_endio(m
->bio
, 0);
921 cell_defer_except(tc
, m
->cell
);
922 cell_defer_except(tc
, m
->cell2
);
923 mempool_free(m
, tc
->pool
->mapping_pool
);
926 static void process_prepared(struct pool
*pool
, struct list_head
*head
,
927 void (*fn
)(struct dm_thin_new_mapping
*))
930 struct list_head maps
;
931 struct dm_thin_new_mapping
*m
, *tmp
;
933 INIT_LIST_HEAD(&maps
);
934 spin_lock_irqsave(&pool
->lock
, flags
);
935 list_splice_init(head
, &maps
);
936 spin_unlock_irqrestore(&pool
->lock
, flags
);
938 list_for_each_entry_safe(m
, tmp
, &maps
, list
)
945 static int io_overlaps_block(struct pool
*pool
, struct bio
*bio
)
947 return bio
->bi_size
== (pool
->sectors_per_block
<< SECTOR_SHIFT
);
950 static int io_overwrites_block(struct pool
*pool
, struct bio
*bio
)
952 return (bio_data_dir(bio
) == WRITE
) &&
953 io_overlaps_block(pool
, bio
);
956 static void save_and_set_endio(struct bio
*bio
, bio_end_io_t
**save
,
959 *save
= bio
->bi_end_io
;
963 static int ensure_next_mapping(struct pool
*pool
)
965 if (pool
->next_mapping
)
968 pool
->next_mapping
= mempool_alloc(pool
->mapping_pool
, GFP_ATOMIC
);
970 return pool
->next_mapping
? 0 : -ENOMEM
;
973 static struct dm_thin_new_mapping
*get_next_mapping(struct pool
*pool
)
975 struct dm_thin_new_mapping
*r
= pool
->next_mapping
;
977 BUG_ON(!pool
->next_mapping
);
979 pool
->next_mapping
= NULL
;
984 static void schedule_copy(struct thin_c
*tc
, dm_block_t virt_block
,
985 struct dm_dev
*origin
, dm_block_t data_origin
,
986 dm_block_t data_dest
,
987 struct dm_bio_prison_cell
*cell
, struct bio
*bio
)
990 struct pool
*pool
= tc
->pool
;
991 struct dm_thin_new_mapping
*m
= get_next_mapping(pool
);
993 INIT_LIST_HEAD(&m
->list
);
997 m
->virt_block
= virt_block
;
998 m
->data_block
= data_dest
;
1003 if (!ds_add_work(&pool
->shared_read_ds
, &m
->list
))
1007 * IO to pool_dev remaps to the pool target's data_dev.
1009 * If the whole block of data is being overwritten, we can issue the
1010 * bio immediately. Otherwise we use kcopyd to clone the data first.
1012 if (io_overwrites_block(pool
, bio
)) {
1013 struct dm_thin_endio_hook
*h
= dm_get_mapinfo(bio
)->ptr
;
1015 h
->overwrite_mapping
= m
;
1017 save_and_set_endio(bio
, &m
->saved_bi_end_io
, overwrite_endio
);
1018 remap_and_issue(tc
, bio
, data_dest
);
1020 struct dm_io_region from
, to
;
1022 from
.bdev
= origin
->bdev
;
1023 from
.sector
= data_origin
* pool
->sectors_per_block
;
1024 from
.count
= pool
->sectors_per_block
;
1026 to
.bdev
= tc
->pool_dev
->bdev
;
1027 to
.sector
= data_dest
* pool
->sectors_per_block
;
1028 to
.count
= pool
->sectors_per_block
;
1030 r
= dm_kcopyd_copy(pool
->copier
, &from
, 1, &to
,
1031 0, copy_complete
, m
);
1033 mempool_free(m
, pool
->mapping_pool
);
1034 DMERR("dm_kcopyd_copy() failed");
1040 static void schedule_internal_copy(struct thin_c
*tc
, dm_block_t virt_block
,
1041 dm_block_t data_origin
, dm_block_t data_dest
,
1042 struct dm_bio_prison_cell
*cell
, struct bio
*bio
)
1044 schedule_copy(tc
, virt_block
, tc
->pool_dev
,
1045 data_origin
, data_dest
, cell
, bio
);
1048 static void schedule_external_copy(struct thin_c
*tc
, dm_block_t virt_block
,
1049 dm_block_t data_dest
,
1050 struct dm_bio_prison_cell
*cell
, struct bio
*bio
)
1052 schedule_copy(tc
, virt_block
, tc
->origin_dev
,
1053 virt_block
, data_dest
, cell
, bio
);
1056 static void schedule_zero(struct thin_c
*tc
, dm_block_t virt_block
,
1057 dm_block_t data_block
, struct dm_bio_prison_cell
*cell
,
1060 struct pool
*pool
= tc
->pool
;
1061 struct dm_thin_new_mapping
*m
= get_next_mapping(pool
);
1063 INIT_LIST_HEAD(&m
->list
);
1067 m
->virt_block
= virt_block
;
1068 m
->data_block
= data_block
;
1074 * If the whole block of data is being overwritten or we are not
1075 * zeroing pre-existing data, we can issue the bio immediately.
1076 * Otherwise we use kcopyd to zero the data first.
1078 if (!pool
->pf
.zero_new_blocks
)
1079 process_prepared_mapping(m
);
1081 else if (io_overwrites_block(pool
, bio
)) {
1082 struct dm_thin_endio_hook
*h
= dm_get_mapinfo(bio
)->ptr
;
1084 h
->overwrite_mapping
= m
;
1086 save_and_set_endio(bio
, &m
->saved_bi_end_io
, overwrite_endio
);
1087 remap_and_issue(tc
, bio
, data_block
);
1090 struct dm_io_region to
;
1092 to
.bdev
= tc
->pool_dev
->bdev
;
1093 to
.sector
= data_block
* pool
->sectors_per_block
;
1094 to
.count
= pool
->sectors_per_block
;
1096 r
= dm_kcopyd_zero(pool
->copier
, 1, &to
, 0, copy_complete
, m
);
1098 mempool_free(m
, pool
->mapping_pool
);
1099 DMERR("dm_kcopyd_zero() failed");
1105 static int alloc_data_block(struct thin_c
*tc
, dm_block_t
*result
)
1108 dm_block_t free_blocks
;
1109 unsigned long flags
;
1110 struct pool
*pool
= tc
->pool
;
1112 r
= dm_pool_get_free_block_count(pool
->pmd
, &free_blocks
);
1116 if (free_blocks
<= pool
->low_water_blocks
&& !pool
->low_water_triggered
) {
1117 DMWARN("%s: reached low water mark, sending event.",
1118 dm_device_name(pool
->pool_md
));
1119 spin_lock_irqsave(&pool
->lock
, flags
);
1120 pool
->low_water_triggered
= 1;
1121 spin_unlock_irqrestore(&pool
->lock
, flags
);
1122 dm_table_event(pool
->ti
->table
);
1126 if (pool
->no_free_space
)
1130 * Try to commit to see if that will free up some
1133 r
= dm_pool_commit_metadata(pool
->pmd
);
1135 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1140 r
= dm_pool_get_free_block_count(pool
->pmd
, &free_blocks
);
1145 * If we still have no space we set a flag to avoid
1146 * doing all this checking and return -ENOSPC.
1149 DMWARN("%s: no free space available.",
1150 dm_device_name(pool
->pool_md
));
1151 spin_lock_irqsave(&pool
->lock
, flags
);
1152 pool
->no_free_space
= 1;
1153 spin_unlock_irqrestore(&pool
->lock
, flags
);
1159 r
= dm_pool_alloc_data_block(pool
->pmd
, result
);
1167 * If we have run out of space, queue bios until the device is
1168 * resumed, presumably after having been reloaded with more space.
1170 static void retry_on_resume(struct bio
*bio
)
1172 struct dm_thin_endio_hook
*h
= dm_get_mapinfo(bio
)->ptr
;
1173 struct thin_c
*tc
= h
->tc
;
1174 struct pool
*pool
= tc
->pool
;
1175 unsigned long flags
;
1177 spin_lock_irqsave(&pool
->lock
, flags
);
1178 bio_list_add(&pool
->retry_on_resume_list
, bio
);
1179 spin_unlock_irqrestore(&pool
->lock
, flags
);
1182 static void no_space(struct dm_bio_prison_cell
*cell
)
1185 struct bio_list bios
;
1187 bio_list_init(&bios
);
1188 cell_release(cell
, &bios
);
1190 while ((bio
= bio_list_pop(&bios
)))
1191 retry_on_resume(bio
);
1194 static void process_discard(struct thin_c
*tc
, struct bio
*bio
)
1197 unsigned long flags
;
1198 struct pool
*pool
= tc
->pool
;
1199 struct dm_bio_prison_cell
*cell
, *cell2
;
1200 struct cell_key key
, key2
;
1201 dm_block_t block
= get_bio_block(tc
, bio
);
1202 struct dm_thin_lookup_result lookup_result
;
1203 struct dm_thin_new_mapping
*m
;
1205 build_virtual_key(tc
->td
, block
, &key
);
1206 if (bio_detain(tc
->pool
->prison
, &key
, bio
, &cell
))
1209 r
= dm_thin_find_block(tc
->td
, block
, 1, &lookup_result
);
1213 * Check nobody is fiddling with this pool block. This can
1214 * happen if someone's in the process of breaking sharing
1217 build_data_key(tc
->td
, lookup_result
.block
, &key2
);
1218 if (bio_detain(tc
->pool
->prison
, &key2
, bio
, &cell2
)) {
1219 cell_release_singleton(cell
, bio
);
1223 if (io_overlaps_block(pool
, bio
)) {
1225 * IO may still be going to the destination block. We must
1226 * quiesce before we can do the removal.
1228 m
= get_next_mapping(pool
);
1230 m
->pass_discard
= (!lookup_result
.shared
) && pool
->pf
.discard_passdown
;
1231 m
->virt_block
= block
;
1232 m
->data_block
= lookup_result
.block
;
1238 if (!ds_add_work(&pool
->all_io_ds
, &m
->list
)) {
1239 spin_lock_irqsave(&pool
->lock
, flags
);
1240 list_add(&m
->list
, &pool
->prepared_discards
);
1241 spin_unlock_irqrestore(&pool
->lock
, flags
);
1246 * The DM core makes sure that the discard doesn't span
1247 * a block boundary. So we submit the discard of a
1248 * partial block appropriately.
1250 cell_release_singleton(cell
, bio
);
1251 cell_release_singleton(cell2
, bio
);
1252 if ((!lookup_result
.shared
) && pool
->pf
.discard_passdown
)
1253 remap_and_issue(tc
, bio
, lookup_result
.block
);
1261 * It isn't provisioned, just forget it.
1263 cell_release_singleton(cell
, bio
);
1268 DMERR("discard: find block unexpectedly returned %d", r
);
1269 cell_release_singleton(cell
, bio
);
1275 static void break_sharing(struct thin_c
*tc
, struct bio
*bio
, dm_block_t block
,
1276 struct cell_key
*key
,
1277 struct dm_thin_lookup_result
*lookup_result
,
1278 struct dm_bio_prison_cell
*cell
)
1281 dm_block_t data_block
;
1283 r
= alloc_data_block(tc
, &data_block
);
1286 schedule_internal_copy(tc
, block
, lookup_result
->block
,
1287 data_block
, cell
, bio
);
1295 DMERR("%s: alloc_data_block() failed, error = %d", __func__
, r
);
1301 static void process_shared_bio(struct thin_c
*tc
, struct bio
*bio
,
1303 struct dm_thin_lookup_result
*lookup_result
)
1305 struct dm_bio_prison_cell
*cell
;
1306 struct pool
*pool
= tc
->pool
;
1307 struct cell_key key
;
1310 * If cell is already occupied, then sharing is already in the process
1311 * of being broken so we have nothing further to do here.
1313 build_data_key(tc
->td
, lookup_result
->block
, &key
);
1314 if (bio_detain(pool
->prison
, &key
, bio
, &cell
))
1317 if (bio_data_dir(bio
) == WRITE
)
1318 break_sharing(tc
, bio
, block
, &key
, lookup_result
, cell
);
1320 struct dm_thin_endio_hook
*h
= dm_get_mapinfo(bio
)->ptr
;
1322 h
->shared_read_entry
= ds_inc(&pool
->shared_read_ds
);
1324 cell_release_singleton(cell
, bio
);
1325 remap_and_issue(tc
, bio
, lookup_result
->block
);
1329 static void provision_block(struct thin_c
*tc
, struct bio
*bio
, dm_block_t block
,
1330 struct dm_bio_prison_cell
*cell
)
1333 dm_block_t data_block
;
1336 * Remap empty bios (flushes) immediately, without provisioning.
1338 if (!bio
->bi_size
) {
1339 cell_release_singleton(cell
, bio
);
1340 remap_and_issue(tc
, bio
, 0);
1345 * Fill read bios with zeroes and complete them immediately.
1347 if (bio_data_dir(bio
) == READ
) {
1349 cell_release_singleton(cell
, bio
);
1354 r
= alloc_data_block(tc
, &data_block
);
1358 schedule_external_copy(tc
, block
, data_block
, cell
, bio
);
1360 schedule_zero(tc
, block
, data_block
, cell
, bio
);
1368 DMERR("%s: alloc_data_block() failed, error = %d", __func__
, r
);
1374 static void process_bio(struct thin_c
*tc
, struct bio
*bio
)
1377 dm_block_t block
= get_bio_block(tc
, bio
);
1378 struct dm_bio_prison_cell
*cell
;
1379 struct cell_key key
;
1380 struct dm_thin_lookup_result lookup_result
;
1383 * If cell is already occupied, then the block is already
1384 * being provisioned so we have nothing further to do here.
1386 build_virtual_key(tc
->td
, block
, &key
);
1387 if (bio_detain(tc
->pool
->prison
, &key
, bio
, &cell
))
1390 r
= dm_thin_find_block(tc
->td
, block
, 1, &lookup_result
);
1394 * We can release this cell now. This thread is the only
1395 * one that puts bios into a cell, and we know there were
1396 * no preceding bios.
1399 * TODO: this will probably have to change when discard goes
1402 cell_release_singleton(cell
, bio
);
1404 if (lookup_result
.shared
)
1405 process_shared_bio(tc
, bio
, block
, &lookup_result
);
1407 remap_and_issue(tc
, bio
, lookup_result
.block
);
1411 if (bio_data_dir(bio
) == READ
&& tc
->origin_dev
) {
1412 cell_release_singleton(cell
, bio
);
1413 remap_to_origin_and_issue(tc
, bio
);
1415 provision_block(tc
, bio
, block
, cell
);
1419 DMERR("dm_thin_find_block() failed, error = %d", r
);
1420 cell_release_singleton(cell
, bio
);
1426 static int need_commit_due_to_time(struct pool
*pool
)
1428 return jiffies
< pool
->last_commit_jiffies
||
1429 jiffies
> pool
->last_commit_jiffies
+ COMMIT_PERIOD
;
1432 static void process_deferred_bios(struct pool
*pool
)
1434 unsigned long flags
;
1436 struct bio_list bios
;
1439 bio_list_init(&bios
);
1441 spin_lock_irqsave(&pool
->lock
, flags
);
1442 bio_list_merge(&bios
, &pool
->deferred_bios
);
1443 bio_list_init(&pool
->deferred_bios
);
1444 spin_unlock_irqrestore(&pool
->lock
, flags
);
1446 while ((bio
= bio_list_pop(&bios
))) {
1447 struct dm_thin_endio_hook
*h
= dm_get_mapinfo(bio
)->ptr
;
1448 struct thin_c
*tc
= h
->tc
;
1451 * If we've got no free new_mapping structs, and processing
1452 * this bio might require one, we pause until there are some
1453 * prepared mappings to process.
1455 if (ensure_next_mapping(pool
)) {
1456 spin_lock_irqsave(&pool
->lock
, flags
);
1457 bio_list_merge(&pool
->deferred_bios
, &bios
);
1458 spin_unlock_irqrestore(&pool
->lock
, flags
);
1463 if (bio
->bi_rw
& REQ_DISCARD
)
1464 process_discard(tc
, bio
);
1466 process_bio(tc
, bio
);
1470 * If there are any deferred flush bios, we must commit
1471 * the metadata before issuing them.
1473 bio_list_init(&bios
);
1474 spin_lock_irqsave(&pool
->lock
, flags
);
1475 bio_list_merge(&bios
, &pool
->deferred_flush_bios
);
1476 bio_list_init(&pool
->deferred_flush_bios
);
1477 spin_unlock_irqrestore(&pool
->lock
, flags
);
1479 if (bio_list_empty(&bios
) && !need_commit_due_to_time(pool
))
1482 r
= dm_pool_commit_metadata(pool
->pmd
);
1484 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1486 while ((bio
= bio_list_pop(&bios
)))
1490 pool
->last_commit_jiffies
= jiffies
;
1492 while ((bio
= bio_list_pop(&bios
)))
1493 generic_make_request(bio
);
1496 static void do_worker(struct work_struct
*ws
)
1498 struct pool
*pool
= container_of(ws
, struct pool
, worker
);
1500 process_prepared(pool
, &pool
->prepared_mappings
, process_prepared_mapping
);
1501 process_prepared(pool
, &pool
->prepared_discards
, process_prepared_discard
);
1502 process_deferred_bios(pool
);
1506 * We want to commit periodically so that not too much
1507 * unwritten data builds up.
1509 static void do_waker(struct work_struct
*ws
)
1511 struct pool
*pool
= container_of(to_delayed_work(ws
), struct pool
, waker
);
1513 queue_delayed_work(pool
->wq
, &pool
->waker
, COMMIT_PERIOD
);
1516 /*----------------------------------------------------------------*/
1519 * Mapping functions.
1523 * Called only while mapping a thin bio to hand it over to the workqueue.
1525 static void thin_defer_bio(struct thin_c
*tc
, struct bio
*bio
)
1527 unsigned long flags
;
1528 struct pool
*pool
= tc
->pool
;
1530 spin_lock_irqsave(&pool
->lock
, flags
);
1531 bio_list_add(&pool
->deferred_bios
, bio
);
1532 spin_unlock_irqrestore(&pool
->lock
, flags
);
1537 static struct dm_thin_endio_hook
*thin_hook_bio(struct thin_c
*tc
, struct bio
*bio
)
1539 struct pool
*pool
= tc
->pool
;
1540 struct dm_thin_endio_hook
*h
= mempool_alloc(pool
->endio_hook_pool
, GFP_NOIO
);
1543 h
->shared_read_entry
= NULL
;
1544 h
->all_io_entry
= bio
->bi_rw
& REQ_DISCARD
? NULL
: ds_inc(&pool
->all_io_ds
);
1545 h
->overwrite_mapping
= NULL
;
1551 * Non-blocking function called from the thin target's map function.
1553 static int thin_bio_map(struct dm_target
*ti
, struct bio
*bio
,
1554 union map_info
*map_context
)
1557 struct thin_c
*tc
= ti
->private;
1558 dm_block_t block
= get_bio_block(tc
, bio
);
1559 struct dm_thin_device
*td
= tc
->td
;
1560 struct dm_thin_lookup_result result
;
1562 map_context
->ptr
= thin_hook_bio(tc
, bio
);
1563 if (bio
->bi_rw
& (REQ_DISCARD
| REQ_FLUSH
| REQ_FUA
)) {
1564 thin_defer_bio(tc
, bio
);
1565 return DM_MAPIO_SUBMITTED
;
1568 r
= dm_thin_find_block(td
, block
, 0, &result
);
1571 * Note that we defer readahead too.
1575 if (unlikely(result
.shared
)) {
1577 * We have a race condition here between the
1578 * result.shared value returned by the lookup and
1579 * snapshot creation, which may cause new
1582 * To avoid this always quiesce the origin before
1583 * taking the snap. You want to do this anyway to
1584 * ensure a consistent application view
1587 * More distant ancestors are irrelevant. The
1588 * shared flag will be set in their case.
1590 thin_defer_bio(tc
, bio
);
1591 r
= DM_MAPIO_SUBMITTED
;
1593 remap(tc
, bio
, result
.block
);
1594 r
= DM_MAPIO_REMAPPED
;
1600 * In future, the failed dm_thin_find_block above could
1601 * provide the hint to load the metadata into cache.
1604 thin_defer_bio(tc
, bio
);
1605 r
= DM_MAPIO_SUBMITTED
;
1612 static int pool_is_congested(struct dm_target_callbacks
*cb
, int bdi_bits
)
1615 unsigned long flags
;
1616 struct pool_c
*pt
= container_of(cb
, struct pool_c
, callbacks
);
1618 spin_lock_irqsave(&pt
->pool
->lock
, flags
);
1619 r
= !bio_list_empty(&pt
->pool
->retry_on_resume_list
);
1620 spin_unlock_irqrestore(&pt
->pool
->lock
, flags
);
1623 struct request_queue
*q
= bdev_get_queue(pt
->data_dev
->bdev
);
1624 r
= bdi_congested(&q
->backing_dev_info
, bdi_bits
);
1630 static void __requeue_bios(struct pool
*pool
)
1632 bio_list_merge(&pool
->deferred_bios
, &pool
->retry_on_resume_list
);
1633 bio_list_init(&pool
->retry_on_resume_list
);
1636 /*----------------------------------------------------------------
1637 * Binding of control targets to a pool object
1638 *--------------------------------------------------------------*/
1639 static int bind_control_target(struct pool
*pool
, struct dm_target
*ti
)
1641 struct pool_c
*pt
= ti
->private;
1644 pool
->low_water_blocks
= pt
->low_water_blocks
;
1648 * If discard_passdown was enabled verify that the data device
1649 * supports discards. Disable discard_passdown if not; otherwise
1650 * -EOPNOTSUPP will be returned.
1652 if (pt
->pf
.discard_passdown
) {
1653 struct request_queue
*q
= bdev_get_queue(pt
->data_dev
->bdev
);
1654 if (!q
|| !blk_queue_discard(q
)) {
1655 char buf
[BDEVNAME_SIZE
];
1656 DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.",
1657 bdevname(pt
->data_dev
->bdev
, buf
));
1658 pool
->pf
.discard_passdown
= 0;
1665 static void unbind_control_target(struct pool
*pool
, struct dm_target
*ti
)
1671 /*----------------------------------------------------------------
1673 *--------------------------------------------------------------*/
1674 /* Initialize pool features. */
1675 static void pool_features_init(struct pool_features
*pf
)
1677 pf
->zero_new_blocks
= 1;
1678 pf
->discard_enabled
= 1;
1679 pf
->discard_passdown
= 1;
1682 static void __pool_destroy(struct pool
*pool
)
1684 __pool_table_remove(pool
);
1686 if (dm_pool_metadata_close(pool
->pmd
) < 0)
1687 DMWARN("%s: dm_pool_metadata_close() failed.", __func__
);
1689 prison_destroy(pool
->prison
);
1690 dm_kcopyd_client_destroy(pool
->copier
);
1693 destroy_workqueue(pool
->wq
);
1695 if (pool
->next_mapping
)
1696 mempool_free(pool
->next_mapping
, pool
->mapping_pool
);
1697 mempool_destroy(pool
->mapping_pool
);
1698 mempool_destroy(pool
->endio_hook_pool
);
1702 static struct kmem_cache
*_new_mapping_cache
;
1703 static struct kmem_cache
*_endio_hook_cache
;
1705 static struct pool
*pool_create(struct mapped_device
*pool_md
,
1706 struct block_device
*metadata_dev
,
1707 unsigned long block_size
, char **error
)
1712 struct dm_pool_metadata
*pmd
;
1714 pmd
= dm_pool_metadata_open(metadata_dev
, block_size
);
1716 *error
= "Error creating metadata object";
1717 return (struct pool
*)pmd
;
1720 pool
= kmalloc(sizeof(*pool
), GFP_KERNEL
);
1722 *error
= "Error allocating memory for pool";
1723 err_p
= ERR_PTR(-ENOMEM
);
1728 pool
->sectors_per_block
= block_size
;
1729 if (block_size
& (block_size
- 1))
1730 pool
->sectors_per_block_shift
= -1;
1732 pool
->sectors_per_block_shift
= __ffs(block_size
);
1733 pool
->low_water_blocks
= 0;
1734 pool_features_init(&pool
->pf
);
1735 pool
->prison
= prison_create(PRISON_CELLS
);
1736 if (!pool
->prison
) {
1737 *error
= "Error creating pool's bio prison";
1738 err_p
= ERR_PTR(-ENOMEM
);
1742 pool
->copier
= dm_kcopyd_client_create();
1743 if (IS_ERR(pool
->copier
)) {
1744 r
= PTR_ERR(pool
->copier
);
1745 *error
= "Error creating pool's kcopyd client";
1747 goto bad_kcopyd_client
;
1751 * Create singlethreaded workqueue that will service all devices
1752 * that use this metadata.
1754 pool
->wq
= alloc_ordered_workqueue("dm-" DM_MSG_PREFIX
, WQ_MEM_RECLAIM
);
1756 *error
= "Error creating pool's workqueue";
1757 err_p
= ERR_PTR(-ENOMEM
);
1761 INIT_WORK(&pool
->worker
, do_worker
);
1762 INIT_DELAYED_WORK(&pool
->waker
, do_waker
);
1763 spin_lock_init(&pool
->lock
);
1764 bio_list_init(&pool
->deferred_bios
);
1765 bio_list_init(&pool
->deferred_flush_bios
);
1766 INIT_LIST_HEAD(&pool
->prepared_mappings
);
1767 INIT_LIST_HEAD(&pool
->prepared_discards
);
1768 pool
->low_water_triggered
= 0;
1769 pool
->no_free_space
= 0;
1770 bio_list_init(&pool
->retry_on_resume_list
);
1771 ds_init(&pool
->shared_read_ds
);
1772 ds_init(&pool
->all_io_ds
);
1774 pool
->next_mapping
= NULL
;
1775 pool
->mapping_pool
= mempool_create_slab_pool(MAPPING_POOL_SIZE
,
1776 _new_mapping_cache
);
1777 if (!pool
->mapping_pool
) {
1778 *error
= "Error creating pool's mapping mempool";
1779 err_p
= ERR_PTR(-ENOMEM
);
1780 goto bad_mapping_pool
;
1783 pool
->endio_hook_pool
= mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE
,
1785 if (!pool
->endio_hook_pool
) {
1786 *error
= "Error creating pool's endio_hook mempool";
1787 err_p
= ERR_PTR(-ENOMEM
);
1788 goto bad_endio_hook_pool
;
1790 pool
->ref_count
= 1;
1791 pool
->last_commit_jiffies
= jiffies
;
1792 pool
->pool_md
= pool_md
;
1793 pool
->md_dev
= metadata_dev
;
1794 __pool_table_insert(pool
);
1798 bad_endio_hook_pool
:
1799 mempool_destroy(pool
->mapping_pool
);
1801 destroy_workqueue(pool
->wq
);
1803 dm_kcopyd_client_destroy(pool
->copier
);
1805 prison_destroy(pool
->prison
);
1809 if (dm_pool_metadata_close(pmd
))
1810 DMWARN("%s: dm_pool_metadata_close() failed.", __func__
);
1815 static void __pool_inc(struct pool
*pool
)
1817 BUG_ON(!mutex_is_locked(&dm_thin_pool_table
.mutex
));
1821 static void __pool_dec(struct pool
*pool
)
1823 BUG_ON(!mutex_is_locked(&dm_thin_pool_table
.mutex
));
1824 BUG_ON(!pool
->ref_count
);
1825 if (!--pool
->ref_count
)
1826 __pool_destroy(pool
);
1829 static struct pool
*__pool_find(struct mapped_device
*pool_md
,
1830 struct block_device
*metadata_dev
,
1831 unsigned long block_size
, char **error
,
1834 struct pool
*pool
= __pool_table_lookup_metadata_dev(metadata_dev
);
1837 if (pool
->pool_md
!= pool_md
) {
1838 *error
= "metadata device already in use by a pool";
1839 return ERR_PTR(-EBUSY
);
1844 pool
= __pool_table_lookup(pool_md
);
1846 if (pool
->md_dev
!= metadata_dev
) {
1847 *error
= "different pool cannot replace a pool";
1848 return ERR_PTR(-EINVAL
);
1853 pool
= pool_create(pool_md
, metadata_dev
, block_size
, error
);
1861 /*----------------------------------------------------------------
1862 * Pool target methods
1863 *--------------------------------------------------------------*/
1864 static void pool_dtr(struct dm_target
*ti
)
1866 struct pool_c
*pt
= ti
->private;
1868 mutex_lock(&dm_thin_pool_table
.mutex
);
1870 unbind_control_target(pt
->pool
, ti
);
1871 __pool_dec(pt
->pool
);
1872 dm_put_device(ti
, pt
->metadata_dev
);
1873 dm_put_device(ti
, pt
->data_dev
);
1876 mutex_unlock(&dm_thin_pool_table
.mutex
);
1879 static int parse_pool_features(struct dm_arg_set
*as
, struct pool_features
*pf
,
1880 struct dm_target
*ti
)
1884 const char *arg_name
;
1886 static struct dm_arg _args
[] = {
1887 {0, 3, "Invalid number of pool feature arguments"},
1891 * No feature arguments supplied.
1896 r
= dm_read_arg_group(_args
, as
, &argc
, &ti
->error
);
1900 while (argc
&& !r
) {
1901 arg_name
= dm_shift_arg(as
);
1904 if (!strcasecmp(arg_name
, "skip_block_zeroing")) {
1905 pf
->zero_new_blocks
= 0;
1907 } else if (!strcasecmp(arg_name
, "ignore_discard")) {
1908 pf
->discard_enabled
= 0;
1910 } else if (!strcasecmp(arg_name
, "no_discard_passdown")) {
1911 pf
->discard_passdown
= 0;
1915 ti
->error
= "Unrecognised pool feature requested";
1923 * thin-pool <metadata dev> <data dev>
1924 * <data block size (sectors)>
1925 * <low water mark (blocks)>
1926 * [<#feature args> [<arg>]*]
1928 * Optional feature arguments are:
1929 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1930 * ignore_discard: disable discard
1931 * no_discard_passdown: don't pass discards down to the data device
1933 static int pool_ctr(struct dm_target
*ti
, unsigned argc
, char **argv
)
1935 int r
, pool_created
= 0;
1938 struct pool_features pf
;
1939 struct dm_arg_set as
;
1940 struct dm_dev
*data_dev
;
1941 unsigned long block_size
;
1942 dm_block_t low_water_blocks
;
1943 struct dm_dev
*metadata_dev
;
1944 sector_t metadata_dev_size
;
1945 char b
[BDEVNAME_SIZE
];
1948 * FIXME Remove validation from scope of lock.
1950 mutex_lock(&dm_thin_pool_table
.mutex
);
1953 ti
->error
= "Invalid argument count";
1960 r
= dm_get_device(ti
, argv
[0], FMODE_READ
| FMODE_WRITE
, &metadata_dev
);
1962 ti
->error
= "Error opening metadata block device";
1966 metadata_dev_size
= i_size_read(metadata_dev
->bdev
->bd_inode
) >> SECTOR_SHIFT
;
1967 if (metadata_dev_size
> THIN_METADATA_MAX_SECTORS_WARNING
)
1968 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1969 bdevname(metadata_dev
->bdev
, b
), THIN_METADATA_MAX_SECTORS
);
1971 r
= dm_get_device(ti
, argv
[1], FMODE_READ
| FMODE_WRITE
, &data_dev
);
1973 ti
->error
= "Error getting data device";
1977 if (kstrtoul(argv
[2], 10, &block_size
) || !block_size
||
1978 block_size
< DATA_DEV_BLOCK_SIZE_MIN_SECTORS
||
1979 block_size
> DATA_DEV_BLOCK_SIZE_MAX_SECTORS
||
1980 block_size
& (DATA_DEV_BLOCK_SIZE_MIN_SECTORS
- 1)) {
1981 ti
->error
= "Invalid block size";
1986 if (kstrtoull(argv
[3], 10, (unsigned long long *)&low_water_blocks
)) {
1987 ti
->error
= "Invalid low water mark";
1993 * Set default pool features.
1995 pool_features_init(&pf
);
1997 dm_consume_args(&as
, 4);
1998 r
= parse_pool_features(&as
, &pf
, ti
);
2002 pt
= kzalloc(sizeof(*pt
), GFP_KERNEL
);
2008 pool
= __pool_find(dm_table_get_md(ti
->table
), metadata_dev
->bdev
,
2009 block_size
, &ti
->error
, &pool_created
);
2016 * 'pool_created' reflects whether this is the first table load.
2017 * Top level discard support is not allowed to be changed after
2018 * initial load. This would require a pool reload to trigger thin
2021 if (!pool_created
&& pf
.discard_enabled
!= pool
->pf
.discard_enabled
) {
2022 ti
->error
= "Discard support cannot be disabled once enabled";
2024 goto out_flags_changed
;
2028 * The block layer requires discard_granularity to be a power of 2.
2030 if (pf
.discard_enabled
&& !is_power_of_2(block_size
)) {
2031 ti
->error
= "Discard support must be disabled when the block size is not a power of 2";
2033 goto out_flags_changed
;
2038 pt
->metadata_dev
= metadata_dev
;
2039 pt
->data_dev
= data_dev
;
2040 pt
->low_water_blocks
= low_water_blocks
;
2042 ti
->num_flush_requests
= 1;
2044 * Only need to enable discards if the pool should pass
2045 * them down to the data device. The thin device's discard
2046 * processing will cause mappings to be removed from the btree.
2048 if (pf
.discard_enabled
&& pf
.discard_passdown
) {
2049 ti
->num_discard_requests
= 1;
2051 * Setting 'discards_supported' circumvents the normal
2052 * stacking of discard limits (this keeps the pool and
2053 * thin devices' discard limits consistent).
2055 ti
->discards_supported
= 1;
2059 pt
->callbacks
.congested_fn
= pool_is_congested
;
2060 dm_table_add_target_callbacks(ti
->table
, &pt
->callbacks
);
2062 mutex_unlock(&dm_thin_pool_table
.mutex
);
2071 dm_put_device(ti
, data_dev
);
2073 dm_put_device(ti
, metadata_dev
);
2075 mutex_unlock(&dm_thin_pool_table
.mutex
);
2080 static int pool_map(struct dm_target
*ti
, struct bio
*bio
,
2081 union map_info
*map_context
)
2084 struct pool_c
*pt
= ti
->private;
2085 struct pool
*pool
= pt
->pool
;
2086 unsigned long flags
;
2089 * As this is a singleton target, ti->begin is always zero.
2091 spin_lock_irqsave(&pool
->lock
, flags
);
2092 bio
->bi_bdev
= pt
->data_dev
->bdev
;
2093 r
= DM_MAPIO_REMAPPED
;
2094 spin_unlock_irqrestore(&pool
->lock
, flags
);
2100 * Retrieves the number of blocks of the data device from
2101 * the superblock and compares it to the actual device size,
2102 * thus resizing the data device in case it has grown.
2104 * This both copes with opening preallocated data devices in the ctr
2105 * being followed by a resume
2107 * calling the resume method individually after userspace has
2108 * grown the data device in reaction to a table event.
2110 static int pool_preresume(struct dm_target
*ti
)
2113 struct pool_c
*pt
= ti
->private;
2114 struct pool
*pool
= pt
->pool
;
2115 sector_t data_size
= ti
->len
;
2116 dm_block_t sb_data_size
;
2119 * Take control of the pool object.
2121 r
= bind_control_target(pool
, ti
);
2125 (void) sector_div(data_size
, pool
->sectors_per_block
);
2127 r
= dm_pool_get_data_dev_size(pool
->pmd
, &sb_data_size
);
2129 DMERR("failed to retrieve data device size");
2133 if (data_size
< sb_data_size
) {
2134 DMERR("pool target too small, is %llu blocks (expected %llu)",
2135 (unsigned long long)data_size
, sb_data_size
);
2138 } else if (data_size
> sb_data_size
) {
2139 r
= dm_pool_resize_data_dev(pool
->pmd
, data_size
);
2141 DMERR("failed to resize data device");
2145 r
= dm_pool_commit_metadata(pool
->pmd
);
2147 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
2156 static void pool_resume(struct dm_target
*ti
)
2158 struct pool_c
*pt
= ti
->private;
2159 struct pool
*pool
= pt
->pool
;
2160 unsigned long flags
;
2162 spin_lock_irqsave(&pool
->lock
, flags
);
2163 pool
->low_water_triggered
= 0;
2164 pool
->no_free_space
= 0;
2165 __requeue_bios(pool
);
2166 spin_unlock_irqrestore(&pool
->lock
, flags
);
2168 do_waker(&pool
->waker
.work
);
2171 static void pool_postsuspend(struct dm_target
*ti
)
2174 struct pool_c
*pt
= ti
->private;
2175 struct pool
*pool
= pt
->pool
;
2177 cancel_delayed_work(&pool
->waker
);
2178 flush_workqueue(pool
->wq
);
2180 r
= dm_pool_commit_metadata(pool
->pmd
);
2182 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
2184 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
2188 static int check_arg_count(unsigned argc
, unsigned args_required
)
2190 if (argc
!= args_required
) {
2191 DMWARN("Message received with %u arguments instead of %u.",
2192 argc
, args_required
);
2199 static int read_dev_id(char *arg
, dm_thin_id
*dev_id
, int warning
)
2201 if (!kstrtoull(arg
, 10, (unsigned long long *)dev_id
) &&
2202 *dev_id
<= MAX_DEV_ID
)
2206 DMWARN("Message received with invalid device id: %s", arg
);
2211 static int process_create_thin_mesg(unsigned argc
, char **argv
, struct pool
*pool
)
2216 r
= check_arg_count(argc
, 2);
2220 r
= read_dev_id(argv
[1], &dev_id
, 1);
2224 r
= dm_pool_create_thin(pool
->pmd
, dev_id
);
2226 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2234 static int process_create_snap_mesg(unsigned argc
, char **argv
, struct pool
*pool
)
2237 dm_thin_id origin_dev_id
;
2240 r
= check_arg_count(argc
, 3);
2244 r
= read_dev_id(argv
[1], &dev_id
, 1);
2248 r
= read_dev_id(argv
[2], &origin_dev_id
, 1);
2252 r
= dm_pool_create_snap(pool
->pmd
, dev_id
, origin_dev_id
);
2254 DMWARN("Creation of new snapshot %s of device %s failed.",
2262 static int process_delete_mesg(unsigned argc
, char **argv
, struct pool
*pool
)
2267 r
= check_arg_count(argc
, 2);
2271 r
= read_dev_id(argv
[1], &dev_id
, 1);
2275 r
= dm_pool_delete_thin_device(pool
->pmd
, dev_id
);
2277 DMWARN("Deletion of thin device %s failed.", argv
[1]);
2282 static int process_set_transaction_id_mesg(unsigned argc
, char **argv
, struct pool
*pool
)
2284 dm_thin_id old_id
, new_id
;
2287 r
= check_arg_count(argc
, 3);
2291 if (kstrtoull(argv
[1], 10, (unsigned long long *)&old_id
)) {
2292 DMWARN("set_transaction_id message: Unrecognised id %s.", argv
[1]);
2296 if (kstrtoull(argv
[2], 10, (unsigned long long *)&new_id
)) {
2297 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv
[2]);
2301 r
= dm_pool_set_metadata_transaction_id(pool
->pmd
, old_id
, new_id
);
2303 DMWARN("Failed to change transaction id from %s to %s.",
2311 static int process_reserve_metadata_snap_mesg(unsigned argc
, char **argv
, struct pool
*pool
)
2315 r
= check_arg_count(argc
, 1);
2319 r
= dm_pool_commit_metadata(pool
->pmd
);
2321 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
2326 r
= dm_pool_reserve_metadata_snap(pool
->pmd
);
2328 DMWARN("reserve_metadata_snap message failed.");
2333 static int process_release_metadata_snap_mesg(unsigned argc
, char **argv
, struct pool
*pool
)
2337 r
= check_arg_count(argc
, 1);
2341 r
= dm_pool_release_metadata_snap(pool
->pmd
);
2343 DMWARN("release_metadata_snap message failed.");
2349 * Messages supported:
2350 * create_thin <dev_id>
2351 * create_snap <dev_id> <origin_id>
2353 * trim <dev_id> <new_size_in_sectors>
2354 * set_transaction_id <current_trans_id> <new_trans_id>
2355 * reserve_metadata_snap
2356 * release_metadata_snap
2358 static int pool_message(struct dm_target
*ti
, unsigned argc
, char **argv
)
2361 struct pool_c
*pt
= ti
->private;
2362 struct pool
*pool
= pt
->pool
;
2364 if (!strcasecmp(argv
[0], "create_thin"))
2365 r
= process_create_thin_mesg(argc
, argv
, pool
);
2367 else if (!strcasecmp(argv
[0], "create_snap"))
2368 r
= process_create_snap_mesg(argc
, argv
, pool
);
2370 else if (!strcasecmp(argv
[0], "delete"))
2371 r
= process_delete_mesg(argc
, argv
, pool
);
2373 else if (!strcasecmp(argv
[0], "set_transaction_id"))
2374 r
= process_set_transaction_id_mesg(argc
, argv
, pool
);
2376 else if (!strcasecmp(argv
[0], "reserve_metadata_snap"))
2377 r
= process_reserve_metadata_snap_mesg(argc
, argv
, pool
);
2379 else if (!strcasecmp(argv
[0], "release_metadata_snap"))
2380 r
= process_release_metadata_snap_mesg(argc
, argv
, pool
);
2383 DMWARN("Unrecognised thin pool target message received: %s", argv
[0]);
2386 r
= dm_pool_commit_metadata(pool
->pmd
);
2388 DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
2397 * <transaction id> <used metadata sectors>/<total metadata sectors>
2398 * <used data sectors>/<total data sectors> <held metadata root>
2400 static int pool_status(struct dm_target
*ti
, status_type_t type
,
2401 char *result
, unsigned maxlen
)
2405 uint64_t transaction_id
;
2406 dm_block_t nr_free_blocks_data
;
2407 dm_block_t nr_free_blocks_metadata
;
2408 dm_block_t nr_blocks_data
;
2409 dm_block_t nr_blocks_metadata
;
2410 dm_block_t held_root
;
2411 char buf
[BDEVNAME_SIZE
];
2412 char buf2
[BDEVNAME_SIZE
];
2413 struct pool_c
*pt
= ti
->private;
2414 struct pool
*pool
= pt
->pool
;
2417 case STATUSTYPE_INFO
:
2418 r
= dm_pool_get_metadata_transaction_id(pool
->pmd
,
2423 r
= dm_pool_get_free_metadata_block_count(pool
->pmd
,
2424 &nr_free_blocks_metadata
);
2428 r
= dm_pool_get_metadata_dev_size(pool
->pmd
, &nr_blocks_metadata
);
2432 r
= dm_pool_get_free_block_count(pool
->pmd
,
2433 &nr_free_blocks_data
);
2437 r
= dm_pool_get_data_dev_size(pool
->pmd
, &nr_blocks_data
);
2441 r
= dm_pool_get_metadata_snap(pool
->pmd
, &held_root
);
2445 DMEMIT("%llu %llu/%llu %llu/%llu ",
2446 (unsigned long long)transaction_id
,
2447 (unsigned long long)(nr_blocks_metadata
- nr_free_blocks_metadata
),
2448 (unsigned long long)nr_blocks_metadata
,
2449 (unsigned long long)(nr_blocks_data
- nr_free_blocks_data
),
2450 (unsigned long long)nr_blocks_data
);
2453 DMEMIT("%llu", held_root
);
2459 case STATUSTYPE_TABLE
:
2460 DMEMIT("%s %s %lu %llu ",
2461 format_dev_t(buf
, pt
->metadata_dev
->bdev
->bd_dev
),
2462 format_dev_t(buf2
, pt
->data_dev
->bdev
->bd_dev
),
2463 (unsigned long)pool
->sectors_per_block
,
2464 (unsigned long long)pt
->low_water_blocks
);
2466 count
= !pool
->pf
.zero_new_blocks
+ !pool
->pf
.discard_enabled
+
2467 !pt
->pf
.discard_passdown
;
2468 DMEMIT("%u ", count
);
2470 if (!pool
->pf
.zero_new_blocks
)
2471 DMEMIT("skip_block_zeroing ");
2473 if (!pool
->pf
.discard_enabled
)
2474 DMEMIT("ignore_discard ");
2476 if (!pt
->pf
.discard_passdown
)
2477 DMEMIT("no_discard_passdown ");
2485 static int pool_iterate_devices(struct dm_target
*ti
,
2486 iterate_devices_callout_fn fn
, void *data
)
2488 struct pool_c
*pt
= ti
->private;
2490 return fn(ti
, pt
->data_dev
, 0, ti
->len
, data
);
2493 static int pool_merge(struct dm_target
*ti
, struct bvec_merge_data
*bvm
,
2494 struct bio_vec
*biovec
, int max_size
)
2496 struct pool_c
*pt
= ti
->private;
2497 struct request_queue
*q
= bdev_get_queue(pt
->data_dev
->bdev
);
2499 if (!q
->merge_bvec_fn
)
2502 bvm
->bi_bdev
= pt
->data_dev
->bdev
;
2504 return min(max_size
, q
->merge_bvec_fn(q
, bvm
, biovec
));
2507 static void set_discard_limits(struct pool
*pool
, struct queue_limits
*limits
)
2510 * FIXME: these limits may be incompatible with the pool's data device
2512 limits
->max_discard_sectors
= pool
->sectors_per_block
;
2515 * This is just a hint, and not enforced. We have to cope with
2516 * bios that cover a block partially. A discard that spans a block
2517 * boundary is not sent to this target.
2519 limits
->discard_granularity
= pool
->sectors_per_block
<< SECTOR_SHIFT
;
2520 limits
->discard_zeroes_data
= pool
->pf
.zero_new_blocks
;
2523 static void pool_io_hints(struct dm_target
*ti
, struct queue_limits
*limits
)
2525 struct pool_c
*pt
= ti
->private;
2526 struct pool
*pool
= pt
->pool
;
2528 blk_limits_io_min(limits
, 0);
2529 blk_limits_io_opt(limits
, pool
->sectors_per_block
<< SECTOR_SHIFT
);
2530 if (pool
->pf
.discard_enabled
)
2531 set_discard_limits(pool
, limits
);
2534 static struct target_type pool_target
= {
2535 .name
= "thin-pool",
2536 .features
= DM_TARGET_SINGLETON
| DM_TARGET_ALWAYS_WRITEABLE
|
2537 DM_TARGET_IMMUTABLE
,
2538 .version
= {1, 2, 0},
2539 .module
= THIS_MODULE
,
2543 .postsuspend
= pool_postsuspend
,
2544 .preresume
= pool_preresume
,
2545 .resume
= pool_resume
,
2546 .message
= pool_message
,
2547 .status
= pool_status
,
2548 .merge
= pool_merge
,
2549 .iterate_devices
= pool_iterate_devices
,
2550 .io_hints
= pool_io_hints
,
2553 /*----------------------------------------------------------------
2554 * Thin target methods
2555 *--------------------------------------------------------------*/
2556 static void thin_dtr(struct dm_target
*ti
)
2558 struct thin_c
*tc
= ti
->private;
2560 mutex_lock(&dm_thin_pool_table
.mutex
);
2562 __pool_dec(tc
->pool
);
2563 dm_pool_close_thin_device(tc
->td
);
2564 dm_put_device(ti
, tc
->pool_dev
);
2566 dm_put_device(ti
, tc
->origin_dev
);
2569 mutex_unlock(&dm_thin_pool_table
.mutex
);
2573 * Thin target parameters:
2575 * <pool_dev> <dev_id> [origin_dev]
2577 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2578 * dev_id: the internal device identifier
2579 * origin_dev: a device external to the pool that should act as the origin
2581 * If the pool device has discards disabled, they get disabled for the thin
2584 static int thin_ctr(struct dm_target
*ti
, unsigned argc
, char **argv
)
2588 struct dm_dev
*pool_dev
, *origin_dev
;
2589 struct mapped_device
*pool_md
;
2591 mutex_lock(&dm_thin_pool_table
.mutex
);
2593 if (argc
!= 2 && argc
!= 3) {
2594 ti
->error
= "Invalid argument count";
2599 tc
= ti
->private = kzalloc(sizeof(*tc
), GFP_KERNEL
);
2601 ti
->error
= "Out of memory";
2607 r
= dm_get_device(ti
, argv
[2], FMODE_READ
, &origin_dev
);
2609 ti
->error
= "Error opening origin device";
2610 goto bad_origin_dev
;
2612 tc
->origin_dev
= origin_dev
;
2615 r
= dm_get_device(ti
, argv
[0], dm_table_get_mode(ti
->table
), &pool_dev
);
2617 ti
->error
= "Error opening pool device";
2620 tc
->pool_dev
= pool_dev
;
2622 if (read_dev_id(argv
[1], (unsigned long long *)&tc
->dev_id
, 0)) {
2623 ti
->error
= "Invalid device id";
2628 pool_md
= dm_get_md(tc
->pool_dev
->bdev
->bd_dev
);
2630 ti
->error
= "Couldn't get pool mapped device";
2635 tc
->pool
= __pool_table_lookup(pool_md
);
2637 ti
->error
= "Couldn't find pool object";
2639 goto bad_pool_lookup
;
2641 __pool_inc(tc
->pool
);
2643 r
= dm_pool_open_thin_device(tc
->pool
->pmd
, tc
->dev_id
, &tc
->td
);
2645 ti
->error
= "Couldn't open thin internal device";
2649 r
= dm_set_target_max_io_len(ti
, tc
->pool
->sectors_per_block
);
2653 ti
->num_flush_requests
= 1;
2655 /* In case the pool supports discards, pass them on. */
2656 if (tc
->pool
->pf
.discard_enabled
) {
2657 ti
->discards_supported
= 1;
2658 ti
->num_discard_requests
= 1;
2659 ti
->discard_zeroes_data_unsupported
= 1;
2660 /* Discard requests must be split on a block boundary */
2661 ti
->split_discard_requests
= 1;
2666 mutex_unlock(&dm_thin_pool_table
.mutex
);
2671 __pool_dec(tc
->pool
);
2675 dm_put_device(ti
, tc
->pool_dev
);
2678 dm_put_device(ti
, tc
->origin_dev
);
2682 mutex_unlock(&dm_thin_pool_table
.mutex
);
2687 static int thin_map(struct dm_target
*ti
, struct bio
*bio
,
2688 union map_info
*map_context
)
2690 bio
->bi_sector
= dm_target_offset(ti
, bio
->bi_sector
);
2692 return thin_bio_map(ti
, bio
, map_context
);
2695 static int thin_endio(struct dm_target
*ti
,
2696 struct bio
*bio
, int err
,
2697 union map_info
*map_context
)
2699 unsigned long flags
;
2700 struct dm_thin_endio_hook
*h
= map_context
->ptr
;
2701 struct list_head work
;
2702 struct dm_thin_new_mapping
*m
, *tmp
;
2703 struct pool
*pool
= h
->tc
->pool
;
2705 if (h
->shared_read_entry
) {
2706 INIT_LIST_HEAD(&work
);
2707 ds_dec(h
->shared_read_entry
, &work
);
2709 spin_lock_irqsave(&pool
->lock
, flags
);
2710 list_for_each_entry_safe(m
, tmp
, &work
, list
) {
2713 __maybe_add_mapping(m
);
2715 spin_unlock_irqrestore(&pool
->lock
, flags
);
2718 if (h
->all_io_entry
) {
2719 INIT_LIST_HEAD(&work
);
2720 ds_dec(h
->all_io_entry
, &work
);
2721 spin_lock_irqsave(&pool
->lock
, flags
);
2722 list_for_each_entry_safe(m
, tmp
, &work
, list
)
2723 list_add(&m
->list
, &pool
->prepared_discards
);
2724 spin_unlock_irqrestore(&pool
->lock
, flags
);
2727 mempool_free(h
, pool
->endio_hook_pool
);
2732 static void thin_postsuspend(struct dm_target
*ti
)
2734 if (dm_noflush_suspending(ti
))
2735 requeue_io((struct thin_c
*)ti
->private);
2739 * <nr mapped sectors> <highest mapped sector>
2741 static int thin_status(struct dm_target
*ti
, status_type_t type
,
2742 char *result
, unsigned maxlen
)
2746 dm_block_t mapped
, highest
;
2747 char buf
[BDEVNAME_SIZE
];
2748 struct thin_c
*tc
= ti
->private;
2754 case STATUSTYPE_INFO
:
2755 r
= dm_thin_get_mapped_count(tc
->td
, &mapped
);
2759 r
= dm_thin_get_highest_mapped_block(tc
->td
, &highest
);
2763 DMEMIT("%llu ", mapped
* tc
->pool
->sectors_per_block
);
2765 DMEMIT("%llu", ((highest
+ 1) *
2766 tc
->pool
->sectors_per_block
) - 1);
2771 case STATUSTYPE_TABLE
:
2773 format_dev_t(buf
, tc
->pool_dev
->bdev
->bd_dev
),
2774 (unsigned long) tc
->dev_id
);
2776 DMEMIT(" %s", format_dev_t(buf
, tc
->origin_dev
->bdev
->bd_dev
));
2784 static int thin_iterate_devices(struct dm_target
*ti
,
2785 iterate_devices_callout_fn fn
, void *data
)
2788 struct thin_c
*tc
= ti
->private;
2789 struct pool
*pool
= tc
->pool
;
2792 * We can't call dm_pool_get_data_dev_size() since that blocks. So
2793 * we follow a more convoluted path through to the pool's target.
2796 return 0; /* nothing is bound */
2798 blocks
= pool
->ti
->len
;
2799 (void) sector_div(blocks
, pool
->sectors_per_block
);
2801 return fn(ti
, tc
->pool_dev
, 0, pool
->sectors_per_block
* blocks
, data
);
2806 static void thin_io_hints(struct dm_target
*ti
, struct queue_limits
*limits
)
2808 struct thin_c
*tc
= ti
->private;
2809 struct pool
*pool
= tc
->pool
;
2811 blk_limits_io_min(limits
, 0);
2812 blk_limits_io_opt(limits
, pool
->sectors_per_block
<< SECTOR_SHIFT
);
2813 set_discard_limits(pool
, limits
);
2816 static struct target_type thin_target
= {
2818 .version
= {1, 2, 0},
2819 .module
= THIS_MODULE
,
2823 .end_io
= thin_endio
,
2824 .postsuspend
= thin_postsuspend
,
2825 .status
= thin_status
,
2826 .iterate_devices
= thin_iterate_devices
,
2827 .io_hints
= thin_io_hints
,
2830 /*----------------------------------------------------------------*/
2832 static int __init
dm_thin_init(void)
2838 r
= dm_register_target(&thin_target
);
2842 r
= dm_register_target(&pool_target
);
2844 goto bad_pool_target
;
2848 _cell_cache
= KMEM_CACHE(dm_bio_prison_cell
, 0);
2850 goto bad_cell_cache
;
2852 _new_mapping_cache
= KMEM_CACHE(dm_thin_new_mapping
, 0);
2853 if (!_new_mapping_cache
)
2854 goto bad_new_mapping_cache
;
2856 _endio_hook_cache
= KMEM_CACHE(dm_thin_endio_hook
, 0);
2857 if (!_endio_hook_cache
)
2858 goto bad_endio_hook_cache
;
2862 bad_endio_hook_cache
:
2863 kmem_cache_destroy(_new_mapping_cache
);
2864 bad_new_mapping_cache
:
2865 kmem_cache_destroy(_cell_cache
);
2867 dm_unregister_target(&pool_target
);
2869 dm_unregister_target(&thin_target
);
2874 static void dm_thin_exit(void)
2876 dm_unregister_target(&thin_target
);
2877 dm_unregister_target(&pool_target
);
2879 kmem_cache_destroy(_cell_cache
);
2880 kmem_cache_destroy(_new_mapping_cache
);
2881 kmem_cache_destroy(_endio_hook_cache
);
2884 module_init(dm_thin_init
);
2885 module_exit(dm_thin_exit
);
2887 MODULE_DESCRIPTION(DM_NAME
" thin provisioning target");
2888 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2889 MODULE_LICENSE("GPL");