dm thin: support discards
[deliverable/linux.git] / drivers / md / dm-thin.c
CommitLineData
991d9fa0
JT
1/*
2 * Copyright (C) 2011 Red Hat UK.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-thin-metadata.h"
8
9#include <linux/device-mapper.h>
10#include <linux/dm-io.h>
11#include <linux/dm-kcopyd.h>
12#include <linux/list.h>
13#include <linux/init.h>
14#include <linux/module.h>
15#include <linux/slab.h>
16
17#define DM_MSG_PREFIX "thin"
18
19/*
20 * Tunable constants
21 */
22#define ENDIO_HOOK_POOL_SIZE 10240
23#define DEFERRED_SET_SIZE 64
24#define MAPPING_POOL_SIZE 1024
25#define PRISON_CELLS 1024
905e51b3 26#define COMMIT_PERIOD HZ
991d9fa0
JT
27
28/*
29 * The block size of the device holding pool data must be
30 * between 64KB and 1GB.
31 */
32#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
33#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
34
991d9fa0
JT
35/*
36 * Device id is restricted to 24 bits.
37 */
38#define MAX_DEV_ID ((1 << 24) - 1)
39
40/*
41 * How do we handle breaking sharing of data blocks?
42 * =================================================
43 *
44 * We use a standard copy-on-write btree to store the mappings for the
45 * devices (note I'm talking about copy-on-write of the metadata here, not
46 * the data). When you take an internal snapshot you clone the root node
47 * of the origin btree. After this there is no concept of an origin or a
48 * snapshot. They are just two device trees that happen to point to the
49 * same data blocks.
50 *
51 * When we get a write in we decide if it's to a shared data block using
52 * some timestamp magic. If it is, we have to break sharing.
53 *
54 * Let's say we write to a shared block in what was the origin. The
55 * steps are:
56 *
57 * i) plug io further to this physical block. (see bio_prison code).
58 *
59 * ii) quiesce any read io to that shared data block. Obviously
60 * including all devices that share this block. (see deferred_set code)
61 *
62 * iii) copy the data block to a newly allocate block. This step can be
63 * missed out if the io covers the block. (schedule_copy).
64 *
65 * iv) insert the new mapping into the origin's btree
fe878f34 66 * (process_prepared_mapping). This act of inserting breaks some
991d9fa0
JT
67 * sharing of btree nodes between the two devices. Breaking sharing only
68 * effects the btree of that specific device. Btrees for the other
69 * devices that share the block never change. The btree for the origin
70 * device as it was after the last commit is untouched, ie. we're using
71 * persistent data structures in the functional programming sense.
72 *
73 * v) unplug io to this physical block, including the io that triggered
74 * the breaking of sharing.
75 *
76 * Steps (ii) and (iii) occur in parallel.
77 *
78 * The metadata _doesn't_ need to be committed before the io continues. We
79 * get away with this because the io is always written to a _new_ block.
80 * If there's a crash, then:
81 *
82 * - The origin mapping will point to the old origin block (the shared
83 * one). This will contain the data as it was before the io that triggered
84 * the breaking of sharing came in.
85 *
86 * - The snap mapping still points to the old block. As it would after
87 * the commit.
88 *
89 * The downside of this scheme is the timestamp magic isn't perfect, and
90 * will continue to think that data block in the snapshot device is shared
91 * even after the write to the origin has broken sharing. I suspect data
92 * blocks will typically be shared by many different devices, so we're
93 * breaking sharing n + 1 times, rather than n, where n is the number of
94 * devices that reference this data block. At the moment I think the
95 * benefits far, far outweigh the disadvantages.
96 */
97
98/*----------------------------------------------------------------*/
99
100/*
101 * Sometimes we can't deal with a bio straight away. We put them in prison
102 * where they can't cause any mischief. Bios are put in a cell identified
103 * by a key, multiple bios can be in the same cell. When the cell is
104 * subsequently unlocked the bios become available.
105 */
106struct bio_prison;
107
108struct cell_key {
109 int virtual;
110 dm_thin_id dev;
111 dm_block_t block;
112};
113
114struct cell {
115 struct hlist_node list;
116 struct bio_prison *prison;
117 struct cell_key key;
6f94a4c4 118 struct bio *holder;
991d9fa0
JT
119 struct bio_list bios;
120};
121
122struct bio_prison {
123 spinlock_t lock;
124 mempool_t *cell_pool;
125
126 unsigned nr_buckets;
127 unsigned hash_mask;
128 struct hlist_head *cells;
129};
130
131static uint32_t calc_nr_buckets(unsigned nr_cells)
132{
133 uint32_t n = 128;
134
135 nr_cells /= 4;
136 nr_cells = min(nr_cells, 8192u);
137
138 while (n < nr_cells)
139 n <<= 1;
140
141 return n;
142}
143
144/*
145 * @nr_cells should be the number of cells you want in use _concurrently_.
146 * Don't confuse it with the number of distinct keys.
147 */
148static struct bio_prison *prison_create(unsigned nr_cells)
149{
150 unsigned i;
151 uint32_t nr_buckets = calc_nr_buckets(nr_cells);
152 size_t len = sizeof(struct bio_prison) +
153 (sizeof(struct hlist_head) * nr_buckets);
154 struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
155
156 if (!prison)
157 return NULL;
158
159 spin_lock_init(&prison->lock);
160 prison->cell_pool = mempool_create_kmalloc_pool(nr_cells,
161 sizeof(struct cell));
162 if (!prison->cell_pool) {
163 kfree(prison);
164 return NULL;
165 }
166
167 prison->nr_buckets = nr_buckets;
168 prison->hash_mask = nr_buckets - 1;
169 prison->cells = (struct hlist_head *) (prison + 1);
170 for (i = 0; i < nr_buckets; i++)
171 INIT_HLIST_HEAD(prison->cells + i);
172
173 return prison;
174}
175
176static void prison_destroy(struct bio_prison *prison)
177{
178 mempool_destroy(prison->cell_pool);
179 kfree(prison);
180}
181
182static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key)
183{
184 const unsigned long BIG_PRIME = 4294967291UL;
185 uint64_t hash = key->block * BIG_PRIME;
186
187 return (uint32_t) (hash & prison->hash_mask);
188}
189
190static int keys_equal(struct cell_key *lhs, struct cell_key *rhs)
191{
192 return (lhs->virtual == rhs->virtual) &&
193 (lhs->dev == rhs->dev) &&
194 (lhs->block == rhs->block);
195}
196
197static struct cell *__search_bucket(struct hlist_head *bucket,
198 struct cell_key *key)
199{
200 struct cell *cell;
201 struct hlist_node *tmp;
202
203 hlist_for_each_entry(cell, tmp, bucket, list)
204 if (keys_equal(&cell->key, key))
205 return cell;
206
207 return NULL;
208}
209
210/*
211 * This may block if a new cell needs allocating. You must ensure that
212 * cells will be unlocked even if the calling thread is blocked.
213 *
6f94a4c4 214 * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
991d9fa0
JT
215 */
216static int bio_detain(struct bio_prison *prison, struct cell_key *key,
217 struct bio *inmate, struct cell **ref)
218{
6f94a4c4 219 int r = 1;
991d9fa0
JT
220 unsigned long flags;
221 uint32_t hash = hash_key(prison, key);
6f94a4c4 222 struct cell *cell, *cell2;
991d9fa0
JT
223
224 BUG_ON(hash > prison->nr_buckets);
225
226 spin_lock_irqsave(&prison->lock, flags);
991d9fa0 227
6f94a4c4
JT
228 cell = __search_bucket(prison->cells + hash, key);
229 if (cell) {
230 bio_list_add(&cell->bios, inmate);
231 goto out;
991d9fa0
JT
232 }
233
6f94a4c4
JT
234 /*
235 * Allocate a new cell
236 */
991d9fa0 237 spin_unlock_irqrestore(&prison->lock, flags);
6f94a4c4
JT
238 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
239 spin_lock_irqsave(&prison->lock, flags);
991d9fa0 240
6f94a4c4
JT
241 /*
242 * We've been unlocked, so we have to double check that
243 * nobody else has inserted this cell in the meantime.
244 */
245 cell = __search_bucket(prison->cells + hash, key);
246 if (cell) {
991d9fa0 247 mempool_free(cell2, prison->cell_pool);
6f94a4c4
JT
248 bio_list_add(&cell->bios, inmate);
249 goto out;
250 }
251
252 /*
253 * Use new cell.
254 */
255 cell = cell2;
256
257 cell->prison = prison;
258 memcpy(&cell->key, key, sizeof(cell->key));
259 cell->holder = inmate;
260 bio_list_init(&cell->bios);
261 hlist_add_head(&cell->list, prison->cells + hash);
262
263 r = 0;
264
265out:
266 spin_unlock_irqrestore(&prison->lock, flags);
991d9fa0
JT
267
268 *ref = cell;
269
270 return r;
271}
272
273/*
274 * @inmates must have been initialised prior to this call
275 */
276static void __cell_release(struct cell *cell, struct bio_list *inmates)
277{
278 struct bio_prison *prison = cell->prison;
279
280 hlist_del(&cell->list);
281
6f94a4c4
JT
282 bio_list_add(inmates, cell->holder);
283 bio_list_merge(inmates, &cell->bios);
991d9fa0
JT
284
285 mempool_free(cell, prison->cell_pool);
286}
287
288static void cell_release(struct cell *cell, struct bio_list *bios)
289{
290 unsigned long flags;
291 struct bio_prison *prison = cell->prison;
292
293 spin_lock_irqsave(&prison->lock, flags);
294 __cell_release(cell, bios);
295 spin_unlock_irqrestore(&prison->lock, flags);
296}
297
298/*
299 * There are a couple of places where we put a bio into a cell briefly
300 * before taking it out again. In these situations we know that no other
301 * bio may be in the cell. This function releases the cell, and also does
302 * a sanity check.
303 */
6f94a4c4
JT
304static void __cell_release_singleton(struct cell *cell, struct bio *bio)
305{
306 hlist_del(&cell->list);
307 BUG_ON(cell->holder != bio);
308 BUG_ON(!bio_list_empty(&cell->bios));
309}
310
991d9fa0
JT
311static void cell_release_singleton(struct cell *cell, struct bio *bio)
312{
991d9fa0 313 unsigned long flags;
6f94a4c4 314 struct bio_prison *prison = cell->prison;
991d9fa0
JT
315
316 spin_lock_irqsave(&prison->lock, flags);
6f94a4c4 317 __cell_release_singleton(cell, bio);
991d9fa0 318 spin_unlock_irqrestore(&prison->lock, flags);
6f94a4c4
JT
319}
320
321/*
322 * Sometimes we don't want the holder, just the additional bios.
323 */
324static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
325{
326 struct bio_prison *prison = cell->prison;
327
328 hlist_del(&cell->list);
329 bio_list_merge(inmates, &cell->bios);
330
331 mempool_free(cell, prison->cell_pool);
332}
333
334static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
335{
336 unsigned long flags;
337 struct bio_prison *prison = cell->prison;
991d9fa0 338
6f94a4c4
JT
339 spin_lock_irqsave(&prison->lock, flags);
340 __cell_release_no_holder(cell, inmates);
341 spin_unlock_irqrestore(&prison->lock, flags);
991d9fa0
JT
342}
343
344static void cell_error(struct cell *cell)
345{
346 struct bio_prison *prison = cell->prison;
347 struct bio_list bios;
348 struct bio *bio;
349 unsigned long flags;
350
351 bio_list_init(&bios);
352
353 spin_lock_irqsave(&prison->lock, flags);
354 __cell_release(cell, &bios);
355 spin_unlock_irqrestore(&prison->lock, flags);
356
357 while ((bio = bio_list_pop(&bios)))
358 bio_io_error(bio);
359}
360
361/*----------------------------------------------------------------*/
362
363/*
364 * We use the deferred set to keep track of pending reads to shared blocks.
365 * We do this to ensure the new mapping caused by a write isn't performed
366 * until these prior reads have completed. Otherwise the insertion of the
367 * new mapping could free the old block that the read bios are mapped to.
368 */
369
370struct deferred_set;
371struct deferred_entry {
372 struct deferred_set *ds;
373 unsigned count;
374 struct list_head work_items;
375};
376
377struct deferred_set {
378 spinlock_t lock;
379 unsigned current_entry;
380 unsigned sweeper;
381 struct deferred_entry entries[DEFERRED_SET_SIZE];
382};
383
384static void ds_init(struct deferred_set *ds)
385{
386 int i;
387
388 spin_lock_init(&ds->lock);
389 ds->current_entry = 0;
390 ds->sweeper = 0;
391 for (i = 0; i < DEFERRED_SET_SIZE; i++) {
392 ds->entries[i].ds = ds;
393 ds->entries[i].count = 0;
394 INIT_LIST_HEAD(&ds->entries[i].work_items);
395 }
396}
397
398static struct deferred_entry *ds_inc(struct deferred_set *ds)
399{
400 unsigned long flags;
401 struct deferred_entry *entry;
402
403 spin_lock_irqsave(&ds->lock, flags);
404 entry = ds->entries + ds->current_entry;
405 entry->count++;
406 spin_unlock_irqrestore(&ds->lock, flags);
407
408 return entry;
409}
410
411static unsigned ds_next(unsigned index)
412{
413 return (index + 1) % DEFERRED_SET_SIZE;
414}
415
416static void __sweep(struct deferred_set *ds, struct list_head *head)
417{
418 while ((ds->sweeper != ds->current_entry) &&
419 !ds->entries[ds->sweeper].count) {
420 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
421 ds->sweeper = ds_next(ds->sweeper);
422 }
423
424 if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
425 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
426}
427
428static void ds_dec(struct deferred_entry *entry, struct list_head *head)
429{
430 unsigned long flags;
431
432 spin_lock_irqsave(&entry->ds->lock, flags);
433 BUG_ON(!entry->count);
434 --entry->count;
435 __sweep(entry->ds, head);
436 spin_unlock_irqrestore(&entry->ds->lock, flags);
437}
438
439/*
440 * Returns 1 if deferred or 0 if no pending items to delay job.
441 */
442static int ds_add_work(struct deferred_set *ds, struct list_head *work)
443{
444 int r = 1;
445 unsigned long flags;
446 unsigned next_entry;
447
448 spin_lock_irqsave(&ds->lock, flags);
449 if ((ds->sweeper == ds->current_entry) &&
450 !ds->entries[ds->current_entry].count)
451 r = 0;
452 else {
453 list_add(work, &ds->entries[ds->current_entry].work_items);
454 next_entry = ds_next(ds->current_entry);
455 if (!ds->entries[next_entry].count)
456 ds->current_entry = next_entry;
457 }
458 spin_unlock_irqrestore(&ds->lock, flags);
459
460 return r;
461}
462
463/*----------------------------------------------------------------*/
464
465/*
466 * Key building.
467 */
468static void build_data_key(struct dm_thin_device *td,
469 dm_block_t b, struct cell_key *key)
470{
471 key->virtual = 0;
472 key->dev = dm_thin_dev_id(td);
473 key->block = b;
474}
475
476static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
477 struct cell_key *key)
478{
479 key->virtual = 1;
480 key->dev = dm_thin_dev_id(td);
481 key->block = b;
482}
483
484/*----------------------------------------------------------------*/
485
486/*
487 * A pool device ties together a metadata device and a data device. It
488 * also provides the interface for creating and destroying internal
489 * devices.
490 */
491struct new_mapping;
492struct pool {
493 struct list_head list;
494 struct dm_target *ti; /* Only set if a pool target is bound */
495
496 struct mapped_device *pool_md;
497 struct block_device *md_dev;
498 struct dm_pool_metadata *pmd;
499
500 uint32_t sectors_per_block;
501 unsigned block_shift;
502 dm_block_t offset_mask;
503 dm_block_t low_water_blocks;
504
505 unsigned zero_new_blocks:1;
506 unsigned low_water_triggered:1; /* A dm event has been sent */
507 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
508
509 struct bio_prison *prison;
510 struct dm_kcopyd_client *copier;
511
512 struct workqueue_struct *wq;
513 struct work_struct worker;
905e51b3 514 struct delayed_work waker;
991d9fa0
JT
515
516 unsigned ref_count;
905e51b3 517 unsigned long last_commit_jiffies;
991d9fa0
JT
518
519 spinlock_t lock;
520 struct bio_list deferred_bios;
521 struct bio_list deferred_flush_bios;
522 struct list_head prepared_mappings;
104655fd 523 struct list_head prepared_discards;
991d9fa0
JT
524
525 struct bio_list retry_on_resume_list;
526
eb2aa48d 527 struct deferred_set shared_read_ds;
104655fd 528 struct deferred_set all_io_ds;
991d9fa0
JT
529
530 struct new_mapping *next_mapping;
531 mempool_t *mapping_pool;
532 mempool_t *endio_hook_pool;
533};
534
535/*
536 * Target context for a pool.
537 */
538struct pool_c {
539 struct dm_target *ti;
540 struct pool *pool;
541 struct dm_dev *data_dev;
542 struct dm_dev *metadata_dev;
543 struct dm_target_callbacks callbacks;
544
545 dm_block_t low_water_blocks;
546 unsigned zero_new_blocks:1;
547};
548
549/*
550 * Target context for a thin.
551 */
552struct thin_c {
553 struct dm_dev *pool_dev;
2dd9c257 554 struct dm_dev *origin_dev;
991d9fa0
JT
555 dm_thin_id dev_id;
556
557 struct pool *pool;
558 struct dm_thin_device *td;
559};
560
561/*----------------------------------------------------------------*/
562
563/*
564 * A global list of pools that uses a struct mapped_device as a key.
565 */
566static struct dm_thin_pool_table {
567 struct mutex mutex;
568 struct list_head pools;
569} dm_thin_pool_table;
570
571static void pool_table_init(void)
572{
573 mutex_init(&dm_thin_pool_table.mutex);
574 INIT_LIST_HEAD(&dm_thin_pool_table.pools);
575}
576
577static void __pool_table_insert(struct pool *pool)
578{
579 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
580 list_add(&pool->list, &dm_thin_pool_table.pools);
581}
582
583static void __pool_table_remove(struct pool *pool)
584{
585 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
586 list_del(&pool->list);
587}
588
589static struct pool *__pool_table_lookup(struct mapped_device *md)
590{
591 struct pool *pool = NULL, *tmp;
592
593 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
594
595 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
596 if (tmp->pool_md == md) {
597 pool = tmp;
598 break;
599 }
600 }
601
602 return pool;
603}
604
605static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
606{
607 struct pool *pool = NULL, *tmp;
608
609 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
610
611 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
612 if (tmp->md_dev == md_dev) {
613 pool = tmp;
614 break;
615 }
616 }
617
618 return pool;
619}
620
621/*----------------------------------------------------------------*/
622
eb2aa48d
JT
623struct endio_hook {
624 struct thin_c *tc;
625 struct deferred_entry *shared_read_entry;
104655fd 626 struct deferred_entry *all_io_entry;
eb2aa48d
JT
627 struct new_mapping *overwrite_mapping;
628};
629
991d9fa0
JT
630static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
631{
632 struct bio *bio;
633 struct bio_list bios;
634
635 bio_list_init(&bios);
636 bio_list_merge(&bios, master);
637 bio_list_init(master);
638
639 while ((bio = bio_list_pop(&bios))) {
eb2aa48d
JT
640 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
641 if (h->tc == tc)
991d9fa0
JT
642 bio_endio(bio, DM_ENDIO_REQUEUE);
643 else
644 bio_list_add(master, bio);
645 }
646}
647
648static void requeue_io(struct thin_c *tc)
649{
650 struct pool *pool = tc->pool;
651 unsigned long flags;
652
653 spin_lock_irqsave(&pool->lock, flags);
654 __requeue_bio_list(tc, &pool->deferred_bios);
655 __requeue_bio_list(tc, &pool->retry_on_resume_list);
656 spin_unlock_irqrestore(&pool->lock, flags);
657}
658
659/*
660 * This section of code contains the logic for processing a thin device's IO.
661 * Much of the code depends on pool object resources (lists, workqueues, etc)
662 * but most is exclusively called from the thin target rather than the thin-pool
663 * target.
664 */
665
666static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
667{
668 return bio->bi_sector >> tc->pool->block_shift;
669}
670
671static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
672{
673 struct pool *pool = tc->pool;
674
675 bio->bi_bdev = tc->pool_dev->bdev;
676 bio->bi_sector = (block << pool->block_shift) +
677 (bio->bi_sector & pool->offset_mask);
678}
679
2dd9c257
JT
680static void remap_to_origin(struct thin_c *tc, struct bio *bio)
681{
682 bio->bi_bdev = tc->origin_dev->bdev;
683}
684
685static void issue(struct thin_c *tc, struct bio *bio)
991d9fa0
JT
686{
687 struct pool *pool = tc->pool;
688 unsigned long flags;
689
991d9fa0
JT
690 /*
691 * Batch together any FUA/FLUSH bios we find and then issue
692 * a single commit for them in process_deferred_bios().
693 */
694 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
695 spin_lock_irqsave(&pool->lock, flags);
696 bio_list_add(&pool->deferred_flush_bios, bio);
697 spin_unlock_irqrestore(&pool->lock, flags);
698 } else
699 generic_make_request(bio);
700}
701
2dd9c257
JT
702static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
703{
704 remap_to_origin(tc, bio);
705 issue(tc, bio);
706}
707
708static void remap_and_issue(struct thin_c *tc, struct bio *bio,
709 dm_block_t block)
710{
711 remap(tc, bio, block);
712 issue(tc, bio);
713}
714
991d9fa0
JT
715/*
716 * wake_worker() is used when new work is queued and when pool_resume is
717 * ready to continue deferred IO processing.
718 */
719static void wake_worker(struct pool *pool)
720{
721 queue_work(pool->wq, &pool->worker);
722}
723
724/*----------------------------------------------------------------*/
725
726/*
727 * Bio endio functions.
728 */
991d9fa0
JT
729struct new_mapping {
730 struct list_head list;
731
eb2aa48d
JT
732 unsigned quiesced:1;
733 unsigned prepared:1;
104655fd 734 unsigned pass_discard:1;
991d9fa0
JT
735
736 struct thin_c *tc;
737 dm_block_t virt_block;
738 dm_block_t data_block;
104655fd 739 struct cell *cell, *cell2;
991d9fa0
JT
740 int err;
741
742 /*
743 * If the bio covers the whole area of a block then we can avoid
744 * zeroing or copying. Instead this bio is hooked. The bio will
745 * still be in the cell, so care has to be taken to avoid issuing
746 * the bio twice.
747 */
748 struct bio *bio;
749 bio_end_io_t *saved_bi_end_io;
750};
751
752static void __maybe_add_mapping(struct new_mapping *m)
753{
754 struct pool *pool = m->tc->pool;
755
eb2aa48d 756 if (m->quiesced && m->prepared) {
991d9fa0
JT
757 list_add(&m->list, &pool->prepared_mappings);
758 wake_worker(pool);
759 }
760}
761
762static void copy_complete(int read_err, unsigned long write_err, void *context)
763{
764 unsigned long flags;
765 struct new_mapping *m = context;
766 struct pool *pool = m->tc->pool;
767
768 m->err = read_err || write_err ? -EIO : 0;
769
770 spin_lock_irqsave(&pool->lock, flags);
771 m->prepared = 1;
772 __maybe_add_mapping(m);
773 spin_unlock_irqrestore(&pool->lock, flags);
774}
775
776static void overwrite_endio(struct bio *bio, int err)
777{
778 unsigned long flags;
eb2aa48d
JT
779 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
780 struct new_mapping *m = h->overwrite_mapping;
991d9fa0
JT
781 struct pool *pool = m->tc->pool;
782
783 m->err = err;
784
785 spin_lock_irqsave(&pool->lock, flags);
786 m->prepared = 1;
787 __maybe_add_mapping(m);
788 spin_unlock_irqrestore(&pool->lock, flags);
789}
790
991d9fa0
JT
791/*----------------------------------------------------------------*/
792
793/*
794 * Workqueue.
795 */
796
797/*
798 * Prepared mapping jobs.
799 */
800
801/*
802 * This sends the bios in the cell back to the deferred_bios list.
803 */
804static void cell_defer(struct thin_c *tc, struct cell *cell,
805 dm_block_t data_block)
806{
807 struct pool *pool = tc->pool;
808 unsigned long flags;
809
810 spin_lock_irqsave(&pool->lock, flags);
811 cell_release(cell, &pool->deferred_bios);
812 spin_unlock_irqrestore(&tc->pool->lock, flags);
813
814 wake_worker(pool);
815}
816
817/*
818 * Same as cell_defer above, except it omits one particular detainee,
819 * a write bio that covers the block and has already been processed.
820 */
6f94a4c4 821static void cell_defer_except(struct thin_c *tc, struct cell *cell)
991d9fa0
JT
822{
823 struct bio_list bios;
991d9fa0
JT
824 struct pool *pool = tc->pool;
825 unsigned long flags;
826
827 bio_list_init(&bios);
991d9fa0
JT
828
829 spin_lock_irqsave(&pool->lock, flags);
6f94a4c4 830 cell_release_no_holder(cell, &pool->deferred_bios);
991d9fa0
JT
831 spin_unlock_irqrestore(&pool->lock, flags);
832
833 wake_worker(pool);
834}
835
836static void process_prepared_mapping(struct new_mapping *m)
837{
838 struct thin_c *tc = m->tc;
839 struct bio *bio;
840 int r;
841
842 bio = m->bio;
843 if (bio)
844 bio->bi_end_io = m->saved_bi_end_io;
845
846 if (m->err) {
847 cell_error(m->cell);
848 return;
849 }
850
851 /*
852 * Commit the prepared block into the mapping btree.
853 * Any I/O for this block arriving after this point will get
854 * remapped to it directly.
855 */
856 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
857 if (r) {
858 DMERR("dm_thin_insert_block() failed");
859 cell_error(m->cell);
860 return;
861 }
862
863 /*
864 * Release any bios held while the block was being provisioned.
865 * If we are processing a write bio that completely covers the block,
866 * we already processed it so can ignore it now when processing
867 * the bios in the cell.
868 */
869 if (bio) {
6f94a4c4 870 cell_defer_except(tc, m->cell);
991d9fa0
JT
871 bio_endio(bio, 0);
872 } else
873 cell_defer(tc, m->cell, m->data_block);
874
875 list_del(&m->list);
876 mempool_free(m, tc->pool->mapping_pool);
877}
878
104655fd
JT
879static void process_prepared_discard(struct new_mapping *m)
880{
881 int r;
882 struct thin_c *tc = m->tc;
883
884 r = dm_thin_remove_block(tc->td, m->virt_block);
885 if (r)
886 DMERR("dm_thin_remove_block() failed");
887
888 /*
889 * Pass the discard down to the underlying device?
890 */
891 if (m->pass_discard)
892 remap_and_issue(tc, m->bio, m->data_block);
893 else
894 bio_endio(m->bio, 0);
895
896 cell_defer_except(tc, m->cell);
897 cell_defer_except(tc, m->cell2);
898 mempool_free(m, tc->pool->mapping_pool);
899}
900
901static void process_prepared(struct pool *pool, struct list_head *head,
902 void (*fn)(struct new_mapping *))
991d9fa0
JT
903{
904 unsigned long flags;
905 struct list_head maps;
906 struct new_mapping *m, *tmp;
907
908 INIT_LIST_HEAD(&maps);
909 spin_lock_irqsave(&pool->lock, flags);
104655fd 910 list_splice_init(head, &maps);
991d9fa0
JT
911 spin_unlock_irqrestore(&pool->lock, flags);
912
913 list_for_each_entry_safe(m, tmp, &maps, list)
104655fd 914 fn(m);
991d9fa0
JT
915}
916
917/*
918 * Deferred bio jobs.
919 */
104655fd 920static int io_overlaps_block(struct pool *pool, struct bio *bio)
991d9fa0 921{
104655fd 922 return !(bio->bi_sector & pool->offset_mask) &&
991d9fa0 923 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
104655fd
JT
924
925}
926
927static int io_overwrites_block(struct pool *pool, struct bio *bio)
928{
929 return (bio_data_dir(bio) == WRITE) &&
930 io_overlaps_block(pool, bio);
991d9fa0
JT
931}
932
933static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
934 bio_end_io_t *fn)
935{
936 *save = bio->bi_end_io;
937 bio->bi_end_io = fn;
938}
939
940static int ensure_next_mapping(struct pool *pool)
941{
942 if (pool->next_mapping)
943 return 0;
944
945 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
946
947 return pool->next_mapping ? 0 : -ENOMEM;
948}
949
950static struct new_mapping *get_next_mapping(struct pool *pool)
951{
952 struct new_mapping *r = pool->next_mapping;
953
954 BUG_ON(!pool->next_mapping);
955
956 pool->next_mapping = NULL;
957
958 return r;
959}
960
961static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
2dd9c257
JT
962 struct dm_dev *origin, dm_block_t data_origin,
963 dm_block_t data_dest,
991d9fa0
JT
964 struct cell *cell, struct bio *bio)
965{
966 int r;
967 struct pool *pool = tc->pool;
968 struct new_mapping *m = get_next_mapping(pool);
969
970 INIT_LIST_HEAD(&m->list);
eb2aa48d 971 m->quiesced = 0;
991d9fa0
JT
972 m->prepared = 0;
973 m->tc = tc;
974 m->virt_block = virt_block;
975 m->data_block = data_dest;
976 m->cell = cell;
977 m->err = 0;
978 m->bio = NULL;
979
eb2aa48d
JT
980 if (!ds_add_work(&pool->shared_read_ds, &m->list))
981 m->quiesced = 1;
991d9fa0
JT
982
983 /*
984 * IO to pool_dev remaps to the pool target's data_dev.
985 *
986 * If the whole block of data is being overwritten, we can issue the
987 * bio immediately. Otherwise we use kcopyd to clone the data first.
988 */
989 if (io_overwrites_block(pool, bio)) {
eb2aa48d
JT
990 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
991 h->overwrite_mapping = m;
991d9fa0
JT
992 m->bio = bio;
993 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
991d9fa0
JT
994 remap_and_issue(tc, bio, data_dest);
995 } else {
996 struct dm_io_region from, to;
997
2dd9c257 998 from.bdev = origin->bdev;
991d9fa0
JT
999 from.sector = data_origin * pool->sectors_per_block;
1000 from.count = pool->sectors_per_block;
1001
1002 to.bdev = tc->pool_dev->bdev;
1003 to.sector = data_dest * pool->sectors_per_block;
1004 to.count = pool->sectors_per_block;
1005
1006 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
1007 0, copy_complete, m);
1008 if (r < 0) {
1009 mempool_free(m, pool->mapping_pool);
1010 DMERR("dm_kcopyd_copy() failed");
1011 cell_error(cell);
1012 }
1013 }
1014}
1015
2dd9c257
JT
1016static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1017 dm_block_t data_origin, dm_block_t data_dest,
1018 struct cell *cell, struct bio *bio)
1019{
1020 schedule_copy(tc, virt_block, tc->pool_dev,
1021 data_origin, data_dest, cell, bio);
1022}
1023
1024static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1025 dm_block_t data_dest,
1026 struct cell *cell, struct bio *bio)
1027{
1028 schedule_copy(tc, virt_block, tc->origin_dev,
1029 virt_block, data_dest, cell, bio);
1030}
1031
991d9fa0
JT
1032static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
1033 dm_block_t data_block, struct cell *cell,
1034 struct bio *bio)
1035{
1036 struct pool *pool = tc->pool;
1037 struct new_mapping *m = get_next_mapping(pool);
1038
1039 INIT_LIST_HEAD(&m->list);
eb2aa48d 1040 m->quiesced = 1;
991d9fa0
JT
1041 m->prepared = 0;
1042 m->tc = tc;
1043 m->virt_block = virt_block;
1044 m->data_block = data_block;
1045 m->cell = cell;
1046 m->err = 0;
1047 m->bio = NULL;
1048
1049 /*
1050 * If the whole block of data is being overwritten or we are not
1051 * zeroing pre-existing data, we can issue the bio immediately.
1052 * Otherwise we use kcopyd to zero the data first.
1053 */
1054 if (!pool->zero_new_blocks)
1055 process_prepared_mapping(m);
1056
1057 else if (io_overwrites_block(pool, bio)) {
eb2aa48d
JT
1058 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1059 h->overwrite_mapping = m;
991d9fa0
JT
1060 m->bio = bio;
1061 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
991d9fa0
JT
1062 remap_and_issue(tc, bio, data_block);
1063
1064 } else {
1065 int r;
1066 struct dm_io_region to;
1067
1068 to.bdev = tc->pool_dev->bdev;
1069 to.sector = data_block * pool->sectors_per_block;
1070 to.count = pool->sectors_per_block;
1071
1072 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
1073 if (r < 0) {
1074 mempool_free(m, pool->mapping_pool);
1075 DMERR("dm_kcopyd_zero() failed");
1076 cell_error(cell);
1077 }
1078 }
1079}
1080
1081static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1082{
1083 int r;
1084 dm_block_t free_blocks;
1085 unsigned long flags;
1086 struct pool *pool = tc->pool;
1087
1088 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1089 if (r)
1090 return r;
1091
1092 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1093 DMWARN("%s: reached low water mark, sending event.",
1094 dm_device_name(pool->pool_md));
1095 spin_lock_irqsave(&pool->lock, flags);
1096 pool->low_water_triggered = 1;
1097 spin_unlock_irqrestore(&pool->lock, flags);
1098 dm_table_event(pool->ti->table);
1099 }
1100
1101 if (!free_blocks) {
1102 if (pool->no_free_space)
1103 return -ENOSPC;
1104 else {
1105 /*
1106 * Try to commit to see if that will free up some
1107 * more space.
1108 */
1109 r = dm_pool_commit_metadata(pool->pmd);
1110 if (r) {
1111 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1112 __func__, r);
1113 return r;
1114 }
1115
1116 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1117 if (r)
1118 return r;
1119
1120 /*
1121 * If we still have no space we set a flag to avoid
1122 * doing all this checking and return -ENOSPC.
1123 */
1124 if (!free_blocks) {
1125 DMWARN("%s: no free space available.",
1126 dm_device_name(pool->pool_md));
1127 spin_lock_irqsave(&pool->lock, flags);
1128 pool->no_free_space = 1;
1129 spin_unlock_irqrestore(&pool->lock, flags);
1130 return -ENOSPC;
1131 }
1132 }
1133 }
1134
1135 r = dm_pool_alloc_data_block(pool->pmd, result);
1136 if (r)
1137 return r;
1138
1139 return 0;
1140}
1141
1142/*
1143 * If we have run out of space, queue bios until the device is
1144 * resumed, presumably after having been reloaded with more space.
1145 */
1146static void retry_on_resume(struct bio *bio)
1147{
eb2aa48d
JT
1148 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1149 struct thin_c *tc = h->tc;
991d9fa0
JT
1150 struct pool *pool = tc->pool;
1151 unsigned long flags;
1152
1153 spin_lock_irqsave(&pool->lock, flags);
1154 bio_list_add(&pool->retry_on_resume_list, bio);
1155 spin_unlock_irqrestore(&pool->lock, flags);
1156}
1157
1158static void no_space(struct cell *cell)
1159{
1160 struct bio *bio;
1161 struct bio_list bios;
1162
1163 bio_list_init(&bios);
1164 cell_release(cell, &bios);
1165
1166 while ((bio = bio_list_pop(&bios)))
1167 retry_on_resume(bio);
1168}
1169
104655fd
JT
1170static void process_discard(struct thin_c *tc, struct bio *bio)
1171{
1172 int r;
1173 struct pool *pool = tc->pool;
1174 struct cell *cell, *cell2;
1175 struct cell_key key, key2;
1176 dm_block_t block = get_bio_block(tc, bio);
1177 struct dm_thin_lookup_result lookup_result;
1178 struct new_mapping *m;
1179
1180 build_virtual_key(tc->td, block, &key);
1181 if (bio_detain(tc->pool->prison, &key, bio, &cell))
1182 return;
1183
1184 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1185 switch (r) {
1186 case 0:
1187 /*
1188 * Check nobody is fiddling with this pool block. This can
1189 * happen if someone's in the process of breaking sharing
1190 * on this block.
1191 */
1192 build_data_key(tc->td, lookup_result.block, &key2);
1193 if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
1194 cell_release_singleton(cell, bio);
1195 break;
1196 }
1197
1198 if (io_overlaps_block(pool, bio)) {
1199 /*
1200 * IO may still be going to the destination block. We must
1201 * quiesce before we can do the removal.
1202 */
1203 m = get_next_mapping(pool);
1204 m->tc = tc;
1205 m->pass_discard = !lookup_result.shared;
1206 m->virt_block = block;
1207 m->data_block = lookup_result.block;
1208 m->cell = cell;
1209 m->cell2 = cell2;
1210 m->err = 0;
1211 m->bio = bio;
1212
1213 if (!ds_add_work(&pool->all_io_ds, &m->list)) {
1214 list_add(&m->list, &pool->prepared_discards);
1215 wake_worker(pool);
1216 }
1217 } else {
1218 /*
1219 * This path is hit if people are ignoring
1220 * limits->discard_granularity. It ignores any
1221 * part of the discard that is in a subsequent
1222 * block.
1223 */
1224 sector_t offset = bio->bi_sector - (block << pool->block_shift);
1225 unsigned remaining = (pool->sectors_per_block - offset) << 9;
1226 bio->bi_size = min(bio->bi_size, remaining);
1227
1228 cell_release_singleton(cell, bio);
1229 cell_release_singleton(cell2, bio);
1230 remap_and_issue(tc, bio, lookup_result.block);
1231 }
1232 break;
1233
1234 case -ENODATA:
1235 /*
1236 * It isn't provisioned, just forget it.
1237 */
1238 cell_release_singleton(cell, bio);
1239 bio_endio(bio, 0);
1240 break;
1241
1242 default:
1243 DMERR("discard: find block unexpectedly returned %d", r);
1244 cell_release_singleton(cell, bio);
1245 bio_io_error(bio);
1246 break;
1247 }
1248}
1249
991d9fa0
JT
1250static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1251 struct cell_key *key,
1252 struct dm_thin_lookup_result *lookup_result,
1253 struct cell *cell)
1254{
1255 int r;
1256 dm_block_t data_block;
1257
1258 r = alloc_data_block(tc, &data_block);
1259 switch (r) {
1260 case 0:
2dd9c257
JT
1261 schedule_internal_copy(tc, block, lookup_result->block,
1262 data_block, cell, bio);
991d9fa0
JT
1263 break;
1264
1265 case -ENOSPC:
1266 no_space(cell);
1267 break;
1268
1269 default:
1270 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1271 cell_error(cell);
1272 break;
1273 }
1274}
1275
1276static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1277 dm_block_t block,
1278 struct dm_thin_lookup_result *lookup_result)
1279{
1280 struct cell *cell;
1281 struct pool *pool = tc->pool;
1282 struct cell_key key;
1283
1284 /*
1285 * If cell is already occupied, then sharing is already in the process
1286 * of being broken so we have nothing further to do here.
1287 */
1288 build_data_key(tc->td, lookup_result->block, &key);
1289 if (bio_detain(pool->prison, &key, bio, &cell))
1290 return;
1291
1292 if (bio_data_dir(bio) == WRITE)
1293 break_sharing(tc, bio, block, &key, lookup_result, cell);
1294 else {
eb2aa48d 1295 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
991d9fa0 1296
eb2aa48d 1297 h->shared_read_entry = ds_inc(&pool->shared_read_ds);
991d9fa0
JT
1298
1299 cell_release_singleton(cell, bio);
1300 remap_and_issue(tc, bio, lookup_result->block);
1301 }
1302}
1303
1304static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1305 struct cell *cell)
1306{
1307 int r;
1308 dm_block_t data_block;
1309
1310 /*
1311 * Remap empty bios (flushes) immediately, without provisioning.
1312 */
1313 if (!bio->bi_size) {
1314 cell_release_singleton(cell, bio);
1315 remap_and_issue(tc, bio, 0);
1316 return;
1317 }
1318
1319 /*
1320 * Fill read bios with zeroes and complete them immediately.
1321 */
1322 if (bio_data_dir(bio) == READ) {
1323 zero_fill_bio(bio);
1324 cell_release_singleton(cell, bio);
1325 bio_endio(bio, 0);
1326 return;
1327 }
1328
1329 r = alloc_data_block(tc, &data_block);
1330 switch (r) {
1331 case 0:
2dd9c257
JT
1332 if (tc->origin_dev)
1333 schedule_external_copy(tc, block, data_block, cell, bio);
1334 else
1335 schedule_zero(tc, block, data_block, cell, bio);
991d9fa0
JT
1336 break;
1337
1338 case -ENOSPC:
1339 no_space(cell);
1340 break;
1341
1342 default:
1343 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1344 cell_error(cell);
1345 break;
1346 }
1347}
1348
1349static void process_bio(struct thin_c *tc, struct bio *bio)
1350{
1351 int r;
1352 dm_block_t block = get_bio_block(tc, bio);
1353 struct cell *cell;
1354 struct cell_key key;
1355 struct dm_thin_lookup_result lookup_result;
1356
1357 /*
1358 * If cell is already occupied, then the block is already
1359 * being provisioned so we have nothing further to do here.
1360 */
1361 build_virtual_key(tc->td, block, &key);
1362 if (bio_detain(tc->pool->prison, &key, bio, &cell))
1363 return;
1364
1365 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1366 switch (r) {
1367 case 0:
1368 /*
1369 * We can release this cell now. This thread is the only
1370 * one that puts bios into a cell, and we know there were
1371 * no preceding bios.
1372 */
1373 /*
1374 * TODO: this will probably have to change when discard goes
1375 * back in.
1376 */
1377 cell_release_singleton(cell, bio);
1378
1379 if (lookup_result.shared)
1380 process_shared_bio(tc, bio, block, &lookup_result);
1381 else
1382 remap_and_issue(tc, bio, lookup_result.block);
1383 break;
1384
1385 case -ENODATA:
2dd9c257
JT
1386 if (bio_data_dir(bio) == READ && tc->origin_dev) {
1387 cell_release_singleton(cell, bio);
1388 remap_to_origin_and_issue(tc, bio);
1389 } else
1390 provision_block(tc, bio, block, cell);
991d9fa0
JT
1391 break;
1392
1393 default:
1394 DMERR("dm_thin_find_block() failed, error = %d", r);
104655fd 1395 cell_release_singleton(cell, bio);
991d9fa0
JT
1396 bio_io_error(bio);
1397 break;
1398 }
1399}
1400
905e51b3
JT
1401static int need_commit_due_to_time(struct pool *pool)
1402{
1403 return jiffies < pool->last_commit_jiffies ||
1404 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1405}
1406
991d9fa0
JT
1407static void process_deferred_bios(struct pool *pool)
1408{
1409 unsigned long flags;
1410 struct bio *bio;
1411 struct bio_list bios;
1412 int r;
1413
1414 bio_list_init(&bios);
1415
1416 spin_lock_irqsave(&pool->lock, flags);
1417 bio_list_merge(&bios, &pool->deferred_bios);
1418 bio_list_init(&pool->deferred_bios);
1419 spin_unlock_irqrestore(&pool->lock, flags);
1420
1421 while ((bio = bio_list_pop(&bios))) {
eb2aa48d
JT
1422 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1423 struct thin_c *tc = h->tc;
1424
991d9fa0
JT
1425 /*
1426 * If we've got no free new_mapping structs, and processing
1427 * this bio might require one, we pause until there are some
1428 * prepared mappings to process.
1429 */
1430 if (ensure_next_mapping(pool)) {
1431 spin_lock_irqsave(&pool->lock, flags);
1432 bio_list_merge(&pool->deferred_bios, &bios);
1433 spin_unlock_irqrestore(&pool->lock, flags);
1434
1435 break;
1436 }
104655fd
JT
1437
1438 if (bio->bi_rw & REQ_DISCARD)
1439 process_discard(tc, bio);
1440 else
1441 process_bio(tc, bio);
991d9fa0
JT
1442 }
1443
1444 /*
1445 * If there are any deferred flush bios, we must commit
1446 * the metadata before issuing them.
1447 */
1448 bio_list_init(&bios);
1449 spin_lock_irqsave(&pool->lock, flags);
1450 bio_list_merge(&bios, &pool->deferred_flush_bios);
1451 bio_list_init(&pool->deferred_flush_bios);
1452 spin_unlock_irqrestore(&pool->lock, flags);
1453
905e51b3 1454 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
991d9fa0
JT
1455 return;
1456
1457 r = dm_pool_commit_metadata(pool->pmd);
1458 if (r) {
1459 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1460 __func__, r);
1461 while ((bio = bio_list_pop(&bios)))
1462 bio_io_error(bio);
1463 return;
1464 }
905e51b3 1465 pool->last_commit_jiffies = jiffies;
991d9fa0
JT
1466
1467 while ((bio = bio_list_pop(&bios)))
1468 generic_make_request(bio);
1469}
1470
1471static void do_worker(struct work_struct *ws)
1472{
1473 struct pool *pool = container_of(ws, struct pool, worker);
1474
104655fd
JT
1475 process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
1476 process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
991d9fa0
JT
1477 process_deferred_bios(pool);
1478}
1479
905e51b3
JT
1480/*
1481 * We want to commit periodically so that not too much
1482 * unwritten data builds up.
1483 */
1484static void do_waker(struct work_struct *ws)
1485{
1486 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1487 wake_worker(pool);
1488 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1489}
1490
991d9fa0
JT
1491/*----------------------------------------------------------------*/
1492
1493/*
1494 * Mapping functions.
1495 */
1496
1497/*
1498 * Called only while mapping a thin bio to hand it over to the workqueue.
1499 */
1500static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1501{
1502 unsigned long flags;
1503 struct pool *pool = tc->pool;
1504
1505 spin_lock_irqsave(&pool->lock, flags);
1506 bio_list_add(&pool->deferred_bios, bio);
1507 spin_unlock_irqrestore(&pool->lock, flags);
1508
1509 wake_worker(pool);
1510}
1511
eb2aa48d
JT
1512static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
1513{
1514 struct pool *pool = tc->pool;
1515 struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1516
1517 h->tc = tc;
1518 h->shared_read_entry = NULL;
104655fd 1519 h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
eb2aa48d
JT
1520 h->overwrite_mapping = NULL;
1521
1522 return h;
1523}
1524
991d9fa0
JT
1525/*
1526 * Non-blocking function called from the thin target's map function.
1527 */
1528static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1529 union map_info *map_context)
1530{
1531 int r;
1532 struct thin_c *tc = ti->private;
1533 dm_block_t block = get_bio_block(tc, bio);
1534 struct dm_thin_device *td = tc->td;
1535 struct dm_thin_lookup_result result;
1536
eb2aa48d 1537 map_context->ptr = thin_hook_bio(tc, bio);
104655fd 1538 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
991d9fa0
JT
1539 thin_defer_bio(tc, bio);
1540 return DM_MAPIO_SUBMITTED;
1541 }
1542
1543 r = dm_thin_find_block(td, block, 0, &result);
1544
1545 /*
1546 * Note that we defer readahead too.
1547 */
1548 switch (r) {
1549 case 0:
1550 if (unlikely(result.shared)) {
1551 /*
1552 * We have a race condition here between the
1553 * result.shared value returned by the lookup and
1554 * snapshot creation, which may cause new
1555 * sharing.
1556 *
1557 * To avoid this always quiesce the origin before
1558 * taking the snap. You want to do this anyway to
1559 * ensure a consistent application view
1560 * (i.e. lockfs).
1561 *
1562 * More distant ancestors are irrelevant. The
1563 * shared flag will be set in their case.
1564 */
1565 thin_defer_bio(tc, bio);
1566 r = DM_MAPIO_SUBMITTED;
1567 } else {
1568 remap(tc, bio, result.block);
1569 r = DM_MAPIO_REMAPPED;
1570 }
1571 break;
1572
1573 case -ENODATA:
1574 /*
1575 * In future, the failed dm_thin_find_block above could
1576 * provide the hint to load the metadata into cache.
1577 */
1578 case -EWOULDBLOCK:
1579 thin_defer_bio(tc, bio);
1580 r = DM_MAPIO_SUBMITTED;
1581 break;
1582 }
1583
1584 return r;
1585}
1586
1587static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1588{
1589 int r;
1590 unsigned long flags;
1591 struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1592
1593 spin_lock_irqsave(&pt->pool->lock, flags);
1594 r = !bio_list_empty(&pt->pool->retry_on_resume_list);
1595 spin_unlock_irqrestore(&pt->pool->lock, flags);
1596
1597 if (!r) {
1598 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1599 r = bdi_congested(&q->backing_dev_info, bdi_bits);
1600 }
1601
1602 return r;
1603}
1604
1605static void __requeue_bios(struct pool *pool)
1606{
1607 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
1608 bio_list_init(&pool->retry_on_resume_list);
1609}
1610
1611/*----------------------------------------------------------------
1612 * Binding of control targets to a pool object
1613 *--------------------------------------------------------------*/
1614static int bind_control_target(struct pool *pool, struct dm_target *ti)
1615{
1616 struct pool_c *pt = ti->private;
1617
1618 pool->ti = ti;
1619 pool->low_water_blocks = pt->low_water_blocks;
1620 pool->zero_new_blocks = pt->zero_new_blocks;
1621
1622 return 0;
1623}
1624
1625static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1626{
1627 if (pool->ti == ti)
1628 pool->ti = NULL;
1629}
1630
1631/*----------------------------------------------------------------
1632 * Pool creation
1633 *--------------------------------------------------------------*/
1634static void __pool_destroy(struct pool *pool)
1635{
1636 __pool_table_remove(pool);
1637
1638 if (dm_pool_metadata_close(pool->pmd) < 0)
1639 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1640
1641 prison_destroy(pool->prison);
1642 dm_kcopyd_client_destroy(pool->copier);
1643
1644 if (pool->wq)
1645 destroy_workqueue(pool->wq);
1646
1647 if (pool->next_mapping)
1648 mempool_free(pool->next_mapping, pool->mapping_pool);
1649 mempool_destroy(pool->mapping_pool);
1650 mempool_destroy(pool->endio_hook_pool);
1651 kfree(pool);
1652}
1653
1654static struct pool *pool_create(struct mapped_device *pool_md,
1655 struct block_device *metadata_dev,
1656 unsigned long block_size, char **error)
1657{
1658 int r;
1659 void *err_p;
1660 struct pool *pool;
1661 struct dm_pool_metadata *pmd;
1662
1663 pmd = dm_pool_metadata_open(metadata_dev, block_size);
1664 if (IS_ERR(pmd)) {
1665 *error = "Error creating metadata object";
1666 return (struct pool *)pmd;
1667 }
1668
1669 pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1670 if (!pool) {
1671 *error = "Error allocating memory for pool";
1672 err_p = ERR_PTR(-ENOMEM);
1673 goto bad_pool;
1674 }
1675
1676 pool->pmd = pmd;
1677 pool->sectors_per_block = block_size;
1678 pool->block_shift = ffs(block_size) - 1;
1679 pool->offset_mask = block_size - 1;
1680 pool->low_water_blocks = 0;
1681 pool->zero_new_blocks = 1;
1682 pool->prison = prison_create(PRISON_CELLS);
1683 if (!pool->prison) {
1684 *error = "Error creating pool's bio prison";
1685 err_p = ERR_PTR(-ENOMEM);
1686 goto bad_prison;
1687 }
1688
1689 pool->copier = dm_kcopyd_client_create();
1690 if (IS_ERR(pool->copier)) {
1691 r = PTR_ERR(pool->copier);
1692 *error = "Error creating pool's kcopyd client";
1693 err_p = ERR_PTR(r);
1694 goto bad_kcopyd_client;
1695 }
1696
1697 /*
1698 * Create singlethreaded workqueue that will service all devices
1699 * that use this metadata.
1700 */
1701 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1702 if (!pool->wq) {
1703 *error = "Error creating pool's workqueue";
1704 err_p = ERR_PTR(-ENOMEM);
1705 goto bad_wq;
1706 }
1707
1708 INIT_WORK(&pool->worker, do_worker);
905e51b3 1709 INIT_DELAYED_WORK(&pool->waker, do_waker);
991d9fa0
JT
1710 spin_lock_init(&pool->lock);
1711 bio_list_init(&pool->deferred_bios);
1712 bio_list_init(&pool->deferred_flush_bios);
1713 INIT_LIST_HEAD(&pool->prepared_mappings);
104655fd 1714 INIT_LIST_HEAD(&pool->prepared_discards);
991d9fa0
JT
1715 pool->low_water_triggered = 0;
1716 pool->no_free_space = 0;
1717 bio_list_init(&pool->retry_on_resume_list);
eb2aa48d 1718 ds_init(&pool->shared_read_ds);
104655fd 1719 ds_init(&pool->all_io_ds);
991d9fa0
JT
1720
1721 pool->next_mapping = NULL;
1722 pool->mapping_pool =
1723 mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping));
1724 if (!pool->mapping_pool) {
1725 *error = "Error creating pool's mapping mempool";
1726 err_p = ERR_PTR(-ENOMEM);
1727 goto bad_mapping_pool;
1728 }
1729
1730 pool->endio_hook_pool =
1731 mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook));
1732 if (!pool->endio_hook_pool) {
1733 *error = "Error creating pool's endio_hook mempool";
1734 err_p = ERR_PTR(-ENOMEM);
1735 goto bad_endio_hook_pool;
1736 }
1737 pool->ref_count = 1;
905e51b3 1738 pool->last_commit_jiffies = jiffies;
991d9fa0
JT
1739 pool->pool_md = pool_md;
1740 pool->md_dev = metadata_dev;
1741 __pool_table_insert(pool);
1742
1743 return pool;
1744
1745bad_endio_hook_pool:
1746 mempool_destroy(pool->mapping_pool);
1747bad_mapping_pool:
1748 destroy_workqueue(pool->wq);
1749bad_wq:
1750 dm_kcopyd_client_destroy(pool->copier);
1751bad_kcopyd_client:
1752 prison_destroy(pool->prison);
1753bad_prison:
1754 kfree(pool);
1755bad_pool:
1756 if (dm_pool_metadata_close(pmd))
1757 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1758
1759 return err_p;
1760}
1761
1762static void __pool_inc(struct pool *pool)
1763{
1764 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1765 pool->ref_count++;
1766}
1767
1768static void __pool_dec(struct pool *pool)
1769{
1770 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1771 BUG_ON(!pool->ref_count);
1772 if (!--pool->ref_count)
1773 __pool_destroy(pool);
1774}
1775
1776static struct pool *__pool_find(struct mapped_device *pool_md,
1777 struct block_device *metadata_dev,
1778 unsigned long block_size, char **error)
1779{
1780 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1781
1782 if (pool) {
1783 if (pool->pool_md != pool_md)
1784 return ERR_PTR(-EBUSY);
1785 __pool_inc(pool);
1786
1787 } else {
1788 pool = __pool_table_lookup(pool_md);
1789 if (pool) {
1790 if (pool->md_dev != metadata_dev)
1791 return ERR_PTR(-EINVAL);
1792 __pool_inc(pool);
1793
1794 } else
1795 pool = pool_create(pool_md, metadata_dev, block_size, error);
1796 }
1797
1798 return pool;
1799}
1800
1801/*----------------------------------------------------------------
1802 * Pool target methods
1803 *--------------------------------------------------------------*/
1804static void pool_dtr(struct dm_target *ti)
1805{
1806 struct pool_c *pt = ti->private;
1807
1808 mutex_lock(&dm_thin_pool_table.mutex);
1809
1810 unbind_control_target(pt->pool, ti);
1811 __pool_dec(pt->pool);
1812 dm_put_device(ti, pt->metadata_dev);
1813 dm_put_device(ti, pt->data_dev);
1814 kfree(pt);
1815
1816 mutex_unlock(&dm_thin_pool_table.mutex);
1817}
1818
1819struct pool_features {
1820 unsigned zero_new_blocks:1;
1821};
1822
1823static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1824 struct dm_target *ti)
1825{
1826 int r;
1827 unsigned argc;
1828 const char *arg_name;
1829
1830 static struct dm_arg _args[] = {
1831 {0, 1, "Invalid number of pool feature arguments"},
1832 };
1833
1834 /*
1835 * No feature arguments supplied.
1836 */
1837 if (!as->argc)
1838 return 0;
1839
1840 r = dm_read_arg_group(_args, as, &argc, &ti->error);
1841 if (r)
1842 return -EINVAL;
1843
1844 while (argc && !r) {
1845 arg_name = dm_shift_arg(as);
1846 argc--;
1847
1848 if (!strcasecmp(arg_name, "skip_block_zeroing")) {
1849 pf->zero_new_blocks = 0;
1850 continue;
1851 }
1852
1853 ti->error = "Unrecognised pool feature requested";
1854 r = -EINVAL;
1855 }
1856
1857 return r;
1858}
1859
1860/*
1861 * thin-pool <metadata dev> <data dev>
1862 * <data block size (sectors)>
1863 * <low water mark (blocks)>
1864 * [<#feature args> [<arg>]*]
1865 *
1866 * Optional feature arguments are:
1867 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1868 */
1869static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1870{
1871 int r;
1872 struct pool_c *pt;
1873 struct pool *pool;
1874 struct pool_features pf;
1875 struct dm_arg_set as;
1876 struct dm_dev *data_dev;
1877 unsigned long block_size;
1878 dm_block_t low_water_blocks;
1879 struct dm_dev *metadata_dev;
1880 sector_t metadata_dev_size;
c4a69ecd 1881 char b[BDEVNAME_SIZE];
991d9fa0
JT
1882
1883 /*
1884 * FIXME Remove validation from scope of lock.
1885 */
1886 mutex_lock(&dm_thin_pool_table.mutex);
1887
1888 if (argc < 4) {
1889 ti->error = "Invalid argument count";
1890 r = -EINVAL;
1891 goto out_unlock;
1892 }
1893 as.argc = argc;
1894 as.argv = argv;
1895
1896 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
1897 if (r) {
1898 ti->error = "Error opening metadata block device";
1899 goto out_unlock;
1900 }
1901
1902 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
c4a69ecd
MS
1903 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
1904 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1905 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
991d9fa0
JT
1906
1907 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1908 if (r) {
1909 ti->error = "Error getting data device";
1910 goto out_metadata;
1911 }
1912
1913 if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1914 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1915 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1916 !is_power_of_2(block_size)) {
1917 ti->error = "Invalid block size";
1918 r = -EINVAL;
1919 goto out;
1920 }
1921
1922 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
1923 ti->error = "Invalid low water mark";
1924 r = -EINVAL;
1925 goto out;
1926 }
1927
1928 /*
1929 * Set default pool features.
1930 */
1931 memset(&pf, 0, sizeof(pf));
1932 pf.zero_new_blocks = 1;
1933
1934 dm_consume_args(&as, 4);
1935 r = parse_pool_features(&as, &pf, ti);
1936 if (r)
1937 goto out;
1938
1939 pt = kzalloc(sizeof(*pt), GFP_KERNEL);
1940 if (!pt) {
1941 r = -ENOMEM;
1942 goto out;
1943 }
1944
1945 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1946 block_size, &ti->error);
1947 if (IS_ERR(pool)) {
1948 r = PTR_ERR(pool);
1949 goto out_free_pt;
1950 }
1951
1952 pt->pool = pool;
1953 pt->ti = ti;
1954 pt->metadata_dev = metadata_dev;
1955 pt->data_dev = data_dev;
1956 pt->low_water_blocks = low_water_blocks;
1957 pt->zero_new_blocks = pf.zero_new_blocks;
1958 ti->num_flush_requests = 1;
104655fd
JT
1959 ti->num_discard_requests = 1;
1960 ti->discards_supported = 1;
991d9fa0
JT
1961 ti->private = pt;
1962
1963 pt->callbacks.congested_fn = pool_is_congested;
1964 dm_table_add_target_callbacks(ti->table, &pt->callbacks);
1965
1966 mutex_unlock(&dm_thin_pool_table.mutex);
1967
1968 return 0;
1969
1970out_free_pt:
1971 kfree(pt);
1972out:
1973 dm_put_device(ti, data_dev);
1974out_metadata:
1975 dm_put_device(ti, metadata_dev);
1976out_unlock:
1977 mutex_unlock(&dm_thin_pool_table.mutex);
1978
1979 return r;
1980}
1981
1982static int pool_map(struct dm_target *ti, struct bio *bio,
1983 union map_info *map_context)
1984{
1985 int r;
1986 struct pool_c *pt = ti->private;
1987 struct pool *pool = pt->pool;
1988 unsigned long flags;
1989
1990 /*
1991 * As this is a singleton target, ti->begin is always zero.
1992 */
1993 spin_lock_irqsave(&pool->lock, flags);
1994 bio->bi_bdev = pt->data_dev->bdev;
1995 r = DM_MAPIO_REMAPPED;
1996 spin_unlock_irqrestore(&pool->lock, flags);
1997
1998 return r;
1999}
2000
2001/*
2002 * Retrieves the number of blocks of the data device from
2003 * the superblock and compares it to the actual device size,
2004 * thus resizing the data device in case it has grown.
2005 *
2006 * This both copes with opening preallocated data devices in the ctr
2007 * being followed by a resume
2008 * -and-
2009 * calling the resume method individually after userspace has
2010 * grown the data device in reaction to a table event.
2011 */
2012static int pool_preresume(struct dm_target *ti)
2013{
2014 int r;
2015 struct pool_c *pt = ti->private;
2016 struct pool *pool = pt->pool;
2017 dm_block_t data_size, sb_data_size;
2018
2019 /*
2020 * Take control of the pool object.
2021 */
2022 r = bind_control_target(pool, ti);
2023 if (r)
2024 return r;
2025
2026 data_size = ti->len >> pool->block_shift;
2027 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2028 if (r) {
2029 DMERR("failed to retrieve data device size");
2030 return r;
2031 }
2032
2033 if (data_size < sb_data_size) {
2034 DMERR("pool target too small, is %llu blocks (expected %llu)",
2035 data_size, sb_data_size);
2036 return -EINVAL;
2037
2038 } else if (data_size > sb_data_size) {
2039 r = dm_pool_resize_data_dev(pool->pmd, data_size);
2040 if (r) {
2041 DMERR("failed to resize data device");
2042 return r;
2043 }
2044
2045 r = dm_pool_commit_metadata(pool->pmd);
2046 if (r) {
2047 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
2048 __func__, r);
2049 return r;
2050 }
2051 }
2052
2053 return 0;
2054}
2055
2056static void pool_resume(struct dm_target *ti)
2057{
2058 struct pool_c *pt = ti->private;
2059 struct pool *pool = pt->pool;
2060 unsigned long flags;
2061
2062 spin_lock_irqsave(&pool->lock, flags);
2063 pool->low_water_triggered = 0;
2064 pool->no_free_space = 0;
2065 __requeue_bios(pool);
2066 spin_unlock_irqrestore(&pool->lock, flags);
2067
905e51b3 2068 do_waker(&pool->waker.work);
991d9fa0
JT
2069}
2070
2071static void pool_postsuspend(struct dm_target *ti)
2072{
2073 int r;
2074 struct pool_c *pt = ti->private;
2075 struct pool *pool = pt->pool;
2076
905e51b3 2077 cancel_delayed_work(&pool->waker);
991d9fa0
JT
2078 flush_workqueue(pool->wq);
2079
2080 r = dm_pool_commit_metadata(pool->pmd);
2081 if (r < 0) {
2082 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
2083 __func__, r);
2084 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
2085 }
2086}
2087
2088static int check_arg_count(unsigned argc, unsigned args_required)
2089{
2090 if (argc != args_required) {
2091 DMWARN("Message received with %u arguments instead of %u.",
2092 argc, args_required);
2093 return -EINVAL;
2094 }
2095
2096 return 0;
2097}
2098
2099static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2100{
2101 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2102 *dev_id <= MAX_DEV_ID)
2103 return 0;
2104
2105 if (warning)
2106 DMWARN("Message received with invalid device id: %s", arg);
2107
2108 return -EINVAL;
2109}
2110
2111static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2112{
2113 dm_thin_id dev_id;
2114 int r;
2115
2116 r = check_arg_count(argc, 2);
2117 if (r)
2118 return r;
2119
2120 r = read_dev_id(argv[1], &dev_id, 1);
2121 if (r)
2122 return r;
2123
2124 r = dm_pool_create_thin(pool->pmd, dev_id);
2125 if (r) {
2126 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2127 argv[1]);
2128 return r;
2129 }
2130
2131 return 0;
2132}
2133
2134static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2135{
2136 dm_thin_id dev_id;
2137 dm_thin_id origin_dev_id;
2138 int r;
2139
2140 r = check_arg_count(argc, 3);
2141 if (r)
2142 return r;
2143
2144 r = read_dev_id(argv[1], &dev_id, 1);
2145 if (r)
2146 return r;
2147
2148 r = read_dev_id(argv[2], &origin_dev_id, 1);
2149 if (r)
2150 return r;
2151
2152 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2153 if (r) {
2154 DMWARN("Creation of new snapshot %s of device %s failed.",
2155 argv[1], argv[2]);
2156 return r;
2157 }
2158
2159 return 0;
2160}
2161
2162static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2163{
2164 dm_thin_id dev_id;
2165 int r;
2166
2167 r = check_arg_count(argc, 2);
2168 if (r)
2169 return r;
2170
2171 r = read_dev_id(argv[1], &dev_id, 1);
2172 if (r)
2173 return r;
2174
2175 r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2176 if (r)
2177 DMWARN("Deletion of thin device %s failed.", argv[1]);
2178
2179 return r;
2180}
2181
2182static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2183{
2184 dm_thin_id old_id, new_id;
2185 int r;
2186
2187 r = check_arg_count(argc, 3);
2188 if (r)
2189 return r;
2190
2191 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2192 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2193 return -EINVAL;
2194 }
2195
2196 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2197 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2198 return -EINVAL;
2199 }
2200
2201 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2202 if (r) {
2203 DMWARN("Failed to change transaction id from %s to %s.",
2204 argv[1], argv[2]);
2205 return r;
2206 }
2207
2208 return 0;
2209}
2210
2211/*
2212 * Messages supported:
2213 * create_thin <dev_id>
2214 * create_snap <dev_id> <origin_id>
2215 * delete <dev_id>
2216 * trim <dev_id> <new_size_in_sectors>
2217 * set_transaction_id <current_trans_id> <new_trans_id>
2218 */
2219static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2220{
2221 int r = -EINVAL;
2222 struct pool_c *pt = ti->private;
2223 struct pool *pool = pt->pool;
2224
2225 if (!strcasecmp(argv[0], "create_thin"))
2226 r = process_create_thin_mesg(argc, argv, pool);
2227
2228 else if (!strcasecmp(argv[0], "create_snap"))
2229 r = process_create_snap_mesg(argc, argv, pool);
2230
2231 else if (!strcasecmp(argv[0], "delete"))
2232 r = process_delete_mesg(argc, argv, pool);
2233
2234 else if (!strcasecmp(argv[0], "set_transaction_id"))
2235 r = process_set_transaction_id_mesg(argc, argv, pool);
2236
2237 else
2238 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2239
2240 if (!r) {
2241 r = dm_pool_commit_metadata(pool->pmd);
2242 if (r)
2243 DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
2244 argv[0], r);
2245 }
2246
2247 return r;
2248}
2249
2250/*
2251 * Status line is:
2252 * <transaction id> <used metadata sectors>/<total metadata sectors>
2253 * <used data sectors>/<total data sectors> <held metadata root>
2254 */
2255static int pool_status(struct dm_target *ti, status_type_t type,
2256 char *result, unsigned maxlen)
2257{
2258 int r;
2259 unsigned sz = 0;
2260 uint64_t transaction_id;
2261 dm_block_t nr_free_blocks_data;
2262 dm_block_t nr_free_blocks_metadata;
2263 dm_block_t nr_blocks_data;
2264 dm_block_t nr_blocks_metadata;
2265 dm_block_t held_root;
2266 char buf[BDEVNAME_SIZE];
2267 char buf2[BDEVNAME_SIZE];
2268 struct pool_c *pt = ti->private;
2269 struct pool *pool = pt->pool;
2270
2271 switch (type) {
2272 case STATUSTYPE_INFO:
2273 r = dm_pool_get_metadata_transaction_id(pool->pmd,
2274 &transaction_id);
2275 if (r)
2276 return r;
2277
2278 r = dm_pool_get_free_metadata_block_count(pool->pmd,
2279 &nr_free_blocks_metadata);
2280 if (r)
2281 return r;
2282
2283 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2284 if (r)
2285 return r;
2286
2287 r = dm_pool_get_free_block_count(pool->pmd,
2288 &nr_free_blocks_data);
2289 if (r)
2290 return r;
2291
2292 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2293 if (r)
2294 return r;
2295
2296 r = dm_pool_get_held_metadata_root(pool->pmd, &held_root);
2297 if (r)
2298 return r;
2299
2300 DMEMIT("%llu %llu/%llu %llu/%llu ",
2301 (unsigned long long)transaction_id,
2302 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2303 (unsigned long long)nr_blocks_metadata,
2304 (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2305 (unsigned long long)nr_blocks_data);
2306
2307 if (held_root)
2308 DMEMIT("%llu", held_root);
2309 else
2310 DMEMIT("-");
2311
2312 break;
2313
2314 case STATUSTYPE_TABLE:
2315 DMEMIT("%s %s %lu %llu ",
2316 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
2317 format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2318 (unsigned long)pool->sectors_per_block,
2319 (unsigned long long)pt->low_water_blocks);
2320
2321 DMEMIT("%u ", !pool->zero_new_blocks);
2322
2323 if (!pool->zero_new_blocks)
2324 DMEMIT("skip_block_zeroing ");
2325 break;
2326 }
2327
2328 return 0;
2329}
2330
2331static int pool_iterate_devices(struct dm_target *ti,
2332 iterate_devices_callout_fn fn, void *data)
2333{
2334 struct pool_c *pt = ti->private;
2335
2336 return fn(ti, pt->data_dev, 0, ti->len, data);
2337}
2338
2339static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2340 struct bio_vec *biovec, int max_size)
2341{
2342 struct pool_c *pt = ti->private;
2343 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2344
2345 if (!q->merge_bvec_fn)
2346 return max_size;
2347
2348 bvm->bi_bdev = pt->data_dev->bdev;
2349
2350 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2351}
2352
104655fd
JT
2353static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
2354{
2355 limits->max_discard_sectors = pool->sectors_per_block;
2356
2357 /*
2358 * This is just a hint, and not enforced. We have to cope with
2359 * bios that overlap 2 blocks.
2360 */
2361 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2362}
2363
991d9fa0
JT
2364static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2365{
2366 struct pool_c *pt = ti->private;
2367 struct pool *pool = pt->pool;
2368
2369 blk_limits_io_min(limits, 0);
2370 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
104655fd 2371 set_discard_limits(pool, limits);
991d9fa0
JT
2372}
2373
2374static struct target_type pool_target = {
2375 .name = "thin-pool",
2376 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2377 DM_TARGET_IMMUTABLE,
2378 .version = {1, 0, 0},
2379 .module = THIS_MODULE,
2380 .ctr = pool_ctr,
2381 .dtr = pool_dtr,
2382 .map = pool_map,
2383 .postsuspend = pool_postsuspend,
2384 .preresume = pool_preresume,
2385 .resume = pool_resume,
2386 .message = pool_message,
2387 .status = pool_status,
2388 .merge = pool_merge,
2389 .iterate_devices = pool_iterate_devices,
2390 .io_hints = pool_io_hints,
2391};
2392
2393/*----------------------------------------------------------------
2394 * Thin target methods
2395 *--------------------------------------------------------------*/
2396static void thin_dtr(struct dm_target *ti)
2397{
2398 struct thin_c *tc = ti->private;
2399
2400 mutex_lock(&dm_thin_pool_table.mutex);
2401
2402 __pool_dec(tc->pool);
2403 dm_pool_close_thin_device(tc->td);
2404 dm_put_device(ti, tc->pool_dev);
2dd9c257
JT
2405 if (tc->origin_dev)
2406 dm_put_device(ti, tc->origin_dev);
991d9fa0
JT
2407 kfree(tc);
2408
2409 mutex_unlock(&dm_thin_pool_table.mutex);
2410}
2411
2412/*
2413 * Thin target parameters:
2414 *
2dd9c257 2415 * <pool_dev> <dev_id> [origin_dev]
991d9fa0
JT
2416 *
2417 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2418 * dev_id: the internal device identifier
2dd9c257 2419 * origin_dev: a device external to the pool that should act as the origin
991d9fa0
JT
2420 */
2421static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2422{
2423 int r;
2424 struct thin_c *tc;
2dd9c257 2425 struct dm_dev *pool_dev, *origin_dev;
991d9fa0
JT
2426 struct mapped_device *pool_md;
2427
2428 mutex_lock(&dm_thin_pool_table.mutex);
2429
2dd9c257 2430 if (argc != 2 && argc != 3) {
991d9fa0
JT
2431 ti->error = "Invalid argument count";
2432 r = -EINVAL;
2433 goto out_unlock;
2434 }
2435
2436 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2437 if (!tc) {
2438 ti->error = "Out of memory";
2439 r = -ENOMEM;
2440 goto out_unlock;
2441 }
2442
2dd9c257
JT
2443 if (argc == 3) {
2444 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
2445 if (r) {
2446 ti->error = "Error opening origin device";
2447 goto bad_origin_dev;
2448 }
2449 tc->origin_dev = origin_dev;
2450 }
2451
991d9fa0
JT
2452 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2453 if (r) {
2454 ti->error = "Error opening pool device";
2455 goto bad_pool_dev;
2456 }
2457 tc->pool_dev = pool_dev;
2458
2459 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2460 ti->error = "Invalid device id";
2461 r = -EINVAL;
2462 goto bad_common;
2463 }
2464
2465 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2466 if (!pool_md) {
2467 ti->error = "Couldn't get pool mapped device";
2468 r = -EINVAL;
2469 goto bad_common;
2470 }
2471
2472 tc->pool = __pool_table_lookup(pool_md);
2473 if (!tc->pool) {
2474 ti->error = "Couldn't find pool object";
2475 r = -EINVAL;
2476 goto bad_pool_lookup;
2477 }
2478 __pool_inc(tc->pool);
2479
2480 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2481 if (r) {
2482 ti->error = "Couldn't open thin internal device";
2483 goto bad_thin_open;
2484 }
2485
2486 ti->split_io = tc->pool->sectors_per_block;
2487 ti->num_flush_requests = 1;
104655fd
JT
2488 ti->num_discard_requests = 1;
2489 ti->discards_supported = 1;
991d9fa0
JT
2490
2491 dm_put(pool_md);
2492
2493 mutex_unlock(&dm_thin_pool_table.mutex);
2494
2495 return 0;
2496
2497bad_thin_open:
2498 __pool_dec(tc->pool);
2499bad_pool_lookup:
2500 dm_put(pool_md);
2501bad_common:
2502 dm_put_device(ti, tc->pool_dev);
2503bad_pool_dev:
2dd9c257
JT
2504 if (tc->origin_dev)
2505 dm_put_device(ti, tc->origin_dev);
2506bad_origin_dev:
991d9fa0
JT
2507 kfree(tc);
2508out_unlock:
2509 mutex_unlock(&dm_thin_pool_table.mutex);
2510
2511 return r;
2512}
2513
2514static int thin_map(struct dm_target *ti, struct bio *bio,
2515 union map_info *map_context)
2516{
6efd6e83 2517 bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
991d9fa0
JT
2518
2519 return thin_bio_map(ti, bio, map_context);
2520}
2521
eb2aa48d
JT
2522static int thin_endio(struct dm_target *ti,
2523 struct bio *bio, int err,
2524 union map_info *map_context)
2525{
2526 unsigned long flags;
2527 struct endio_hook *h = map_context->ptr;
2528 struct list_head work;
2529 struct new_mapping *m, *tmp;
2530 struct pool *pool = h->tc->pool;
2531
2532 if (h->shared_read_entry) {
2533 INIT_LIST_HEAD(&work);
2534 ds_dec(h->shared_read_entry, &work);
2535
2536 spin_lock_irqsave(&pool->lock, flags);
2537 list_for_each_entry_safe(m, tmp, &work, list) {
2538 list_del(&m->list);
2539 m->quiesced = 1;
2540 __maybe_add_mapping(m);
2541 }
2542 spin_unlock_irqrestore(&pool->lock, flags);
2543 }
2544
104655fd
JT
2545 if (h->all_io_entry) {
2546 INIT_LIST_HEAD(&work);
2547 ds_dec(h->all_io_entry, &work);
2548 list_for_each_entry_safe(m, tmp, &work, list)
2549 list_add(&m->list, &pool->prepared_discards);
2550 }
2551
eb2aa48d
JT
2552 mempool_free(h, pool->endio_hook_pool);
2553
2554 return 0;
2555}
2556
991d9fa0
JT
2557static void thin_postsuspend(struct dm_target *ti)
2558{
2559 if (dm_noflush_suspending(ti))
2560 requeue_io((struct thin_c *)ti->private);
2561}
2562
2563/*
2564 * <nr mapped sectors> <highest mapped sector>
2565 */
2566static int thin_status(struct dm_target *ti, status_type_t type,
2567 char *result, unsigned maxlen)
2568{
2569 int r;
2570 ssize_t sz = 0;
2571 dm_block_t mapped, highest;
2572 char buf[BDEVNAME_SIZE];
2573 struct thin_c *tc = ti->private;
2574
2575 if (!tc->td)
2576 DMEMIT("-");
2577 else {
2578 switch (type) {
2579 case STATUSTYPE_INFO:
2580 r = dm_thin_get_mapped_count(tc->td, &mapped);
2581 if (r)
2582 return r;
2583
2584 r = dm_thin_get_highest_mapped_block(tc->td, &highest);
2585 if (r < 0)
2586 return r;
2587
2588 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
2589 if (r)
2590 DMEMIT("%llu", ((highest + 1) *
2591 tc->pool->sectors_per_block) - 1);
2592 else
2593 DMEMIT("-");
2594 break;
2595
2596 case STATUSTYPE_TABLE:
2597 DMEMIT("%s %lu",
2598 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2599 (unsigned long) tc->dev_id);
2dd9c257
JT
2600 if (tc->origin_dev)
2601 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
991d9fa0
JT
2602 break;
2603 }
2604 }
2605
2606 return 0;
2607}
2608
2609static int thin_iterate_devices(struct dm_target *ti,
2610 iterate_devices_callout_fn fn, void *data)
2611{
2612 dm_block_t blocks;
2613 struct thin_c *tc = ti->private;
2614
2615 /*
2616 * We can't call dm_pool_get_data_dev_size() since that blocks. So
2617 * we follow a more convoluted path through to the pool's target.
2618 */
2619 if (!tc->pool->ti)
2620 return 0; /* nothing is bound */
2621
2622 blocks = tc->pool->ti->len >> tc->pool->block_shift;
2623 if (blocks)
2624 return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data);
2625
2626 return 0;
2627}
2628
2629static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2630{
2631 struct thin_c *tc = ti->private;
104655fd 2632 struct pool *pool = tc->pool;
991d9fa0
JT
2633
2634 blk_limits_io_min(limits, 0);
104655fd
JT
2635 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2636 set_discard_limits(pool, limits);
991d9fa0
JT
2637}
2638
2639static struct target_type thin_target = {
2640 .name = "thin",
2dd9c257 2641 .version = {1, 1, 0},
991d9fa0
JT
2642 .module = THIS_MODULE,
2643 .ctr = thin_ctr,
2644 .dtr = thin_dtr,
2645 .map = thin_map,
eb2aa48d 2646 .end_io = thin_endio,
991d9fa0
JT
2647 .postsuspend = thin_postsuspend,
2648 .status = thin_status,
2649 .iterate_devices = thin_iterate_devices,
2650 .io_hints = thin_io_hints,
2651};
2652
2653/*----------------------------------------------------------------*/
2654
2655static int __init dm_thin_init(void)
2656{
2657 int r;
2658
2659 pool_table_init();
2660
2661 r = dm_register_target(&thin_target);
2662 if (r)
2663 return r;
2664
2665 r = dm_register_target(&pool_target);
2666 if (r)
2667 dm_unregister_target(&thin_target);
2668
2669 return r;
2670}
2671
2672static void dm_thin_exit(void)
2673{
2674 dm_unregister_target(&thin_target);
2675 dm_unregister_target(&pool_target);
2676}
2677
2678module_init(dm_thin_init);
2679module_exit(dm_thin_exit);
2680
2681MODULE_DESCRIPTION(DM_NAME "device-mapper thin provisioning target");
2682MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2683MODULE_LICENSE("GPL");
This page took 0.173142 seconds and 5 git commands to generate.