dm mpath: fix lock order inconsistency in multipath_ioctl
[deliverable/linux.git] / drivers / md / dm-thin.c
CommitLineData
991d9fa0 1/*
e49e5829 2 * Copyright (C) 2011-2012 Red Hat UK.
991d9fa0
JT
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-thin-metadata.h"
4f81a417 8#include "dm-bio-prison.h"
1f4e0ff0 9#include "dm.h"
991d9fa0
JT
10
11#include <linux/device-mapper.h>
12#include <linux/dm-io.h>
13#include <linux/dm-kcopyd.h>
14#include <linux/list.h>
c140e1c4 15#include <linux/rculist.h>
991d9fa0
JT
16#include <linux/init.h>
17#include <linux/module.h>
18#include <linux/slab.h>
67324ea1 19#include <linux/rbtree.h>
991d9fa0
JT
20
21#define DM_MSG_PREFIX "thin"
22
23/*
24 * Tunable constants
25 */
7768ed33 26#define ENDIO_HOOK_POOL_SIZE 1024
991d9fa0
JT
27#define MAPPING_POOL_SIZE 1024
28#define PRISON_CELLS 1024
905e51b3 29#define COMMIT_PERIOD HZ
85ad643b 30#define NO_SPACE_TIMEOUT (HZ * 60)
991d9fa0 31
df5d2e90
MP
32DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
33 "A percentage of time allocated for copy on write");
34
991d9fa0
JT
35/*
36 * The block size of the device holding pool data must be
37 * between 64KB and 1GB.
38 */
39#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
40#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
41
991d9fa0
JT
42/*
43 * Device id is restricted to 24 bits.
44 */
45#define MAX_DEV_ID ((1 << 24) - 1)
46
47/*
48 * How do we handle breaking sharing of data blocks?
49 * =================================================
50 *
51 * We use a standard copy-on-write btree to store the mappings for the
52 * devices (note I'm talking about copy-on-write of the metadata here, not
53 * the data). When you take an internal snapshot you clone the root node
54 * of the origin btree. After this there is no concept of an origin or a
55 * snapshot. They are just two device trees that happen to point to the
56 * same data blocks.
57 *
58 * When we get a write in we decide if it's to a shared data block using
59 * some timestamp magic. If it is, we have to break sharing.
60 *
61 * Let's say we write to a shared block in what was the origin. The
62 * steps are:
63 *
64 * i) plug io further to this physical block. (see bio_prison code).
65 *
66 * ii) quiesce any read io to that shared data block. Obviously
44feb387 67 * including all devices that share this block. (see dm_deferred_set code)
991d9fa0
JT
68 *
69 * iii) copy the data block to a newly allocate block. This step can be
70 * missed out if the io covers the block. (schedule_copy).
71 *
72 * iv) insert the new mapping into the origin's btree
fe878f34 73 * (process_prepared_mapping). This act of inserting breaks some
991d9fa0
JT
74 * sharing of btree nodes between the two devices. Breaking sharing only
75 * effects the btree of that specific device. Btrees for the other
76 * devices that share the block never change. The btree for the origin
77 * device as it was after the last commit is untouched, ie. we're using
78 * persistent data structures in the functional programming sense.
79 *
80 * v) unplug io to this physical block, including the io that triggered
81 * the breaking of sharing.
82 *
83 * Steps (ii) and (iii) occur in parallel.
84 *
85 * The metadata _doesn't_ need to be committed before the io continues. We
86 * get away with this because the io is always written to a _new_ block.
87 * If there's a crash, then:
88 *
89 * - The origin mapping will point to the old origin block (the shared
90 * one). This will contain the data as it was before the io that triggered
91 * the breaking of sharing came in.
92 *
93 * - The snap mapping still points to the old block. As it would after
94 * the commit.
95 *
96 * The downside of this scheme is the timestamp magic isn't perfect, and
97 * will continue to think that data block in the snapshot device is shared
98 * even after the write to the origin has broken sharing. I suspect data
99 * blocks will typically be shared by many different devices, so we're
100 * breaking sharing n + 1 times, rather than n, where n is the number of
101 * devices that reference this data block. At the moment I think the
102 * benefits far, far outweigh the disadvantages.
103 */
104
105/*----------------------------------------------------------------*/
106
991d9fa0
JT
107/*
108 * Key building.
109 */
110static void build_data_key(struct dm_thin_device *td,
44feb387 111 dm_block_t b, struct dm_cell_key *key)
991d9fa0
JT
112{
113 key->virtual = 0;
114 key->dev = dm_thin_dev_id(td);
115 key->block = b;
116}
117
118static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
44feb387 119 struct dm_cell_key *key)
991d9fa0
JT
120{
121 key->virtual = 1;
122 key->dev = dm_thin_dev_id(td);
123 key->block = b;
124}
125
126/*----------------------------------------------------------------*/
127
128/*
129 * A pool device ties together a metadata device and a data device. It
130 * also provides the interface for creating and destroying internal
131 * devices.
132 */
a24c2569 133struct dm_thin_new_mapping;
67e2e2b2 134
e49e5829 135/*
3e1a0699 136 * The pool runs in 4 modes. Ordered in degraded order for comparisons.
e49e5829
JT
137 */
138enum pool_mode {
139 PM_WRITE, /* metadata may be changed */
3e1a0699 140 PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
e49e5829
JT
141 PM_READ_ONLY, /* metadata may not be changed */
142 PM_FAIL, /* all I/O fails */
143};
144
67e2e2b2 145struct pool_features {
e49e5829
JT
146 enum pool_mode mode;
147
9bc142dd
MS
148 bool zero_new_blocks:1;
149 bool discard_enabled:1;
150 bool discard_passdown:1;
787a996c 151 bool error_if_no_space:1;
67e2e2b2
JT
152};
153
e49e5829
JT
154struct thin_c;
155typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
156typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
157
991d9fa0
JT
158struct pool {
159 struct list_head list;
160 struct dm_target *ti; /* Only set if a pool target is bound */
161
162 struct mapped_device *pool_md;
163 struct block_device *md_dev;
164 struct dm_pool_metadata *pmd;
165
991d9fa0 166 dm_block_t low_water_blocks;
55f2b8bd 167 uint32_t sectors_per_block;
f9a8e0cd 168 int sectors_per_block_shift;
991d9fa0 169
67e2e2b2 170 struct pool_features pf;
88a6621b 171 bool low_water_triggered:1; /* A dm event has been sent */
991d9fa0 172
44feb387 173 struct dm_bio_prison *prison;
991d9fa0
JT
174 struct dm_kcopyd_client *copier;
175
176 struct workqueue_struct *wq;
177 struct work_struct worker;
905e51b3 178 struct delayed_work waker;
85ad643b 179 struct delayed_work no_space_timeout;
991d9fa0 180
905e51b3 181 unsigned long last_commit_jiffies;
55f2b8bd 182 unsigned ref_count;
991d9fa0
JT
183
184 spinlock_t lock;
991d9fa0
JT
185 struct bio_list deferred_flush_bios;
186 struct list_head prepared_mappings;
104655fd 187 struct list_head prepared_discards;
c140e1c4 188 struct list_head active_thins;
991d9fa0 189
44feb387
MS
190 struct dm_deferred_set *shared_read_ds;
191 struct dm_deferred_set *all_io_ds;
991d9fa0 192
a24c2569 193 struct dm_thin_new_mapping *next_mapping;
991d9fa0 194 mempool_t *mapping_pool;
e49e5829
JT
195
196 process_bio_fn process_bio;
197 process_bio_fn process_discard;
198
199 process_mapping_fn process_prepared_mapping;
200 process_mapping_fn process_prepared_discard;
991d9fa0
JT
201};
202
e49e5829 203static enum pool_mode get_pool_mode(struct pool *pool);
b5330655 204static void metadata_operation_failed(struct pool *pool, const char *op, int r);
e49e5829 205
991d9fa0
JT
206/*
207 * Target context for a pool.
208 */
209struct pool_c {
210 struct dm_target *ti;
211 struct pool *pool;
212 struct dm_dev *data_dev;
213 struct dm_dev *metadata_dev;
214 struct dm_target_callbacks callbacks;
215
216 dm_block_t low_water_blocks;
0424caa1
MS
217 struct pool_features requested_pf; /* Features requested during table load */
218 struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
991d9fa0
JT
219};
220
221/*
222 * Target context for a thin.
223 */
224struct thin_c {
c140e1c4 225 struct list_head list;
991d9fa0 226 struct dm_dev *pool_dev;
2dd9c257 227 struct dm_dev *origin_dev;
991d9fa0
JT
228 dm_thin_id dev_id;
229
230 struct pool *pool;
231 struct dm_thin_device *td;
738211f7 232 bool requeue_mode:1;
c140e1c4
MS
233 spinlock_t lock;
234 struct bio_list deferred_bio_list;
235 struct bio_list retry_on_resume_list;
67324ea1 236 struct rb_root sort_bio_list; /* sorted list of deferred bios */
b10ebd34
JT
237
238 /*
239 * Ensures the thin is not destroyed until the worker has finished
240 * iterating the active_thins list.
241 */
242 atomic_t refcount;
243 struct completion can_destroy;
991d9fa0
JT
244};
245
246/*----------------------------------------------------------------*/
247
025b9685
JT
248/*
249 * wake_worker() is used when new work is queued and when pool_resume is
250 * ready to continue deferred IO processing.
251 */
252static void wake_worker(struct pool *pool)
253{
254 queue_work(pool->wq, &pool->worker);
255}
256
257/*----------------------------------------------------------------*/
258
6beca5eb
JT
259static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
260 struct dm_bio_prison_cell **cell_result)
261{
262 int r;
263 struct dm_bio_prison_cell *cell_prealloc;
264
265 /*
266 * Allocate a cell from the prison's mempool.
267 * This might block but it can't fail.
268 */
269 cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
270
271 r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
272 if (r)
273 /*
274 * We reused an old cell; we can get rid of
275 * the new one.
276 */
277 dm_bio_prison_free_cell(pool->prison, cell_prealloc);
278
279 return r;
280}
281
282static void cell_release(struct pool *pool,
283 struct dm_bio_prison_cell *cell,
284 struct bio_list *bios)
285{
286 dm_cell_release(pool->prison, cell, bios);
287 dm_bio_prison_free_cell(pool->prison, cell);
288}
289
290static void cell_release_no_holder(struct pool *pool,
291 struct dm_bio_prison_cell *cell,
292 struct bio_list *bios)
293{
294 dm_cell_release_no_holder(pool->prison, cell, bios);
295 dm_bio_prison_free_cell(pool->prison, cell);
296}
297
025b9685
JT
298static void cell_defer_no_holder_no_free(struct thin_c *tc,
299 struct dm_bio_prison_cell *cell)
300{
301 struct pool *pool = tc->pool;
302 unsigned long flags;
303
c140e1c4
MS
304 spin_lock_irqsave(&tc->lock, flags);
305 dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list);
306 spin_unlock_irqrestore(&tc->lock, flags);
025b9685
JT
307
308 wake_worker(pool);
309}
310
6beca5eb
JT
311static void cell_error(struct pool *pool,
312 struct dm_bio_prison_cell *cell)
313{
314 dm_cell_error(pool->prison, cell);
315 dm_bio_prison_free_cell(pool->prison, cell);
316}
317
318/*----------------------------------------------------------------*/
319
991d9fa0
JT
320/*
321 * A global list of pools that uses a struct mapped_device as a key.
322 */
323static struct dm_thin_pool_table {
324 struct mutex mutex;
325 struct list_head pools;
326} dm_thin_pool_table;
327
328static void pool_table_init(void)
329{
330 mutex_init(&dm_thin_pool_table.mutex);
331 INIT_LIST_HEAD(&dm_thin_pool_table.pools);
332}
333
334static void __pool_table_insert(struct pool *pool)
335{
336 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
337 list_add(&pool->list, &dm_thin_pool_table.pools);
338}
339
340static void __pool_table_remove(struct pool *pool)
341{
342 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
343 list_del(&pool->list);
344}
345
346static struct pool *__pool_table_lookup(struct mapped_device *md)
347{
348 struct pool *pool = NULL, *tmp;
349
350 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
351
352 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
353 if (tmp->pool_md == md) {
354 pool = tmp;
355 break;
356 }
357 }
358
359 return pool;
360}
361
362static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
363{
364 struct pool *pool = NULL, *tmp;
365
366 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
367
368 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
369 if (tmp->md_dev == md_dev) {
370 pool = tmp;
371 break;
372 }
373 }
374
375 return pool;
376}
377
378/*----------------------------------------------------------------*/
379
a24c2569 380struct dm_thin_endio_hook {
eb2aa48d 381 struct thin_c *tc;
44feb387
MS
382 struct dm_deferred_entry *shared_read_entry;
383 struct dm_deferred_entry *all_io_entry;
a24c2569 384 struct dm_thin_new_mapping *overwrite_mapping;
67324ea1 385 struct rb_node rb_node;
eb2aa48d
JT
386};
387
18adc577 388static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
991d9fa0
JT
389{
390 struct bio *bio;
391 struct bio_list bios;
18adc577 392 unsigned long flags;
991d9fa0
JT
393
394 bio_list_init(&bios);
18adc577 395
c140e1c4 396 spin_lock_irqsave(&tc->lock, flags);
991d9fa0
JT
397 bio_list_merge(&bios, master);
398 bio_list_init(master);
c140e1c4 399 spin_unlock_irqrestore(&tc->lock, flags);
991d9fa0 400
c140e1c4
MS
401 while ((bio = bio_list_pop(&bios)))
402 bio_endio(bio, DM_ENDIO_REQUEUE);
991d9fa0
JT
403}
404
405static void requeue_io(struct thin_c *tc)
406{
c140e1c4
MS
407 requeue_bio_list(tc, &tc->deferred_bio_list);
408 requeue_bio_list(tc, &tc->retry_on_resume_list);
991d9fa0
JT
409}
410
c140e1c4 411static void error_thin_retry_list(struct thin_c *tc)
3e1a0699
JT
412{
413 struct bio *bio;
414 unsigned long flags;
415 struct bio_list bios;
416
417 bio_list_init(&bios);
418
c140e1c4
MS
419 spin_lock_irqsave(&tc->lock, flags);
420 bio_list_merge(&bios, &tc->retry_on_resume_list);
421 bio_list_init(&tc->retry_on_resume_list);
422 spin_unlock_irqrestore(&tc->lock, flags);
3e1a0699
JT
423
424 while ((bio = bio_list_pop(&bios)))
425 bio_io_error(bio);
426}
427
c140e1c4
MS
428static void error_retry_list(struct pool *pool)
429{
430 struct thin_c *tc;
431
432 rcu_read_lock();
433 list_for_each_entry_rcu(tc, &pool->active_thins, list)
434 error_thin_retry_list(tc);
435 rcu_read_unlock();
436}
437
991d9fa0
JT
438/*
439 * This section of code contains the logic for processing a thin device's IO.
440 * Much of the code depends on pool object resources (lists, workqueues, etc)
441 * but most is exclusively called from the thin target rather than the thin-pool
442 * target.
443 */
444
58f77a21
MS
445static bool block_size_is_power_of_two(struct pool *pool)
446{
447 return pool->sectors_per_block_shift >= 0;
448}
449
991d9fa0
JT
450static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
451{
58f77a21 452 struct pool *pool = tc->pool;
4f024f37 453 sector_t block_nr = bio->bi_iter.bi_sector;
55f2b8bd 454
58f77a21
MS
455 if (block_size_is_power_of_two(pool))
456 block_nr >>= pool->sectors_per_block_shift;
f9a8e0cd 457 else
58f77a21 458 (void) sector_div(block_nr, pool->sectors_per_block);
55f2b8bd
MS
459
460 return block_nr;
991d9fa0
JT
461}
462
463static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
464{
465 struct pool *pool = tc->pool;
4f024f37 466 sector_t bi_sector = bio->bi_iter.bi_sector;
991d9fa0
JT
467
468 bio->bi_bdev = tc->pool_dev->bdev;
58f77a21 469 if (block_size_is_power_of_two(pool))
4f024f37
KO
470 bio->bi_iter.bi_sector =
471 (block << pool->sectors_per_block_shift) |
472 (bi_sector & (pool->sectors_per_block - 1));
58f77a21 473 else
4f024f37 474 bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
58f77a21 475 sector_div(bi_sector, pool->sectors_per_block);
991d9fa0
JT
476}
477
2dd9c257
JT
478static void remap_to_origin(struct thin_c *tc, struct bio *bio)
479{
480 bio->bi_bdev = tc->origin_dev->bdev;
481}
482
4afdd680
JT
483static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
484{
485 return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
486 dm_thin_changed_this_transaction(tc->td);
487}
488
e8088073
JT
489static void inc_all_io_entry(struct pool *pool, struct bio *bio)
490{
491 struct dm_thin_endio_hook *h;
492
493 if (bio->bi_rw & REQ_DISCARD)
494 return;
495
59c3d2c6 496 h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
e8088073
JT
497 h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
498}
499
2dd9c257 500static void issue(struct thin_c *tc, struct bio *bio)
991d9fa0
JT
501{
502 struct pool *pool = tc->pool;
503 unsigned long flags;
504
e49e5829
JT
505 if (!bio_triggers_commit(tc, bio)) {
506 generic_make_request(bio);
507 return;
508 }
509
991d9fa0 510 /*
e49e5829
JT
511 * Complete bio with an error if earlier I/O caused changes to
512 * the metadata that can't be committed e.g, due to I/O errors
513 * on the metadata device.
991d9fa0 514 */
e49e5829
JT
515 if (dm_thin_aborted_changes(tc->td)) {
516 bio_io_error(bio);
517 return;
518 }
519
520 /*
521 * Batch together any bios that trigger commits and then issue a
522 * single commit for them in process_deferred_bios().
523 */
524 spin_lock_irqsave(&pool->lock, flags);
525 bio_list_add(&pool->deferred_flush_bios, bio);
526 spin_unlock_irqrestore(&pool->lock, flags);
991d9fa0
JT
527}
528
2dd9c257
JT
529static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
530{
531 remap_to_origin(tc, bio);
532 issue(tc, bio);
533}
534
535static void remap_and_issue(struct thin_c *tc, struct bio *bio,
536 dm_block_t block)
537{
538 remap(tc, bio, block);
539 issue(tc, bio);
540}
541
991d9fa0
JT
542/*----------------------------------------------------------------*/
543
544/*
545 * Bio endio functions.
546 */
a24c2569 547struct dm_thin_new_mapping {
991d9fa0
JT
548 struct list_head list;
549
7f214665
MS
550 bool quiesced:1;
551 bool prepared:1;
552 bool pass_discard:1;
553 bool definitely_not_shared:1;
991d9fa0 554
7f214665 555 int err;
991d9fa0
JT
556 struct thin_c *tc;
557 dm_block_t virt_block;
558 dm_block_t data_block;
a24c2569 559 struct dm_bio_prison_cell *cell, *cell2;
991d9fa0
JT
560
561 /*
562 * If the bio covers the whole area of a block then we can avoid
563 * zeroing or copying. Instead this bio is hooked. The bio will
564 * still be in the cell, so care has to be taken to avoid issuing
565 * the bio twice.
566 */
567 struct bio *bio;
568 bio_end_io_t *saved_bi_end_io;
569};
570
a24c2569 571static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
991d9fa0
JT
572{
573 struct pool *pool = m->tc->pool;
574
eb2aa48d 575 if (m->quiesced && m->prepared) {
daec338b 576 list_add_tail(&m->list, &pool->prepared_mappings);
991d9fa0
JT
577 wake_worker(pool);
578 }
579}
580
581static void copy_complete(int read_err, unsigned long write_err, void *context)
582{
583 unsigned long flags;
a24c2569 584 struct dm_thin_new_mapping *m = context;
991d9fa0
JT
585 struct pool *pool = m->tc->pool;
586
587 m->err = read_err || write_err ? -EIO : 0;
588
589 spin_lock_irqsave(&pool->lock, flags);
7f214665 590 m->prepared = true;
991d9fa0
JT
591 __maybe_add_mapping(m);
592 spin_unlock_irqrestore(&pool->lock, flags);
593}
594
595static void overwrite_endio(struct bio *bio, int err)
596{
597 unsigned long flags;
59c3d2c6 598 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
a24c2569 599 struct dm_thin_new_mapping *m = h->overwrite_mapping;
991d9fa0
JT
600 struct pool *pool = m->tc->pool;
601
602 m->err = err;
603
604 spin_lock_irqsave(&pool->lock, flags);
7f214665 605 m->prepared = true;
991d9fa0
JT
606 __maybe_add_mapping(m);
607 spin_unlock_irqrestore(&pool->lock, flags);
608}
609
991d9fa0
JT
610/*----------------------------------------------------------------*/
611
612/*
613 * Workqueue.
614 */
615
616/*
617 * Prepared mapping jobs.
618 */
619
620/*
621 * This sends the bios in the cell back to the deferred_bios list.
622 */
2aab3850 623static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
991d9fa0
JT
624{
625 struct pool *pool = tc->pool;
626 unsigned long flags;
627
c140e1c4
MS
628 spin_lock_irqsave(&tc->lock, flags);
629 cell_release(pool, cell, &tc->deferred_bio_list);
630 spin_unlock_irqrestore(&tc->lock, flags);
991d9fa0
JT
631
632 wake_worker(pool);
633}
634
635/*
6beca5eb 636 * Same as cell_defer above, except it omits the original holder of the cell.
991d9fa0 637 */
f286ba0e 638static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
991d9fa0 639{
991d9fa0
JT
640 struct pool *pool = tc->pool;
641 unsigned long flags;
642
c140e1c4
MS
643 spin_lock_irqsave(&tc->lock, flags);
644 cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
645 spin_unlock_irqrestore(&tc->lock, flags);
991d9fa0
JT
646
647 wake_worker(pool);
648}
649
e49e5829
JT
650static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
651{
196d38bc 652 if (m->bio) {
e49e5829 653 m->bio->bi_end_io = m->saved_bi_end_io;
196d38bc
KO
654 atomic_inc(&m->bio->bi_remaining);
655 }
6beca5eb 656 cell_error(m->tc->pool, m->cell);
e49e5829
JT
657 list_del(&m->list);
658 mempool_free(m, m->tc->pool->mapping_pool);
659}
025b9685 660
a24c2569 661static void process_prepared_mapping(struct dm_thin_new_mapping *m)
991d9fa0
JT
662{
663 struct thin_c *tc = m->tc;
6beca5eb 664 struct pool *pool = tc->pool;
991d9fa0
JT
665 struct bio *bio;
666 int r;
667
668 bio = m->bio;
196d38bc 669 if (bio) {
991d9fa0 670 bio->bi_end_io = m->saved_bi_end_io;
196d38bc
KO
671 atomic_inc(&bio->bi_remaining);
672 }
991d9fa0
JT
673
674 if (m->err) {
6beca5eb 675 cell_error(pool, m->cell);
905386f8 676 goto out;
991d9fa0
JT
677 }
678
679 /*
680 * Commit the prepared block into the mapping btree.
681 * Any I/O for this block arriving after this point will get
682 * remapped to it directly.
683 */
684 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
685 if (r) {
b5330655 686 metadata_operation_failed(pool, "dm_thin_insert_block", r);
6beca5eb 687 cell_error(pool, m->cell);
905386f8 688 goto out;
991d9fa0
JT
689 }
690
691 /*
692 * Release any bios held while the block was being provisioned.
693 * If we are processing a write bio that completely covers the block,
694 * we already processed it so can ignore it now when processing
695 * the bios in the cell.
696 */
697 if (bio) {
f286ba0e 698 cell_defer_no_holder(tc, m->cell);
991d9fa0
JT
699 bio_endio(bio, 0);
700 } else
2aab3850 701 cell_defer(tc, m->cell);
991d9fa0 702
905386f8 703out:
991d9fa0 704 list_del(&m->list);
6beca5eb 705 mempool_free(m, pool->mapping_pool);
991d9fa0
JT
706}
707
e49e5829 708static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
104655fd 709{
104655fd
JT
710 struct thin_c *tc = m->tc;
711
e49e5829 712 bio_io_error(m->bio);
f286ba0e
JT
713 cell_defer_no_holder(tc, m->cell);
714 cell_defer_no_holder(tc, m->cell2);
e49e5829
JT
715 mempool_free(m, tc->pool->mapping_pool);
716}
717
718static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
719{
720 struct thin_c *tc = m->tc;
104655fd 721
e8088073 722 inc_all_io_entry(tc->pool, m->bio);
f286ba0e
JT
723 cell_defer_no_holder(tc, m->cell);
724 cell_defer_no_holder(tc, m->cell2);
e8088073 725
104655fd 726 if (m->pass_discard)
19fa1a67
JT
727 if (m->definitely_not_shared)
728 remap_and_issue(tc, m->bio, m->data_block);
729 else {
730 bool used = false;
731 if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
732 bio_endio(m->bio, 0);
733 else
734 remap_and_issue(tc, m->bio, m->data_block);
735 }
104655fd
JT
736 else
737 bio_endio(m->bio, 0);
738
104655fd
JT
739 mempool_free(m, tc->pool->mapping_pool);
740}
741
e49e5829
JT
742static void process_prepared_discard(struct dm_thin_new_mapping *m)
743{
744 int r;
745 struct thin_c *tc = m->tc;
746
747 r = dm_thin_remove_block(tc->td, m->virt_block);
748 if (r)
c397741c 749 DMERR_LIMIT("dm_thin_remove_block() failed");
e49e5829
JT
750
751 process_prepared_discard_passdown(m);
752}
753
104655fd 754static void process_prepared(struct pool *pool, struct list_head *head,
e49e5829 755 process_mapping_fn *fn)
991d9fa0
JT
756{
757 unsigned long flags;
758 struct list_head maps;
a24c2569 759 struct dm_thin_new_mapping *m, *tmp;
991d9fa0
JT
760
761 INIT_LIST_HEAD(&maps);
762 spin_lock_irqsave(&pool->lock, flags);
104655fd 763 list_splice_init(head, &maps);
991d9fa0
JT
764 spin_unlock_irqrestore(&pool->lock, flags);
765
766 list_for_each_entry_safe(m, tmp, &maps, list)
e49e5829 767 (*fn)(m);
991d9fa0
JT
768}
769
770/*
771 * Deferred bio jobs.
772 */
104655fd 773static int io_overlaps_block(struct pool *pool, struct bio *bio)
991d9fa0 774{
4f024f37
KO
775 return bio->bi_iter.bi_size ==
776 (pool->sectors_per_block << SECTOR_SHIFT);
104655fd
JT
777}
778
779static int io_overwrites_block(struct pool *pool, struct bio *bio)
780{
781 return (bio_data_dir(bio) == WRITE) &&
782 io_overlaps_block(pool, bio);
991d9fa0
JT
783}
784
785static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
786 bio_end_io_t *fn)
787{
788 *save = bio->bi_end_io;
789 bio->bi_end_io = fn;
790}
791
792static int ensure_next_mapping(struct pool *pool)
793{
794 if (pool->next_mapping)
795 return 0;
796
797 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
798
799 return pool->next_mapping ? 0 : -ENOMEM;
800}
801
a24c2569 802static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
991d9fa0 803{
16961b04 804 struct dm_thin_new_mapping *m = pool->next_mapping;
991d9fa0
JT
805
806 BUG_ON(!pool->next_mapping);
807
16961b04
MS
808 memset(m, 0, sizeof(struct dm_thin_new_mapping));
809 INIT_LIST_HEAD(&m->list);
810 m->bio = NULL;
811
991d9fa0
JT
812 pool->next_mapping = NULL;
813
16961b04 814 return m;
991d9fa0
JT
815}
816
817static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
2dd9c257
JT
818 struct dm_dev *origin, dm_block_t data_origin,
819 dm_block_t data_dest,
a24c2569 820 struct dm_bio_prison_cell *cell, struct bio *bio)
991d9fa0
JT
821{
822 int r;
823 struct pool *pool = tc->pool;
a24c2569 824 struct dm_thin_new_mapping *m = get_next_mapping(pool);
991d9fa0 825
991d9fa0
JT
826 m->tc = tc;
827 m->virt_block = virt_block;
828 m->data_block = data_dest;
829 m->cell = cell;
991d9fa0 830
44feb387 831 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
7f214665 832 m->quiesced = true;
991d9fa0
JT
833
834 /*
835 * IO to pool_dev remaps to the pool target's data_dev.
836 *
837 * If the whole block of data is being overwritten, we can issue the
838 * bio immediately. Otherwise we use kcopyd to clone the data first.
839 */
840 if (io_overwrites_block(pool, bio)) {
59c3d2c6 841 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
a24c2569 842
eb2aa48d 843 h->overwrite_mapping = m;
991d9fa0
JT
844 m->bio = bio;
845 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
e8088073 846 inc_all_io_entry(pool, bio);
991d9fa0
JT
847 remap_and_issue(tc, bio, data_dest);
848 } else {
849 struct dm_io_region from, to;
850
2dd9c257 851 from.bdev = origin->bdev;
991d9fa0
JT
852 from.sector = data_origin * pool->sectors_per_block;
853 from.count = pool->sectors_per_block;
854
855 to.bdev = tc->pool_dev->bdev;
856 to.sector = data_dest * pool->sectors_per_block;
857 to.count = pool->sectors_per_block;
858
859 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
860 0, copy_complete, m);
861 if (r < 0) {
862 mempool_free(m, pool->mapping_pool);
c397741c 863 DMERR_LIMIT("dm_kcopyd_copy() failed");
6beca5eb 864 cell_error(pool, cell);
991d9fa0
JT
865 }
866 }
867}
868
2dd9c257
JT
869static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
870 dm_block_t data_origin, dm_block_t data_dest,
a24c2569 871 struct dm_bio_prison_cell *cell, struct bio *bio)
2dd9c257
JT
872{
873 schedule_copy(tc, virt_block, tc->pool_dev,
874 data_origin, data_dest, cell, bio);
875}
876
877static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
878 dm_block_t data_dest,
a24c2569 879 struct dm_bio_prison_cell *cell, struct bio *bio)
2dd9c257
JT
880{
881 schedule_copy(tc, virt_block, tc->origin_dev,
882 virt_block, data_dest, cell, bio);
883}
884
991d9fa0 885static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
a24c2569 886 dm_block_t data_block, struct dm_bio_prison_cell *cell,
991d9fa0
JT
887 struct bio *bio)
888{
889 struct pool *pool = tc->pool;
a24c2569 890 struct dm_thin_new_mapping *m = get_next_mapping(pool);
991d9fa0 891
7f214665
MS
892 m->quiesced = true;
893 m->prepared = false;
991d9fa0
JT
894 m->tc = tc;
895 m->virt_block = virt_block;
896 m->data_block = data_block;
897 m->cell = cell;
991d9fa0
JT
898
899 /*
900 * If the whole block of data is being overwritten or we are not
901 * zeroing pre-existing data, we can issue the bio immediately.
902 * Otherwise we use kcopyd to zero the data first.
903 */
67e2e2b2 904 if (!pool->pf.zero_new_blocks)
991d9fa0
JT
905 process_prepared_mapping(m);
906
907 else if (io_overwrites_block(pool, bio)) {
59c3d2c6 908 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
a24c2569 909
eb2aa48d 910 h->overwrite_mapping = m;
991d9fa0
JT
911 m->bio = bio;
912 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
e8088073 913 inc_all_io_entry(pool, bio);
991d9fa0 914 remap_and_issue(tc, bio, data_block);
991d9fa0
JT
915 } else {
916 int r;
917 struct dm_io_region to;
918
919 to.bdev = tc->pool_dev->bdev;
920 to.sector = data_block * pool->sectors_per_block;
921 to.count = pool->sectors_per_block;
922
923 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
924 if (r < 0) {
925 mempool_free(m, pool->mapping_pool);
c397741c 926 DMERR_LIMIT("dm_kcopyd_zero() failed");
6beca5eb 927 cell_error(pool, cell);
991d9fa0
JT
928 }
929 }
930}
931
e49e5829
JT
932/*
933 * A non-zero return indicates read_only or fail_io mode.
934 * Many callers don't care about the return value.
935 */
020cc3b5 936static int commit(struct pool *pool)
e49e5829
JT
937{
938 int r;
939
8d07e8a5 940 if (get_pool_mode(pool) >= PM_READ_ONLY)
e49e5829
JT
941 return -EINVAL;
942
020cc3b5 943 r = dm_pool_commit_metadata(pool->pmd);
b5330655
JT
944 if (r)
945 metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
e49e5829
JT
946
947 return r;
948}
949
88a6621b
JT
950static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
951{
952 unsigned long flags;
953
954 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
955 DMWARN("%s: reached low water mark for data device: sending event.",
956 dm_device_name(pool->pool_md));
957 spin_lock_irqsave(&pool->lock, flags);
958 pool->low_water_triggered = true;
959 spin_unlock_irqrestore(&pool->lock, flags);
960 dm_table_event(pool->ti->table);
961 }
962}
963
3e1a0699
JT
964static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
965
991d9fa0
JT
966static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
967{
968 int r;
969 dm_block_t free_blocks;
991d9fa0
JT
970 struct pool *pool = tc->pool;
971
3e1a0699 972 if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
8d30abff
JT
973 return -EINVAL;
974
991d9fa0 975 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
b5330655
JT
976 if (r) {
977 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
991d9fa0 978 return r;
b5330655 979 }
991d9fa0 980
88a6621b 981 check_low_water_mark(pool, free_blocks);
991d9fa0
JT
982
983 if (!free_blocks) {
94563bad
MS
984 /*
985 * Try to commit to see if that will free up some
986 * more space.
987 */
020cc3b5
JT
988 r = commit(pool);
989 if (r)
990 return r;
991d9fa0 991
94563bad 992 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
b5330655
JT
993 if (r) {
994 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
94563bad 995 return r;
b5330655 996 }
991d9fa0 997
94563bad 998 if (!free_blocks) {
3e1a0699 999 set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
94563bad 1000 return -ENOSPC;
991d9fa0
JT
1001 }
1002 }
1003
1004 r = dm_pool_alloc_data_block(pool->pmd, result);
4a02b34e 1005 if (r) {
b5330655 1006 metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
991d9fa0 1007 return r;
4a02b34e 1008 }
991d9fa0
JT
1009
1010 return 0;
1011}
1012
1013/*
1014 * If we have run out of space, queue bios until the device is
1015 * resumed, presumably after having been reloaded with more space.
1016 */
1017static void retry_on_resume(struct bio *bio)
1018{
59c3d2c6 1019 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
eb2aa48d 1020 struct thin_c *tc = h->tc;
991d9fa0
JT
1021 unsigned long flags;
1022
c140e1c4
MS
1023 spin_lock_irqsave(&tc->lock, flags);
1024 bio_list_add(&tc->retry_on_resume_list, bio);
1025 spin_unlock_irqrestore(&tc->lock, flags);
991d9fa0
JT
1026}
1027
3e1a0699 1028static bool should_error_unserviceable_bio(struct pool *pool)
8c0f0e8c 1029{
3e1a0699
JT
1030 enum pool_mode m = get_pool_mode(pool);
1031
1032 switch (m) {
1033 case PM_WRITE:
1034 /* Shouldn't get here */
1035 DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1036 return true;
1037
1038 case PM_OUT_OF_DATA_SPACE:
1039 return pool->pf.error_if_no_space;
1040
1041 case PM_READ_ONLY:
1042 case PM_FAIL:
1043 return true;
1044 default:
1045 /* Shouldn't get here */
1046 DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1047 return true;
1048 }
1049}
8c0f0e8c 1050
3e1a0699
JT
1051static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1052{
1053 if (should_error_unserviceable_bio(pool))
8c0f0e8c 1054 bio_io_error(bio);
6d16202b
MS
1055 else
1056 retry_on_resume(bio);
8c0f0e8c
MS
1057}
1058
399caddf 1059static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
991d9fa0
JT
1060{
1061 struct bio *bio;
1062 struct bio_list bios;
1063
3e1a0699
JT
1064 if (should_error_unserviceable_bio(pool)) {
1065 cell_error(pool, cell);
1066 return;
1067 }
1068
991d9fa0 1069 bio_list_init(&bios);
6beca5eb 1070 cell_release(pool, cell, &bios);
991d9fa0 1071
3e1a0699
JT
1072 if (should_error_unserviceable_bio(pool))
1073 while ((bio = bio_list_pop(&bios)))
1074 bio_io_error(bio);
1075 else
1076 while ((bio = bio_list_pop(&bios)))
1077 retry_on_resume(bio);
991d9fa0
JT
1078}
1079
104655fd
JT
1080static void process_discard(struct thin_c *tc, struct bio *bio)
1081{
1082 int r;
c3a0ce2e 1083 unsigned long flags;
104655fd 1084 struct pool *pool = tc->pool;
a24c2569 1085 struct dm_bio_prison_cell *cell, *cell2;
44feb387 1086 struct dm_cell_key key, key2;
104655fd
JT
1087 dm_block_t block = get_bio_block(tc, bio);
1088 struct dm_thin_lookup_result lookup_result;
a24c2569 1089 struct dm_thin_new_mapping *m;
104655fd
JT
1090
1091 build_virtual_key(tc->td, block, &key);
6beca5eb 1092 if (bio_detain(tc->pool, &key, bio, &cell))
104655fd
JT
1093 return;
1094
1095 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1096 switch (r) {
1097 case 0:
1098 /*
1099 * Check nobody is fiddling with this pool block. This can
1100 * happen if someone's in the process of breaking sharing
1101 * on this block.
1102 */
1103 build_data_key(tc->td, lookup_result.block, &key2);
6beca5eb 1104 if (bio_detain(tc->pool, &key2, bio, &cell2)) {
f286ba0e 1105 cell_defer_no_holder(tc, cell);
104655fd
JT
1106 break;
1107 }
1108
1109 if (io_overlaps_block(pool, bio)) {
1110 /*
1111 * IO may still be going to the destination block. We must
1112 * quiesce before we can do the removal.
1113 */
1114 m = get_next_mapping(pool);
1115 m->tc = tc;
19fa1a67
JT
1116 m->pass_discard = pool->pf.discard_passdown;
1117 m->definitely_not_shared = !lookup_result.shared;
104655fd
JT
1118 m->virt_block = block;
1119 m->data_block = lookup_result.block;
1120 m->cell = cell;
1121 m->cell2 = cell2;
104655fd
JT
1122 m->bio = bio;
1123
44feb387 1124 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
c3a0ce2e 1125 spin_lock_irqsave(&pool->lock, flags);
daec338b 1126 list_add_tail(&m->list, &pool->prepared_discards);
c3a0ce2e 1127 spin_unlock_irqrestore(&pool->lock, flags);
104655fd
JT
1128 wake_worker(pool);
1129 }
1130 } else {
e8088073 1131 inc_all_io_entry(pool, bio);
f286ba0e
JT
1132 cell_defer_no_holder(tc, cell);
1133 cell_defer_no_holder(tc, cell2);
e8088073 1134
104655fd 1135 /*
49296309
MP
1136 * The DM core makes sure that the discard doesn't span
1137 * a block boundary. So we submit the discard of a
1138 * partial block appropriately.
104655fd 1139 */
650d2a06
MP
1140 if ((!lookup_result.shared) && pool->pf.discard_passdown)
1141 remap_and_issue(tc, bio, lookup_result.block);
1142 else
1143 bio_endio(bio, 0);
104655fd
JT
1144 }
1145 break;
1146
1147 case -ENODATA:
1148 /*
1149 * It isn't provisioned, just forget it.
1150 */
f286ba0e 1151 cell_defer_no_holder(tc, cell);
104655fd
JT
1152 bio_endio(bio, 0);
1153 break;
1154
1155 default:
c397741c
MS
1156 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1157 __func__, r);
f286ba0e 1158 cell_defer_no_holder(tc, cell);
104655fd
JT
1159 bio_io_error(bio);
1160 break;
1161 }
1162}
1163
991d9fa0 1164static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
44feb387 1165 struct dm_cell_key *key,
991d9fa0 1166 struct dm_thin_lookup_result *lookup_result,
a24c2569 1167 struct dm_bio_prison_cell *cell)
991d9fa0
JT
1168{
1169 int r;
1170 dm_block_t data_block;
d6fc2042 1171 struct pool *pool = tc->pool;
991d9fa0
JT
1172
1173 r = alloc_data_block(tc, &data_block);
1174 switch (r) {
1175 case 0:
2dd9c257
JT
1176 schedule_internal_copy(tc, block, lookup_result->block,
1177 data_block, cell, bio);
991d9fa0
JT
1178 break;
1179
1180 case -ENOSPC:
399caddf 1181 retry_bios_on_resume(pool, cell);
991d9fa0
JT
1182 break;
1183
1184 default:
c397741c
MS
1185 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1186 __func__, r);
d6fc2042 1187 cell_error(pool, cell);
991d9fa0
JT
1188 break;
1189 }
1190}
1191
1192static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1193 dm_block_t block,
1194 struct dm_thin_lookup_result *lookup_result)
1195{
a24c2569 1196 struct dm_bio_prison_cell *cell;
991d9fa0 1197 struct pool *pool = tc->pool;
44feb387 1198 struct dm_cell_key key;
991d9fa0
JT
1199
1200 /*
1201 * If cell is already occupied, then sharing is already in the process
1202 * of being broken so we have nothing further to do here.
1203 */
1204 build_data_key(tc->td, lookup_result->block, &key);
6beca5eb 1205 if (bio_detain(pool, &key, bio, &cell))
991d9fa0
JT
1206 return;
1207
4f024f37 1208 if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
991d9fa0
JT
1209 break_sharing(tc, bio, block, &key, lookup_result, cell);
1210 else {
59c3d2c6 1211 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
991d9fa0 1212
44feb387 1213 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
e8088073 1214 inc_all_io_entry(pool, bio);
f286ba0e 1215 cell_defer_no_holder(tc, cell);
e8088073 1216
991d9fa0
JT
1217 remap_and_issue(tc, bio, lookup_result->block);
1218 }
1219}
1220
1221static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
a24c2569 1222 struct dm_bio_prison_cell *cell)
991d9fa0
JT
1223{
1224 int r;
1225 dm_block_t data_block;
6beca5eb 1226 struct pool *pool = tc->pool;
991d9fa0
JT
1227
1228 /*
1229 * Remap empty bios (flushes) immediately, without provisioning.
1230 */
4f024f37 1231 if (!bio->bi_iter.bi_size) {
6beca5eb 1232 inc_all_io_entry(pool, bio);
f286ba0e 1233 cell_defer_no_holder(tc, cell);
e8088073 1234
991d9fa0
JT
1235 remap_and_issue(tc, bio, 0);
1236 return;
1237 }
1238
1239 /*
1240 * Fill read bios with zeroes and complete them immediately.
1241 */
1242 if (bio_data_dir(bio) == READ) {
1243 zero_fill_bio(bio);
f286ba0e 1244 cell_defer_no_holder(tc, cell);
991d9fa0
JT
1245 bio_endio(bio, 0);
1246 return;
1247 }
1248
1249 r = alloc_data_block(tc, &data_block);
1250 switch (r) {
1251 case 0:
2dd9c257
JT
1252 if (tc->origin_dev)
1253 schedule_external_copy(tc, block, data_block, cell, bio);
1254 else
1255 schedule_zero(tc, block, data_block, cell, bio);
991d9fa0
JT
1256 break;
1257
1258 case -ENOSPC:
399caddf 1259 retry_bios_on_resume(pool, cell);
991d9fa0
JT
1260 break;
1261
1262 default:
c397741c
MS
1263 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1264 __func__, r);
6beca5eb 1265 cell_error(pool, cell);
991d9fa0
JT
1266 break;
1267 }
1268}
1269
1270static void process_bio(struct thin_c *tc, struct bio *bio)
1271{
1272 int r;
6beca5eb 1273 struct pool *pool = tc->pool;
991d9fa0 1274 dm_block_t block = get_bio_block(tc, bio);
a24c2569 1275 struct dm_bio_prison_cell *cell;
44feb387 1276 struct dm_cell_key key;
991d9fa0
JT
1277 struct dm_thin_lookup_result lookup_result;
1278
1279 /*
1280 * If cell is already occupied, then the block is already
1281 * being provisioned so we have nothing further to do here.
1282 */
1283 build_virtual_key(tc->td, block, &key);
6beca5eb 1284 if (bio_detain(pool, &key, bio, &cell))
991d9fa0
JT
1285 return;
1286
1287 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1288 switch (r) {
1289 case 0:
e8088073 1290 if (lookup_result.shared) {
991d9fa0 1291 process_shared_bio(tc, bio, block, &lookup_result);
6beca5eb 1292 cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
e8088073 1293 } else {
6beca5eb 1294 inc_all_io_entry(pool, bio);
f286ba0e 1295 cell_defer_no_holder(tc, cell);
e8088073 1296
991d9fa0 1297 remap_and_issue(tc, bio, lookup_result.block);
e8088073 1298 }
991d9fa0
JT
1299 break;
1300
1301 case -ENODATA:
2dd9c257 1302 if (bio_data_dir(bio) == READ && tc->origin_dev) {
6beca5eb 1303 inc_all_io_entry(pool, bio);
f286ba0e 1304 cell_defer_no_holder(tc, cell);
e8088073 1305
2dd9c257
JT
1306 remap_to_origin_and_issue(tc, bio);
1307 } else
1308 provision_block(tc, bio, block, cell);
991d9fa0
JT
1309 break;
1310
1311 default:
c397741c
MS
1312 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1313 __func__, r);
f286ba0e 1314 cell_defer_no_holder(tc, cell);
991d9fa0
JT
1315 bio_io_error(bio);
1316 break;
1317 }
1318}
1319
e49e5829
JT
1320static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1321{
1322 int r;
1323 int rw = bio_data_dir(bio);
1324 dm_block_t block = get_bio_block(tc, bio);
1325 struct dm_thin_lookup_result lookup_result;
1326
1327 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1328 switch (r) {
1329 case 0:
4f024f37 1330 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
8c0f0e8c 1331 handle_unserviceable_bio(tc->pool, bio);
e8088073
JT
1332 else {
1333 inc_all_io_entry(tc->pool, bio);
e49e5829 1334 remap_and_issue(tc, bio, lookup_result.block);
e8088073 1335 }
e49e5829
JT
1336 break;
1337
1338 case -ENODATA:
1339 if (rw != READ) {
8c0f0e8c 1340 handle_unserviceable_bio(tc->pool, bio);
e49e5829
JT
1341 break;
1342 }
1343
1344 if (tc->origin_dev) {
e8088073 1345 inc_all_io_entry(tc->pool, bio);
e49e5829
JT
1346 remap_to_origin_and_issue(tc, bio);
1347 break;
1348 }
1349
1350 zero_fill_bio(bio);
1351 bio_endio(bio, 0);
1352 break;
1353
1354 default:
c397741c
MS
1355 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1356 __func__, r);
e49e5829
JT
1357 bio_io_error(bio);
1358 break;
1359 }
1360}
1361
3e1a0699
JT
1362static void process_bio_success(struct thin_c *tc, struct bio *bio)
1363{
1364 bio_endio(bio, 0);
1365}
1366
e49e5829
JT
1367static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1368{
1369 bio_io_error(bio);
1370}
1371
ac8c3f3d
JT
1372/*
1373 * FIXME: should we also commit due to size of transaction, measured in
1374 * metadata blocks?
1375 */
905e51b3
JT
1376static int need_commit_due_to_time(struct pool *pool)
1377{
1378 return jiffies < pool->last_commit_jiffies ||
1379 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1380}
1381
67324ea1
MS
1382#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
1383#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
1384
1385static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
1386{
1387 struct rb_node **rbp, *parent;
1388 struct dm_thin_endio_hook *pbd;
1389 sector_t bi_sector = bio->bi_iter.bi_sector;
1390
1391 rbp = &tc->sort_bio_list.rb_node;
1392 parent = NULL;
1393 while (*rbp) {
1394 parent = *rbp;
1395 pbd = thin_pbd(parent);
1396
1397 if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
1398 rbp = &(*rbp)->rb_left;
1399 else
1400 rbp = &(*rbp)->rb_right;
1401 }
1402
1403 pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1404 rb_link_node(&pbd->rb_node, parent, rbp);
1405 rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
1406}
1407
1408static void __extract_sorted_bios(struct thin_c *tc)
1409{
1410 struct rb_node *node;
1411 struct dm_thin_endio_hook *pbd;
1412 struct bio *bio;
1413
1414 for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
1415 pbd = thin_pbd(node);
1416 bio = thin_bio(pbd);
1417
1418 bio_list_add(&tc->deferred_bio_list, bio);
1419 rb_erase(&pbd->rb_node, &tc->sort_bio_list);
1420 }
1421
1422 WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
1423}
1424
1425static void __sort_thin_deferred_bios(struct thin_c *tc)
1426{
1427 struct bio *bio;
1428 struct bio_list bios;
1429
1430 bio_list_init(&bios);
1431 bio_list_merge(&bios, &tc->deferred_bio_list);
1432 bio_list_init(&tc->deferred_bio_list);
1433
1434 /* Sort deferred_bio_list using rb-tree */
1435 while ((bio = bio_list_pop(&bios)))
1436 __thin_bio_rb_add(tc, bio);
1437
1438 /*
1439 * Transfer the sorted bios in sort_bio_list back to
1440 * deferred_bio_list to allow lockless submission of
1441 * all bios.
1442 */
1443 __extract_sorted_bios(tc);
1444}
1445
c140e1c4 1446static void process_thin_deferred_bios(struct thin_c *tc)
991d9fa0 1447{
c140e1c4 1448 struct pool *pool = tc->pool;
991d9fa0
JT
1449 unsigned long flags;
1450 struct bio *bio;
1451 struct bio_list bios;
67324ea1 1452 struct blk_plug plug;
991d9fa0 1453
c140e1c4
MS
1454 if (tc->requeue_mode) {
1455 requeue_bio_list(tc, &tc->deferred_bio_list);
1456 return;
1457 }
1458
991d9fa0
JT
1459 bio_list_init(&bios);
1460
c140e1c4 1461 spin_lock_irqsave(&tc->lock, flags);
67324ea1
MS
1462
1463 if (bio_list_empty(&tc->deferred_bio_list)) {
1464 spin_unlock_irqrestore(&tc->lock, flags);
1465 return;
1466 }
1467
1468 __sort_thin_deferred_bios(tc);
1469
c140e1c4
MS
1470 bio_list_merge(&bios, &tc->deferred_bio_list);
1471 bio_list_init(&tc->deferred_bio_list);
67324ea1 1472
c140e1c4 1473 spin_unlock_irqrestore(&tc->lock, flags);
991d9fa0 1474
67324ea1 1475 blk_start_plug(&plug);
991d9fa0 1476 while ((bio = bio_list_pop(&bios))) {
991d9fa0
JT
1477 /*
1478 * If we've got no free new_mapping structs, and processing
1479 * this bio might require one, we pause until there are some
1480 * prepared mappings to process.
1481 */
1482 if (ensure_next_mapping(pool)) {
c140e1c4
MS
1483 spin_lock_irqsave(&tc->lock, flags);
1484 bio_list_add(&tc->deferred_bio_list, bio);
1485 bio_list_merge(&tc->deferred_bio_list, &bios);
1486 spin_unlock_irqrestore(&tc->lock, flags);
991d9fa0
JT
1487 break;
1488 }
104655fd
JT
1489
1490 if (bio->bi_rw & REQ_DISCARD)
e49e5829 1491 pool->process_discard(tc, bio);
104655fd 1492 else
e49e5829 1493 pool->process_bio(tc, bio);
991d9fa0 1494 }
67324ea1 1495 blk_finish_plug(&plug);
c140e1c4
MS
1496}
1497
b10ebd34
JT
1498static void thin_get(struct thin_c *tc);
1499static void thin_put(struct thin_c *tc);
1500
1501/*
1502 * We can't hold rcu_read_lock() around code that can block. So we
1503 * find a thin with the rcu lock held; bump a refcount; then drop
1504 * the lock.
1505 */
1506static struct thin_c *get_first_thin(struct pool *pool)
1507{
1508 struct thin_c *tc = NULL;
1509
1510 rcu_read_lock();
1511 if (!list_empty(&pool->active_thins)) {
1512 tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
1513 thin_get(tc);
1514 }
1515 rcu_read_unlock();
1516
1517 return tc;
1518}
1519
1520static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
1521{
1522 struct thin_c *old_tc = tc;
1523
1524 rcu_read_lock();
1525 list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
1526 thin_get(tc);
1527 thin_put(old_tc);
1528 rcu_read_unlock();
1529 return tc;
1530 }
1531 thin_put(old_tc);
1532 rcu_read_unlock();
1533
1534 return NULL;
1535}
1536
c140e1c4
MS
1537static void process_deferred_bios(struct pool *pool)
1538{
1539 unsigned long flags;
1540 struct bio *bio;
1541 struct bio_list bios;
1542 struct thin_c *tc;
1543
b10ebd34
JT
1544 tc = get_first_thin(pool);
1545 while (tc) {
c140e1c4 1546 process_thin_deferred_bios(tc);
b10ebd34
JT
1547 tc = get_next_thin(pool, tc);
1548 }
991d9fa0
JT
1549
1550 /*
1551 * If there are any deferred flush bios, we must commit
1552 * the metadata before issuing them.
1553 */
1554 bio_list_init(&bios);
1555 spin_lock_irqsave(&pool->lock, flags);
1556 bio_list_merge(&bios, &pool->deferred_flush_bios);
1557 bio_list_init(&pool->deferred_flush_bios);
1558 spin_unlock_irqrestore(&pool->lock, flags);
1559
4d1662a3
MS
1560 if (bio_list_empty(&bios) &&
1561 !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
991d9fa0
JT
1562 return;
1563
020cc3b5 1564 if (commit(pool)) {
991d9fa0
JT
1565 while ((bio = bio_list_pop(&bios)))
1566 bio_io_error(bio);
1567 return;
1568 }
905e51b3 1569 pool->last_commit_jiffies = jiffies;
991d9fa0
JT
1570
1571 while ((bio = bio_list_pop(&bios)))
1572 generic_make_request(bio);
1573}
1574
1575static void do_worker(struct work_struct *ws)
1576{
1577 struct pool *pool = container_of(ws, struct pool, worker);
1578
e49e5829
JT
1579 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1580 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
991d9fa0
JT
1581 process_deferred_bios(pool);
1582}
1583
905e51b3
JT
1584/*
1585 * We want to commit periodically so that not too much
1586 * unwritten data builds up.
1587 */
1588static void do_waker(struct work_struct *ws)
1589{
1590 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1591 wake_worker(pool);
1592 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1593}
1594
85ad643b
JT
1595/*
1596 * We're holding onto IO to allow userland time to react. After the
1597 * timeout either the pool will have been resized (and thus back in
1598 * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
1599 */
1600static void do_no_space_timeout(struct work_struct *ws)
1601{
1602 struct pool *pool = container_of(to_delayed_work(ws), struct pool,
1603 no_space_timeout);
1604
1605 if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
1606 set_pool_mode(pool, PM_READ_ONLY);
1607}
1608
991d9fa0
JT
1609/*----------------------------------------------------------------*/
1610
738211f7
JT
1611struct noflush_work {
1612 struct work_struct worker;
1613 struct thin_c *tc;
1614
1615 atomic_t complete;
1616 wait_queue_head_t wait;
1617};
1618
1619static void complete_noflush_work(struct noflush_work *w)
1620{
1621 atomic_set(&w->complete, 1);
1622 wake_up(&w->wait);
1623}
1624
1625static void do_noflush_start(struct work_struct *ws)
1626{
1627 struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1628 w->tc->requeue_mode = true;
1629 requeue_io(w->tc);
1630 complete_noflush_work(w);
1631}
1632
1633static void do_noflush_stop(struct work_struct *ws)
1634{
1635 struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1636 w->tc->requeue_mode = false;
1637 complete_noflush_work(w);
1638}
1639
1640static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
1641{
1642 struct noflush_work w;
1643
fbcde3d8 1644 INIT_WORK_ONSTACK(&w.worker, fn);
738211f7
JT
1645 w.tc = tc;
1646 atomic_set(&w.complete, 0);
1647 init_waitqueue_head(&w.wait);
1648
1649 queue_work(tc->pool->wq, &w.worker);
1650
1651 wait_event(w.wait, atomic_read(&w.complete));
1652}
1653
1654/*----------------------------------------------------------------*/
1655
e49e5829
JT
1656static enum pool_mode get_pool_mode(struct pool *pool)
1657{
1658 return pool->pf.mode;
1659}
1660
3e1a0699
JT
1661static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
1662{
1663 dm_table_event(pool->ti->table);
1664 DMINFO("%s: switching pool to %s mode",
1665 dm_device_name(pool->pool_md), new_mode);
1666}
1667
8b64e881 1668static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
e49e5829 1669{
cdc2b415 1670 struct pool_c *pt = pool->ti->private;
07f2b6e0
MS
1671 bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
1672 enum pool_mode old_mode = get_pool_mode(pool);
1673
1674 /*
1675 * Never allow the pool to transition to PM_WRITE mode if user
1676 * intervention is required to verify metadata and data consistency.
1677 */
1678 if (new_mode == PM_WRITE && needs_check) {
1679 DMERR("%s: unable to switch pool to write mode until repaired.",
1680 dm_device_name(pool->pool_md));
1681 if (old_mode != new_mode)
1682 new_mode = old_mode;
1683 else
1684 new_mode = PM_READ_ONLY;
1685 }
1686 /*
1687 * If we were in PM_FAIL mode, rollback of metadata failed. We're
1688 * not going to recover without a thin_repair. So we never let the
1689 * pool move out of the old mode.
1690 */
1691 if (old_mode == PM_FAIL)
1692 new_mode = old_mode;
e49e5829 1693
8b64e881 1694 switch (new_mode) {
e49e5829 1695 case PM_FAIL:
8b64e881 1696 if (old_mode != new_mode)
3e1a0699 1697 notify_of_pool_mode_change(pool, "failure");
5383ef3a 1698 dm_pool_metadata_read_only(pool->pmd);
e49e5829
JT
1699 pool->process_bio = process_bio_fail;
1700 pool->process_discard = process_bio_fail;
1701 pool->process_prepared_mapping = process_prepared_mapping_fail;
1702 pool->process_prepared_discard = process_prepared_discard_fail;
3e1a0699
JT
1703
1704 error_retry_list(pool);
e49e5829
JT
1705 break;
1706
1707 case PM_READ_ONLY:
8b64e881 1708 if (old_mode != new_mode)
3e1a0699
JT
1709 notify_of_pool_mode_change(pool, "read-only");
1710 dm_pool_metadata_read_only(pool->pmd);
1711 pool->process_bio = process_bio_read_only;
1712 pool->process_discard = process_bio_success;
1713 pool->process_prepared_mapping = process_prepared_mapping_fail;
1714 pool->process_prepared_discard = process_prepared_discard_passdown;
1715
1716 error_retry_list(pool);
1717 break;
1718
1719 case PM_OUT_OF_DATA_SPACE:
1720 /*
1721 * Ideally we'd never hit this state; the low water mark
1722 * would trigger userland to extend the pool before we
1723 * completely run out of data space. However, many small
1724 * IOs to unprovisioned space can consume data space at an
1725 * alarming rate. Adjust your low water mark if you're
1726 * frequently seeing this mode.
1727 */
1728 if (old_mode != new_mode)
1729 notify_of_pool_mode_change(pool, "out-of-data-space");
1730 pool->process_bio = process_bio_read_only;
1731 pool->process_discard = process_discard;
1732 pool->process_prepared_mapping = process_prepared_mapping;
1733 pool->process_prepared_discard = process_prepared_discard_passdown;
85ad643b
JT
1734
1735 if (!pool->pf.error_if_no_space)
1736 queue_delayed_work(pool->wq, &pool->no_space_timeout, NO_SPACE_TIMEOUT);
e49e5829
JT
1737 break;
1738
1739 case PM_WRITE:
8b64e881 1740 if (old_mode != new_mode)
3e1a0699 1741 notify_of_pool_mode_change(pool, "write");
9b7aaa64 1742 dm_pool_metadata_read_write(pool->pmd);
e49e5829
JT
1743 pool->process_bio = process_bio;
1744 pool->process_discard = process_discard;
1745 pool->process_prepared_mapping = process_prepared_mapping;
1746 pool->process_prepared_discard = process_prepared_discard;
1747 break;
1748 }
8b64e881
MS
1749
1750 pool->pf.mode = new_mode;
cdc2b415
MS
1751 /*
1752 * The pool mode may have changed, sync it so bind_control_target()
1753 * doesn't cause an unexpected mode transition on resume.
1754 */
1755 pt->adjusted_pf.mode = new_mode;
e49e5829
JT
1756}
1757
07f2b6e0 1758static void abort_transaction(struct pool *pool)
b5330655 1759{
07f2b6e0
MS
1760 const char *dev_name = dm_device_name(pool->pool_md);
1761
1762 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1763 if (dm_pool_abort_metadata(pool->pmd)) {
1764 DMERR("%s: failed to abort metadata transaction", dev_name);
1765 set_pool_mode(pool, PM_FAIL);
1766 }
1767
1768 if (dm_pool_metadata_set_needs_check(pool->pmd)) {
1769 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
1770 set_pool_mode(pool, PM_FAIL);
1771 }
1772}
399caddf 1773
07f2b6e0
MS
1774static void metadata_operation_failed(struct pool *pool, const char *op, int r)
1775{
b5330655
JT
1776 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1777 dm_device_name(pool->pool_md), op, r);
1778
07f2b6e0 1779 abort_transaction(pool);
b5330655
JT
1780 set_pool_mode(pool, PM_READ_ONLY);
1781}
1782
e49e5829
JT
1783/*----------------------------------------------------------------*/
1784
991d9fa0
JT
1785/*
1786 * Mapping functions.
1787 */
1788
1789/*
1790 * Called only while mapping a thin bio to hand it over to the workqueue.
1791 */
1792static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1793{
1794 unsigned long flags;
1795 struct pool *pool = tc->pool;
1796
c140e1c4
MS
1797 spin_lock_irqsave(&tc->lock, flags);
1798 bio_list_add(&tc->deferred_bio_list, bio);
1799 spin_unlock_irqrestore(&tc->lock, flags);
991d9fa0
JT
1800
1801 wake_worker(pool);
1802}
1803
59c3d2c6 1804static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
eb2aa48d 1805{
59c3d2c6 1806 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
eb2aa48d
JT
1807
1808 h->tc = tc;
1809 h->shared_read_entry = NULL;
e8088073 1810 h->all_io_entry = NULL;
eb2aa48d 1811 h->overwrite_mapping = NULL;
eb2aa48d
JT
1812}
1813
991d9fa0
JT
1814/*
1815 * Non-blocking function called from the thin target's map function.
1816 */
7de3ee57 1817static int thin_bio_map(struct dm_target *ti, struct bio *bio)
991d9fa0
JT
1818{
1819 int r;
1820 struct thin_c *tc = ti->private;
1821 dm_block_t block = get_bio_block(tc, bio);
1822 struct dm_thin_device *td = tc->td;
1823 struct dm_thin_lookup_result result;
025b9685
JT
1824 struct dm_bio_prison_cell cell1, cell2;
1825 struct dm_bio_prison_cell *cell_result;
e8088073 1826 struct dm_cell_key key;
991d9fa0 1827
59c3d2c6 1828 thin_hook_bio(tc, bio);
e49e5829 1829
738211f7
JT
1830 if (tc->requeue_mode) {
1831 bio_endio(bio, DM_ENDIO_REQUEUE);
1832 return DM_MAPIO_SUBMITTED;
1833 }
1834
e49e5829
JT
1835 if (get_pool_mode(tc->pool) == PM_FAIL) {
1836 bio_io_error(bio);
1837 return DM_MAPIO_SUBMITTED;
1838 }
1839
104655fd 1840 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
991d9fa0
JT
1841 thin_defer_bio(tc, bio);
1842 return DM_MAPIO_SUBMITTED;
1843 }
1844
1845 r = dm_thin_find_block(td, block, 0, &result);
1846
1847 /*
1848 * Note that we defer readahead too.
1849 */
1850 switch (r) {
1851 case 0:
1852 if (unlikely(result.shared)) {
1853 /*
1854 * We have a race condition here between the
1855 * result.shared value returned by the lookup and
1856 * snapshot creation, which may cause new
1857 * sharing.
1858 *
1859 * To avoid this always quiesce the origin before
1860 * taking the snap. You want to do this anyway to
1861 * ensure a consistent application view
1862 * (i.e. lockfs).
1863 *
1864 * More distant ancestors are irrelevant. The
1865 * shared flag will be set in their case.
1866 */
1867 thin_defer_bio(tc, bio);
e8088073 1868 return DM_MAPIO_SUBMITTED;
991d9fa0 1869 }
e8088073
JT
1870
1871 build_virtual_key(tc->td, block, &key);
025b9685 1872 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
e8088073
JT
1873 return DM_MAPIO_SUBMITTED;
1874
1875 build_data_key(tc->td, result.block, &key);
025b9685
JT
1876 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) {
1877 cell_defer_no_holder_no_free(tc, &cell1);
e8088073
JT
1878 return DM_MAPIO_SUBMITTED;
1879 }
1880
1881 inc_all_io_entry(tc->pool, bio);
025b9685
JT
1882 cell_defer_no_holder_no_free(tc, &cell2);
1883 cell_defer_no_holder_no_free(tc, &cell1);
e8088073
JT
1884
1885 remap(tc, bio, result.block);
1886 return DM_MAPIO_REMAPPED;
991d9fa0
JT
1887
1888 case -ENODATA:
e49e5829
JT
1889 if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1890 /*
1891 * This block isn't provisioned, and we have no way
8c0f0e8c 1892 * of doing so.
e49e5829 1893 */
8c0f0e8c 1894 handle_unserviceable_bio(tc->pool, bio);
2aab3850 1895 return DM_MAPIO_SUBMITTED;
e49e5829
JT
1896 }
1897 /* fall through */
1898
1899 case -EWOULDBLOCK:
991d9fa0
JT
1900 /*
1901 * In future, the failed dm_thin_find_block above could
1902 * provide the hint to load the metadata into cache.
1903 */
991d9fa0 1904 thin_defer_bio(tc, bio);
2aab3850 1905 return DM_MAPIO_SUBMITTED;
e49e5829
JT
1906
1907 default:
1908 /*
1909 * Must always call bio_io_error on failure.
1910 * dm_thin_find_block can fail with -EINVAL if the
1911 * pool is switched to fail-io mode.
1912 */
1913 bio_io_error(bio);
2aab3850 1914 return DM_MAPIO_SUBMITTED;
991d9fa0 1915 }
991d9fa0
JT
1916}
1917
1918static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1919{
991d9fa0 1920 struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
760fe67e 1921 struct request_queue *q;
991d9fa0 1922
760fe67e
MS
1923 if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE)
1924 return 1;
991d9fa0 1925
760fe67e
MS
1926 q = bdev_get_queue(pt->data_dev->bdev);
1927 return bdi_congested(&q->backing_dev_info, bdi_bits);
991d9fa0
JT
1928}
1929
c140e1c4 1930static void requeue_bios(struct pool *pool)
991d9fa0 1931{
c140e1c4
MS
1932 unsigned long flags;
1933 struct thin_c *tc;
1934
1935 rcu_read_lock();
1936 list_for_each_entry_rcu(tc, &pool->active_thins, list) {
1937 spin_lock_irqsave(&tc->lock, flags);
1938 bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
1939 bio_list_init(&tc->retry_on_resume_list);
1940 spin_unlock_irqrestore(&tc->lock, flags);
1941 }
1942 rcu_read_unlock();
991d9fa0
JT
1943}
1944
1945/*----------------------------------------------------------------
1946 * Binding of control targets to a pool object
1947 *--------------------------------------------------------------*/
9bc142dd
MS
1948static bool data_dev_supports_discard(struct pool_c *pt)
1949{
1950 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1951
1952 return q && blk_queue_discard(q);
1953}
1954
58051b94
JT
1955static bool is_factor(sector_t block_size, uint32_t n)
1956{
1957 return !sector_div(block_size, n);
1958}
1959
9bc142dd
MS
1960/*
1961 * If discard_passdown was enabled verify that the data device
0424caa1 1962 * supports discards. Disable discard_passdown if not.
9bc142dd 1963 */
0424caa1 1964static void disable_passdown_if_not_supported(struct pool_c *pt)
9bc142dd 1965{
0424caa1
MS
1966 struct pool *pool = pt->pool;
1967 struct block_device *data_bdev = pt->data_dev->bdev;
1968 struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
1969 sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
1970 const char *reason = NULL;
9bc142dd
MS
1971 char buf[BDEVNAME_SIZE];
1972
0424caa1 1973 if (!pt->adjusted_pf.discard_passdown)
9bc142dd
MS
1974 return;
1975
0424caa1
MS
1976 if (!data_dev_supports_discard(pt))
1977 reason = "discard unsupported";
1978
1979 else if (data_limits->max_discard_sectors < pool->sectors_per_block)
1980 reason = "max discard sectors smaller than a block";
9bc142dd 1981
0424caa1
MS
1982 else if (data_limits->discard_granularity > block_size)
1983 reason = "discard granularity larger than a block";
1984
58051b94 1985 else if (!is_factor(block_size, data_limits->discard_granularity))
0424caa1
MS
1986 reason = "discard granularity not a factor of block size";
1987
1988 if (reason) {
1989 DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
1990 pt->adjusted_pf.discard_passdown = false;
1991 }
9bc142dd
MS
1992}
1993
991d9fa0
JT
1994static int bind_control_target(struct pool *pool, struct dm_target *ti)
1995{
1996 struct pool_c *pt = ti->private;
1997
e49e5829 1998 /*
9b7aaa64 1999 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
e49e5829 2000 */
07f2b6e0 2001 enum pool_mode old_mode = get_pool_mode(pool);
0424caa1 2002 enum pool_mode new_mode = pt->adjusted_pf.mode;
e49e5829 2003
8b64e881
MS
2004 /*
2005 * Don't change the pool's mode until set_pool_mode() below.
2006 * Otherwise the pool's process_* function pointers may
2007 * not match the desired pool mode.
2008 */
2009 pt->adjusted_pf.mode = old_mode;
2010
2011 pool->ti = ti;
2012 pool->pf = pt->adjusted_pf;
2013 pool->low_water_blocks = pt->low_water_blocks;
2014
9bc142dd 2015 set_pool_mode(pool, new_mode);
f402693d 2016
991d9fa0
JT
2017 return 0;
2018}
2019
2020static void unbind_control_target(struct pool *pool, struct dm_target *ti)
2021{
2022 if (pool->ti == ti)
2023 pool->ti = NULL;
2024}
2025
2026/*----------------------------------------------------------------
2027 * Pool creation
2028 *--------------------------------------------------------------*/
67e2e2b2
JT
2029/* Initialize pool features. */
2030static void pool_features_init(struct pool_features *pf)
2031{
e49e5829 2032 pf->mode = PM_WRITE;
9bc142dd
MS
2033 pf->zero_new_blocks = true;
2034 pf->discard_enabled = true;
2035 pf->discard_passdown = true;
787a996c 2036 pf->error_if_no_space = false;
67e2e2b2
JT
2037}
2038
991d9fa0
JT
2039static void __pool_destroy(struct pool *pool)
2040{
2041 __pool_table_remove(pool);
2042
2043 if (dm_pool_metadata_close(pool->pmd) < 0)
2044 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2045
44feb387 2046 dm_bio_prison_destroy(pool->prison);
991d9fa0
JT
2047 dm_kcopyd_client_destroy(pool->copier);
2048
2049 if (pool->wq)
2050 destroy_workqueue(pool->wq);
2051
2052 if (pool->next_mapping)
2053 mempool_free(pool->next_mapping, pool->mapping_pool);
2054 mempool_destroy(pool->mapping_pool);
44feb387
MS
2055 dm_deferred_set_destroy(pool->shared_read_ds);
2056 dm_deferred_set_destroy(pool->all_io_ds);
991d9fa0
JT
2057 kfree(pool);
2058}
2059
a24c2569 2060static struct kmem_cache *_new_mapping_cache;
a24c2569 2061
991d9fa0
JT
2062static struct pool *pool_create(struct mapped_device *pool_md,
2063 struct block_device *metadata_dev,
e49e5829
JT
2064 unsigned long block_size,
2065 int read_only, char **error)
991d9fa0
JT
2066{
2067 int r;
2068 void *err_p;
2069 struct pool *pool;
2070 struct dm_pool_metadata *pmd;
e49e5829 2071 bool format_device = read_only ? false : true;
991d9fa0 2072
e49e5829 2073 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
991d9fa0
JT
2074 if (IS_ERR(pmd)) {
2075 *error = "Error creating metadata object";
2076 return (struct pool *)pmd;
2077 }
2078
2079 pool = kmalloc(sizeof(*pool), GFP_KERNEL);
2080 if (!pool) {
2081 *error = "Error allocating memory for pool";
2082 err_p = ERR_PTR(-ENOMEM);
2083 goto bad_pool;
2084 }
2085
2086 pool->pmd = pmd;
2087 pool->sectors_per_block = block_size;
f9a8e0cd
MP
2088 if (block_size & (block_size - 1))
2089 pool->sectors_per_block_shift = -1;
2090 else
2091 pool->sectors_per_block_shift = __ffs(block_size);
991d9fa0 2092 pool->low_water_blocks = 0;
67e2e2b2 2093 pool_features_init(&pool->pf);
44feb387 2094 pool->prison = dm_bio_prison_create(PRISON_CELLS);
991d9fa0
JT
2095 if (!pool->prison) {
2096 *error = "Error creating pool's bio prison";
2097 err_p = ERR_PTR(-ENOMEM);
2098 goto bad_prison;
2099 }
2100
df5d2e90 2101 pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
991d9fa0
JT
2102 if (IS_ERR(pool->copier)) {
2103 r = PTR_ERR(pool->copier);
2104 *error = "Error creating pool's kcopyd client";
2105 err_p = ERR_PTR(r);
2106 goto bad_kcopyd_client;
2107 }
2108
2109 /*
2110 * Create singlethreaded workqueue that will service all devices
2111 * that use this metadata.
2112 */
2113 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2114 if (!pool->wq) {
2115 *error = "Error creating pool's workqueue";
2116 err_p = ERR_PTR(-ENOMEM);
2117 goto bad_wq;
2118 }
2119
2120 INIT_WORK(&pool->worker, do_worker);
905e51b3 2121 INIT_DELAYED_WORK(&pool->waker, do_waker);
85ad643b 2122 INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
991d9fa0 2123 spin_lock_init(&pool->lock);
991d9fa0
JT
2124 bio_list_init(&pool->deferred_flush_bios);
2125 INIT_LIST_HEAD(&pool->prepared_mappings);
104655fd 2126 INIT_LIST_HEAD(&pool->prepared_discards);
c140e1c4 2127 INIT_LIST_HEAD(&pool->active_thins);
88a6621b 2128 pool->low_water_triggered = false;
44feb387
MS
2129
2130 pool->shared_read_ds = dm_deferred_set_create();
2131 if (!pool->shared_read_ds) {
2132 *error = "Error creating pool's shared read deferred set";
2133 err_p = ERR_PTR(-ENOMEM);
2134 goto bad_shared_read_ds;
2135 }
2136
2137 pool->all_io_ds = dm_deferred_set_create();
2138 if (!pool->all_io_ds) {
2139 *error = "Error creating pool's all io deferred set";
2140 err_p = ERR_PTR(-ENOMEM);
2141 goto bad_all_io_ds;
2142 }
991d9fa0
JT
2143
2144 pool->next_mapping = NULL;
a24c2569
MS
2145 pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
2146 _new_mapping_cache);
991d9fa0
JT
2147 if (!pool->mapping_pool) {
2148 *error = "Error creating pool's mapping mempool";
2149 err_p = ERR_PTR(-ENOMEM);
2150 goto bad_mapping_pool;
2151 }
2152
991d9fa0 2153 pool->ref_count = 1;
905e51b3 2154 pool->last_commit_jiffies = jiffies;
991d9fa0
JT
2155 pool->pool_md = pool_md;
2156 pool->md_dev = metadata_dev;
2157 __pool_table_insert(pool);
2158
2159 return pool;
2160
991d9fa0 2161bad_mapping_pool:
44feb387
MS
2162 dm_deferred_set_destroy(pool->all_io_ds);
2163bad_all_io_ds:
2164 dm_deferred_set_destroy(pool->shared_read_ds);
2165bad_shared_read_ds:
991d9fa0
JT
2166 destroy_workqueue(pool->wq);
2167bad_wq:
2168 dm_kcopyd_client_destroy(pool->copier);
2169bad_kcopyd_client:
44feb387 2170 dm_bio_prison_destroy(pool->prison);
991d9fa0
JT
2171bad_prison:
2172 kfree(pool);
2173bad_pool:
2174 if (dm_pool_metadata_close(pmd))
2175 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2176
2177 return err_p;
2178}
2179
2180static void __pool_inc(struct pool *pool)
2181{
2182 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2183 pool->ref_count++;
2184}
2185
2186static void __pool_dec(struct pool *pool)
2187{
2188 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2189 BUG_ON(!pool->ref_count);
2190 if (!--pool->ref_count)
2191 __pool_destroy(pool);
2192}
2193
2194static struct pool *__pool_find(struct mapped_device *pool_md,
2195 struct block_device *metadata_dev,
e49e5829
JT
2196 unsigned long block_size, int read_only,
2197 char **error, int *created)
991d9fa0
JT
2198{
2199 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
2200
2201 if (pool) {
f09996c9
MS
2202 if (pool->pool_md != pool_md) {
2203 *error = "metadata device already in use by a pool";
991d9fa0 2204 return ERR_PTR(-EBUSY);
f09996c9 2205 }
991d9fa0
JT
2206 __pool_inc(pool);
2207
2208 } else {
2209 pool = __pool_table_lookup(pool_md);
2210 if (pool) {
f09996c9
MS
2211 if (pool->md_dev != metadata_dev) {
2212 *error = "different pool cannot replace a pool";
991d9fa0 2213 return ERR_PTR(-EINVAL);
f09996c9 2214 }
991d9fa0
JT
2215 __pool_inc(pool);
2216
67e2e2b2 2217 } else {
e49e5829 2218 pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
67e2e2b2
JT
2219 *created = 1;
2220 }
991d9fa0
JT
2221 }
2222
2223 return pool;
2224}
2225
2226/*----------------------------------------------------------------
2227 * Pool target methods
2228 *--------------------------------------------------------------*/
2229static void pool_dtr(struct dm_target *ti)
2230{
2231 struct pool_c *pt = ti->private;
2232
2233 mutex_lock(&dm_thin_pool_table.mutex);
2234
2235 unbind_control_target(pt->pool, ti);
2236 __pool_dec(pt->pool);
2237 dm_put_device(ti, pt->metadata_dev);
2238 dm_put_device(ti, pt->data_dev);
2239 kfree(pt);
2240
2241 mutex_unlock(&dm_thin_pool_table.mutex);
2242}
2243
991d9fa0
JT
2244static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
2245 struct dm_target *ti)
2246{
2247 int r;
2248 unsigned argc;
2249 const char *arg_name;
2250
2251 static struct dm_arg _args[] = {
74aa45c3 2252 {0, 4, "Invalid number of pool feature arguments"},
991d9fa0
JT
2253 };
2254
2255 /*
2256 * No feature arguments supplied.
2257 */
2258 if (!as->argc)
2259 return 0;
2260
2261 r = dm_read_arg_group(_args, as, &argc, &ti->error);
2262 if (r)
2263 return -EINVAL;
2264
2265 while (argc && !r) {
2266 arg_name = dm_shift_arg(as);
2267 argc--;
2268
e49e5829 2269 if (!strcasecmp(arg_name, "skip_block_zeroing"))
9bc142dd 2270 pf->zero_new_blocks = false;
e49e5829
JT
2271
2272 else if (!strcasecmp(arg_name, "ignore_discard"))
9bc142dd 2273 pf->discard_enabled = false;
e49e5829
JT
2274
2275 else if (!strcasecmp(arg_name, "no_discard_passdown"))
9bc142dd 2276 pf->discard_passdown = false;
991d9fa0 2277
e49e5829
JT
2278 else if (!strcasecmp(arg_name, "read_only"))
2279 pf->mode = PM_READ_ONLY;
2280
787a996c
MS
2281 else if (!strcasecmp(arg_name, "error_if_no_space"))
2282 pf->error_if_no_space = true;
2283
e49e5829
JT
2284 else {
2285 ti->error = "Unrecognised pool feature requested";
2286 r = -EINVAL;
2287 break;
2288 }
991d9fa0
JT
2289 }
2290
2291 return r;
2292}
2293
ac8c3f3d
JT
2294static void metadata_low_callback(void *context)
2295{
2296 struct pool *pool = context;
2297
2298 DMWARN("%s: reached low water mark for metadata device: sending event.",
2299 dm_device_name(pool->pool_md));
2300
2301 dm_table_event(pool->ti->table);
2302}
2303
7d48935e
MS
2304static sector_t get_dev_size(struct block_device *bdev)
2305{
2306 return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
2307}
2308
2309static void warn_if_metadata_device_too_big(struct block_device *bdev)
b17446df 2310{
7d48935e 2311 sector_t metadata_dev_size = get_dev_size(bdev);
b17446df
JT
2312 char buffer[BDEVNAME_SIZE];
2313
7d48935e 2314 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
b17446df
JT
2315 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2316 bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
7d48935e
MS
2317}
2318
2319static sector_t get_metadata_dev_size(struct block_device *bdev)
2320{
2321 sector_t metadata_dev_size = get_dev_size(bdev);
2322
2323 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
2324 metadata_dev_size = THIN_METADATA_MAX_SECTORS;
b17446df
JT
2325
2326 return metadata_dev_size;
2327}
2328
24347e95
JT
2329static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
2330{
2331 sector_t metadata_dev_size = get_metadata_dev_size(bdev);
2332
7d48935e 2333 sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
24347e95
JT
2334
2335 return metadata_dev_size;
2336}
2337
ac8c3f3d
JT
2338/*
2339 * When a metadata threshold is crossed a dm event is triggered, and
2340 * userland should respond by growing the metadata device. We could let
2341 * userland set the threshold, like we do with the data threshold, but I'm
2342 * not sure they know enough to do this well.
2343 */
2344static dm_block_t calc_metadata_threshold(struct pool_c *pt)
2345{
2346 /*
2347 * 4M is ample for all ops with the possible exception of thin
2348 * device deletion which is harmless if it fails (just retry the
2349 * delete after you've grown the device).
2350 */
2351 dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
2352 return min((dm_block_t)1024ULL /* 4M */, quarter);
2353}
2354
991d9fa0
JT
2355/*
2356 * thin-pool <metadata dev> <data dev>
2357 * <data block size (sectors)>
2358 * <low water mark (blocks)>
2359 * [<#feature args> [<arg>]*]
2360 *
2361 * Optional feature arguments are:
2362 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
67e2e2b2
JT
2363 * ignore_discard: disable discard
2364 * no_discard_passdown: don't pass discards down to the data device
787a996c
MS
2365 * read_only: Don't allow any changes to be made to the pool metadata.
2366 * error_if_no_space: error IOs, instead of queueing, if no space.
991d9fa0
JT
2367 */
2368static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2369{
67e2e2b2 2370 int r, pool_created = 0;
991d9fa0
JT
2371 struct pool_c *pt;
2372 struct pool *pool;
2373 struct pool_features pf;
2374 struct dm_arg_set as;
2375 struct dm_dev *data_dev;
2376 unsigned long block_size;
2377 dm_block_t low_water_blocks;
2378 struct dm_dev *metadata_dev;
5d0db96d 2379 fmode_t metadata_mode;
991d9fa0
JT
2380
2381 /*
2382 * FIXME Remove validation from scope of lock.
2383 */
2384 mutex_lock(&dm_thin_pool_table.mutex);
2385
2386 if (argc < 4) {
2387 ti->error = "Invalid argument count";
2388 r = -EINVAL;
2389 goto out_unlock;
2390 }
5d0db96d 2391
991d9fa0
JT
2392 as.argc = argc;
2393 as.argv = argv;
2394
5d0db96d
JT
2395 /*
2396 * Set default pool features.
2397 */
2398 pool_features_init(&pf);
2399
2400 dm_consume_args(&as, 4);
2401 r = parse_pool_features(&as, &pf, ti);
2402 if (r)
2403 goto out_unlock;
2404
2405 metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
2406 r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
991d9fa0
JT
2407 if (r) {
2408 ti->error = "Error opening metadata block device";
2409 goto out_unlock;
2410 }
7d48935e 2411 warn_if_metadata_device_too_big(metadata_dev->bdev);
991d9fa0
JT
2412
2413 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
2414 if (r) {
2415 ti->error = "Error getting data device";
2416 goto out_metadata;
2417 }
2418
2419 if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
2420 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2421 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
55f2b8bd 2422 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
991d9fa0
JT
2423 ti->error = "Invalid block size";
2424 r = -EINVAL;
2425 goto out;
2426 }
2427
2428 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
2429 ti->error = "Invalid low water mark";
2430 r = -EINVAL;
2431 goto out;
2432 }
2433
991d9fa0
JT
2434 pt = kzalloc(sizeof(*pt), GFP_KERNEL);
2435 if (!pt) {
2436 r = -ENOMEM;
2437 goto out;
2438 }
2439
2440 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
e49e5829 2441 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
991d9fa0
JT
2442 if (IS_ERR(pool)) {
2443 r = PTR_ERR(pool);
2444 goto out_free_pt;
2445 }
2446
67e2e2b2
JT
2447 /*
2448 * 'pool_created' reflects whether this is the first table load.
2449 * Top level discard support is not allowed to be changed after
2450 * initial load. This would require a pool reload to trigger thin
2451 * device changes.
2452 */
2453 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
2454 ti->error = "Discard support cannot be disabled once enabled";
2455 r = -EINVAL;
2456 goto out_flags_changed;
2457 }
2458
991d9fa0
JT
2459 pt->pool = pool;
2460 pt->ti = ti;
2461 pt->metadata_dev = metadata_dev;
2462 pt->data_dev = data_dev;
2463 pt->low_water_blocks = low_water_blocks;
0424caa1 2464 pt->adjusted_pf = pt->requested_pf = pf;
55a62eef 2465 ti->num_flush_bios = 1;
9bc142dd 2466
67e2e2b2
JT
2467 /*
2468 * Only need to enable discards if the pool should pass
2469 * them down to the data device. The thin device's discard
2470 * processing will cause mappings to be removed from the btree.
2471 */
b60ab990 2472 ti->discard_zeroes_data_unsupported = true;
67e2e2b2 2473 if (pf.discard_enabled && pf.discard_passdown) {
55a62eef 2474 ti->num_discard_bios = 1;
9bc142dd 2475
67e2e2b2
JT
2476 /*
2477 * Setting 'discards_supported' circumvents the normal
2478 * stacking of discard limits (this keeps the pool and
2479 * thin devices' discard limits consistent).
2480 */
0ac55489 2481 ti->discards_supported = true;
67e2e2b2 2482 }
991d9fa0
JT
2483 ti->private = pt;
2484
ac8c3f3d
JT
2485 r = dm_pool_register_metadata_threshold(pt->pool->pmd,
2486 calc_metadata_threshold(pt),
2487 metadata_low_callback,
2488 pool);
2489 if (r)
2490 goto out_free_pt;
2491
991d9fa0
JT
2492 pt->callbacks.congested_fn = pool_is_congested;
2493 dm_table_add_target_callbacks(ti->table, &pt->callbacks);
2494
2495 mutex_unlock(&dm_thin_pool_table.mutex);
2496
2497 return 0;
2498
67e2e2b2
JT
2499out_flags_changed:
2500 __pool_dec(pool);
991d9fa0
JT
2501out_free_pt:
2502 kfree(pt);
2503out:
2504 dm_put_device(ti, data_dev);
2505out_metadata:
2506 dm_put_device(ti, metadata_dev);
2507out_unlock:
2508 mutex_unlock(&dm_thin_pool_table.mutex);
2509
2510 return r;
2511}
2512
7de3ee57 2513static int pool_map(struct dm_target *ti, struct bio *bio)
991d9fa0
JT
2514{
2515 int r;
2516 struct pool_c *pt = ti->private;
2517 struct pool *pool = pt->pool;
2518 unsigned long flags;
2519
2520 /*
2521 * As this is a singleton target, ti->begin is always zero.
2522 */
2523 spin_lock_irqsave(&pool->lock, flags);
2524 bio->bi_bdev = pt->data_dev->bdev;
2525 r = DM_MAPIO_REMAPPED;
2526 spin_unlock_irqrestore(&pool->lock, flags);
2527
2528 return r;
2529}
2530
b17446df 2531static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
991d9fa0
JT
2532{
2533 int r;
2534 struct pool_c *pt = ti->private;
2535 struct pool *pool = pt->pool;
55f2b8bd
MS
2536 sector_t data_size = ti->len;
2537 dm_block_t sb_data_size;
991d9fa0 2538
b17446df 2539 *need_commit = false;
991d9fa0 2540
55f2b8bd
MS
2541 (void) sector_div(data_size, pool->sectors_per_block);
2542
991d9fa0
JT
2543 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2544 if (r) {
4fa5971a
MS
2545 DMERR("%s: failed to retrieve data device size",
2546 dm_device_name(pool->pool_md));
991d9fa0
JT
2547 return r;
2548 }
2549
2550 if (data_size < sb_data_size) {
4fa5971a
MS
2551 DMERR("%s: pool target (%llu blocks) too small: expected %llu",
2552 dm_device_name(pool->pool_md),
55f2b8bd 2553 (unsigned long long)data_size, sb_data_size);
991d9fa0
JT
2554 return -EINVAL;
2555
2556 } else if (data_size > sb_data_size) {
07f2b6e0
MS
2557 if (dm_pool_metadata_needs_check(pool->pmd)) {
2558 DMERR("%s: unable to grow the data device until repaired.",
2559 dm_device_name(pool->pool_md));
2560 return 0;
2561 }
2562
6f7f51d4
MS
2563 if (sb_data_size)
2564 DMINFO("%s: growing the data device from %llu to %llu blocks",
2565 dm_device_name(pool->pool_md),
2566 sb_data_size, (unsigned long long)data_size);
991d9fa0
JT
2567 r = dm_pool_resize_data_dev(pool->pmd, data_size);
2568 if (r) {
b5330655 2569 metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
991d9fa0
JT
2570 return r;
2571 }
2572
b17446df 2573 *need_commit = true;
991d9fa0
JT
2574 }
2575
2576 return 0;
2577}
2578
24347e95
JT
2579static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
2580{
2581 int r;
2582 struct pool_c *pt = ti->private;
2583 struct pool *pool = pt->pool;
2584 dm_block_t metadata_dev_size, sb_metadata_dev_size;
2585
2586 *need_commit = false;
2587
610bba8b 2588 metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
24347e95
JT
2589
2590 r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
2591 if (r) {
4fa5971a
MS
2592 DMERR("%s: failed to retrieve metadata device size",
2593 dm_device_name(pool->pool_md));
24347e95
JT
2594 return r;
2595 }
2596
2597 if (metadata_dev_size < sb_metadata_dev_size) {
4fa5971a
MS
2598 DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
2599 dm_device_name(pool->pool_md),
24347e95
JT
2600 metadata_dev_size, sb_metadata_dev_size);
2601 return -EINVAL;
2602
2603 } else if (metadata_dev_size > sb_metadata_dev_size) {
07f2b6e0
MS
2604 if (dm_pool_metadata_needs_check(pool->pmd)) {
2605 DMERR("%s: unable to grow the metadata device until repaired.",
2606 dm_device_name(pool->pool_md));
2607 return 0;
2608 }
2609
7d48935e 2610 warn_if_metadata_device_too_big(pool->md_dev);
6f7f51d4
MS
2611 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
2612 dm_device_name(pool->pool_md),
2613 sb_metadata_dev_size, metadata_dev_size);
24347e95
JT
2614 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
2615 if (r) {
b5330655 2616 metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
24347e95
JT
2617 return r;
2618 }
2619
2620 *need_commit = true;
2621 }
2622
2623 return 0;
2624}
2625
b17446df
JT
2626/*
2627 * Retrieves the number of blocks of the data device from
2628 * the superblock and compares it to the actual device size,
2629 * thus resizing the data device in case it has grown.
2630 *
2631 * This both copes with opening preallocated data devices in the ctr
2632 * being followed by a resume
2633 * -and-
2634 * calling the resume method individually after userspace has
2635 * grown the data device in reaction to a table event.
2636 */
2637static int pool_preresume(struct dm_target *ti)
2638{
2639 int r;
24347e95 2640 bool need_commit1, need_commit2;
b17446df
JT
2641 struct pool_c *pt = ti->private;
2642 struct pool *pool = pt->pool;
2643
2644 /*
2645 * Take control of the pool object.
2646 */
2647 r = bind_control_target(pool, ti);
2648 if (r)
2649 return r;
2650
2651 r = maybe_resize_data_dev(ti, &need_commit1);
2652 if (r)
2653 return r;
2654
24347e95
JT
2655 r = maybe_resize_metadata_dev(ti, &need_commit2);
2656 if (r)
2657 return r;
2658
2659 if (need_commit1 || need_commit2)
020cc3b5 2660 (void) commit(pool);
b17446df
JT
2661
2662 return 0;
2663}
2664
991d9fa0
JT
2665static void pool_resume(struct dm_target *ti)
2666{
2667 struct pool_c *pt = ti->private;
2668 struct pool *pool = pt->pool;
2669 unsigned long flags;
2670
2671 spin_lock_irqsave(&pool->lock, flags);
88a6621b 2672 pool->low_water_triggered = false;
991d9fa0 2673 spin_unlock_irqrestore(&pool->lock, flags);
c140e1c4 2674 requeue_bios(pool);
991d9fa0 2675
905e51b3 2676 do_waker(&pool->waker.work);
991d9fa0
JT
2677}
2678
2679static void pool_postsuspend(struct dm_target *ti)
2680{
991d9fa0
JT
2681 struct pool_c *pt = ti->private;
2682 struct pool *pool = pt->pool;
2683
905e51b3 2684 cancel_delayed_work(&pool->waker);
85ad643b 2685 cancel_delayed_work(&pool->no_space_timeout);
991d9fa0 2686 flush_workqueue(pool->wq);
020cc3b5 2687 (void) commit(pool);
991d9fa0
JT
2688}
2689
2690static int check_arg_count(unsigned argc, unsigned args_required)
2691{
2692 if (argc != args_required) {
2693 DMWARN("Message received with %u arguments instead of %u.",
2694 argc, args_required);
2695 return -EINVAL;
2696 }
2697
2698 return 0;
2699}
2700
2701static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2702{
2703 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2704 *dev_id <= MAX_DEV_ID)
2705 return 0;
2706
2707 if (warning)
2708 DMWARN("Message received with invalid device id: %s", arg);
2709
2710 return -EINVAL;
2711}
2712
2713static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2714{
2715 dm_thin_id dev_id;
2716 int r;
2717
2718 r = check_arg_count(argc, 2);
2719 if (r)
2720 return r;
2721
2722 r = read_dev_id(argv[1], &dev_id, 1);
2723 if (r)
2724 return r;
2725
2726 r = dm_pool_create_thin(pool->pmd, dev_id);
2727 if (r) {
2728 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2729 argv[1]);
2730 return r;
2731 }
2732
2733 return 0;
2734}
2735
2736static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2737{
2738 dm_thin_id dev_id;
2739 dm_thin_id origin_dev_id;
2740 int r;
2741
2742 r = check_arg_count(argc, 3);
2743 if (r)
2744 return r;
2745
2746 r = read_dev_id(argv[1], &dev_id, 1);
2747 if (r)
2748 return r;
2749
2750 r = read_dev_id(argv[2], &origin_dev_id, 1);
2751 if (r)
2752 return r;
2753
2754 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2755 if (r) {
2756 DMWARN("Creation of new snapshot %s of device %s failed.",
2757 argv[1], argv[2]);
2758 return r;
2759 }
2760
2761 return 0;
2762}
2763
2764static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2765{
2766 dm_thin_id dev_id;
2767 int r;
2768
2769 r = check_arg_count(argc, 2);
2770 if (r)
2771 return r;
2772
2773 r = read_dev_id(argv[1], &dev_id, 1);
2774 if (r)
2775 return r;
2776
2777 r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2778 if (r)
2779 DMWARN("Deletion of thin device %s failed.", argv[1]);
2780
2781 return r;
2782}
2783
2784static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2785{
2786 dm_thin_id old_id, new_id;
2787 int r;
2788
2789 r = check_arg_count(argc, 3);
2790 if (r)
2791 return r;
2792
2793 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2794 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2795 return -EINVAL;
2796 }
2797
2798 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2799 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2800 return -EINVAL;
2801 }
2802
2803 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2804 if (r) {
2805 DMWARN("Failed to change transaction id from %s to %s.",
2806 argv[1], argv[2]);
2807 return r;
2808 }
2809
2810 return 0;
2811}
2812
cc8394d8
JT
2813static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2814{
2815 int r;
2816
2817 r = check_arg_count(argc, 1);
2818 if (r)
2819 return r;
2820
020cc3b5 2821 (void) commit(pool);
0d200aef 2822
cc8394d8
JT
2823 r = dm_pool_reserve_metadata_snap(pool->pmd);
2824 if (r)
2825 DMWARN("reserve_metadata_snap message failed.");
2826
2827 return r;
2828}
2829
2830static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2831{
2832 int r;
2833
2834 r = check_arg_count(argc, 1);
2835 if (r)
2836 return r;
2837
2838 r = dm_pool_release_metadata_snap(pool->pmd);
2839 if (r)
2840 DMWARN("release_metadata_snap message failed.");
2841
2842 return r;
2843}
2844
991d9fa0
JT
2845/*
2846 * Messages supported:
2847 * create_thin <dev_id>
2848 * create_snap <dev_id> <origin_id>
2849 * delete <dev_id>
2850 * trim <dev_id> <new_size_in_sectors>
2851 * set_transaction_id <current_trans_id> <new_trans_id>
cc8394d8
JT
2852 * reserve_metadata_snap
2853 * release_metadata_snap
991d9fa0
JT
2854 */
2855static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2856{
2857 int r = -EINVAL;
2858 struct pool_c *pt = ti->private;
2859 struct pool *pool = pt->pool;
2860
2861 if (!strcasecmp(argv[0], "create_thin"))
2862 r = process_create_thin_mesg(argc, argv, pool);
2863
2864 else if (!strcasecmp(argv[0], "create_snap"))
2865 r = process_create_snap_mesg(argc, argv, pool);
2866
2867 else if (!strcasecmp(argv[0], "delete"))
2868 r = process_delete_mesg(argc, argv, pool);
2869
2870 else if (!strcasecmp(argv[0], "set_transaction_id"))
2871 r = process_set_transaction_id_mesg(argc, argv, pool);
2872
cc8394d8
JT
2873 else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
2874 r = process_reserve_metadata_snap_mesg(argc, argv, pool);
2875
2876 else if (!strcasecmp(argv[0], "release_metadata_snap"))
2877 r = process_release_metadata_snap_mesg(argc, argv, pool);
2878
991d9fa0
JT
2879 else
2880 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2881
e49e5829 2882 if (!r)
020cc3b5 2883 (void) commit(pool);
991d9fa0
JT
2884
2885 return r;
2886}
2887
e49e5829
JT
2888static void emit_flags(struct pool_features *pf, char *result,
2889 unsigned sz, unsigned maxlen)
2890{
2891 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
787a996c
MS
2892 !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
2893 pf->error_if_no_space;
e49e5829
JT
2894 DMEMIT("%u ", count);
2895
2896 if (!pf->zero_new_blocks)
2897 DMEMIT("skip_block_zeroing ");
2898
2899 if (!pf->discard_enabled)
2900 DMEMIT("ignore_discard ");
2901
2902 if (!pf->discard_passdown)
2903 DMEMIT("no_discard_passdown ");
2904
2905 if (pf->mode == PM_READ_ONLY)
2906 DMEMIT("read_only ");
787a996c
MS
2907
2908 if (pf->error_if_no_space)
2909 DMEMIT("error_if_no_space ");
e49e5829
JT
2910}
2911
991d9fa0
JT
2912/*
2913 * Status line is:
2914 * <transaction id> <used metadata sectors>/<total metadata sectors>
2915 * <used data sectors>/<total data sectors> <held metadata root>
2916 */
fd7c092e
MP
2917static void pool_status(struct dm_target *ti, status_type_t type,
2918 unsigned status_flags, char *result, unsigned maxlen)
991d9fa0 2919{
e49e5829 2920 int r;
991d9fa0
JT
2921 unsigned sz = 0;
2922 uint64_t transaction_id;
2923 dm_block_t nr_free_blocks_data;
2924 dm_block_t nr_free_blocks_metadata;
2925 dm_block_t nr_blocks_data;
2926 dm_block_t nr_blocks_metadata;
2927 dm_block_t held_root;
2928 char buf[BDEVNAME_SIZE];
2929 char buf2[BDEVNAME_SIZE];
2930 struct pool_c *pt = ti->private;
2931 struct pool *pool = pt->pool;
2932
2933 switch (type) {
2934 case STATUSTYPE_INFO:
e49e5829
JT
2935 if (get_pool_mode(pool) == PM_FAIL) {
2936 DMEMIT("Fail");
2937 break;
2938 }
2939
1f4e0ff0
AK
2940 /* Commit to ensure statistics aren't out-of-date */
2941 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
020cc3b5 2942 (void) commit(pool);
1f4e0ff0 2943
fd7c092e
MP
2944 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
2945 if (r) {
4fa5971a
MS
2946 DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
2947 dm_device_name(pool->pool_md), r);
fd7c092e
MP
2948 goto err;
2949 }
991d9fa0 2950
fd7c092e
MP
2951 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
2952 if (r) {
4fa5971a
MS
2953 DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
2954 dm_device_name(pool->pool_md), r);
fd7c092e
MP
2955 goto err;
2956 }
991d9fa0
JT
2957
2958 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
fd7c092e 2959 if (r) {
4fa5971a
MS
2960 DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
2961 dm_device_name(pool->pool_md), r);
fd7c092e
MP
2962 goto err;
2963 }
991d9fa0 2964
fd7c092e
MP
2965 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
2966 if (r) {
4fa5971a
MS
2967 DMERR("%s: dm_pool_get_free_block_count returned %d",
2968 dm_device_name(pool->pool_md), r);
fd7c092e
MP
2969 goto err;
2970 }
991d9fa0
JT
2971
2972 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
fd7c092e 2973 if (r) {
4fa5971a
MS
2974 DMERR("%s: dm_pool_get_data_dev_size returned %d",
2975 dm_device_name(pool->pool_md), r);
fd7c092e
MP
2976 goto err;
2977 }
991d9fa0 2978
cc8394d8 2979 r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
fd7c092e 2980 if (r) {
4fa5971a
MS
2981 DMERR("%s: dm_pool_get_metadata_snap returned %d",
2982 dm_device_name(pool->pool_md), r);
fd7c092e
MP
2983 goto err;
2984 }
991d9fa0
JT
2985
2986 DMEMIT("%llu %llu/%llu %llu/%llu ",
2987 (unsigned long long)transaction_id,
2988 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2989 (unsigned long long)nr_blocks_metadata,
2990 (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2991 (unsigned long long)nr_blocks_data);
2992
2993 if (held_root)
e49e5829
JT
2994 DMEMIT("%llu ", held_root);
2995 else
2996 DMEMIT("- ");
2997
3e1a0699
JT
2998 if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
2999 DMEMIT("out_of_data_space ");
3000 else if (pool->pf.mode == PM_READ_ONLY)
e49e5829 3001 DMEMIT("ro ");
991d9fa0 3002 else
e49e5829
JT
3003 DMEMIT("rw ");
3004
018debea 3005 if (!pool->pf.discard_enabled)
787a996c 3006 DMEMIT("ignore_discard ");
018debea 3007 else if (pool->pf.discard_passdown)
787a996c
MS
3008 DMEMIT("discard_passdown ");
3009 else
3010 DMEMIT("no_discard_passdown ");
3011
3012 if (pool->pf.error_if_no_space)
3013 DMEMIT("error_if_no_space ");
e49e5829 3014 else
787a996c 3015 DMEMIT("queue_if_no_space ");
991d9fa0
JT
3016
3017 break;
3018
3019 case STATUSTYPE_TABLE:
3020 DMEMIT("%s %s %lu %llu ",
3021 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
3022 format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
3023 (unsigned long)pool->sectors_per_block,
3024 (unsigned long long)pt->low_water_blocks);
0424caa1 3025 emit_flags(&pt->requested_pf, result, sz, maxlen);
991d9fa0
JT
3026 break;
3027 }
fd7c092e 3028 return;
991d9fa0 3029
fd7c092e
MP
3030err:
3031 DMEMIT("Error");
991d9fa0
JT
3032}
3033
3034static int pool_iterate_devices(struct dm_target *ti,
3035 iterate_devices_callout_fn fn, void *data)
3036{
3037 struct pool_c *pt = ti->private;
3038
3039 return fn(ti, pt->data_dev, 0, ti->len, data);
3040}
3041
3042static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
3043 struct bio_vec *biovec, int max_size)
3044{
3045 struct pool_c *pt = ti->private;
3046 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
3047
3048 if (!q->merge_bvec_fn)
3049 return max_size;
3050
3051 bvm->bi_bdev = pt->data_dev->bdev;
3052
3053 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3054}
3055
0424caa1 3056static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
104655fd 3057{
0424caa1
MS
3058 struct pool *pool = pt->pool;
3059 struct queue_limits *data_limits;
3060
104655fd
JT
3061 limits->max_discard_sectors = pool->sectors_per_block;
3062
3063 /*
0424caa1 3064 * discard_granularity is just a hint, and not enforced.
104655fd 3065 */
0424caa1
MS
3066 if (pt->adjusted_pf.discard_passdown) {
3067 data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
3068 limits->discard_granularity = data_limits->discard_granularity;
f13945d7 3069 } else
0424caa1 3070 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
104655fd
JT
3071}
3072
991d9fa0
JT
3073static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
3074{
3075 struct pool_c *pt = ti->private;
3076 struct pool *pool = pt->pool;
0cc67cd9 3077 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
991d9fa0 3078
0cc67cd9
MS
3079 /*
3080 * If the system-determined stacked limits are compatible with the
3081 * pool's blocksize (io_opt is a factor) do not override them.
3082 */
3083 if (io_opt_sectors < pool->sectors_per_block ||
3084 do_div(io_opt_sectors, pool->sectors_per_block)) {
3085 blk_limits_io_min(limits, 0);
3086 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
3087 }
0424caa1
MS
3088
3089 /*
3090 * pt->adjusted_pf is a staging area for the actual features to use.
3091 * They get transferred to the live pool in bind_control_target()
3092 * called from pool_preresume().
3093 */
b60ab990
MS
3094 if (!pt->adjusted_pf.discard_enabled) {
3095 /*
3096 * Must explicitly disallow stacking discard limits otherwise the
3097 * block layer will stack them if pool's data device has support.
3098 * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the
3099 * user to see that, so make sure to set all discard limits to 0.
3100 */
3101 limits->discard_granularity = 0;
0424caa1 3102 return;
b60ab990 3103 }
0424caa1
MS
3104
3105 disable_passdown_if_not_supported(pt);
3106
3107 set_discard_limits(pt, limits);
991d9fa0
JT
3108}
3109
3110static struct target_type pool_target = {
3111 .name = "thin-pool",
3112 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
3113 DM_TARGET_IMMUTABLE,
67324ea1 3114 .version = {1, 12, 0},
991d9fa0
JT
3115 .module = THIS_MODULE,
3116 .ctr = pool_ctr,
3117 .dtr = pool_dtr,
3118 .map = pool_map,
3119 .postsuspend = pool_postsuspend,
3120 .preresume = pool_preresume,
3121 .resume = pool_resume,
3122 .message = pool_message,
3123 .status = pool_status,
3124 .merge = pool_merge,
3125 .iterate_devices = pool_iterate_devices,
3126 .io_hints = pool_io_hints,
3127};
3128
3129/*----------------------------------------------------------------
3130 * Thin target methods
3131 *--------------------------------------------------------------*/
b10ebd34
JT
3132static void thin_get(struct thin_c *tc)
3133{
3134 atomic_inc(&tc->refcount);
3135}
3136
3137static void thin_put(struct thin_c *tc)
3138{
3139 if (atomic_dec_and_test(&tc->refcount))
3140 complete(&tc->can_destroy);
3141}
3142
991d9fa0
JT
3143static void thin_dtr(struct dm_target *ti)
3144{
3145 struct thin_c *tc = ti->private;
c140e1c4
MS
3146 unsigned long flags;
3147
b10ebd34
JT
3148 thin_put(tc);
3149 wait_for_completion(&tc->can_destroy);
3150
c140e1c4
MS
3151 spin_lock_irqsave(&tc->pool->lock, flags);
3152 list_del_rcu(&tc->list);
3153 spin_unlock_irqrestore(&tc->pool->lock, flags);
3154 synchronize_rcu();
991d9fa0
JT
3155
3156 mutex_lock(&dm_thin_pool_table.mutex);
3157
3158 __pool_dec(tc->pool);
3159 dm_pool_close_thin_device(tc->td);
3160 dm_put_device(ti, tc->pool_dev);
2dd9c257
JT
3161 if (tc->origin_dev)
3162 dm_put_device(ti, tc->origin_dev);
991d9fa0
JT
3163 kfree(tc);
3164
3165 mutex_unlock(&dm_thin_pool_table.mutex);
3166}
3167
3168/*
3169 * Thin target parameters:
3170 *
2dd9c257 3171 * <pool_dev> <dev_id> [origin_dev]
991d9fa0
JT
3172 *
3173 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
3174 * dev_id: the internal device identifier
2dd9c257 3175 * origin_dev: a device external to the pool that should act as the origin
67e2e2b2
JT
3176 *
3177 * If the pool device has discards disabled, they get disabled for the thin
3178 * device as well.
991d9fa0
JT
3179 */
3180static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3181{
3182 int r;
3183 struct thin_c *tc;
2dd9c257 3184 struct dm_dev *pool_dev, *origin_dev;
991d9fa0 3185 struct mapped_device *pool_md;
5e3283e2 3186 unsigned long flags;
991d9fa0
JT
3187
3188 mutex_lock(&dm_thin_pool_table.mutex);
3189
2dd9c257 3190 if (argc != 2 && argc != 3) {
991d9fa0
JT
3191 ti->error = "Invalid argument count";
3192 r = -EINVAL;
3193 goto out_unlock;
3194 }
3195
3196 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
3197 if (!tc) {
3198 ti->error = "Out of memory";
3199 r = -ENOMEM;
3200 goto out_unlock;
3201 }
c140e1c4
MS
3202 spin_lock_init(&tc->lock);
3203 bio_list_init(&tc->deferred_bio_list);
3204 bio_list_init(&tc->retry_on_resume_list);
67324ea1 3205 tc->sort_bio_list = RB_ROOT;
991d9fa0 3206
2dd9c257
JT
3207 if (argc == 3) {
3208 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
3209 if (r) {
3210 ti->error = "Error opening origin device";
3211 goto bad_origin_dev;
3212 }
3213 tc->origin_dev = origin_dev;
3214 }
3215
991d9fa0
JT
3216 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
3217 if (r) {
3218 ti->error = "Error opening pool device";
3219 goto bad_pool_dev;
3220 }
3221 tc->pool_dev = pool_dev;
3222
3223 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
3224 ti->error = "Invalid device id";
3225 r = -EINVAL;
3226 goto bad_common;
3227 }
3228
3229 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
3230 if (!pool_md) {
3231 ti->error = "Couldn't get pool mapped device";
3232 r = -EINVAL;
3233 goto bad_common;
3234 }
3235
3236 tc->pool = __pool_table_lookup(pool_md);
3237 if (!tc->pool) {
3238 ti->error = "Couldn't find pool object";
3239 r = -EINVAL;
3240 goto bad_pool_lookup;
3241 }
3242 __pool_inc(tc->pool);
3243
e49e5829
JT
3244 if (get_pool_mode(tc->pool) == PM_FAIL) {
3245 ti->error = "Couldn't open thin device, Pool is in fail mode";
1acacc07 3246 r = -EINVAL;
e49e5829
JT
3247 goto bad_thin_open;
3248 }
3249
991d9fa0
JT
3250 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
3251 if (r) {
3252 ti->error = "Couldn't open thin internal device";
3253 goto bad_thin_open;
3254 }
3255
542f9038
MS
3256 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
3257 if (r)
1acacc07 3258 goto bad_target_max_io_len;
542f9038 3259
55a62eef 3260 ti->num_flush_bios = 1;
16ad3d10 3261 ti->flush_supported = true;
59c3d2c6 3262 ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
67e2e2b2
JT
3263
3264 /* In case the pool supports discards, pass them on. */
b60ab990 3265 ti->discard_zeroes_data_unsupported = true;
67e2e2b2 3266 if (tc->pool->pf.discard_enabled) {
0ac55489 3267 ti->discards_supported = true;
55a62eef 3268 ti->num_discard_bios = 1;
55a62eef
AK
3269 /* Discard bios must be split on a block boundary */
3270 ti->split_discard_bios = true;
67e2e2b2 3271 }
991d9fa0
JT
3272
3273 dm_put(pool_md);
3274
3275 mutex_unlock(&dm_thin_pool_table.mutex);
3276
b10ebd34
JT
3277 atomic_set(&tc->refcount, 1);
3278 init_completion(&tc->can_destroy);
3279
5e3283e2 3280 spin_lock_irqsave(&tc->pool->lock, flags);
c140e1c4 3281 list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
5e3283e2 3282 spin_unlock_irqrestore(&tc->pool->lock, flags);
c140e1c4
MS
3283 /*
3284 * This synchronize_rcu() call is needed here otherwise we risk a
3285 * wake_worker() call finding no bios to process (because the newly
3286 * added tc isn't yet visible). So this reduces latency since we
3287 * aren't then dependent on the periodic commit to wake_worker().
3288 */
3289 synchronize_rcu();
3290
991d9fa0
JT
3291 return 0;
3292
1acacc07
MS
3293bad_target_max_io_len:
3294 dm_pool_close_thin_device(tc->td);
991d9fa0
JT
3295bad_thin_open:
3296 __pool_dec(tc->pool);
3297bad_pool_lookup:
3298 dm_put(pool_md);
3299bad_common:
3300 dm_put_device(ti, tc->pool_dev);
3301bad_pool_dev:
2dd9c257
JT
3302 if (tc->origin_dev)
3303 dm_put_device(ti, tc->origin_dev);
3304bad_origin_dev:
991d9fa0
JT
3305 kfree(tc);
3306out_unlock:
3307 mutex_unlock(&dm_thin_pool_table.mutex);
3308
3309 return r;
3310}
3311
7de3ee57 3312static int thin_map(struct dm_target *ti, struct bio *bio)
991d9fa0 3313{
4f024f37 3314 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
991d9fa0 3315
7de3ee57 3316 return thin_bio_map(ti, bio);
991d9fa0
JT
3317}
3318
7de3ee57 3319static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
eb2aa48d
JT
3320{
3321 unsigned long flags;
59c3d2c6 3322 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
eb2aa48d 3323 struct list_head work;
a24c2569 3324 struct dm_thin_new_mapping *m, *tmp;
eb2aa48d
JT
3325 struct pool *pool = h->tc->pool;
3326
3327 if (h->shared_read_entry) {
3328 INIT_LIST_HEAD(&work);
44feb387 3329 dm_deferred_entry_dec(h->shared_read_entry, &work);
eb2aa48d
JT
3330
3331 spin_lock_irqsave(&pool->lock, flags);
3332 list_for_each_entry_safe(m, tmp, &work, list) {
3333 list_del(&m->list);
7f214665 3334 m->quiesced = true;
eb2aa48d
JT
3335 __maybe_add_mapping(m);
3336 }
3337 spin_unlock_irqrestore(&pool->lock, flags);
3338 }
3339
104655fd
JT
3340 if (h->all_io_entry) {
3341 INIT_LIST_HEAD(&work);
44feb387 3342 dm_deferred_entry_dec(h->all_io_entry, &work);
563af186
JT
3343 if (!list_empty(&work)) {
3344 spin_lock_irqsave(&pool->lock, flags);
3345 list_for_each_entry_safe(m, tmp, &work, list)
daec338b 3346 list_add_tail(&m->list, &pool->prepared_discards);
563af186
JT
3347 spin_unlock_irqrestore(&pool->lock, flags);
3348 wake_worker(pool);
3349 }
104655fd
JT
3350 }
3351
eb2aa48d
JT
3352 return 0;
3353}
3354
738211f7 3355static void thin_presuspend(struct dm_target *ti)
991d9fa0 3356{
738211f7
JT
3357 struct thin_c *tc = ti->private;
3358
991d9fa0 3359 if (dm_noflush_suspending(ti))
738211f7
JT
3360 noflush_work(tc, do_noflush_start);
3361}
3362
3363static void thin_postsuspend(struct dm_target *ti)
3364{
3365 struct thin_c *tc = ti->private;
3366
3367 /*
3368 * The dm_noflush_suspending flag has been cleared by now, so
3369 * unfortunately we must always run this.
3370 */
3371 noflush_work(tc, do_noflush_stop);
991d9fa0
JT
3372}
3373
3374/*
3375 * <nr mapped sectors> <highest mapped sector>
3376 */
fd7c092e
MP
3377static void thin_status(struct dm_target *ti, status_type_t type,
3378 unsigned status_flags, char *result, unsigned maxlen)
991d9fa0
JT
3379{
3380 int r;
3381 ssize_t sz = 0;
3382 dm_block_t mapped, highest;
3383 char buf[BDEVNAME_SIZE];
3384 struct thin_c *tc = ti->private;
3385
e49e5829
JT
3386 if (get_pool_mode(tc->pool) == PM_FAIL) {
3387 DMEMIT("Fail");
fd7c092e 3388 return;
e49e5829
JT
3389 }
3390
991d9fa0
JT
3391 if (!tc->td)
3392 DMEMIT("-");
3393 else {
3394 switch (type) {
3395 case STATUSTYPE_INFO:
3396 r = dm_thin_get_mapped_count(tc->td, &mapped);
fd7c092e
MP
3397 if (r) {
3398 DMERR("dm_thin_get_mapped_count returned %d", r);
3399 goto err;
3400 }
991d9fa0
JT
3401
3402 r = dm_thin_get_highest_mapped_block(tc->td, &highest);
fd7c092e
MP
3403 if (r < 0) {
3404 DMERR("dm_thin_get_highest_mapped_block returned %d", r);
3405 goto err;
3406 }
991d9fa0
JT
3407
3408 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
3409 if (r)
3410 DMEMIT("%llu", ((highest + 1) *
3411 tc->pool->sectors_per_block) - 1);
3412 else
3413 DMEMIT("-");
3414 break;
3415
3416 case STATUSTYPE_TABLE:
3417 DMEMIT("%s %lu",
3418 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
3419 (unsigned long) tc->dev_id);
2dd9c257
JT
3420 if (tc->origin_dev)
3421 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
991d9fa0
JT
3422 break;
3423 }
3424 }
3425
fd7c092e
MP
3426 return;
3427
3428err:
3429 DMEMIT("Error");
991d9fa0
JT
3430}
3431
3432static int thin_iterate_devices(struct dm_target *ti,
3433 iterate_devices_callout_fn fn, void *data)
3434{
55f2b8bd 3435 sector_t blocks;
991d9fa0 3436 struct thin_c *tc = ti->private;
55f2b8bd 3437 struct pool *pool = tc->pool;
991d9fa0
JT
3438
3439 /*
3440 * We can't call dm_pool_get_data_dev_size() since that blocks. So
3441 * we follow a more convoluted path through to the pool's target.
3442 */
55f2b8bd 3443 if (!pool->ti)
991d9fa0
JT
3444 return 0; /* nothing is bound */
3445
55f2b8bd
MS
3446 blocks = pool->ti->len;
3447 (void) sector_div(blocks, pool->sectors_per_block);
991d9fa0 3448 if (blocks)
55f2b8bd 3449 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
991d9fa0
JT
3450
3451 return 0;
3452}
3453
991d9fa0
JT
3454static struct target_type thin_target = {
3455 .name = "thin",
67324ea1 3456 .version = {1, 12, 0},
991d9fa0
JT
3457 .module = THIS_MODULE,
3458 .ctr = thin_ctr,
3459 .dtr = thin_dtr,
3460 .map = thin_map,
eb2aa48d 3461 .end_io = thin_endio,
738211f7 3462 .presuspend = thin_presuspend,
991d9fa0
JT
3463 .postsuspend = thin_postsuspend,
3464 .status = thin_status,
3465 .iterate_devices = thin_iterate_devices,
991d9fa0
JT
3466};
3467
3468/*----------------------------------------------------------------*/
3469
3470static int __init dm_thin_init(void)
3471{
3472 int r;
3473
3474 pool_table_init();
3475
3476 r = dm_register_target(&thin_target);
3477 if (r)
3478 return r;
3479
3480 r = dm_register_target(&pool_target);
3481 if (r)
a24c2569
MS
3482 goto bad_pool_target;
3483
3484 r = -ENOMEM;
3485
a24c2569
MS
3486 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
3487 if (!_new_mapping_cache)
3488 goto bad_new_mapping_cache;
3489
a24c2569
MS
3490 return 0;
3491
a24c2569 3492bad_new_mapping_cache:
a24c2569
MS
3493 dm_unregister_target(&pool_target);
3494bad_pool_target:
3495 dm_unregister_target(&thin_target);
991d9fa0
JT
3496
3497 return r;
3498}
3499
3500static void dm_thin_exit(void)
3501{
3502 dm_unregister_target(&thin_target);
3503 dm_unregister_target(&pool_target);
a24c2569 3504
a24c2569 3505 kmem_cache_destroy(_new_mapping_cache);
991d9fa0
JT
3506}
3507
3508module_init(dm_thin_init);
3509module_exit(dm_thin_exit);
3510
7cab8bf1 3511MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
991d9fa0
JT
3512MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3513MODULE_LICENSE("GPL");
This page took 0.53938 seconds and 5 git commands to generate.