bcache: Refactor request_write()
[deliverable/linux.git] / drivers / md / bcache / request.c
CommitLineData
cafe5635
KO
1/*
2 * Main bcache entry point - handle a read or a write request and decide what to
3 * do with it; the make_request functions are called by the block layer.
4 *
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
7 */
8
9#include "bcache.h"
10#include "btree.h"
11#include "debug.h"
12#include "request.h"
279afbad 13#include "writeback.h"
cafe5635
KO
14
15#include <linux/cgroup.h>
16#include <linux/module.h>
17#include <linux/hash.h>
18#include <linux/random.h>
19#include "blk-cgroup.h"
20
21#include <trace/events/bcache.h>
22
23#define CUTOFF_CACHE_ADD 95
24#define CUTOFF_CACHE_READA 90
cafe5635
KO
25
26struct kmem_cache *bch_search_cache;
27
cafe5635
KO
28/* Cgroup interface */
29
30#ifdef CONFIG_CGROUP_BCACHE
31static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 };
32
33static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup)
34{
35 struct cgroup_subsys_state *css;
36 return cgroup &&
37 (css = cgroup_subsys_state(cgroup, bcache_subsys_id))
38 ? container_of(css, struct bch_cgroup, css)
39 : &bcache_default_cgroup;
40}
41
42struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio)
43{
44 struct cgroup_subsys_state *css = bio->bi_css
45 ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id)
46 : task_subsys_state(current, bcache_subsys_id);
47
48 return css
49 ? container_of(css, struct bch_cgroup, css)
50 : &bcache_default_cgroup;
51}
52
53static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft,
54 struct file *file,
55 char __user *buf, size_t nbytes, loff_t *ppos)
56{
57 char tmp[1024];
169ef1cf
KO
58 int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes,
59 cgroup_to_bcache(cgrp)->cache_mode + 1);
cafe5635
KO
60
61 if (len < 0)
62 return len;
63
64 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
65}
66
67static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft,
68 const char *buf)
69{
169ef1cf 70 int v = bch_read_string_list(buf, bch_cache_modes);
cafe5635
KO
71 if (v < 0)
72 return v;
73
74 cgroup_to_bcache(cgrp)->cache_mode = v - 1;
75 return 0;
76}
77
78static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft)
79{
80 return cgroup_to_bcache(cgrp)->verify;
81}
82
83static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
84{
85 cgroup_to_bcache(cgrp)->verify = val;
86 return 0;
87}
88
89static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft)
90{
91 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
92 return atomic_read(&bcachecg->stats.cache_hits);
93}
94
95static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft)
96{
97 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
98 return atomic_read(&bcachecg->stats.cache_misses);
99}
100
101static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp,
102 struct cftype *cft)
103{
104 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
105 return atomic_read(&bcachecg->stats.cache_bypass_hits);
106}
107
108static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp,
109 struct cftype *cft)
110{
111 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
112 return atomic_read(&bcachecg->stats.cache_bypass_misses);
113}
114
115static struct cftype bch_files[] = {
116 {
117 .name = "cache_mode",
118 .read = cache_mode_read,
119 .write_string = cache_mode_write,
120 },
121 {
122 .name = "verify",
123 .read_u64 = bch_verify_read,
124 .write_u64 = bch_verify_write,
125 },
126 {
127 .name = "cache_hits",
128 .read_u64 = bch_cache_hits_read,
129 },
130 {
131 .name = "cache_misses",
132 .read_u64 = bch_cache_misses_read,
133 },
134 {
135 .name = "cache_bypass_hits",
136 .read_u64 = bch_cache_bypass_hits_read,
137 },
138 {
139 .name = "cache_bypass_misses",
140 .read_u64 = bch_cache_bypass_misses_read,
141 },
142 { } /* terminate */
143};
144
145static void init_bch_cgroup(struct bch_cgroup *cg)
146{
147 cg->cache_mode = -1;
148}
149
150static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
151{
152 struct bch_cgroup *cg;
153
154 cg = kzalloc(sizeof(*cg), GFP_KERNEL);
155 if (!cg)
156 return ERR_PTR(-ENOMEM);
157 init_bch_cgroup(cg);
158 return &cg->css;
159}
160
161static void bcachecg_destroy(struct cgroup *cgroup)
162{
163 struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
164 free_css_id(&bcache_subsys, &cg->css);
165 kfree(cg);
166}
167
168struct cgroup_subsys bcache_subsys = {
169 .create = bcachecg_create,
170 .destroy = bcachecg_destroy,
171 .subsys_id = bcache_subsys_id,
172 .name = "bcache",
173 .module = THIS_MODULE,
174};
175EXPORT_SYMBOL_GPL(bcache_subsys);
176#endif
177
178static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
179{
180#ifdef CONFIG_CGROUP_BCACHE
181 int r = bch_bio_to_cgroup(bio)->cache_mode;
182 if (r >= 0)
183 return r;
184#endif
185 return BDEV_CACHE_MODE(&dc->sb);
186}
187
188static bool verify(struct cached_dev *dc, struct bio *bio)
189{
190#ifdef CONFIG_CGROUP_BCACHE
191 if (bch_bio_to_cgroup(bio)->verify)
192 return true;
193#endif
194 return dc->verify;
195}
196
197static void bio_csum(struct bio *bio, struct bkey *k)
198{
199 struct bio_vec *bv;
200 uint64_t csum = 0;
201 int i;
202
203 bio_for_each_segment(bv, bio, i) {
204 void *d = kmap(bv->bv_page) + bv->bv_offset;
169ef1cf 205 csum = bch_crc64_update(csum, d, bv->bv_len);
cafe5635
KO
206 kunmap(bv->bv_page);
207 }
208
209 k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
210}
211
212/* Insert data into cache */
213
214static void bio_invalidate(struct closure *cl)
215{
216 struct btree_op *op = container_of(cl, struct btree_op, cl);
217 struct bio *bio = op->cache_bio;
218
219 pr_debug("invalidating %i sectors from %llu",
220 bio_sectors(bio), (uint64_t) bio->bi_sector);
221
222 while (bio_sectors(bio)) {
223 unsigned len = min(bio_sectors(bio), 1U << 14);
224
225 if (bch_keylist_realloc(&op->keys, 0, op->c))
226 goto out;
227
228 bio->bi_sector += len;
229 bio->bi_size -= len << 9;
230
231 bch_keylist_add(&op->keys,
232 &KEY(op->inode, bio->bi_sector, len));
233 }
234
235 op->insert_data_done = true;
236 bio_put(bio);
237out:
238 continue_at(cl, bch_journal, bcache_wq);
239}
240
241struct open_bucket {
242 struct list_head list;
243 struct task_struct *last;
244 unsigned sectors_free;
245 BKEY_PADDED(key);
246};
247
248void bch_open_buckets_free(struct cache_set *c)
249{
250 struct open_bucket *b;
251
252 while (!list_empty(&c->data_buckets)) {
253 b = list_first_entry(&c->data_buckets,
254 struct open_bucket, list);
255 list_del(&b->list);
256 kfree(b);
257 }
258}
259
260int bch_open_buckets_alloc(struct cache_set *c)
261{
262 int i;
263
264 spin_lock_init(&c->data_bucket_lock);
265
266 for (i = 0; i < 6; i++) {
267 struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
268 if (!b)
269 return -ENOMEM;
270
271 list_add(&b->list, &c->data_buckets);
272 }
273
274 return 0;
275}
276
277/*
278 * We keep multiple buckets open for writes, and try to segregate different
279 * write streams for better cache utilization: first we look for a bucket where
280 * the last write to it was sequential with the current write, and failing that
281 * we look for a bucket that was last used by the same task.
282 *
283 * The ideas is if you've got multiple tasks pulling data into the cache at the
284 * same time, you'll get better cache utilization if you try to segregate their
285 * data and preserve locality.
286 *
287 * For example, say you've starting Firefox at the same time you're copying a
288 * bunch of files. Firefox will likely end up being fairly hot and stay in the
289 * cache awhile, but the data you copied might not be; if you wrote all that
290 * data to the same buckets it'd get invalidated at the same time.
291 *
292 * Both of those tasks will be doing fairly random IO so we can't rely on
293 * detecting sequential IO to segregate their data, but going off of the task
294 * should be a sane heuristic.
295 */
296static struct open_bucket *pick_data_bucket(struct cache_set *c,
297 const struct bkey *search,
298 struct task_struct *task,
299 struct bkey *alloc)
300{
301 struct open_bucket *ret, *ret_task = NULL;
302
303 list_for_each_entry_reverse(ret, &c->data_buckets, list)
304 if (!bkey_cmp(&ret->key, search))
305 goto found;
306 else if (ret->last == task)
307 ret_task = ret;
308
309 ret = ret_task ?: list_first_entry(&c->data_buckets,
310 struct open_bucket, list);
311found:
312 if (!ret->sectors_free && KEY_PTRS(alloc)) {
313 ret->sectors_free = c->sb.bucket_size;
314 bkey_copy(&ret->key, alloc);
315 bkey_init(alloc);
316 }
317
318 if (!ret->sectors_free)
319 ret = NULL;
320
321 return ret;
322}
323
324/*
325 * Allocates some space in the cache to write to, and k to point to the newly
326 * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
327 * end of the newly allocated space).
328 *
329 * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
330 * sectors were actually allocated.
331 *
332 * If s->writeback is true, will not fail.
333 */
334static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
335 struct search *s)
336{
337 struct cache_set *c = s->op.c;
338 struct open_bucket *b;
339 BKEY_PADDED(key) alloc;
340 struct closure cl, *w = NULL;
341 unsigned i;
342
343 if (s->writeback) {
344 closure_init_stack(&cl);
345 w = &cl;
346 }
347
348 /*
349 * We might have to allocate a new bucket, which we can't do with a
350 * spinlock held. So if we have to allocate, we drop the lock, allocate
351 * and then retry. KEY_PTRS() indicates whether alloc points to
352 * allocated bucket(s).
353 */
354
355 bkey_init(&alloc.key);
356 spin_lock(&c->data_bucket_lock);
357
358 while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) {
359 unsigned watermark = s->op.write_prio
360 ? WATERMARK_MOVINGGC
361 : WATERMARK_NONE;
362
363 spin_unlock(&c->data_bucket_lock);
364
365 if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w))
366 return false;
367
368 spin_lock(&c->data_bucket_lock);
369 }
370
371 /*
372 * If we had to allocate, we might race and not need to allocate the
373 * second time we call find_data_bucket(). If we allocated a bucket but
374 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
375 */
376 if (KEY_PTRS(&alloc.key))
377 __bkey_put(c, &alloc.key);
378
379 for (i = 0; i < KEY_PTRS(&b->key); i++)
380 EBUG_ON(ptr_stale(c, &b->key, i));
381
382 /* Set up the pointer to the space we're allocating: */
383
384 for (i = 0; i < KEY_PTRS(&b->key); i++)
385 k->ptr[i] = b->key.ptr[i];
386
387 sectors = min(sectors, b->sectors_free);
388
389 SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
390 SET_KEY_SIZE(k, sectors);
391 SET_KEY_PTRS(k, KEY_PTRS(&b->key));
392
393 /*
394 * Move b to the end of the lru, and keep track of what this bucket was
395 * last used for:
396 */
397 list_move_tail(&b->list, &c->data_buckets);
398 bkey_copy_key(&b->key, k);
399 b->last = s->task;
400
401 b->sectors_free -= sectors;
402
403 for (i = 0; i < KEY_PTRS(&b->key); i++) {
404 SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
405
406 atomic_long_add(sectors,
407 &PTR_CACHE(c, &b->key, i)->sectors_written);
408 }
409
410 if (b->sectors_free < c->sb.block_size)
411 b->sectors_free = 0;
412
413 /*
414 * k takes refcounts on the buckets it points to until it's inserted
415 * into the btree, but if we're done with this bucket we just transfer
416 * get_data_bucket()'s refcount.
417 */
418 if (b->sectors_free)
419 for (i = 0; i < KEY_PTRS(&b->key); i++)
420 atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
421
422 spin_unlock(&c->data_bucket_lock);
423 return true;
424}
425
426static void bch_insert_data_error(struct closure *cl)
427{
428 struct btree_op *op = container_of(cl, struct btree_op, cl);
429
430 /*
431 * Our data write just errored, which means we've got a bunch of keys to
432 * insert that point to data that wasn't succesfully written.
433 *
434 * We don't have to insert those keys but we still have to invalidate
435 * that region of the cache - so, if we just strip off all the pointers
436 * from the keys we'll accomplish just that.
437 */
438
c2f95ae2 439 struct bkey *src = op->keys.keys, *dst = op->keys.keys;
cafe5635
KO
440
441 while (src != op->keys.top) {
442 struct bkey *n = bkey_next(src);
443
444 SET_KEY_PTRS(src, 0);
c2f95ae2 445 memmove(dst, src, bkey_bytes(src));
cafe5635
KO
446
447 dst = bkey_next(dst);
448 src = n;
449 }
450
451 op->keys.top = dst;
452
453 bch_journal(cl);
454}
455
456static void bch_insert_data_endio(struct bio *bio, int error)
457{
458 struct closure *cl = bio->bi_private;
459 struct btree_op *op = container_of(cl, struct btree_op, cl);
460 struct search *s = container_of(op, struct search, op);
461
462 if (error) {
463 /* TODO: We could try to recover from this. */
464 if (s->writeback)
465 s->error = error;
466 else if (s->write)
467 set_closure_fn(cl, bch_insert_data_error, bcache_wq);
468 else
469 set_closure_fn(cl, NULL, NULL);
470 }
471
472 bch_bbio_endio(op->c, bio, error, "writing data to cache");
473}
474
475static void bch_insert_data_loop(struct closure *cl)
476{
477 struct btree_op *op = container_of(cl, struct btree_op, cl);
478 struct search *s = container_of(op, struct search, op);
479 struct bio *bio = op->cache_bio, *n;
480
84f0db03 481 if (op->bypass)
cafe5635
KO
482 return bio_invalidate(cl);
483
484 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
485 set_gc_sectors(op->c);
486 bch_queue_gc(op->c);
487 }
488
54d12f2b
KO
489 /*
490 * Journal writes are marked REQ_FLUSH; if the original write was a
491 * flush, it'll wait on the journal write.
492 */
493 bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA);
494
cafe5635
KO
495 do {
496 unsigned i;
497 struct bkey *k;
498 struct bio_set *split = s->d
499 ? s->d->bio_split : op->c->bio_split;
500
501 /* 1 for the device pointer and 1 for the chksum */
502 if (bch_keylist_realloc(&op->keys,
503 1 + (op->csum ? 1 : 0),
504 op->c))
505 continue_at(cl, bch_journal, bcache_wq);
506
507 k = op->keys.top;
508 bkey_init(k);
509 SET_KEY_INODE(k, op->inode);
510 SET_KEY_OFFSET(k, bio->bi_sector);
511
512 if (!bch_alloc_sectors(k, bio_sectors(bio), s))
513 goto err;
514
515 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
cafe5635
KO
516
517 n->bi_end_io = bch_insert_data_endio;
518 n->bi_private = cl;
519
520 if (s->writeback) {
521 SET_KEY_DIRTY(k, true);
522
523 for (i = 0; i < KEY_PTRS(k); i++)
524 SET_GC_MARK(PTR_BUCKET(op->c, k, i),
525 GC_MARK_DIRTY);
526 }
527
528 SET_KEY_CSUM(k, op->csum);
529 if (KEY_CSUM(k))
530 bio_csum(n, k);
531
c37511b8 532 trace_bcache_cache_insert(k);
cafe5635
KO
533 bch_keylist_push(&op->keys);
534
cafe5635
KO
535 n->bi_rw |= REQ_WRITE;
536 bch_submit_bbio(n, op->c, k, 0);
537 } while (n != bio);
538
539 op->insert_data_done = true;
540 continue_at(cl, bch_journal, bcache_wq);
541err:
542 /* bch_alloc_sectors() blocks if s->writeback = true */
543 BUG_ON(s->writeback);
544
545 /*
546 * But if it's not a writeback write we'd rather just bail out if
547 * there aren't any buckets ready to write to - it might take awhile and
548 * we might be starving btree writes for gc or something.
549 */
550
551 if (s->write) {
552 /*
553 * Writethrough write: We can't complete the write until we've
554 * updated the index. But we don't want to delay the write while
555 * we wait for buckets to be freed up, so just invalidate the
556 * rest of the write.
557 */
84f0db03 558 op->bypass = true;
cafe5635
KO
559 return bio_invalidate(cl);
560 } else {
561 /*
562 * From a cache miss, we can just insert the keys for the data
563 * we have written or bail out if we didn't do anything.
564 */
565 op->insert_data_done = true;
566 bio_put(bio);
567
568 if (!bch_keylist_empty(&op->keys))
569 continue_at(cl, bch_journal, bcache_wq);
570 else
571 closure_return(cl);
572 }
573}
574
575/**
576 * bch_insert_data - stick some data in the cache
577 *
578 * This is the starting point for any data to end up in a cache device; it could
579 * be from a normal write, or a writeback write, or a write to a flash only
580 * volume - it's also used by the moving garbage collector to compact data in
581 * mostly empty buckets.
582 *
583 * It first writes the data to the cache, creating a list of keys to be inserted
584 * (if the data had to be fragmented there will be multiple keys); after the
585 * data is written it calls bch_journal, and after the keys have been added to
586 * the next journal write they're inserted into the btree.
587 *
588 * It inserts the data in op->cache_bio; bi_sector is used for the key offset,
589 * and op->inode is used for the key inode.
590 *
84f0db03
KO
591 * If op->bypass is true, instead of inserting the data it invalidates the
592 * region of the cache represented by op->cache_bio and op->inode.
cafe5635
KO
593 */
594void bch_insert_data(struct closure *cl)
595{
596 struct btree_op *op = container_of(cl, struct btree_op, cl);
597
598 bch_keylist_init(&op->keys);
599 bio_get(op->cache_bio);
600 bch_insert_data_loop(cl);
601}
602
603void bch_btree_insert_async(struct closure *cl)
604{
605 struct btree_op *op = container_of(cl, struct btree_op, cl);
606 struct search *s = container_of(op, struct search, op);
607
4f3d4014 608 if (bch_btree_insert(op, op->c, &op->keys)) {
cafe5635
KO
609 s->error = -ENOMEM;
610 op->insert_data_done = true;
611 }
612
613 if (op->insert_data_done) {
614 bch_keylist_free(&op->keys);
615 closure_return(cl);
616 } else
617 continue_at(cl, bch_insert_data_loop, bcache_wq);
618}
619
620/* Common code for the make_request functions */
621
622static void request_endio(struct bio *bio, int error)
623{
624 struct closure *cl = bio->bi_private;
625
626 if (error) {
627 struct search *s = container_of(cl, struct search, cl);
628 s->error = error;
629 /* Only cache read errors are recoverable */
630 s->recoverable = false;
631 }
632
633 bio_put(bio);
634 closure_put(cl);
635}
636
637void bch_cache_read_endio(struct bio *bio, int error)
638{
639 struct bbio *b = container_of(bio, struct bbio, bio);
640 struct closure *cl = bio->bi_private;
641 struct search *s = container_of(cl, struct search, cl);
642
643 /*
644 * If the bucket was reused while our bio was in flight, we might have
645 * read the wrong data. Set s->error but not error so it doesn't get
646 * counted against the cache device, but we'll still reread the data
647 * from the backing device.
648 */
649
650 if (error)
651 s->error = error;
652 else if (ptr_stale(s->op.c, &b->key, 0)) {
653 atomic_long_inc(&s->op.c->cache_read_races);
654 s->error = -EINTR;
655 }
656
657 bch_bbio_endio(s->op.c, bio, error, "reading from cache");
658}
659
660static void bio_complete(struct search *s)
661{
662 if (s->orig_bio) {
663 int cpu, rw = bio_data_dir(s->orig_bio);
664 unsigned long duration = jiffies - s->start_time;
665
666 cpu = part_stat_lock();
667 part_round_stats(cpu, &s->d->disk->part0);
668 part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
669 part_stat_unlock();
670
671 trace_bcache_request_end(s, s->orig_bio);
672 bio_endio(s->orig_bio, s->error);
673 s->orig_bio = NULL;
674 }
675}
676
677static void do_bio_hook(struct search *s)
678{
679 struct bio *bio = &s->bio.bio;
680 memcpy(bio, s->orig_bio, sizeof(struct bio));
681
682 bio->bi_end_io = request_endio;
683 bio->bi_private = &s->cl;
684 atomic_set(&bio->bi_cnt, 3);
685}
686
687static void search_free(struct closure *cl)
688{
689 struct search *s = container_of(cl, struct search, cl);
690 bio_complete(s);
691
692 if (s->op.cache_bio)
693 bio_put(s->op.cache_bio);
694
695 if (s->unaligned_bvec)
696 mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
697
698 closure_debug_destroy(cl);
699 mempool_free(s, s->d->c->search);
700}
701
702static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
703{
704 struct bio_vec *bv;
705 struct search *s = mempool_alloc(d->c->search, GFP_NOIO);
706 memset(s, 0, offsetof(struct search, op.keys));
707
708 __closure_init(&s->cl, NULL);
709
710 s->op.inode = d->id;
711 s->op.c = d->c;
712 s->d = d;
713 s->op.lock = -1;
714 s->task = current;
715 s->orig_bio = bio;
716 s->write = (bio->bi_rw & REQ_WRITE) != 0;
54d12f2b 717 s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
cafe5635
KO
718 s->recoverable = 1;
719 s->start_time = jiffies;
720 do_bio_hook(s);
721
722 if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
723 bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
724 memcpy(bv, bio_iovec(bio),
725 sizeof(struct bio_vec) * bio_segments(bio));
726
727 s->bio.bio.bi_io_vec = bv;
728 s->unaligned_bvec = 1;
729 }
730
731 return s;
732}
733
734static void btree_read_async(struct closure *cl)
735{
736 struct btree_op *op = container_of(cl, struct btree_op, cl);
737
738 int ret = btree_root(search_recurse, op->c, op);
739
740 if (ret == -EAGAIN)
741 continue_at(cl, btree_read_async, bcache_wq);
742
743 closure_return(cl);
744}
745
746/* Cached devices */
747
748static void cached_dev_bio_complete(struct closure *cl)
749{
750 struct search *s = container_of(cl, struct search, cl);
751 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
752
753 search_free(cl);
754 cached_dev_put(dc);
755}
756
84f0db03
KO
757unsigned bch_get_congested(struct cache_set *c)
758{
759 int i;
760 long rand;
761
762 if (!c->congested_read_threshold_us &&
763 !c->congested_write_threshold_us)
764 return 0;
765
766 i = (local_clock_us() - c->congested_last_us) / 1024;
767 if (i < 0)
768 return 0;
769
770 i += atomic_read(&c->congested);
771 if (i >= 0)
772 return 0;
773
774 i += CONGESTED_MAX;
775
776 if (i > 0)
777 i = fract_exp_two(i, 6);
778
779 rand = get_random_int();
780 i -= bitmap_weight(&rand, BITS_PER_LONG);
781
782 return i > 0 ? i : 1;
783}
784
785static void add_sequential(struct task_struct *t)
786{
787 ewma_add(t->sequential_io_avg,
788 t->sequential_io, 8, 0);
789
790 t->sequential_io = 0;
791}
792
793static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
794{
795 return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
796}
797
798static bool check_should_bypass(struct cached_dev *dc, struct search *s)
799{
800 struct cache_set *c = s->op.c;
801 struct bio *bio = &s->bio.bio;
802 unsigned mode = cache_mode(dc, bio);
803 unsigned sectors, congested = bch_get_congested(c);
804
805 if (atomic_read(&dc->disk.detaching) ||
806 c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
807 (bio->bi_rw & REQ_DISCARD))
808 goto skip;
809
810 if (mode == CACHE_MODE_NONE ||
811 (mode == CACHE_MODE_WRITEAROUND &&
812 (bio->bi_rw & REQ_WRITE)))
813 goto skip;
814
815 if (bio->bi_sector & (c->sb.block_size - 1) ||
816 bio_sectors(bio) & (c->sb.block_size - 1)) {
817 pr_debug("skipping unaligned io");
818 goto skip;
819 }
820
821 if (!congested && !dc->sequential_cutoff)
822 goto rescale;
823
824 if (!congested &&
825 mode == CACHE_MODE_WRITEBACK &&
826 (bio->bi_rw & REQ_WRITE) &&
827 (bio->bi_rw & REQ_SYNC))
828 goto rescale;
829
830 if (dc->sequential_merge) {
831 struct io *i;
832
833 spin_lock(&dc->io_lock);
834
835 hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
836 if (i->last == bio->bi_sector &&
837 time_before(jiffies, i->jiffies))
838 goto found;
839
840 i = list_first_entry(&dc->io_lru, struct io, lru);
841
842 add_sequential(s->task);
843 i->sequential = 0;
844found:
845 if (i->sequential + bio->bi_size > i->sequential)
846 i->sequential += bio->bi_size;
847
848 i->last = bio_end_sector(bio);
849 i->jiffies = jiffies + msecs_to_jiffies(5000);
850 s->task->sequential_io = i->sequential;
851
852 hlist_del(&i->hash);
853 hlist_add_head(&i->hash, iohash(dc, i->last));
854 list_move_tail(&i->lru, &dc->io_lru);
855
856 spin_unlock(&dc->io_lock);
857 } else {
858 s->task->sequential_io = bio->bi_size;
859
860 add_sequential(s->task);
861 }
862
863 sectors = max(s->task->sequential_io,
864 s->task->sequential_io_avg) >> 9;
865
866 if (dc->sequential_cutoff &&
867 sectors >= dc->sequential_cutoff >> 9) {
868 trace_bcache_bypass_sequential(s->orig_bio);
869 goto skip;
870 }
871
872 if (congested && sectors >= congested) {
873 trace_bcache_bypass_congested(s->orig_bio);
874 goto skip;
875 }
876
877rescale:
878 bch_rescale_priorities(c, bio_sectors(bio));
879 return false;
880skip:
881 bch_mark_sectors_bypassed(s, bio_sectors(bio));
882 return true;
883}
884
cafe5635
KO
885/* Process reads */
886
887static void cached_dev_read_complete(struct closure *cl)
888{
889 struct search *s = container_of(cl, struct search, cl);
890
891 if (s->op.insert_collision)
892 bch_mark_cache_miss_collision(s);
893
894 if (s->op.cache_bio) {
895 int i;
896 struct bio_vec *bv;
897
898 __bio_for_each_segment(bv, s->op.cache_bio, i, 0)
899 __free_page(bv->bv_page);
900 }
901
902 cached_dev_bio_complete(cl);
903}
904
905static void request_read_error(struct closure *cl)
906{
907 struct search *s = container_of(cl, struct search, cl);
908 struct bio_vec *bv;
909 int i;
910
911 if (s->recoverable) {
c37511b8
KO
912 /* Retry from the backing device: */
913 trace_bcache_read_retry(s->orig_bio);
cafe5635
KO
914
915 s->error = 0;
916 bv = s->bio.bio.bi_io_vec;
917 do_bio_hook(s);
918 s->bio.bio.bi_io_vec = bv;
919
920 if (!s->unaligned_bvec)
921 bio_for_each_segment(bv, s->orig_bio, i)
922 bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
923 else
924 memcpy(s->bio.bio.bi_io_vec,
925 bio_iovec(s->orig_bio),
926 sizeof(struct bio_vec) *
927 bio_segments(s->orig_bio));
928
929 /* XXX: invalidate cache */
930
cafe5635
KO
931 closure_bio_submit(&s->bio.bio, &s->cl, s->d);
932 }
933
934 continue_at(cl, cached_dev_read_complete, NULL);
935}
936
937static void request_read_done(struct closure *cl)
938{
939 struct search *s = container_of(cl, struct search, cl);
940 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
941
942 /*
943 * s->cache_bio != NULL implies that we had a cache miss; cache_bio now
944 * contains data ready to be inserted into the cache.
945 *
946 * First, we copy the data we just read from cache_bio's bounce buffers
947 * to the buffers the original bio pointed to:
948 */
949
950 if (s->op.cache_bio) {
cafe5635
KO
951 bio_reset(s->op.cache_bio);
952 s->op.cache_bio->bi_sector = s->cache_miss->bi_sector;
953 s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev;
954 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9;
169ef1cf 955 bch_bio_map(s->op.cache_bio, NULL);
cafe5635 956
8e51e414 957 bio_copy_data(s->cache_miss, s->op.cache_bio);
cafe5635
KO
958
959 bio_put(s->cache_miss);
960 s->cache_miss = NULL;
961 }
962
963 if (verify(dc, &s->bio.bio) && s->recoverable)
964 bch_data_verify(s);
965
966 bio_complete(s);
967
968 if (s->op.cache_bio &&
969 !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) {
970 s->op.type = BTREE_REPLACE;
971 closure_call(&s->op.cl, bch_insert_data, NULL, cl);
972 }
973
974 continue_at(cl, cached_dev_read_complete, NULL);
975}
976
977static void request_read_done_bh(struct closure *cl)
978{
979 struct search *s = container_of(cl, struct search, cl);
980 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
981
84f0db03
KO
982 bch_mark_cache_accounting(s, !s->cache_miss, s->op.bypass);
983 trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.bypass);
cafe5635
KO
984
985 if (s->error)
986 continue_at_nobarrier(cl, request_read_error, bcache_wq);
987 else if (s->op.cache_bio || verify(dc, &s->bio.bio))
988 continue_at_nobarrier(cl, request_read_done, bcache_wq);
989 else
990 continue_at_nobarrier(cl, cached_dev_read_complete, NULL);
991}
992
993static int cached_dev_cache_miss(struct btree *b, struct search *s,
994 struct bio *bio, unsigned sectors)
995{
996 int ret = 0;
e7c590eb 997 unsigned reada = 0;
cafe5635
KO
998 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
999 struct bio *miss;
1000
84f0db03 1001 if (s->cache_miss || s->op.bypass) {
e7c590eb
KO
1002 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
1003 if (miss == bio)
1004 s->op.lookup_done = true;
1005 goto out_submit;
1006 }
cafe5635 1007
e7c590eb
KO
1008 if (!(bio->bi_rw & REQ_RAHEAD) &&
1009 !(bio->bi_rw & REQ_META) &&
1010 s->op.c->gc_stats.in_use < CUTOFF_CACHE_READA)
1011 reada = min_t(sector_t, dc->readahead >> 9,
1012 bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
cafe5635 1013
e7c590eb 1014 s->cache_bio_sectors = min(sectors, bio_sectors(bio) + reada);
cafe5635 1015
e7c590eb
KO
1016 s->op.replace = KEY(s->op.inode, bio->bi_sector +
1017 s->cache_bio_sectors, s->cache_bio_sectors);
1018
1019 ret = bch_btree_insert_check_key(b, &s->op, &s->op.replace);
1020 if (ret)
1021 return ret;
1022
1023 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
1024 if (miss == bio)
1025 s->op.lookup_done = true;
1026 else
1027 /* btree_search_recurse()'s btree iterator is no good anymore */
1028 ret = -EINTR;
cafe5635 1029
cafe5635
KO
1030 s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT,
1031 DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
1032 dc->disk.bio_split);
1033
1034 if (!s->op.cache_bio)
1035 goto out_submit;
1036
1037 s->op.cache_bio->bi_sector = miss->bi_sector;
1038 s->op.cache_bio->bi_bdev = miss->bi_bdev;
1039 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9;
1040
1041 s->op.cache_bio->bi_end_io = request_endio;
1042 s->op.cache_bio->bi_private = &s->cl;
1043
169ef1cf 1044 bch_bio_map(s->op.cache_bio, NULL);
8e51e414 1045 if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
cafe5635
KO
1046 goto out_put;
1047
1048 s->cache_miss = miss;
1049 bio_get(s->op.cache_bio);
1050
cafe5635
KO
1051 closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
1052
1053 return ret;
1054out_put:
1055 bio_put(s->op.cache_bio);
1056 s->op.cache_bio = NULL;
1057out_submit:
e7c590eb
KO
1058 miss->bi_end_io = request_endio;
1059 miss->bi_private = &s->cl;
cafe5635
KO
1060 closure_bio_submit(miss, &s->cl, s->d);
1061 return ret;
1062}
1063
1064static void request_read(struct cached_dev *dc, struct search *s)
1065{
1066 struct closure *cl = &s->cl;
1067
cafe5635 1068 closure_call(&s->op.cl, btree_read_async, NULL, cl);
cafe5635
KO
1069 continue_at(cl, request_read_done_bh, NULL);
1070}
1071
1072/* Process writes */
1073
1074static void cached_dev_write_complete(struct closure *cl)
1075{
1076 struct search *s = container_of(cl, struct search, cl);
1077 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
1078
1079 up_read_non_owner(&dc->writeback_lock);
1080 cached_dev_bio_complete(cl);
1081}
1082
cafe5635
KO
1083static void request_write(struct cached_dev *dc, struct search *s)
1084{
1085 struct closure *cl = &s->cl;
1086 struct bio *bio = &s->bio.bio;
84f0db03
KO
1087 struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0);
1088 struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
cafe5635
KO
1089
1090 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
1091
cafe5635 1092 down_read_non_owner(&dc->writeback_lock);
cafe5635 1093 if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
84f0db03
KO
1094 /*
1095 * We overlap with some dirty data undergoing background
1096 * writeback, force this write to writeback
1097 */
1098 s->op.bypass = false;
cafe5635
KO
1099 s->writeback = true;
1100 }
1101
84f0db03
KO
1102 /*
1103 * Discards aren't _required_ to do anything, so skipping if
1104 * check_overlapping returned true is ok
1105 *
1106 * But check_overlapping drops dirty keys for which io hasn't started,
1107 * so we still want to call it.
1108 */
cafe5635 1109 if (bio->bi_rw & REQ_DISCARD)
84f0db03 1110 s->op.bypass = true;
cafe5635 1111
72c27061
KO
1112 if (should_writeback(dc, s->orig_bio,
1113 cache_mode(dc, bio),
84f0db03
KO
1114 s->op.bypass)) {
1115 s->op.bypass = false;
72c27061
KO
1116 s->writeback = true;
1117 }
1118
84f0db03 1119 trace_bcache_write(s->orig_bio, s->writeback, s->op.bypass);
c37511b8 1120
84f0db03
KO
1121 if (s->op.bypass) {
1122 s->op.cache_bio = s->orig_bio;
1123 bio_get(s->op.cache_bio);
cafe5635 1124
84f0db03
KO
1125 if (!(bio->bi_rw & REQ_DISCARD) ||
1126 blk_queue_discard(bdev_get_queue(dc->bdev)))
1127 closure_bio_submit(bio, cl, s->d);
1128 } else if (s->writeback) {
279afbad 1129 bch_writeback_add(dc);
2fe80d3b 1130 s->op.cache_bio = bio;
e49c7c37 1131
c0f04d88 1132 if (bio->bi_rw & REQ_FLUSH) {
e49c7c37 1133 /* Also need to send a flush to the backing device */
d4eddd42 1134 struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
c0f04d88 1135 dc->disk.bio_split);
e49c7c37 1136
c0f04d88
KO
1137 flush->bi_rw = WRITE_FLUSH;
1138 flush->bi_bdev = bio->bi_bdev;
1139 flush->bi_end_io = request_endio;
1140 flush->bi_private = cl;
1141
1142 closure_bio_submit(flush, cl, s->d);
e49c7c37 1143 }
84f0db03
KO
1144 } else {
1145 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
1146 dc->disk.bio_split);
1147
1148 closure_bio_submit(bio, cl, s->d);
cafe5635 1149 }
84f0db03 1150
cafe5635
KO
1151 closure_call(&s->op.cl, bch_insert_data, NULL, cl);
1152 continue_at(cl, cached_dev_write_complete, NULL);
cafe5635
KO
1153}
1154
1155static void request_nodata(struct cached_dev *dc, struct search *s)
1156{
1157 struct closure *cl = &s->cl;
1158 struct bio *bio = &s->bio.bio;
1159
cafe5635
KO
1160 if (s->op.flush_journal)
1161 bch_journal_meta(s->op.c, cl);
1162
84f0db03 1163 /* If it's a flush, we send the flush to the backing device too */
cafe5635
KO
1164 closure_bio_submit(bio, cl, s->d);
1165
1166 continue_at(cl, cached_dev_bio_complete, NULL);
1167}
1168
1169/* Cached devices - read & write stuff */
1170
cafe5635
KO
1171static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
1172{
1173 struct search *s;
1174 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
1175 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1176 int cpu, rw = bio_data_dir(bio);
1177
1178 cpu = part_stat_lock();
1179 part_stat_inc(cpu, &d->disk->part0, ios[rw]);
1180 part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
1181 part_stat_unlock();
1182
1183 bio->bi_bdev = dc->bdev;
2903381f 1184 bio->bi_sector += dc->sb.data_offset;
cafe5635
KO
1185
1186 if (cached_dev_get(dc)) {
1187 s = search_alloc(bio, d);
1188 trace_bcache_request_start(s, bio);
1189
84f0db03 1190 if (!bio->bi_size)
cafe5635 1191 request_nodata(dc, s);
84f0db03
KO
1192 else {
1193 s->op.bypass = check_should_bypass(dc, s);
1194
1195 if (rw)
1196 request_write(dc, s);
1197 else
1198 request_read(dc, s);
1199 }
cafe5635
KO
1200 } else {
1201 if ((bio->bi_rw & REQ_DISCARD) &&
1202 !blk_queue_discard(bdev_get_queue(dc->bdev)))
1203 bio_endio(bio, 0);
1204 else
1205 bch_generic_make_request(bio, &d->bio_split_hook);
1206 }
1207}
1208
1209static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
1210 unsigned int cmd, unsigned long arg)
1211{
1212 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1213 return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
1214}
1215
1216static int cached_dev_congested(void *data, int bits)
1217{
1218 struct bcache_device *d = data;
1219 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1220 struct request_queue *q = bdev_get_queue(dc->bdev);
1221 int ret = 0;
1222
1223 if (bdi_congested(&q->backing_dev_info, bits))
1224 return 1;
1225
1226 if (cached_dev_get(dc)) {
1227 unsigned i;
1228 struct cache *ca;
1229
1230 for_each_cache(ca, d->c, i) {
1231 q = bdev_get_queue(ca->bdev);
1232 ret |= bdi_congested(&q->backing_dev_info, bits);
1233 }
1234
1235 cached_dev_put(dc);
1236 }
1237
1238 return ret;
1239}
1240
1241void bch_cached_dev_request_init(struct cached_dev *dc)
1242{
1243 struct gendisk *g = dc->disk.disk;
1244
1245 g->queue->make_request_fn = cached_dev_make_request;
1246 g->queue->backing_dev_info.congested_fn = cached_dev_congested;
1247 dc->disk.cache_miss = cached_dev_cache_miss;
1248 dc->disk.ioctl = cached_dev_ioctl;
1249}
1250
1251/* Flash backed devices */
1252
1253static int flash_dev_cache_miss(struct btree *b, struct search *s,
1254 struct bio *bio, unsigned sectors)
1255{
8e51e414
KO
1256 struct bio_vec *bv;
1257 int i;
1258
cafe5635
KO
1259 /* Zero fill bio */
1260
8e51e414 1261 bio_for_each_segment(bv, bio, i) {
cafe5635
KO
1262 unsigned j = min(bv->bv_len >> 9, sectors);
1263
1264 void *p = kmap(bv->bv_page);
1265 memset(p + bv->bv_offset, 0, j << 9);
1266 kunmap(bv->bv_page);
1267
8e51e414 1268 sectors -= j;
cafe5635
KO
1269 }
1270
8e51e414
KO
1271 bio_advance(bio, min(sectors << 9, bio->bi_size));
1272
1273 if (!bio->bi_size)
1274 s->op.lookup_done = true;
cafe5635
KO
1275
1276 return 0;
1277}
1278
1279static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
1280{
1281 struct search *s;
1282 struct closure *cl;
1283 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
1284 int cpu, rw = bio_data_dir(bio);
1285
1286 cpu = part_stat_lock();
1287 part_stat_inc(cpu, &d->disk->part0, ios[rw]);
1288 part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
1289 part_stat_unlock();
1290
1291 s = search_alloc(bio, d);
1292 cl = &s->cl;
1293 bio = &s->bio.bio;
1294
1295 trace_bcache_request_start(s, bio);
1296
84f0db03
KO
1297 if (!bio->bi_size) {
1298 if (s->op.flush_journal)
1299 bch_journal_meta(s->op.c, cl);
1300 } else if (rw) {
cafe5635 1301 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
8e51e414
KO
1302 &KEY(d->id, bio->bi_sector, 0),
1303 &KEY(d->id, bio_end_sector(bio), 0));
cafe5635 1304
84f0db03 1305 s->op.bypass = (bio->bi_rw & REQ_DISCARD) != 0;
cafe5635
KO
1306 s->writeback = true;
1307 s->op.cache_bio = bio;
1308
1309 closure_call(&s->op.cl, bch_insert_data, NULL, cl);
1310 } else {
84f0db03 1311 closure_call(&s->op.cl, btree_read_async, NULL, cl);
cafe5635
KO
1312 }
1313
1314 continue_at(cl, search_free, NULL);
1315}
1316
1317static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
1318 unsigned int cmd, unsigned long arg)
1319{
1320 return -ENOTTY;
1321}
1322
1323static int flash_dev_congested(void *data, int bits)
1324{
1325 struct bcache_device *d = data;
1326 struct request_queue *q;
1327 struct cache *ca;
1328 unsigned i;
1329 int ret = 0;
1330
1331 for_each_cache(ca, d->c, i) {
1332 q = bdev_get_queue(ca->bdev);
1333 ret |= bdi_congested(&q->backing_dev_info, bits);
1334 }
1335
1336 return ret;
1337}
1338
1339void bch_flash_dev_request_init(struct bcache_device *d)
1340{
1341 struct gendisk *g = d->disk;
1342
1343 g->queue->make_request_fn = flash_dev_make_request;
1344 g->queue->backing_dev_info.congested_fn = flash_dev_congested;
1345 d->cache_miss = flash_dev_cache_miss;
1346 d->ioctl = flash_dev_ioctl;
1347}
1348
1349void bch_request_exit(void)
1350{
1351#ifdef CONFIG_CGROUP_BCACHE
1352 cgroup_unload_subsys(&bcache_subsys);
1353#endif
1354 if (bch_search_cache)
1355 kmem_cache_destroy(bch_search_cache);
1356}
1357
1358int __init bch_request_init(void)
1359{
1360 bch_search_cache = KMEM_CACHE(search, 0);
1361 if (!bch_search_cache)
1362 return -ENOMEM;
1363
1364#ifdef CONFIG_CGROUP_BCACHE
1365 cgroup_load_subsys(&bcache_subsys);
1366 init_bch_cgroup(&bcache_default_cgroup);
1367
1368 cgroup_add_cftypes(&bcache_subsys, bch_files);
1369#endif
1370 return 0;
1371}
This page took 0.107945 seconds and 5 git commands to generate.