2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4 * Uses a block device as cache for other block devices; optimized for SSDs.
5 * All allocation is done in buckets, which should match the erase block size
8 * Buckets containing cached data are kept on a heap sorted by priority;
9 * bucket priority is increased on cache hit, and periodically all the buckets
10 * on the heap have their priority scaled down. This currently is just used as
11 * an LRU but in the future should allow for more intelligent heuristics.
13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
14 * counter. Garbage collection is used to remove stale pointers.
16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
17 * as keys are inserted we only sort the pages that have not yet been written.
18 * When garbage collection is run, we resort the entire node.
20 * All configuration is done via sysfs; see Documentation/bcache.txt.
28 #include <linux/slab.h>
29 #include <linux/bitops.h>
30 #include <linux/hash.h>
31 #include <linux/prefetch.h>
32 #include <linux/random.h>
33 #include <linux/rcupdate.h>
34 #include <trace/events/bcache.h>
38 * register_bcache: Return errors out to userspace correctly
40 * Writeback: don't undirty key until after a cache flush
42 * Create an iterator for key pointers
44 * On btree write error, mark bucket such that it won't be freed from the cache
47 * Check for bad keys in replay
49 * Refcount journal entries in journal_replay
52 * Finish incremental gc
53 * Gc should free old UUIDs, data for invalid UUIDs
55 * Provide a way to list backing device UUIDs we have data cached for, and
56 * probably how long it's been since we've seen them, and a way to invalidate
57 * dirty data for devices that will never be attached again
59 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
60 * that based on that and how much dirty data we have we can keep writeback
63 * Add a tracepoint or somesuch to watch for writeback starvation
65 * When btree depth > 1 and splitting an interior node, we have to make sure
66 * alloc_bucket() cannot fail. This should be true but is not completely
69 * Make sure all allocations get charged to the root cgroup
73 * If data write is less than hard sector size of ssd, round up offset in open
74 * bucket to the next whole sector
76 * Also lookup by cgroup in get_open_bucket()
78 * Superblock needs to be fleshed out for multiple cache devices
80 * Add a sysfs tunable for the number of writeback IOs in flight
82 * Add a sysfs tunable for the number of open data buckets
84 * IO tracking: Can we track when one process is doing io on behalf of another?
85 * IO tracking: Don't use just an average, weigh more recent stuff higher
87 * Test module load/unload
90 static const char * const op_types
[] = {
94 static const char *op_type(struct btree_op
*op
)
96 return op_types
[op
->type
];
99 #define MAX_NEED_GC 64
100 #define MAX_SAVE_PRIO 72
102 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
104 #define PTR_HASH(c, k) \
105 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
107 struct workqueue_struct
*bch_gc_wq
;
108 static struct workqueue_struct
*btree_io_wq
;
110 void bch_btree_op_init_stack(struct btree_op
*op
)
112 memset(op
, 0, sizeof(struct btree_op
));
113 closure_init_stack(&op
->cl
);
115 bch_keylist_init(&op
->keys
);
118 /* Btree key manipulation */
120 static void bkey_put(struct cache_set
*c
, struct bkey
*k
, int level
)
122 if ((level
&& KEY_OFFSET(k
)) || !level
)
128 static uint64_t btree_csum_set(struct btree
*b
, struct bset
*i
)
130 uint64_t crc
= b
->key
.ptr
[0];
131 void *data
= (void *) i
+ 8, *end
= end(i
);
133 crc
= bch_crc64_update(crc
, data
, end
- data
);
134 return crc
^ 0xffffffffffffffffULL
;
137 static void btree_bio_endio(struct bio
*bio
, int error
)
139 struct closure
*cl
= bio
->bi_private
;
140 struct btree
*b
= container_of(cl
, struct btree
, io
.cl
);
143 set_btree_node_io_error(b
);
145 bch_bbio_count_io_errors(b
->c
, bio
, error
, (bio
->bi_rw
& WRITE
)
146 ? "writing btree" : "reading btree");
150 static void btree_bio_init(struct btree
*b
)
153 b
->bio
= bch_bbio_alloc(b
->c
);
155 b
->bio
->bi_end_io
= btree_bio_endio
;
156 b
->bio
->bi_private
= &b
->io
.cl
;
159 void bch_btree_read_done(struct closure
*cl
)
161 struct btree
*b
= container_of(cl
, struct btree
, io
.cl
);
162 struct bset
*i
= b
->sets
[0].data
;
163 struct btree_iter
*iter
= b
->c
->fill_iter
;
164 const char *err
= "bad btree header";
165 BUG_ON(b
->nsets
|| b
->written
);
167 bch_bbio_free(b
->bio
, b
->c
);
170 mutex_lock(&b
->c
->fill_lock
);
173 if (btree_node_io_error(b
) ||
178 b
->written
< btree_blocks(b
) && i
->seq
== b
->sets
[0].data
->seq
;
179 i
= write_block(b
)) {
180 err
= "unsupported bset version";
181 if (i
->version
> BCACHE_BSET_VERSION
)
184 err
= "bad btree header";
185 if (b
->written
+ set_blocks(i
, b
->c
) > btree_blocks(b
))
189 if (i
->magic
!= bset_magic(b
->c
))
192 err
= "bad checksum";
193 switch (i
->version
) {
195 if (i
->csum
!= csum_set(i
))
198 case BCACHE_BSET_VERSION
:
199 if (i
->csum
!= btree_csum_set(b
, i
))
205 if (i
!= b
->sets
[0].data
&& !i
->keys
)
208 bch_btree_iter_push(iter
, i
->start
, end(i
));
210 b
->written
+= set_blocks(i
, b
->c
);
213 err
= "corrupted btree";
214 for (i
= write_block(b
);
215 index(i
, b
) < btree_blocks(b
);
216 i
= ((void *) i
) + block_bytes(b
->c
))
217 if (i
->seq
== b
->sets
[0].data
->seq
)
220 bch_btree_sort_and_fix_extents(b
, iter
);
223 err
= "short btree key";
224 if (b
->sets
[0].size
&&
225 bkey_cmp(&b
->key
, &b
->sets
[0].end
) < 0)
228 if (b
->written
< btree_blocks(b
))
229 bch_bset_init_next(b
);
232 mutex_unlock(&b
->c
->fill_lock
);
234 spin_lock(&b
->c
->btree_read_time_lock
);
235 bch_time_stats_update(&b
->c
->btree_read_time
, b
->io_start_time
);
236 spin_unlock(&b
->c
->btree_read_time_lock
);
238 smp_wmb(); /* read_done is our write lock */
239 set_btree_node_read_done(b
);
243 set_btree_node_io_error(b
);
244 bch_cache_set_error(b
->c
, "%s at bucket %zu, block %zu, %u keys",
245 err
, PTR_BUCKET_NR(b
->c
, &b
->key
, 0),
246 index(i
, b
), i
->keys
);
250 void bch_btree_read(struct btree
*b
)
252 BUG_ON(b
->nsets
|| b
->written
);
254 if (!closure_trylock(&b
->io
.cl
, &b
->c
->cl
))
257 b
->io_start_time
= local_clock();
260 b
->bio
->bi_rw
= REQ_META
|READ_SYNC
;
261 b
->bio
->bi_size
= KEY_SIZE(&b
->key
) << 9;
263 bch_bio_map(b
->bio
, b
->sets
[0].data
);
265 pr_debug("%s", pbtree(b
));
266 trace_bcache_btree_read(b
->bio
);
267 bch_submit_bbio(b
->bio
, b
->c
, &b
->key
, 0);
269 continue_at(&b
->io
.cl
, bch_btree_read_done
, system_wq
);
272 static void btree_complete_write(struct btree
*b
, struct btree_write
*w
)
274 if (w
->prio_blocked
&&
275 !atomic_sub_return(w
->prio_blocked
, &b
->c
->prio_blocked
))
276 wake_up(&b
->c
->alloc_wait
);
279 atomic_dec_bug(w
->journal
);
280 __closure_wake_up(&b
->c
->journal
.wait
);
284 closure_put(w
->owner
);
291 static void __btree_write_done(struct closure
*cl
)
293 struct btree
*b
= container_of(cl
, struct btree
, io
.cl
);
294 struct btree_write
*w
= btree_prev_write(b
);
296 bch_bbio_free(b
->bio
, b
->c
);
298 btree_complete_write(b
, w
);
300 if (btree_node_dirty(b
))
301 queue_delayed_work(btree_io_wq
, &b
->work
,
302 msecs_to_jiffies(30000));
307 static void btree_write_done(struct closure
*cl
)
309 struct btree
*b
= container_of(cl
, struct btree
, io
.cl
);
313 __bio_for_each_segment(bv
, b
->bio
, n
, 0)
314 __free_page(bv
->bv_page
);
316 __btree_write_done(cl
);
319 static void do_btree_write(struct btree
*b
)
321 struct closure
*cl
= &b
->io
.cl
;
322 struct bset
*i
= b
->sets
[b
->nsets
].data
;
325 i
->version
= BCACHE_BSET_VERSION
;
326 i
->csum
= btree_csum_set(b
, i
);
329 b
->bio
->bi_rw
= REQ_META
|WRITE_SYNC
;
330 b
->bio
->bi_size
= set_blocks(i
, b
->c
) * block_bytes(b
->c
);
331 bch_bio_map(b
->bio
, i
);
333 bkey_copy(&k
.key
, &b
->key
);
334 SET_PTR_OFFSET(&k
.key
, 0, PTR_OFFSET(&k
.key
, 0) + bset_offset(b
, i
));
336 if (!bch_bio_alloc_pages(b
->bio
, GFP_NOIO
)) {
339 void *base
= (void *) ((unsigned long) i
& ~(PAGE_SIZE
- 1));
341 bio_for_each_segment(bv
, b
->bio
, j
)
342 memcpy(page_address(bv
->bv_page
),
343 base
+ j
* PAGE_SIZE
, PAGE_SIZE
);
345 trace_bcache_btree_write(b
->bio
);
346 bch_submit_bbio(b
->bio
, b
->c
, &k
.key
, 0);
348 continue_at(cl
, btree_write_done
, NULL
);
351 bch_bio_map(b
->bio
, i
);
353 trace_bcache_btree_write(b
->bio
);
354 bch_submit_bbio(b
->bio
, b
->c
, &k
.key
, 0);
357 __btree_write_done(cl
);
361 static void __btree_write(struct btree
*b
)
363 struct bset
*i
= b
->sets
[b
->nsets
].data
;
365 BUG_ON(current
->bio_list
);
367 closure_lock(&b
->io
, &b
->c
->cl
);
368 cancel_delayed_work(&b
->work
);
370 clear_bit(BTREE_NODE_dirty
, &b
->flags
);
371 change_bit(BTREE_NODE_write_idx
, &b
->flags
);
373 bch_check_key_order(b
, i
);
374 BUG_ON(b
->written
&& !i
->keys
);
378 pr_debug("%s block %i keys %i", pbtree(b
), b
->written
, i
->keys
);
380 b
->written
+= set_blocks(i
, b
->c
);
381 atomic_long_add(set_blocks(i
, b
->c
) * b
->c
->sb
.block_size
,
382 &PTR_CACHE(b
->c
, &b
->key
, 0)->btree_sectors_written
);
384 bch_btree_sort_lazy(b
);
386 if (b
->written
< btree_blocks(b
))
387 bch_bset_init_next(b
);
390 static void btree_write_work(struct work_struct
*w
)
392 struct btree
*b
= container_of(to_delayed_work(w
), struct btree
, work
);
394 down_write(&b
->lock
);
396 if (btree_node_dirty(b
))
401 void bch_btree_write(struct btree
*b
, bool now
, struct btree_op
*op
)
403 struct bset
*i
= b
->sets
[b
->nsets
].data
;
404 struct btree_write
*w
= btree_current_write(b
);
407 (b
->written
>= btree_blocks(b
) ||
408 i
->seq
!= b
->sets
[0].data
->seq
||
411 if (!btree_node_dirty(b
)) {
412 set_btree_node_dirty(b
);
413 queue_delayed_work(btree_io_wq
, &b
->work
,
414 msecs_to_jiffies(30000));
417 w
->prio_blocked
+= b
->prio_blocked
;
420 if (op
&& op
->journal
&& !b
->level
) {
422 journal_pin_cmp(b
->c
, w
, op
)) {
423 atomic_dec_bug(w
->journal
);
428 w
->journal
= op
->journal
;
429 atomic_inc(w
->journal
);
433 if (current
->bio_list
)
436 /* Force write if set is too big */
439 set_bytes(i
) > PAGE_SIZE
- 48) {
441 /* Must wait on multiple writes */
444 closure_get(&op
->cl
);
453 * Btree in memory cache - allocation/freeing
454 * mca -> memory cache
457 static void mca_reinit(struct btree
*b
)
465 for (i
= 0; i
< MAX_BSETS
; i
++)
468 * Second loop starts at 1 because b->sets[0]->data is the memory we
471 for (i
= 1; i
< MAX_BSETS
; i
++)
472 b
->sets
[i
].data
= NULL
;
475 #define mca_reserve(c) (((c->root && c->root->level) \
476 ? c->root->level : 1) * 8 + 16)
477 #define mca_can_free(c) \
478 max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
480 static void mca_data_free(struct btree
*b
)
482 struct bset_tree
*t
= b
->sets
;
483 BUG_ON(!closure_is_unlocked(&b
->io
.cl
));
485 if (bset_prev_bytes(b
) < PAGE_SIZE
)
488 free_pages((unsigned long) t
->prev
,
489 get_order(bset_prev_bytes(b
)));
491 if (bset_tree_bytes(b
) < PAGE_SIZE
)
494 free_pages((unsigned long) t
->tree
,
495 get_order(bset_tree_bytes(b
)));
497 free_pages((unsigned long) t
->data
, b
->page_order
);
502 list_move(&b
->list
, &b
->c
->btree_cache_freed
);
503 b
->c
->bucket_cache_used
--;
506 static void mca_bucket_free(struct btree
*b
)
508 BUG_ON(btree_node_dirty(b
));
511 hlist_del_init_rcu(&b
->hash
);
512 list_move(&b
->list
, &b
->c
->btree_cache_freeable
);
515 static unsigned btree_order(struct bkey
*k
)
517 return ilog2(KEY_SIZE(k
) / PAGE_SECTORS
?: 1);
520 static void mca_data_alloc(struct btree
*b
, struct bkey
*k
, gfp_t gfp
)
522 struct bset_tree
*t
= b
->sets
;
525 b
->page_order
= max_t(unsigned,
526 ilog2(b
->c
->btree_pages
),
529 t
->data
= (void *) __get_free_pages(gfp
, b
->page_order
);
533 t
->tree
= bset_tree_bytes(b
) < PAGE_SIZE
534 ? kmalloc(bset_tree_bytes(b
), gfp
)
535 : (void *) __get_free_pages(gfp
, get_order(bset_tree_bytes(b
)));
539 t
->prev
= bset_prev_bytes(b
) < PAGE_SIZE
540 ? kmalloc(bset_prev_bytes(b
), gfp
)
541 : (void *) __get_free_pages(gfp
, get_order(bset_prev_bytes(b
)));
545 list_move(&b
->list
, &b
->c
->btree_cache
);
546 b
->c
->bucket_cache_used
++;
552 static struct btree
*mca_bucket_alloc(struct cache_set
*c
,
553 struct bkey
*k
, gfp_t gfp
)
555 struct btree
*b
= kzalloc(sizeof(struct btree
), gfp
);
559 init_rwsem(&b
->lock
);
560 lockdep_set_novalidate_class(&b
->lock
);
561 INIT_LIST_HEAD(&b
->list
);
562 INIT_DELAYED_WORK(&b
->work
, btree_write_work
);
564 closure_init_unlocked(&b
->io
);
566 mca_data_alloc(b
, k
, gfp
);
570 static int mca_reap(struct btree
*b
, struct closure
*cl
, unsigned min_order
)
572 lockdep_assert_held(&b
->c
->bucket_lock
);
574 if (!down_write_trylock(&b
->lock
))
577 if (b
->page_order
< min_order
) {
582 BUG_ON(btree_node_dirty(b
) && !b
->sets
[0].data
);
584 if (cl
&& btree_node_dirty(b
))
585 bch_btree_write(b
, true, NULL
);
588 closure_wait_event_async(&b
->io
.wait
, cl
,
589 atomic_read(&b
->io
.cl
.remaining
) == -1);
591 if (btree_node_dirty(b
) ||
592 !closure_is_unlocked(&b
->io
.cl
) ||
593 work_pending(&b
->work
.work
)) {
601 static int bch_mca_shrink(struct shrinker
*shrink
, struct shrink_control
*sc
)
603 struct cache_set
*c
= container_of(shrink
, struct cache_set
, shrink
);
605 unsigned long i
, nr
= sc
->nr_to_scan
;
607 if (c
->shrinker_disabled
)
614 * If nr == 0, we're supposed to return the number of items we have
615 * cached. Not allowed to return -1.
618 return mca_can_free(c
) * c
->btree_pages
;
620 /* Return -1 if we can't do anything right now */
621 if (sc
->gfp_mask
& __GFP_WAIT
)
622 mutex_lock(&c
->bucket_lock
);
623 else if (!mutex_trylock(&c
->bucket_lock
))
626 nr
/= c
->btree_pages
;
627 nr
= min_t(unsigned long, nr
, mca_can_free(c
));
630 list_for_each_entry_safe(b
, t
, &c
->btree_cache_freeable
, list
) {
635 !mca_reap(b
, NULL
, 0)) {
643 * Can happen right when we first start up, before we've read in any
646 if (list_empty(&c
->btree_cache
))
649 for (i
= 0; nr
&& i
< c
->bucket_cache_used
; i
++) {
650 b
= list_first_entry(&c
->btree_cache
, struct btree
, list
);
651 list_rotate_left(&c
->btree_cache
);
654 !mca_reap(b
, NULL
, 0)) {
663 nr
= mca_can_free(c
) * c
->btree_pages
;
664 mutex_unlock(&c
->bucket_lock
);
668 void bch_btree_cache_free(struct cache_set
*c
)
672 closure_init_stack(&cl
);
674 if (c
->shrink
.list
.next
)
675 unregister_shrinker(&c
->shrink
);
677 mutex_lock(&c
->bucket_lock
);
679 #ifdef CONFIG_BCACHE_DEBUG
681 list_move(&c
->verify_data
->list
, &c
->btree_cache
);
684 list_splice(&c
->btree_cache_freeable
,
687 while (!list_empty(&c
->btree_cache
)) {
688 b
= list_first_entry(&c
->btree_cache
, struct btree
, list
);
690 if (btree_node_dirty(b
))
691 btree_complete_write(b
, btree_current_write(b
));
692 clear_bit(BTREE_NODE_dirty
, &b
->flags
);
697 while (!list_empty(&c
->btree_cache_freed
)) {
698 b
= list_first_entry(&c
->btree_cache_freed
,
701 cancel_delayed_work_sync(&b
->work
);
705 mutex_unlock(&c
->bucket_lock
);
708 int bch_btree_cache_alloc(struct cache_set
*c
)
712 /* XXX: doesn't check for errors */
714 closure_init_unlocked(&c
->gc
);
716 for (i
= 0; i
< mca_reserve(c
); i
++)
717 mca_bucket_alloc(c
, &ZERO_KEY
, GFP_KERNEL
);
719 list_splice_init(&c
->btree_cache
,
720 &c
->btree_cache_freeable
);
722 #ifdef CONFIG_BCACHE_DEBUG
723 mutex_init(&c
->verify_lock
);
725 c
->verify_data
= mca_bucket_alloc(c
, &ZERO_KEY
, GFP_KERNEL
);
727 if (c
->verify_data
&&
728 c
->verify_data
->sets
[0].data
)
729 list_del_init(&c
->verify_data
->list
);
731 c
->verify_data
= NULL
;
734 c
->shrink
.shrink
= bch_mca_shrink
;
736 c
->shrink
.batch
= c
->btree_pages
* 2;
737 register_shrinker(&c
->shrink
);
742 /* Btree in memory cache - hash table */
744 static struct hlist_head
*mca_hash(struct cache_set
*c
, struct bkey
*k
)
746 return &c
->bucket_hash
[hash_32(PTR_HASH(c
, k
), BUCKET_HASH_BITS
)];
749 static struct btree
*mca_find(struct cache_set
*c
, struct bkey
*k
)
754 hlist_for_each_entry_rcu(b
, mca_hash(c
, k
), hash
)
755 if (PTR_HASH(c
, &b
->key
) == PTR_HASH(c
, k
))
763 static struct btree
*mca_cannibalize(struct cache_set
*c
, struct bkey
*k
,
764 int level
, struct closure
*cl
)
770 return ERR_PTR(-ENOMEM
);
773 * Trying to free up some memory - i.e. reuse some btree nodes - may
774 * require initiating IO to flush the dirty part of the node. If we're
775 * running under generic_make_request(), that IO will never finish and
776 * we would deadlock. Returning -EAGAIN causes the cache lookup code to
777 * punt to workqueue and retry.
779 if (current
->bio_list
)
780 return ERR_PTR(-EAGAIN
);
782 if (c
->try_harder
&& c
->try_harder
!= cl
) {
783 closure_wait_event_async(&c
->try_wait
, cl
, !c
->try_harder
);
784 return ERR_PTR(-EAGAIN
);
787 /* XXX: tracepoint */
789 c
->try_harder_start
= local_clock();
791 list_for_each_entry_reverse(i
, &c
->btree_cache
, list
) {
792 int r
= mca_reap(i
, cl
, btree_order(k
));
799 if (ret
== -EAGAIN
&&
800 closure_blocking(cl
)) {
801 mutex_unlock(&c
->bucket_lock
);
803 mutex_lock(&c
->bucket_lock
);
811 * We can only have one thread cannibalizing other cached btree nodes at a time,
812 * or we'll deadlock. We use an open coded mutex to ensure that, which a
813 * cannibalize_bucket() will take. This means every time we unlock the root of
814 * the btree, we need to release this lock if we have it held.
816 void bch_cannibalize_unlock(struct cache_set
*c
, struct closure
*cl
)
818 if (c
->try_harder
== cl
) {
819 bch_time_stats_update(&c
->try_harder_time
, c
->try_harder_start
);
820 c
->try_harder
= NULL
;
821 __closure_wake_up(&c
->try_wait
);
825 static struct btree
*mca_alloc(struct cache_set
*c
, struct bkey
*k
,
826 int level
, struct closure
*cl
)
830 lockdep_assert_held(&c
->bucket_lock
);
835 /* btree_free() doesn't free memory; it sticks the node on the end of
836 * the list. Check if there's any freed nodes there:
838 list_for_each_entry(b
, &c
->btree_cache_freeable
, list
)
839 if (!mca_reap(b
, NULL
, btree_order(k
)))
842 /* We never free struct btree itself, just the memory that holds the on
843 * disk node. Check the freed list before allocating a new one:
845 list_for_each_entry(b
, &c
->btree_cache_freed
, list
)
846 if (!mca_reap(b
, NULL
, 0)) {
847 mca_data_alloc(b
, k
, __GFP_NOWARN
|GFP_NOIO
);
848 if (!b
->sets
[0].data
)
854 b
= mca_bucket_alloc(c
, k
, __GFP_NOWARN
|GFP_NOIO
);
858 BUG_ON(!down_write_trylock(&b
->lock
));
862 BUG_ON(!closure_is_unlocked(&b
->io
.cl
));
864 bkey_copy(&b
->key
, k
);
865 list_move(&b
->list
, &c
->btree_cache
);
866 hlist_del_init_rcu(&b
->hash
);
867 hlist_add_head_rcu(&b
->hash
, mca_hash(c
, k
));
869 lock_set_subclass(&b
->lock
.dep_map
, level
+ 1, _THIS_IP_
);
879 b
= mca_cannibalize(c
, k
, level
, cl
);
887 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
888 * in from disk if necessary.
890 * If IO is necessary, it uses the closure embedded in struct btree_op to wait;
891 * if that closure is in non blocking mode, will return -EAGAIN.
893 * The btree node will have either a read or a write lock held, depending on
894 * level and op->lock.
896 struct btree
*bch_btree_node_get(struct cache_set
*c
, struct bkey
*k
,
897 int level
, struct btree_op
*op
)
900 bool write
= level
<= op
->lock
;
908 mutex_lock(&c
->bucket_lock
);
909 b
= mca_alloc(c
, k
, level
, &op
->cl
);
910 mutex_unlock(&c
->bucket_lock
);
920 downgrade_write(&b
->lock
);
922 rw_lock(write
, b
, level
);
923 if (PTR_HASH(c
, &b
->key
) != PTR_HASH(c
, k
)) {
927 BUG_ON(b
->level
!= level
);
932 for (; i
<= b
->nsets
&& b
->sets
[i
].size
; i
++) {
933 prefetch(b
->sets
[i
].tree
);
934 prefetch(b
->sets
[i
].data
);
937 for (; i
<= b
->nsets
; i
++)
938 prefetch(b
->sets
[i
].data
);
940 if (!closure_wait_event(&b
->io
.wait
, &op
->cl
,
941 btree_node_read_done(b
))) {
943 b
= ERR_PTR(-EAGAIN
);
944 } else if (btree_node_io_error(b
)) {
953 static void btree_node_prefetch(struct cache_set
*c
, struct bkey
*k
, int level
)
957 mutex_lock(&c
->bucket_lock
);
958 b
= mca_alloc(c
, k
, level
, NULL
);
959 mutex_unlock(&c
->bucket_lock
);
961 if (!IS_ERR_OR_NULL(b
)) {
969 static void btree_node_free(struct btree
*b
, struct btree_op
*op
)
974 * The BUG_ON() in btree_node_get() implies that we must have a write
975 * lock on parent to free or even invalidate a node
977 BUG_ON(op
->lock
<= b
->level
);
978 BUG_ON(b
== b
->c
->root
);
979 pr_debug("bucket %s", pbtree(b
));
981 if (btree_node_dirty(b
))
982 btree_complete_write(b
, btree_current_write(b
));
983 clear_bit(BTREE_NODE_dirty
, &b
->flags
);
985 if (b
->prio_blocked
&&
986 !atomic_sub_return(b
->prio_blocked
, &b
->c
->prio_blocked
))
987 wake_up(&b
->c
->alloc_wait
);
991 cancel_delayed_work(&b
->work
);
993 mutex_lock(&b
->c
->bucket_lock
);
995 for (i
= 0; i
< KEY_PTRS(&b
->key
); i
++) {
996 BUG_ON(atomic_read(&PTR_BUCKET(b
->c
, &b
->key
, i
)->pin
));
998 bch_inc_gen(PTR_CACHE(b
->c
, &b
->key
, i
),
999 PTR_BUCKET(b
->c
, &b
->key
, i
));
1002 bch_bucket_free(b
->c
, &b
->key
);
1004 mutex_unlock(&b
->c
->bucket_lock
);
1007 struct btree
*bch_btree_node_alloc(struct cache_set
*c
, int level
,
1011 struct btree
*b
= ERR_PTR(-EAGAIN
);
1013 mutex_lock(&c
->bucket_lock
);
1015 if (__bch_bucket_alloc_set(c
, WATERMARK_METADATA
, &k
.key
, 1, cl
))
1018 SET_KEY_SIZE(&k
.key
, c
->btree_pages
* PAGE_SECTORS
);
1020 b
= mca_alloc(c
, &k
.key
, level
, cl
);
1026 "Tried to allocate bucket that was in btree cache");
1027 __bkey_put(c
, &k
.key
);
1031 set_btree_node_read_done(b
);
1033 bch_bset_init_next(b
);
1035 mutex_unlock(&c
->bucket_lock
);
1038 bch_bucket_free(c
, &k
.key
);
1039 __bkey_put(c
, &k
.key
);
1041 mutex_unlock(&c
->bucket_lock
);
1045 static struct btree
*btree_node_alloc_replacement(struct btree
*b
,
1048 struct btree
*n
= bch_btree_node_alloc(b
->c
, b
->level
, cl
);
1049 if (!IS_ERR_OR_NULL(n
))
1050 bch_btree_sort_into(b
, n
);
1055 /* Garbage collection */
1057 uint8_t __bch_btree_mark_key(struct cache_set
*c
, int level
, struct bkey
*k
)
1064 * ptr_invalid() can't return true for the keys that mark btree nodes as
1065 * freed, but since ptr_bad() returns true we'll never actually use them
1066 * for anything and thus we don't want mark their pointers here
1068 if (!bkey_cmp(k
, &ZERO_KEY
))
1071 for (i
= 0; i
< KEY_PTRS(k
); i
++) {
1072 if (!ptr_available(c
, k
, i
))
1075 g
= PTR_BUCKET(c
, k
, i
);
1077 if (gen_after(g
->gc_gen
, PTR_GEN(k
, i
)))
1078 g
->gc_gen
= PTR_GEN(k
, i
);
1080 if (ptr_stale(c
, k
, i
)) {
1081 stale
= max(stale
, ptr_stale(c
, k
, i
));
1085 cache_bug_on(GC_MARK(g
) &&
1086 (GC_MARK(g
) == GC_MARK_METADATA
) != (level
!= 0),
1087 c
, "inconsistent ptrs: mark = %llu, level = %i",
1091 SET_GC_MARK(g
, GC_MARK_METADATA
);
1092 else if (KEY_DIRTY(k
))
1093 SET_GC_MARK(g
, GC_MARK_DIRTY
);
1095 /* guard against overflow */
1096 SET_GC_SECTORS_USED(g
, min_t(unsigned,
1097 GC_SECTORS_USED(g
) + KEY_SIZE(k
),
1100 BUG_ON(!GC_SECTORS_USED(g
));
1106 #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
1108 static int btree_gc_mark_node(struct btree
*b
, unsigned *keys
,
1112 unsigned last_dev
= -1;
1113 struct bcache_device
*d
= NULL
;
1115 struct btree_iter iter
;
1116 struct bset_tree
*t
;
1120 for_each_key_filter(b
, k
, &iter
, bch_ptr_invalid
) {
1121 if (last_dev
!= KEY_INODE(k
)) {
1122 last_dev
= KEY_INODE(k
);
1124 d
= KEY_INODE(k
) < b
->c
->nr_uuids
1125 ? b
->c
->devices
[last_dev
]
1129 stale
= max(stale
, btree_mark_key(b
, k
));
1131 if (bch_ptr_bad(b
, k
))
1134 *keys
+= bkey_u64s(k
);
1136 gc
->key_bytes
+= bkey_u64s(k
);
1139 gc
->data
+= KEY_SIZE(k
);
1141 gc
->dirty
+= KEY_SIZE(k
);
1143 d
->sectors_dirty_gc
+= KEY_SIZE(k
);
1147 for (t
= b
->sets
; t
<= &b
->sets
[b
->nsets
]; t
++)
1148 btree_bug_on(t
->size
&&
1149 bset_written(b
, t
) &&
1150 bkey_cmp(&b
->key
, &t
->end
) < 0,
1151 b
, "found short btree key in gc");
1156 static struct btree
*btree_gc_alloc(struct btree
*b
, struct bkey
*k
,
1157 struct btree_op
*op
)
1160 * We block priorities from being written for the duration of garbage
1161 * collection, so we can't sleep in btree_alloc() ->
1162 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
1165 struct btree
*n
= btree_node_alloc_replacement(b
, NULL
);
1167 if (!IS_ERR_OR_NULL(n
)) {
1170 memcpy(k
->ptr
, b
->key
.ptr
,
1171 sizeof(uint64_t) * KEY_PTRS(&b
->key
));
1173 __bkey_put(b
->c
, &b
->key
);
1174 atomic_inc(&b
->c
->prio_blocked
);
1177 btree_node_free(n
, op
);
1185 * Leaving this at 2 until we've got incremental garbage collection done; it
1186 * could be higher (and has been tested with 4) except that garbage collection
1187 * could take much longer, adversely affecting latency.
1189 #define GC_MERGE_NODES 2U
1191 struct gc_merge_info
{
1197 static void btree_gc_coalesce(struct btree
*b
, struct btree_op
*op
,
1198 struct gc_stat
*gc
, struct gc_merge_info
*r
)
1200 unsigned nodes
= 0, keys
= 0, blocks
;
1203 while (nodes
< GC_MERGE_NODES
&& r
[nodes
].b
)
1204 keys
+= r
[nodes
++].keys
;
1206 blocks
= btree_default_blocks(b
->c
) * 2 / 3;
1209 __set_blocks(b
->sets
[0].data
, keys
, b
->c
) > blocks
* (nodes
- 1))
1212 for (i
= nodes
- 1; i
>= 0; --i
) {
1213 if (r
[i
].b
->written
)
1214 r
[i
].b
= btree_gc_alloc(r
[i
].b
, r
[i
].k
, op
);
1216 if (r
[i
].b
->written
)
1220 for (i
= nodes
- 1; i
> 0; --i
) {
1221 struct bset
*n1
= r
[i
].b
->sets
->data
;
1222 struct bset
*n2
= r
[i
- 1].b
->sets
->data
;
1223 struct bkey
*k
, *last
= NULL
;
1229 * Last node we're not getting rid of - we're getting
1230 * rid of the node at r[0]. Have to try and fit all of
1231 * the remaining keys into this node; we can't ensure
1232 * they will always fit due to rounding and variable
1233 * length keys (shouldn't be possible in practice,
1236 if (__set_blocks(n1
, n1
->keys
+ r
->keys
,
1237 b
->c
) > btree_blocks(r
[i
].b
))
1246 if (__set_blocks(n1
, n1
->keys
+ keys
+
1247 bkey_u64s(k
), b
->c
) > blocks
)
1251 keys
+= bkey_u64s(k
);
1254 BUG_ON(__set_blocks(n1
, n1
->keys
+ keys
,
1255 b
->c
) > btree_blocks(r
[i
].b
));
1258 bkey_copy_key(&r
[i
].b
->key
, last
);
1259 bkey_copy_key(r
[i
].k
, last
);
1264 (void *) node(n2
, keys
) - (void *) n2
->start
);
1270 (void *) end(n2
) - (void *) node(n2
, keys
));
1274 r
[i
].keys
= n1
->keys
;
1275 r
[i
- 1].keys
= n2
->keys
;
1278 btree_node_free(r
->b
, op
);
1279 up_write(&r
->b
->lock
);
1281 pr_debug("coalesced %u nodes", nodes
);
1286 memmove(&r
[0], &r
[1], sizeof(struct gc_merge_info
) * nodes
);
1287 memset(&r
[nodes
], 0, sizeof(struct gc_merge_info
));
1290 static int btree_gc_recurse(struct btree
*b
, struct btree_op
*op
,
1291 struct closure
*writes
, struct gc_stat
*gc
)
1293 void write(struct btree
*r
)
1296 bch_btree_write(r
, true, op
);
1297 else if (btree_node_dirty(r
)) {
1298 BUG_ON(btree_current_write(r
)->owner
);
1299 btree_current_write(r
)->owner
= writes
;
1300 closure_get(writes
);
1302 bch_btree_write(r
, true, NULL
);
1310 struct gc_merge_info r
[GC_MERGE_NODES
];
1312 memset(r
, 0, sizeof(r
));
1314 while ((r
->k
= bch_next_recurse_key(b
, &b
->c
->gc_done
))) {
1315 r
->b
= bch_btree_node_get(b
->c
, r
->k
, b
->level
- 1, op
);
1318 ret
= PTR_ERR(r
->b
);
1323 stale
= btree_gc_mark_node(r
->b
, &r
->keys
, gc
);
1326 (r
->b
->level
|| stale
> 10 ||
1327 b
->c
->gc_always_rewrite
))
1328 r
->b
= btree_gc_alloc(r
->b
, r
->k
, op
);
1331 ret
= btree_gc_recurse(r
->b
, op
, writes
, gc
);
1338 bkey_copy_key(&b
->c
->gc_done
, r
->k
);
1341 btree_gc_coalesce(b
, op
, gc
, r
);
1343 if (r
[GC_MERGE_NODES
- 1].b
)
1344 write(r
[GC_MERGE_NODES
- 1].b
);
1346 memmove(&r
[1], &r
[0],
1347 sizeof(struct gc_merge_info
) * (GC_MERGE_NODES
- 1));
1349 /* When we've got incremental GC working, we'll want to do
1350 * if (should_resched())
1355 if (need_resched()) {
1362 for (i
= 1; i
< GC_MERGE_NODES
&& r
[i
].b
; i
++)
1365 /* Might have freed some children, must remove their keys */
1372 static int bch_btree_gc_root(struct btree
*b
, struct btree_op
*op
,
1373 struct closure
*writes
, struct gc_stat
*gc
)
1375 struct btree
*n
= NULL
;
1377 int ret
= 0, stale
= btree_gc_mark_node(b
, &keys
, gc
);
1379 if (b
->level
|| stale
> 10)
1380 n
= btree_node_alloc_replacement(b
, NULL
);
1382 if (!IS_ERR_OR_NULL(n
))
1386 ret
= btree_gc_recurse(b
, op
, writes
, gc
);
1388 if (!b
->written
|| btree_node_dirty(b
)) {
1389 atomic_inc(&b
->c
->prio_blocked
);
1391 bch_btree_write(b
, true, n
? op
: NULL
);
1394 if (!IS_ERR_OR_NULL(n
)) {
1395 closure_sync(&op
->cl
);
1396 bch_btree_set_root(b
);
1397 btree_node_free(n
, op
);
1404 static void btree_gc_start(struct cache_set
*c
)
1408 struct bcache_device
**d
;
1411 if (!c
->gc_mark_valid
)
1414 mutex_lock(&c
->bucket_lock
);
1416 c
->gc_mark_valid
= 0;
1417 c
->gc_done
= ZERO_KEY
;
1419 for_each_cache(ca
, c
, i
)
1420 for_each_bucket(b
, ca
) {
1422 if (!atomic_read(&b
->pin
))
1423 SET_GC_MARK(b
, GC_MARK_RECLAIMABLE
);
1426 for (d
= c
->devices
;
1427 d
< c
->devices
+ c
->nr_uuids
;
1430 (*d
)->sectors_dirty_gc
= 0;
1432 mutex_unlock(&c
->bucket_lock
);
1435 size_t bch_btree_gc_finish(struct cache_set
*c
)
1437 size_t available
= 0;
1440 struct bcache_device
**d
;
1443 mutex_lock(&c
->bucket_lock
);
1446 c
->gc_mark_valid
= 1;
1450 for (i
= 0; i
< KEY_PTRS(&c
->root
->key
); i
++)
1451 SET_GC_MARK(PTR_BUCKET(c
, &c
->root
->key
, i
),
1454 for (i
= 0; i
< KEY_PTRS(&c
->uuid_bucket
); i
++)
1455 SET_GC_MARK(PTR_BUCKET(c
, &c
->uuid_bucket
, i
),
1458 for_each_cache(ca
, c
, i
) {
1461 ca
->invalidate_needs_gc
= 0;
1463 for (i
= ca
->sb
.d
; i
< ca
->sb
.d
+ ca
->sb
.keys
; i
++)
1464 SET_GC_MARK(ca
->buckets
+ *i
, GC_MARK_METADATA
);
1466 for (i
= ca
->prio_buckets
;
1467 i
< ca
->prio_buckets
+ prio_buckets(ca
) * 2; i
++)
1468 SET_GC_MARK(ca
->buckets
+ *i
, GC_MARK_METADATA
);
1470 for_each_bucket(b
, ca
) {
1471 b
->last_gc
= b
->gc_gen
;
1472 c
->need_gc
= max(c
->need_gc
, bucket_gc_gen(b
));
1474 if (!atomic_read(&b
->pin
) &&
1475 GC_MARK(b
) == GC_MARK_RECLAIMABLE
) {
1477 if (!GC_SECTORS_USED(b
))
1478 bch_bucket_add_unused(ca
, b
);
1483 for (d
= c
->devices
;
1484 d
< c
->devices
+ c
->nr_uuids
;
1487 unsigned long last
=
1488 atomic_long_read(&((*d
)->sectors_dirty
));
1489 long difference
= (*d
)->sectors_dirty_gc
- last
;
1491 pr_debug("sectors dirty off by %li", difference
);
1493 (*d
)->sectors_dirty_last
+= difference
;
1495 atomic_long_set(&((*d
)->sectors_dirty
),
1496 (*d
)->sectors_dirty_gc
);
1499 mutex_unlock(&c
->bucket_lock
);
1503 static void bch_btree_gc(struct closure
*cl
)
1505 struct cache_set
*c
= container_of(cl
, struct cache_set
, gc
.cl
);
1507 unsigned long available
;
1508 struct gc_stat stats
;
1509 struct closure writes
;
1512 uint64_t start_time
= local_clock();
1513 trace_bcache_gc_start(c
->sb
.set_uuid
);
1514 blktrace_msg_all(c
, "Starting gc");
1516 memset(&stats
, 0, sizeof(struct gc_stat
));
1517 closure_init_stack(&writes
);
1518 bch_btree_op_init_stack(&op
);
1523 ret
= btree_root(gc_root
, c
, &op
, &writes
, &stats
);
1524 closure_sync(&op
.cl
);
1525 closure_sync(&writes
);
1528 blktrace_msg_all(c
, "Stopped gc");
1529 pr_warn("gc failed!");
1531 continue_at(cl
, bch_btree_gc
, bch_gc_wq
);
1534 /* Possibly wait for new UUIDs or whatever to hit disk */
1535 bch_journal_meta(c
, &op
.cl
);
1536 closure_sync(&op
.cl
);
1538 available
= bch_btree_gc_finish(c
);
1540 bch_time_stats_update(&c
->btree_gc_time
, start_time
);
1542 stats
.key_bytes
*= sizeof(uint64_t);
1545 stats
.in_use
= (c
->nbuckets
- available
) * 100 / c
->nbuckets
;
1546 memcpy(&c
->gc_stats
, &stats
, sizeof(struct gc_stat
));
1547 blktrace_msg_all(c
, "Finished gc");
1549 trace_bcache_gc_end(c
->sb
.set_uuid
);
1550 wake_up(&c
->alloc_wait
);
1552 continue_at(cl
, bch_moving_gc
, bch_gc_wq
);
1555 void bch_queue_gc(struct cache_set
*c
)
1557 closure_trylock_call(&c
->gc
.cl
, bch_btree_gc
, bch_gc_wq
, &c
->cl
);
1560 /* Initial partial gc */
1562 static int bch_btree_check_recurse(struct btree
*b
, struct btree_op
*op
,
1563 unsigned long **seen
)
1569 struct btree_iter iter
;
1571 for_each_key_filter(b
, k
, &iter
, bch_ptr_invalid
) {
1572 for (i
= 0; i
< KEY_PTRS(k
); i
++) {
1573 if (!ptr_available(b
->c
, k
, i
))
1576 g
= PTR_BUCKET(b
->c
, k
, i
);
1578 if (!__test_and_set_bit(PTR_BUCKET_NR(b
->c
, k
, i
),
1579 seen
[PTR_DEV(k
, i
)]) ||
1580 !ptr_stale(b
->c
, k
, i
)) {
1581 g
->gen
= PTR_GEN(k
, i
);
1584 g
->prio
= BTREE_PRIO
;
1585 else if (g
->prio
== BTREE_PRIO
)
1586 g
->prio
= INITIAL_PRIO
;
1590 btree_mark_key(b
, k
);
1594 k
= bch_next_recurse_key(b
, &ZERO_KEY
);
1597 struct bkey
*p
= bch_next_recurse_key(b
, k
);
1599 btree_node_prefetch(b
->c
, p
, b
->level
- 1);
1601 ret
= btree(check_recurse
, k
, b
, op
, seen
);
1612 int bch_btree_check(struct cache_set
*c
, struct btree_op
*op
)
1616 unsigned long *seen
[MAX_CACHES_PER_SET
];
1618 memset(seen
, 0, sizeof(seen
));
1620 for (i
= 0; c
->cache
[i
]; i
++) {
1621 size_t n
= DIV_ROUND_UP(c
->cache
[i
]->sb
.nbuckets
, 8);
1622 seen
[i
] = kmalloc(n
, GFP_KERNEL
);
1626 /* Disables the seen array until prio_read() uses it too */
1627 memset(seen
[i
], 0xFF, n
);
1630 ret
= btree_root(check_recurse
, c
, op
, seen
);
1632 for (i
= 0; i
< MAX_CACHES_PER_SET
; i
++)
1637 /* Btree insertion */
1639 static void shift_keys(struct btree
*b
, struct bkey
*where
, struct bkey
*insert
)
1641 struct bset
*i
= b
->sets
[b
->nsets
].data
;
1643 memmove((uint64_t *) where
+ bkey_u64s(insert
),
1645 (void *) end(i
) - (void *) where
);
1647 i
->keys
+= bkey_u64s(insert
);
1648 bkey_copy(where
, insert
);
1649 bch_bset_fix_lookup_table(b
, where
);
1652 static bool fix_overlapping_extents(struct btree
*b
,
1653 struct bkey
*insert
,
1654 struct btree_iter
*iter
,
1655 struct btree_op
*op
)
1657 void subtract_dirty(struct bkey
*k
, int sectors
)
1659 struct bcache_device
*d
= b
->c
->devices
[KEY_INODE(k
)];
1661 if (KEY_DIRTY(k
) && d
)
1662 atomic_long_sub(sectors
, &d
->sectors_dirty
);
1665 unsigned old_size
, sectors_found
= 0;
1668 struct bkey
*k
= bch_btree_iter_next(iter
);
1670 bkey_cmp(&START_KEY(k
), insert
) >= 0)
1673 if (bkey_cmp(k
, &START_KEY(insert
)) <= 0)
1676 old_size
= KEY_SIZE(k
);
1679 * We might overlap with 0 size extents; we can't skip these
1680 * because if they're in the set we're inserting to we have to
1681 * adjust them so they don't overlap with the key we're
1682 * inserting. But we don't want to check them for BTREE_REPLACE
1686 if (op
->type
== BTREE_REPLACE
&&
1689 * k might have been split since we inserted/found the
1690 * key we're replacing
1693 uint64_t offset
= KEY_START(k
) -
1694 KEY_START(&op
->replace
);
1696 /* But it must be a subset of the replace key */
1697 if (KEY_START(k
) < KEY_START(&op
->replace
) ||
1698 KEY_OFFSET(k
) > KEY_OFFSET(&op
->replace
))
1701 /* We didn't find a key that we were supposed to */
1702 if (KEY_START(k
) > KEY_START(insert
) + sectors_found
)
1705 if (KEY_PTRS(&op
->replace
) != KEY_PTRS(k
))
1711 BUG_ON(!KEY_PTRS(&op
->replace
));
1713 for (i
= 0; i
< KEY_PTRS(&op
->replace
); i
++)
1714 if (k
->ptr
[i
] != op
->replace
.ptr
[i
] + offset
)
1717 sectors_found
= KEY_OFFSET(k
) - KEY_START(insert
);
1720 if (bkey_cmp(insert
, k
) < 0 &&
1721 bkey_cmp(&START_KEY(insert
), &START_KEY(k
)) > 0) {
1723 * We overlapped in the middle of an existing key: that
1724 * means we have to split the old key. But we have to do
1725 * slightly different things depending on whether the
1726 * old key has been written out yet.
1731 subtract_dirty(k
, KEY_SIZE(insert
));
1733 if (bkey_written(b
, k
)) {
1735 * We insert a new key to cover the top of the
1736 * old key, and the old key is modified in place
1737 * to represent the bottom split.
1739 * It's completely arbitrary whether the new key
1740 * is the top or the bottom, but it has to match
1741 * up with what btree_sort_fixup() does - it
1742 * doesn't check for this kind of overlap, it
1743 * depends on us inserting a new key for the top
1746 top
= bch_bset_search(b
, &b
->sets
[b
->nsets
],
1748 shift_keys(b
, top
, k
);
1750 BKEY_PADDED(key
) temp
;
1751 bkey_copy(&temp
.key
, k
);
1752 shift_keys(b
, k
, &temp
.key
);
1756 bch_cut_front(insert
, top
);
1757 bch_cut_back(&START_KEY(insert
), k
);
1758 bch_bset_fix_invalidated_key(b
, k
);
1762 if (bkey_cmp(insert
, k
) < 0) {
1763 bch_cut_front(insert
, k
);
1765 if (bkey_written(b
, k
) &&
1766 bkey_cmp(&START_KEY(insert
), &START_KEY(k
)) <= 0) {
1768 * Completely overwrote, so we don't have to
1769 * invalidate the binary search tree
1771 bch_cut_front(k
, k
);
1773 __bch_cut_back(&START_KEY(insert
), k
);
1774 bch_bset_fix_invalidated_key(b
, k
);
1778 subtract_dirty(k
, old_size
- KEY_SIZE(k
));
1782 if (op
->type
== BTREE_REPLACE
) {
1783 if (!sectors_found
) {
1784 op
->insert_collision
= true;
1786 } else if (sectors_found
< KEY_SIZE(insert
)) {
1787 SET_KEY_OFFSET(insert
, KEY_OFFSET(insert
) -
1788 (KEY_SIZE(insert
) - sectors_found
));
1789 SET_KEY_SIZE(insert
, sectors_found
);
1796 static bool btree_insert_key(struct btree
*b
, struct btree_op
*op
,
1799 struct bset
*i
= b
->sets
[b
->nsets
].data
;
1800 struct bkey
*m
, *prev
;
1801 const char *status
= "insert";
1803 BUG_ON(bkey_cmp(k
, &b
->key
) > 0);
1804 BUG_ON(b
->level
&& !KEY_PTRS(k
));
1805 BUG_ON(!b
->level
&& !KEY_OFFSET(k
));
1808 struct btree_iter iter
;
1809 struct bkey search
= KEY(KEY_INODE(k
), KEY_START(k
), 0);
1812 * bset_search() returns the first key that is strictly greater
1813 * than the search key - but for back merging, we want to find
1814 * the first key that is greater than or equal to KEY_START(k) -
1815 * unless KEY_START(k) is 0.
1817 if (KEY_OFFSET(&search
))
1818 SET_KEY_OFFSET(&search
, KEY_OFFSET(&search
) - 1);
1821 m
= bch_btree_iter_init(b
, &iter
, &search
);
1823 if (fix_overlapping_extents(b
, k
, &iter
, op
))
1826 while (m
!= end(i
) &&
1827 bkey_cmp(k
, &START_KEY(m
)) > 0)
1828 prev
= m
, m
= bkey_next(m
);
1830 if (key_merging_disabled(b
->c
))
1833 /* prev is in the tree, if we merge we're done */
1834 status
= "back merging";
1836 bch_bkey_try_merge(b
, prev
, k
))
1839 status
= "overwrote front";
1841 KEY_PTRS(m
) == KEY_PTRS(k
) && !KEY_SIZE(m
))
1844 status
= "front merge";
1846 bch_bkey_try_merge(b
, k
, m
))
1849 m
= bch_bset_search(b
, &b
->sets
[b
->nsets
], k
);
1851 insert
: shift_keys(b
, m
, k
);
1852 copy
: bkey_copy(m
, k
);
1854 bch_check_keys(b
, "%s for %s at %s: %s", status
,
1855 op_type(op
), pbtree(b
), pkey(k
));
1856 bch_check_key_order_msg(b
, i
, "%s for %s at %s: %s", status
,
1857 op_type(op
), pbtree(b
), pkey(k
));
1859 if (b
->level
&& !KEY_OFFSET(k
))
1862 pr_debug("%s for %s at %s: %s", status
,
1863 op_type(op
), pbtree(b
), pkey(k
));
1868 bool bch_btree_insert_keys(struct btree
*b
, struct btree_op
*op
)
1872 unsigned oldsize
= bch_count_data(b
);
1874 while ((k
= bch_keylist_pop(&op
->keys
))) {
1875 bkey_put(b
->c
, k
, b
->level
);
1876 ret
|= btree_insert_key(b
, op
, k
);
1879 BUG_ON(bch_count_data(b
) < oldsize
);
1883 bool bch_btree_insert_check_key(struct btree
*b
, struct btree_op
*op
,
1887 uint64_t btree_ptr
= b
->key
.ptr
[0];
1888 unsigned long seq
= b
->seq
;
1891 rw_unlock(false, b
);
1892 rw_lock(true, b
, b
->level
);
1894 if (b
->key
.ptr
[0] != btree_ptr
||
1895 b
->seq
!= seq
+ 1 ||
1899 op
->replace
= KEY(op
->inode
, bio_end(bio
), bio_sectors(bio
));
1901 SET_KEY_PTRS(&op
->replace
, 1);
1902 get_random_bytes(&op
->replace
.ptr
[0], sizeof(uint64_t));
1904 SET_PTR_DEV(&op
->replace
, 0, PTR_CHECK_DEV
);
1906 bkey_copy(&tmp
.k
, &op
->replace
);
1908 BUG_ON(op
->type
!= BTREE_INSERT
);
1909 BUG_ON(!btree_insert_key(b
, op
, &tmp
.k
));
1910 bch_btree_write(b
, false, NULL
);
1913 downgrade_write(&b
->lock
);
1917 static int btree_split(struct btree
*b
, struct btree_op
*op
)
1919 bool split
, root
= b
== b
->c
->root
;
1920 struct btree
*n1
, *n2
= NULL
, *n3
= NULL
;
1921 uint64_t start_time
= local_clock();
1924 set_closure_blocking(&op
->cl
);
1926 n1
= btree_node_alloc_replacement(b
, &op
->cl
);
1930 split
= set_blocks(n1
->sets
[0].data
, n1
->c
) > (btree_blocks(b
) * 4) / 5;
1932 pr_debug("%ssplitting at %s keys %i", split
? "" : "not ",
1933 pbtree(b
), n1
->sets
[0].data
->keys
);
1938 n2
= bch_btree_node_alloc(b
->c
, b
->level
, &op
->cl
);
1943 n3
= bch_btree_node_alloc(b
->c
, b
->level
+ 1, &op
->cl
);
1948 bch_btree_insert_keys(n1
, op
);
1950 /* Has to be a linear search because we don't have an auxiliary
1954 while (keys
< (n1
->sets
[0].data
->keys
* 3) / 5)
1955 keys
+= bkey_u64s(node(n1
->sets
[0].data
, keys
));
1957 bkey_copy_key(&n1
->key
, node(n1
->sets
[0].data
, keys
));
1958 keys
+= bkey_u64s(node(n1
->sets
[0].data
, keys
));
1960 n2
->sets
[0].data
->keys
= n1
->sets
[0].data
->keys
- keys
;
1961 n1
->sets
[0].data
->keys
= keys
;
1963 memcpy(n2
->sets
[0].data
->start
,
1964 end(n1
->sets
[0].data
),
1965 n2
->sets
[0].data
->keys
* sizeof(uint64_t));
1967 bkey_copy_key(&n2
->key
, &b
->key
);
1969 bch_keylist_add(&op
->keys
, &n2
->key
);
1970 bch_btree_write(n2
, true, op
);
1971 rw_unlock(true, n2
);
1973 bch_btree_insert_keys(n1
, op
);
1975 bch_keylist_add(&op
->keys
, &n1
->key
);
1976 bch_btree_write(n1
, true, op
);
1979 bkey_copy_key(&n3
->key
, &MAX_KEY
);
1980 bch_btree_insert_keys(n3
, op
);
1981 bch_btree_write(n3
, true, op
);
1983 closure_sync(&op
->cl
);
1984 bch_btree_set_root(n3
);
1985 rw_unlock(true, n3
);
1987 op
->keys
.top
= op
->keys
.bottom
;
1988 closure_sync(&op
->cl
);
1989 bch_btree_set_root(n1
);
1993 bkey_copy(op
->keys
.top
, &b
->key
);
1994 bkey_copy_key(op
->keys
.top
, &ZERO_KEY
);
1996 for (i
= 0; i
< KEY_PTRS(&b
->key
); i
++) {
1997 uint8_t g
= PTR_BUCKET(b
->c
, &b
->key
, i
)->gen
+ 1;
1999 SET_PTR_GEN(op
->keys
.top
, i
, g
);
2002 bch_keylist_push(&op
->keys
);
2003 closure_sync(&op
->cl
);
2004 atomic_inc(&b
->c
->prio_blocked
);
2007 rw_unlock(true, n1
);
2008 btree_node_free(b
, op
);
2010 bch_time_stats_update(&b
->c
->btree_split_time
, start_time
);
2014 __bkey_put(n2
->c
, &n2
->key
);
2015 btree_node_free(n2
, op
);
2016 rw_unlock(true, n2
);
2018 __bkey_put(n1
->c
, &n1
->key
);
2019 btree_node_free(n1
, op
);
2020 rw_unlock(true, n1
);
2022 if (n3
== ERR_PTR(-EAGAIN
) ||
2023 n2
== ERR_PTR(-EAGAIN
) ||
2024 n1
== ERR_PTR(-EAGAIN
))
2027 pr_warn("couldn't split");
2031 static int bch_btree_insert_recurse(struct btree
*b
, struct btree_op
*op
,
2032 struct keylist
*stack_keys
)
2036 struct bkey
*insert
= op
->keys
.bottom
;
2037 struct bkey
*k
= bch_next_recurse_key(b
, &START_KEY(insert
));
2040 btree_bug(b
, "no key to recurse on at level %i/%i",
2041 b
->level
, b
->c
->root
->level
);
2043 op
->keys
.top
= op
->keys
.bottom
;
2047 if (bkey_cmp(insert
, k
) > 0) {
2050 if (op
->type
== BTREE_REPLACE
) {
2051 __bkey_put(b
->c
, insert
);
2052 op
->keys
.top
= op
->keys
.bottom
;
2053 op
->insert_collision
= true;
2057 for (i
= 0; i
< KEY_PTRS(insert
); i
++)
2058 atomic_inc(&PTR_BUCKET(b
->c
, insert
, i
)->pin
);
2060 bkey_copy(stack_keys
->top
, insert
);
2062 bch_cut_back(k
, insert
);
2063 bch_cut_front(k
, stack_keys
->top
);
2065 bch_keylist_push(stack_keys
);
2068 ret
= btree(insert_recurse
, k
, b
, op
, stack_keys
);
2073 if (!bch_keylist_empty(&op
->keys
)) {
2074 if (should_split(b
)) {
2075 if (op
->lock
<= b
->c
->root
->level
) {
2077 op
->lock
= b
->c
->root
->level
+ 1;
2080 return btree_split(b
, op
);
2083 BUG_ON(write_block(b
) != b
->sets
[b
->nsets
].data
);
2085 if (bch_btree_insert_keys(b
, op
))
2086 bch_btree_write(b
, false, op
);
2092 int bch_btree_insert(struct btree_op
*op
, struct cache_set
*c
)
2095 struct keylist stack_keys
;
2098 * Don't want to block with the btree locked unless we have to,
2099 * otherwise we get deadlocks with try_harder and between split/gc
2101 clear_closure_blocking(&op
->cl
);
2103 BUG_ON(bch_keylist_empty(&op
->keys
));
2104 bch_keylist_copy(&stack_keys
, &op
->keys
);
2105 bch_keylist_init(&op
->keys
);
2107 while (!bch_keylist_empty(&stack_keys
) ||
2108 !bch_keylist_empty(&op
->keys
)) {
2109 if (bch_keylist_empty(&op
->keys
)) {
2110 bch_keylist_add(&op
->keys
,
2111 bch_keylist_pop(&stack_keys
));
2115 ret
= btree_root(insert_recurse
, c
, op
, &stack_keys
);
2117 if (ret
== -EAGAIN
) {
2119 closure_sync(&op
->cl
);
2123 pr_err("error %i trying to insert key for %s",
2126 while ((k
= bch_keylist_pop(&stack_keys
) ?:
2127 bch_keylist_pop(&op
->keys
)))
2132 bch_keylist_free(&stack_keys
);
2135 atomic_dec_bug(op
->journal
);
2140 void bch_btree_set_root(struct btree
*b
)
2144 BUG_ON(!b
->written
);
2146 for (i
= 0; i
< KEY_PTRS(&b
->key
); i
++)
2147 BUG_ON(PTR_BUCKET(b
->c
, &b
->key
, i
)->prio
!= BTREE_PRIO
);
2149 mutex_lock(&b
->c
->bucket_lock
);
2150 list_del_init(&b
->list
);
2151 mutex_unlock(&b
->c
->bucket_lock
);
2154 __bkey_put(b
->c
, &b
->key
);
2156 bch_journal_meta(b
->c
, NULL
);
2157 pr_debug("%s for %pf", pbtree(b
), __builtin_return_address(0));
2162 static int submit_partial_cache_miss(struct btree
*b
, struct btree_op
*op
,
2165 struct search
*s
= container_of(op
, struct search
, op
);
2166 struct bio
*bio
= &s
->bio
.bio
;
2171 unsigned sectors
= INT_MAX
;
2173 if (KEY_INODE(k
) == op
->inode
) {
2174 if (KEY_START(k
) <= bio
->bi_sector
)
2177 sectors
= min_t(uint64_t, sectors
,
2178 KEY_START(k
) - bio
->bi_sector
);
2181 ret
= s
->d
->cache_miss(b
, s
, bio
, sectors
);
2188 * Read from a single key, handling the initial cache miss if the key starts in
2189 * the middle of the bio
2191 static int submit_partial_cache_hit(struct btree
*b
, struct btree_op
*op
,
2194 struct search
*s
= container_of(op
, struct search
, op
);
2195 struct bio
*bio
= &s
->bio
.bio
;
2199 int ret
= submit_partial_cache_miss(b
, op
, k
);
2200 if (ret
|| op
->lookup_done
)
2203 /* XXX: figure out best pointer - for multiple cache devices */
2206 PTR_BUCKET(b
->c
, k
, ptr
)->prio
= INITIAL_PRIO
;
2208 while (!op
->lookup_done
&&
2209 KEY_INODE(k
) == op
->inode
&&
2210 bio
->bi_sector
< KEY_OFFSET(k
)) {
2211 struct bkey
*bio_key
;
2212 sector_t sector
= PTR_OFFSET(k
, ptr
) +
2213 (bio
->bi_sector
- KEY_START(k
));
2214 unsigned sectors
= min_t(uint64_t, INT_MAX
,
2215 KEY_OFFSET(k
) - bio
->bi_sector
);
2217 n
= bch_bio_split(bio
, sectors
, GFP_NOIO
, s
->d
->bio_split
);
2222 op
->lookup_done
= true;
2224 bio_key
= &container_of(n
, struct bbio
, bio
)->key
;
2227 * The bucket we're reading from might be reused while our bio
2228 * is in flight, and we could then end up reading the wrong
2231 * We guard against this by checking (in cache_read_endio()) if
2232 * the pointer is stale again; if so, we treat it as an error
2233 * and reread from the backing device (but we don't pass that
2234 * error up anywhere).
2237 bch_bkey_copy_single_ptr(bio_key
, k
, ptr
);
2238 SET_PTR_OFFSET(bio_key
, 0, sector
);
2240 n
->bi_end_io
= bch_cache_read_endio
;
2241 n
->bi_private
= &s
->cl
;
2243 trace_bcache_cache_hit(n
);
2244 __bch_submit_bbio(n
, b
->c
);
2250 int bch_btree_search_recurse(struct btree
*b
, struct btree_op
*op
)
2252 struct search
*s
= container_of(op
, struct search
, op
);
2253 struct bio
*bio
= &s
->bio
.bio
;
2257 struct btree_iter iter
;
2258 bch_btree_iter_init(b
, &iter
, &KEY(op
->inode
, bio
->bi_sector
, 0));
2260 pr_debug("at %s searching for %u:%llu", pbtree(b
), op
->inode
,
2261 (uint64_t) bio
->bi_sector
);
2264 k
= bch_btree_iter_next_filter(&iter
, b
, bch_ptr_bad
);
2267 * b->key would be exactly what we want, except that
2268 * pointers to btree nodes have nonzero size - we
2269 * wouldn't go far enough
2272 ret
= submit_partial_cache_miss(b
, op
,
2273 &KEY(KEY_INODE(&b
->key
),
2274 KEY_OFFSET(&b
->key
), 0));
2279 ? btree(search_recurse
, k
, b
, op
)
2280 : submit_partial_cache_hit(b
, op
, k
);
2289 static inline int keybuf_cmp(struct keybuf_key
*l
, struct keybuf_key
*r
)
2291 /* Overlapping keys compare equal */
2292 if (bkey_cmp(&l
->key
, &START_KEY(&r
->key
)) <= 0)
2294 if (bkey_cmp(&START_KEY(&l
->key
), &r
->key
) >= 0)
2299 static inline int keybuf_nonoverlapping_cmp(struct keybuf_key
*l
,
2300 struct keybuf_key
*r
)
2302 return clamp_t(int64_t, bkey_cmp(&l
->key
, &r
->key
), -1, 1);
2305 static int bch_btree_refill_keybuf(struct btree
*b
, struct btree_op
*op
,
2306 struct keybuf
*buf
, struct bkey
*end
)
2308 struct btree_iter iter
;
2309 bch_btree_iter_init(b
, &iter
, &buf
->last_scanned
);
2311 while (!array_freelist_empty(&buf
->freelist
)) {
2312 struct bkey
*k
= bch_btree_iter_next_filter(&iter
, b
,
2317 buf
->last_scanned
= b
->key
;
2321 buf
->last_scanned
= *k
;
2322 if (bkey_cmp(&buf
->last_scanned
, end
) >= 0)
2325 if (buf
->key_predicate(buf
, k
)) {
2326 struct keybuf_key
*w
;
2328 pr_debug("%s", pkey(k
));
2330 spin_lock(&buf
->lock
);
2332 w
= array_alloc(&buf
->freelist
);
2335 bkey_copy(&w
->key
, k
);
2337 if (RB_INSERT(&buf
->keys
, w
, node
, keybuf_cmp
))
2338 array_free(&buf
->freelist
, w
);
2340 spin_unlock(&buf
->lock
);
2346 btree(refill_keybuf
, k
, b
, op
, buf
, end
);
2348 * Might get an error here, but can't really do anything
2349 * and it'll get logged elsewhere. Just read what we
2353 if (bkey_cmp(&buf
->last_scanned
, end
) >= 0)
2363 void bch_refill_keybuf(struct cache_set
*c
, struct keybuf
*buf
,
2366 struct bkey start
= buf
->last_scanned
;
2368 bch_btree_op_init_stack(&op
);
2372 btree_root(refill_keybuf
, c
, &op
, buf
, end
);
2373 closure_sync(&op
.cl
);
2375 pr_debug("found %s keys from %llu:%llu to %llu:%llu",
2376 RB_EMPTY_ROOT(&buf
->keys
) ? "no" :
2377 array_freelist_empty(&buf
->freelist
) ? "some" : "a few",
2378 KEY_INODE(&start
), KEY_OFFSET(&start
),
2379 KEY_INODE(&buf
->last_scanned
), KEY_OFFSET(&buf
->last_scanned
));
2381 spin_lock(&buf
->lock
);
2383 if (!RB_EMPTY_ROOT(&buf
->keys
)) {
2384 struct keybuf_key
*w
;
2385 w
= RB_FIRST(&buf
->keys
, struct keybuf_key
, node
);
2386 buf
->start
= START_KEY(&w
->key
);
2388 w
= RB_LAST(&buf
->keys
, struct keybuf_key
, node
);
2391 buf
->start
= MAX_KEY
;
2395 spin_unlock(&buf
->lock
);
2398 static void __bch_keybuf_del(struct keybuf
*buf
, struct keybuf_key
*w
)
2400 rb_erase(&w
->node
, &buf
->keys
);
2401 array_free(&buf
->freelist
, w
);
2404 void bch_keybuf_del(struct keybuf
*buf
, struct keybuf_key
*w
)
2406 spin_lock(&buf
->lock
);
2407 __bch_keybuf_del(buf
, w
);
2408 spin_unlock(&buf
->lock
);
2411 bool bch_keybuf_check_overlapping(struct keybuf
*buf
, struct bkey
*start
,
2415 struct keybuf_key
*p
, *w
, s
;
2418 if (bkey_cmp(end
, &buf
->start
) <= 0 ||
2419 bkey_cmp(start
, &buf
->end
) >= 0)
2422 spin_lock(&buf
->lock
);
2423 w
= RB_GREATER(&buf
->keys
, s
, node
, keybuf_nonoverlapping_cmp
);
2425 while (w
&& bkey_cmp(&START_KEY(&w
->key
), end
) < 0) {
2427 w
= RB_NEXT(w
, node
);
2432 __bch_keybuf_del(buf
, p
);
2435 spin_unlock(&buf
->lock
);
2439 struct keybuf_key
*bch_keybuf_next(struct keybuf
*buf
)
2441 struct keybuf_key
*w
;
2442 spin_lock(&buf
->lock
);
2444 w
= RB_FIRST(&buf
->keys
, struct keybuf_key
, node
);
2446 while (w
&& w
->private)
2447 w
= RB_NEXT(w
, node
);
2450 w
->private = ERR_PTR(-EINTR
);
2452 spin_unlock(&buf
->lock
);
2456 struct keybuf_key
*bch_keybuf_next_rescan(struct cache_set
*c
,
2460 struct keybuf_key
*ret
;
2463 ret
= bch_keybuf_next(buf
);
2467 if (bkey_cmp(&buf
->last_scanned
, end
) >= 0) {
2468 pr_debug("scan finished");
2472 bch_refill_keybuf(c
, buf
, end
);
2478 void bch_keybuf_init(struct keybuf
*buf
, keybuf_pred_fn
*fn
)
2480 buf
->key_predicate
= fn
;
2481 buf
->last_scanned
= MAX_KEY
;
2482 buf
->keys
= RB_ROOT
;
2484 spin_lock_init(&buf
->lock
);
2485 array_allocator_init(&buf
->freelist
);
2488 void bch_btree_exit(void)
2491 destroy_workqueue(btree_io_wq
);
2493 destroy_workqueue(bch_gc_wq
);
2496 int __init
bch_btree_init(void)
2498 if (!(bch_gc_wq
= create_singlethread_workqueue("bch_btree_gc")) ||
2499 !(btree_io_wq
= create_singlethread_workqueue("bch_btree_io")))