Commit | Line | Data |
---|---|---|
cafe5635 KO |
1 | /* |
2 | * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> | |
3 | * | |
4 | * Uses a block device as cache for other block devices; optimized for SSDs. | |
5 | * All allocation is done in buckets, which should match the erase block size | |
6 | * of the device. | |
7 | * | |
8 | * Buckets containing cached data are kept on a heap sorted by priority; | |
9 | * bucket priority is increased on cache hit, and periodically all the buckets | |
10 | * on the heap have their priority scaled down. This currently is just used as | |
11 | * an LRU but in the future should allow for more intelligent heuristics. | |
12 | * | |
13 | * Buckets have an 8 bit counter; freeing is accomplished by incrementing the | |
14 | * counter. Garbage collection is used to remove stale pointers. | |
15 | * | |
16 | * Indexing is done via a btree; nodes are not necessarily fully sorted, rather | |
17 | * as keys are inserted we only sort the pages that have not yet been written. | |
18 | * When garbage collection is run, we resort the entire node. | |
19 | * | |
20 | * All configuration is done via sysfs; see Documentation/bcache.txt. | |
21 | */ | |
22 | ||
23 | #include "bcache.h" | |
24 | #include "btree.h" | |
25 | #include "debug.h" | |
65d45231 | 26 | #include "extents.h" |
cafe5635 KO |
27 | |
28 | #include <linux/slab.h> | |
29 | #include <linux/bitops.h> | |
72a44517 | 30 | #include <linux/freezer.h> |
cafe5635 | 31 | #include <linux/hash.h> |
72a44517 | 32 | #include <linux/kthread.h> |
cd953ed0 | 33 | #include <linux/prefetch.h> |
cafe5635 KO |
34 | #include <linux/random.h> |
35 | #include <linux/rcupdate.h> | |
36 | #include <trace/events/bcache.h> | |
37 | ||
38 | /* | |
39 | * Todo: | |
40 | * register_bcache: Return errors out to userspace correctly | |
41 | * | |
42 | * Writeback: don't undirty key until after a cache flush | |
43 | * | |
44 | * Create an iterator for key pointers | |
45 | * | |
46 | * On btree write error, mark bucket such that it won't be freed from the cache | |
47 | * | |
48 | * Journalling: | |
49 | * Check for bad keys in replay | |
50 | * Propagate barriers | |
51 | * Refcount journal entries in journal_replay | |
52 | * | |
53 | * Garbage collection: | |
54 | * Finish incremental gc | |
55 | * Gc should free old UUIDs, data for invalid UUIDs | |
56 | * | |
57 | * Provide a way to list backing device UUIDs we have data cached for, and | |
58 | * probably how long it's been since we've seen them, and a way to invalidate | |
59 | * dirty data for devices that will never be attached again | |
60 | * | |
61 | * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so | |
62 | * that based on that and how much dirty data we have we can keep writeback | |
63 | * from being starved | |
64 | * | |
65 | * Add a tracepoint or somesuch to watch for writeback starvation | |
66 | * | |
67 | * When btree depth > 1 and splitting an interior node, we have to make sure | |
68 | * alloc_bucket() cannot fail. This should be true but is not completely | |
69 | * obvious. | |
70 | * | |
71 | * Make sure all allocations get charged to the root cgroup | |
72 | * | |
73 | * Plugging? | |
74 | * | |
75 | * If data write is less than hard sector size of ssd, round up offset in open | |
76 | * bucket to the next whole sector | |
77 | * | |
78 | * Also lookup by cgroup in get_open_bucket() | |
79 | * | |
80 | * Superblock needs to be fleshed out for multiple cache devices | |
81 | * | |
82 | * Add a sysfs tunable for the number of writeback IOs in flight | |
83 | * | |
84 | * Add a sysfs tunable for the number of open data buckets | |
85 | * | |
86 | * IO tracking: Can we track when one process is doing io on behalf of another? | |
87 | * IO tracking: Don't use just an average, weigh more recent stuff higher | |
88 | * | |
89 | * Test module load/unload | |
90 | */ | |
91 | ||
cafe5635 KO |
92 | #define MAX_NEED_GC 64 |
93 | #define MAX_SAVE_PRIO 72 | |
94 | ||
95 | #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) | |
96 | ||
97 | #define PTR_HASH(c, k) \ | |
98 | (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) | |
99 | ||
cafe5635 KO |
100 | static struct workqueue_struct *btree_io_wq; |
101 | ||
df8e8970 KO |
102 | #define insert_lock(s, b) ((b)->level <= (s)->lock) |
103 | ||
104 | /* | |
105 | * These macros are for recursing down the btree - they handle the details of | |
106 | * locking and looking up nodes in the cache for you. They're best treated as | |
107 | * mere syntax when reading code that uses them. | |
108 | * | |
109 | * op->lock determines whether we take a read or a write lock at a given depth. | |
110 | * If you've got a read lock and find that you need a write lock (i.e. you're | |
111 | * going to have to split), set op->lock and return -EINTR; btree_root() will | |
112 | * call you again and you'll have the correct lock. | |
113 | */ | |
114 | ||
115 | /** | |
116 | * btree - recurse down the btree on a specified key | |
117 | * @fn: function to call, which will be passed the child node | |
118 | * @key: key to recurse on | |
119 | * @b: parent btree node | |
120 | * @op: pointer to struct btree_op | |
121 | */ | |
122 | #define btree(fn, key, b, op, ...) \ | |
123 | ({ \ | |
124 | int _r, l = (b)->level - 1; \ | |
125 | bool _w = l <= (op)->lock; \ | |
126 | struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ | |
127 | if (!IS_ERR(_child)) { \ | |
128 | _child->parent = (b); \ | |
129 | _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ | |
130 | rw_unlock(_w, _child); \ | |
131 | } else \ | |
132 | _r = PTR_ERR(_child); \ | |
133 | _r; \ | |
134 | }) | |
135 | ||
136 | /** | |
137 | * btree_root - call a function on the root of the btree | |
138 | * @fn: function to call, which will be passed the child node | |
139 | * @c: cache set | |
140 | * @op: pointer to struct btree_op | |
141 | */ | |
142 | #define btree_root(fn, c, op, ...) \ | |
143 | ({ \ | |
144 | int _r = -EINTR; \ | |
145 | do { \ | |
146 | struct btree *_b = (c)->root; \ | |
147 | bool _w = insert_lock(op, _b); \ | |
148 | rw_lock(_w, _b, _b->level); \ | |
149 | if (_b == (c)->root && \ | |
150 | _w == insert_lock(op, _b)) { \ | |
151 | _b->parent = NULL; \ | |
152 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | |
153 | } \ | |
154 | rw_unlock(_w, _b); \ | |
78365411 KO |
155 | if (_r == -EINTR) \ |
156 | schedule(); \ | |
df8e8970 KO |
157 | bch_cannibalize_unlock(c); \ |
158 | if (_r == -ENOSPC) { \ | |
159 | wait_event((c)->try_wait, \ | |
160 | !(c)->try_harder); \ | |
161 | _r = -EINTR; \ | |
162 | } \ | |
163 | } while (_r == -EINTR); \ | |
164 | \ | |
78365411 | 165 | finish_wait(&(c)->bucket_wait, &(op)->wait); \ |
df8e8970 KO |
166 | _r; \ |
167 | }) | |
168 | ||
a85e968e KO |
169 | static inline struct bset *write_block(struct btree *b) |
170 | { | |
171 | return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); | |
172 | } | |
173 | ||
cafe5635 KO |
174 | /* Btree key manipulation */ |
175 | ||
3a3b6a4e | 176 | void bkey_put(struct cache_set *c, struct bkey *k) |
e7c590eb KO |
177 | { |
178 | unsigned i; | |
179 | ||
180 | for (i = 0; i < KEY_PTRS(k); i++) | |
181 | if (ptr_available(c, k, i)) | |
182 | atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); | |
183 | } | |
184 | ||
cafe5635 KO |
185 | /* Btree IO */ |
186 | ||
187 | static uint64_t btree_csum_set(struct btree *b, struct bset *i) | |
188 | { | |
189 | uint64_t crc = b->key.ptr[0]; | |
fafff81c | 190 | void *data = (void *) i + 8, *end = bset_bkey_last(i); |
cafe5635 | 191 | |
169ef1cf | 192 | crc = bch_crc64_update(crc, data, end - data); |
c19ed23a | 193 | return crc ^ 0xffffffffffffffffULL; |
cafe5635 KO |
194 | } |
195 | ||
78b77bf8 | 196 | void bch_btree_node_read_done(struct btree *b) |
cafe5635 | 197 | { |
cafe5635 | 198 | const char *err = "bad btree header"; |
ee811287 | 199 | struct bset *i = btree_bset_first(b); |
57943511 | 200 | struct btree_iter *iter; |
cafe5635 | 201 | |
57943511 KO |
202 | iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); |
203 | iter->size = b->c->sb.bucket_size / b->c->sb.block_size; | |
cafe5635 KO |
204 | iter->used = 0; |
205 | ||
280481d0 | 206 | #ifdef CONFIG_BCACHE_DEBUG |
c052dd9a | 207 | iter->b = &b->keys; |
280481d0 KO |
208 | #endif |
209 | ||
57943511 | 210 | if (!i->seq) |
cafe5635 KO |
211 | goto err; |
212 | ||
213 | for (; | |
a85e968e | 214 | b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; |
cafe5635 KO |
215 | i = write_block(b)) { |
216 | err = "unsupported bset version"; | |
217 | if (i->version > BCACHE_BSET_VERSION) | |
218 | goto err; | |
219 | ||
220 | err = "bad btree header"; | |
ee811287 KO |
221 | if (b->written + set_blocks(i, block_bytes(b->c)) > |
222 | btree_blocks(b)) | |
cafe5635 KO |
223 | goto err; |
224 | ||
225 | err = "bad magic"; | |
81ab4190 | 226 | if (i->magic != bset_magic(&b->c->sb)) |
cafe5635 KO |
227 | goto err; |
228 | ||
229 | err = "bad checksum"; | |
230 | switch (i->version) { | |
231 | case 0: | |
232 | if (i->csum != csum_set(i)) | |
233 | goto err; | |
234 | break; | |
235 | case BCACHE_BSET_VERSION: | |
236 | if (i->csum != btree_csum_set(b, i)) | |
237 | goto err; | |
238 | break; | |
239 | } | |
240 | ||
241 | err = "empty set"; | |
a85e968e | 242 | if (i != b->keys.set[0].data && !i->keys) |
cafe5635 KO |
243 | goto err; |
244 | ||
fafff81c | 245 | bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); |
cafe5635 | 246 | |
ee811287 | 247 | b->written += set_blocks(i, block_bytes(b->c)); |
cafe5635 KO |
248 | } |
249 | ||
250 | err = "corrupted btree"; | |
251 | for (i = write_block(b); | |
a85e968e | 252 | bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); |
cafe5635 | 253 | i = ((void *) i) + block_bytes(b->c)) |
a85e968e | 254 | if (i->seq == b->keys.set[0].data->seq) |
cafe5635 KO |
255 | goto err; |
256 | ||
a85e968e | 257 | bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); |
cafe5635 | 258 | |
a85e968e | 259 | i = b->keys.set[0].data; |
cafe5635 | 260 | err = "short btree key"; |
a85e968e KO |
261 | if (b->keys.set[0].size && |
262 | bkey_cmp(&b->key, &b->keys.set[0].end) < 0) | |
cafe5635 KO |
263 | goto err; |
264 | ||
265 | if (b->written < btree_blocks(b)) | |
a85e968e KO |
266 | bch_bset_init_next(&b->keys, write_block(b), |
267 | bset_magic(&b->c->sb)); | |
cafe5635 | 268 | out: |
57943511 KO |
269 | mempool_free(iter, b->c->fill_iter); |
270 | return; | |
cafe5635 KO |
271 | err: |
272 | set_btree_node_io_error(b); | |
88b9f8c4 | 273 | bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", |
cafe5635 | 274 | err, PTR_BUCKET_NR(b->c, &b->key, 0), |
88b9f8c4 | 275 | bset_block_offset(b, i), i->keys); |
cafe5635 KO |
276 | goto out; |
277 | } | |
278 | ||
57943511 | 279 | static void btree_node_read_endio(struct bio *bio, int error) |
cafe5635 | 280 | { |
57943511 KO |
281 | struct closure *cl = bio->bi_private; |
282 | closure_put(cl); | |
283 | } | |
cafe5635 | 284 | |
78b77bf8 | 285 | static void bch_btree_node_read(struct btree *b) |
57943511 KO |
286 | { |
287 | uint64_t start_time = local_clock(); | |
288 | struct closure cl; | |
289 | struct bio *bio; | |
cafe5635 | 290 | |
c37511b8 KO |
291 | trace_bcache_btree_read(b); |
292 | ||
57943511 | 293 | closure_init_stack(&cl); |
cafe5635 | 294 | |
57943511 KO |
295 | bio = bch_bbio_alloc(b->c); |
296 | bio->bi_rw = REQ_META|READ_SYNC; | |
4f024f37 | 297 | bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9; |
57943511 KO |
298 | bio->bi_end_io = btree_node_read_endio; |
299 | bio->bi_private = &cl; | |
cafe5635 | 300 | |
a85e968e | 301 | bch_bio_map(bio, b->keys.set[0].data); |
cafe5635 | 302 | |
57943511 KO |
303 | bch_submit_bbio(bio, b->c, &b->key, 0); |
304 | closure_sync(&cl); | |
cafe5635 | 305 | |
57943511 KO |
306 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) |
307 | set_btree_node_io_error(b); | |
308 | ||
309 | bch_bbio_free(bio, b->c); | |
310 | ||
311 | if (btree_node_io_error(b)) | |
312 | goto err; | |
313 | ||
314 | bch_btree_node_read_done(b); | |
57943511 | 315 | bch_time_stats_update(&b->c->btree_read_time, start_time); |
57943511 KO |
316 | |
317 | return; | |
318 | err: | |
61cbd250 | 319 | bch_cache_set_error(b->c, "io error reading bucket %zu", |
57943511 | 320 | PTR_BUCKET_NR(b->c, &b->key, 0)); |
cafe5635 KO |
321 | } |
322 | ||
323 | static void btree_complete_write(struct btree *b, struct btree_write *w) | |
324 | { | |
325 | if (w->prio_blocked && | |
326 | !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) | |
119ba0f8 | 327 | wake_up_allocators(b->c); |
cafe5635 KO |
328 | |
329 | if (w->journal) { | |
330 | atomic_dec_bug(w->journal); | |
331 | __closure_wake_up(&b->c->journal.wait); | |
332 | } | |
333 | ||
cafe5635 KO |
334 | w->prio_blocked = 0; |
335 | w->journal = NULL; | |
cafe5635 KO |
336 | } |
337 | ||
cb7a583e KO |
338 | static void btree_node_write_unlock(struct closure *cl) |
339 | { | |
340 | struct btree *b = container_of(cl, struct btree, io); | |
341 | ||
342 | up(&b->io_mutex); | |
343 | } | |
344 | ||
57943511 | 345 | static void __btree_node_write_done(struct closure *cl) |
cafe5635 | 346 | { |
cb7a583e | 347 | struct btree *b = container_of(cl, struct btree, io); |
cafe5635 KO |
348 | struct btree_write *w = btree_prev_write(b); |
349 | ||
350 | bch_bbio_free(b->bio, b->c); | |
351 | b->bio = NULL; | |
352 | btree_complete_write(b, w); | |
353 | ||
354 | if (btree_node_dirty(b)) | |
355 | queue_delayed_work(btree_io_wq, &b->work, | |
356 | msecs_to_jiffies(30000)); | |
357 | ||
cb7a583e | 358 | closure_return_with_destructor(cl, btree_node_write_unlock); |
cafe5635 KO |
359 | } |
360 | ||
57943511 | 361 | static void btree_node_write_done(struct closure *cl) |
cafe5635 | 362 | { |
cb7a583e | 363 | struct btree *b = container_of(cl, struct btree, io); |
cafe5635 KO |
364 | struct bio_vec *bv; |
365 | int n; | |
366 | ||
7988613b | 367 | bio_for_each_segment_all(bv, b->bio, n) |
cafe5635 KO |
368 | __free_page(bv->bv_page); |
369 | ||
57943511 | 370 | __btree_node_write_done(cl); |
cafe5635 KO |
371 | } |
372 | ||
57943511 KO |
373 | static void btree_node_write_endio(struct bio *bio, int error) |
374 | { | |
375 | struct closure *cl = bio->bi_private; | |
cb7a583e | 376 | struct btree *b = container_of(cl, struct btree, io); |
57943511 KO |
377 | |
378 | if (error) | |
379 | set_btree_node_io_error(b); | |
380 | ||
381 | bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); | |
382 | closure_put(cl); | |
383 | } | |
384 | ||
385 | static void do_btree_node_write(struct btree *b) | |
cafe5635 | 386 | { |
cb7a583e | 387 | struct closure *cl = &b->io; |
ee811287 | 388 | struct bset *i = btree_bset_last(b); |
cafe5635 KO |
389 | BKEY_PADDED(key) k; |
390 | ||
391 | i->version = BCACHE_BSET_VERSION; | |
392 | i->csum = btree_csum_set(b, i); | |
393 | ||
57943511 KO |
394 | BUG_ON(b->bio); |
395 | b->bio = bch_bbio_alloc(b->c); | |
396 | ||
397 | b->bio->bi_end_io = btree_node_write_endio; | |
faadf0c9 | 398 | b->bio->bi_private = cl; |
e49c7c37 | 399 | b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; |
ee811287 | 400 | b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); |
169ef1cf | 401 | bch_bio_map(b->bio, i); |
cafe5635 | 402 | |
e49c7c37 KO |
403 | /* |
404 | * If we're appending to a leaf node, we don't technically need FUA - | |
405 | * this write just needs to be persisted before the next journal write, | |
406 | * which will be marked FLUSH|FUA. | |
407 | * | |
408 | * Similarly if we're writing a new btree root - the pointer is going to | |
409 | * be in the next journal entry. | |
410 | * | |
411 | * But if we're writing a new btree node (that isn't a root) or | |
412 | * appending to a non leaf btree node, we need either FUA or a flush | |
413 | * when we write the parent with the new pointer. FUA is cheaper than a | |
414 | * flush, and writes appending to leaf nodes aren't blocking anything so | |
415 | * just make all btree node writes FUA to keep things sane. | |
416 | */ | |
417 | ||
cafe5635 | 418 | bkey_copy(&k.key, &b->key); |
ee811287 | 419 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + |
a85e968e | 420 | bset_sector_offset(&b->keys, i)); |
cafe5635 | 421 | |
8e51e414 | 422 | if (!bio_alloc_pages(b->bio, GFP_NOIO)) { |
cafe5635 KO |
423 | int j; |
424 | struct bio_vec *bv; | |
425 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); | |
426 | ||
7988613b | 427 | bio_for_each_segment_all(bv, b->bio, j) |
cafe5635 KO |
428 | memcpy(page_address(bv->bv_page), |
429 | base + j * PAGE_SIZE, PAGE_SIZE); | |
430 | ||
cafe5635 KO |
431 | bch_submit_bbio(b->bio, b->c, &k.key, 0); |
432 | ||
57943511 | 433 | continue_at(cl, btree_node_write_done, NULL); |
cafe5635 KO |
434 | } else { |
435 | b->bio->bi_vcnt = 0; | |
169ef1cf | 436 | bch_bio_map(b->bio, i); |
cafe5635 | 437 | |
cafe5635 KO |
438 | bch_submit_bbio(b->bio, b->c, &k.key, 0); |
439 | ||
440 | closure_sync(cl); | |
cb7a583e | 441 | continue_at_nobarrier(cl, __btree_node_write_done, NULL); |
cafe5635 KO |
442 | } |
443 | } | |
444 | ||
57943511 | 445 | void bch_btree_node_write(struct btree *b, struct closure *parent) |
cafe5635 | 446 | { |
ee811287 | 447 | struct bset *i = btree_bset_last(b); |
cafe5635 | 448 | |
c37511b8 KO |
449 | trace_bcache_btree_write(b); |
450 | ||
cafe5635 | 451 | BUG_ON(current->bio_list); |
57943511 KO |
452 | BUG_ON(b->written >= btree_blocks(b)); |
453 | BUG_ON(b->written && !i->keys); | |
ee811287 | 454 | BUG_ON(btree_bset_first(b)->seq != i->seq); |
dc9d98d6 | 455 | bch_check_keys(&b->keys, "writing"); |
cafe5635 | 456 | |
cafe5635 KO |
457 | cancel_delayed_work(&b->work); |
458 | ||
57943511 | 459 | /* If caller isn't waiting for write, parent refcount is cache set */ |
cb7a583e KO |
460 | down(&b->io_mutex); |
461 | closure_init(&b->io, parent ?: &b->c->cl); | |
57943511 | 462 | |
cafe5635 KO |
463 | clear_bit(BTREE_NODE_dirty, &b->flags); |
464 | change_bit(BTREE_NODE_write_idx, &b->flags); | |
465 | ||
57943511 | 466 | do_btree_node_write(b); |
cafe5635 | 467 | |
ee811287 | 468 | atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, |
cafe5635 KO |
469 | &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); |
470 | ||
a85e968e KO |
471 | b->written += set_blocks(i, block_bytes(b->c)); |
472 | ||
67539e85 | 473 | /* If not a leaf node, always sort */ |
a85e968e | 474 | if (b->level && b->keys.nsets) |
89ebb4a2 | 475 | bch_btree_sort(&b->keys, &b->c->sort); |
67539e85 | 476 | else |
89ebb4a2 | 477 | bch_btree_sort_lazy(&b->keys, &b->c->sort); |
cafe5635 | 478 | |
78b77bf8 KO |
479 | /* |
480 | * do verify if there was more than one set initially (i.e. we did a | |
481 | * sort) and we sorted down to a single set: | |
482 | */ | |
a85e968e | 483 | if (i != b->keys.set->data && !b->keys.nsets) |
78b77bf8 KO |
484 | bch_btree_verify(b); |
485 | ||
cafe5635 | 486 | if (b->written < btree_blocks(b)) |
a85e968e KO |
487 | bch_bset_init_next(&b->keys, write_block(b), |
488 | bset_magic(&b->c->sb)); | |
cafe5635 KO |
489 | } |
490 | ||
f269af5a KO |
491 | static void bch_btree_node_write_sync(struct btree *b) |
492 | { | |
493 | struct closure cl; | |
494 | ||
495 | closure_init_stack(&cl); | |
496 | bch_btree_node_write(b, &cl); | |
497 | closure_sync(&cl); | |
498 | } | |
499 | ||
57943511 | 500 | static void btree_node_write_work(struct work_struct *w) |
cafe5635 KO |
501 | { |
502 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); | |
503 | ||
57943511 | 504 | rw_lock(true, b, b->level); |
cafe5635 KO |
505 | |
506 | if (btree_node_dirty(b)) | |
57943511 KO |
507 | bch_btree_node_write(b, NULL); |
508 | rw_unlock(true, b); | |
cafe5635 KO |
509 | } |
510 | ||
c18536a7 | 511 | static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) |
cafe5635 | 512 | { |
ee811287 | 513 | struct bset *i = btree_bset_last(b); |
cafe5635 KO |
514 | struct btree_write *w = btree_current_write(b); |
515 | ||
57943511 KO |
516 | BUG_ON(!b->written); |
517 | BUG_ON(!i->keys); | |
cafe5635 | 518 | |
57943511 KO |
519 | if (!btree_node_dirty(b)) |
520 | queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); | |
cafe5635 | 521 | |
57943511 | 522 | set_btree_node_dirty(b); |
cafe5635 | 523 | |
c18536a7 | 524 | if (journal_ref) { |
cafe5635 | 525 | if (w->journal && |
c18536a7 | 526 | journal_pin_cmp(b->c, w->journal, journal_ref)) { |
cafe5635 KO |
527 | atomic_dec_bug(w->journal); |
528 | w->journal = NULL; | |
529 | } | |
530 | ||
531 | if (!w->journal) { | |
c18536a7 | 532 | w->journal = journal_ref; |
cafe5635 KO |
533 | atomic_inc(w->journal); |
534 | } | |
535 | } | |
536 | ||
cafe5635 | 537 | /* Force write if set is too big */ |
57943511 KO |
538 | if (set_bytes(i) > PAGE_SIZE - 48 && |
539 | !current->bio_list) | |
540 | bch_btree_node_write(b, NULL); | |
cafe5635 KO |
541 | } |
542 | ||
543 | /* | |
544 | * Btree in memory cache - allocation/freeing | |
545 | * mca -> memory cache | |
546 | */ | |
547 | ||
cafe5635 KO |
548 | #define mca_reserve(c) (((c->root && c->root->level) \ |
549 | ? c->root->level : 1) * 8 + 16) | |
550 | #define mca_can_free(c) \ | |
551 | max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) | |
552 | ||
553 | static void mca_data_free(struct btree *b) | |
554 | { | |
cb7a583e | 555 | BUG_ON(b->io_mutex.count != 1); |
cafe5635 | 556 | |
a85e968e | 557 | bch_btree_keys_free(&b->keys); |
cafe5635 | 558 | |
cafe5635 | 559 | b->c->bucket_cache_used--; |
ee811287 | 560 | list_move(&b->list, &b->c->btree_cache_freed); |
cafe5635 KO |
561 | } |
562 | ||
563 | static void mca_bucket_free(struct btree *b) | |
564 | { | |
565 | BUG_ON(btree_node_dirty(b)); | |
566 | ||
567 | b->key.ptr[0] = 0; | |
568 | hlist_del_init_rcu(&b->hash); | |
569 | list_move(&b->list, &b->c->btree_cache_freeable); | |
570 | } | |
571 | ||
572 | static unsigned btree_order(struct bkey *k) | |
573 | { | |
574 | return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); | |
575 | } | |
576 | ||
577 | static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) | |
578 | { | |
a85e968e | 579 | if (!bch_btree_keys_alloc(&b->keys, |
ee811287 KO |
580 | max_t(unsigned, |
581 | ilog2(b->c->btree_pages), | |
582 | btree_order(k)), | |
583 | gfp)) { | |
584 | b->c->bucket_cache_used++; | |
585 | list_move(&b->list, &b->c->btree_cache); | |
586 | } else { | |
587 | list_move(&b->list, &b->c->btree_cache_freed); | |
588 | } | |
cafe5635 KO |
589 | } |
590 | ||
591 | static struct btree *mca_bucket_alloc(struct cache_set *c, | |
592 | struct bkey *k, gfp_t gfp) | |
593 | { | |
594 | struct btree *b = kzalloc(sizeof(struct btree), gfp); | |
595 | if (!b) | |
596 | return NULL; | |
597 | ||
598 | init_rwsem(&b->lock); | |
599 | lockdep_set_novalidate_class(&b->lock); | |
600 | INIT_LIST_HEAD(&b->list); | |
57943511 | 601 | INIT_DELAYED_WORK(&b->work, btree_node_write_work); |
cafe5635 | 602 | b->c = c; |
cb7a583e | 603 | sema_init(&b->io_mutex, 1); |
cafe5635 KO |
604 | |
605 | mca_data_alloc(b, k, gfp); | |
606 | return b; | |
607 | } | |
608 | ||
e8e1d468 | 609 | static int mca_reap(struct btree *b, unsigned min_order, bool flush) |
cafe5635 | 610 | { |
e8e1d468 KO |
611 | struct closure cl; |
612 | ||
613 | closure_init_stack(&cl); | |
cafe5635 KO |
614 | lockdep_assert_held(&b->c->bucket_lock); |
615 | ||
616 | if (!down_write_trylock(&b->lock)) | |
617 | return -ENOMEM; | |
618 | ||
a85e968e | 619 | BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); |
e8e1d468 | 620 | |
a85e968e | 621 | if (b->keys.page_order < min_order) |
cb7a583e KO |
622 | goto out_unlock; |
623 | ||
624 | if (!flush) { | |
625 | if (btree_node_dirty(b)) | |
626 | goto out_unlock; | |
627 | ||
628 | if (down_trylock(&b->io_mutex)) | |
629 | goto out_unlock; | |
630 | up(&b->io_mutex); | |
cafe5635 KO |
631 | } |
632 | ||
f269af5a KO |
633 | if (btree_node_dirty(b)) |
634 | bch_btree_node_write_sync(b); | |
cafe5635 | 635 | |
e8e1d468 | 636 | /* wait for any in flight btree write */ |
cb7a583e KO |
637 | down(&b->io_mutex); |
638 | up(&b->io_mutex); | |
e8e1d468 | 639 | |
cafe5635 | 640 | return 0; |
cb7a583e KO |
641 | out_unlock: |
642 | rw_unlock(true, b); | |
643 | return -ENOMEM; | |
cafe5635 KO |
644 | } |
645 | ||
7dc19d5a DC |
646 | static unsigned long bch_mca_scan(struct shrinker *shrink, |
647 | struct shrink_control *sc) | |
cafe5635 KO |
648 | { |
649 | struct cache_set *c = container_of(shrink, struct cache_set, shrink); | |
650 | struct btree *b, *t; | |
651 | unsigned long i, nr = sc->nr_to_scan; | |
7dc19d5a | 652 | unsigned long freed = 0; |
cafe5635 KO |
653 | |
654 | if (c->shrinker_disabled) | |
7dc19d5a | 655 | return SHRINK_STOP; |
cafe5635 KO |
656 | |
657 | if (c->try_harder) | |
7dc19d5a | 658 | return SHRINK_STOP; |
cafe5635 KO |
659 | |
660 | /* Return -1 if we can't do anything right now */ | |
a698e08c | 661 | if (sc->gfp_mask & __GFP_IO) |
cafe5635 KO |
662 | mutex_lock(&c->bucket_lock); |
663 | else if (!mutex_trylock(&c->bucket_lock)) | |
664 | return -1; | |
665 | ||
36c9ea98 KO |
666 | /* |
667 | * It's _really_ critical that we don't free too many btree nodes - we | |
668 | * have to always leave ourselves a reserve. The reserve is how we | |
669 | * guarantee that allocating memory for a new btree node can always | |
670 | * succeed, so that inserting keys into the btree can always succeed and | |
671 | * IO can always make forward progress: | |
672 | */ | |
cafe5635 KO |
673 | nr /= c->btree_pages; |
674 | nr = min_t(unsigned long, nr, mca_can_free(c)); | |
675 | ||
676 | i = 0; | |
677 | list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { | |
7dc19d5a | 678 | if (freed >= nr) |
cafe5635 KO |
679 | break; |
680 | ||
681 | if (++i > 3 && | |
e8e1d468 | 682 | !mca_reap(b, 0, false)) { |
cafe5635 KO |
683 | mca_data_free(b); |
684 | rw_unlock(true, b); | |
7dc19d5a | 685 | freed++; |
cafe5635 KO |
686 | } |
687 | } | |
688 | ||
7dc19d5a | 689 | for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { |
b0f32a56 KO |
690 | if (list_empty(&c->btree_cache)) |
691 | goto out; | |
692 | ||
cafe5635 KO |
693 | b = list_first_entry(&c->btree_cache, struct btree, list); |
694 | list_rotate_left(&c->btree_cache); | |
695 | ||
696 | if (!b->accessed && | |
e8e1d468 | 697 | !mca_reap(b, 0, false)) { |
cafe5635 KO |
698 | mca_bucket_free(b); |
699 | mca_data_free(b); | |
700 | rw_unlock(true, b); | |
7dc19d5a | 701 | freed++; |
cafe5635 KO |
702 | } else |
703 | b->accessed = 0; | |
704 | } | |
705 | out: | |
cafe5635 | 706 | mutex_unlock(&c->bucket_lock); |
7dc19d5a DC |
707 | return freed; |
708 | } | |
709 | ||
710 | static unsigned long bch_mca_count(struct shrinker *shrink, | |
711 | struct shrink_control *sc) | |
712 | { | |
713 | struct cache_set *c = container_of(shrink, struct cache_set, shrink); | |
714 | ||
715 | if (c->shrinker_disabled) | |
716 | return 0; | |
717 | ||
718 | if (c->try_harder) | |
719 | return 0; | |
720 | ||
721 | return mca_can_free(c) * c->btree_pages; | |
cafe5635 KO |
722 | } |
723 | ||
724 | void bch_btree_cache_free(struct cache_set *c) | |
725 | { | |
726 | struct btree *b; | |
727 | struct closure cl; | |
728 | closure_init_stack(&cl); | |
729 | ||
730 | if (c->shrink.list.next) | |
731 | unregister_shrinker(&c->shrink); | |
732 | ||
733 | mutex_lock(&c->bucket_lock); | |
734 | ||
735 | #ifdef CONFIG_BCACHE_DEBUG | |
736 | if (c->verify_data) | |
737 | list_move(&c->verify_data->list, &c->btree_cache); | |
78b77bf8 KO |
738 | |
739 | free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); | |
cafe5635 KO |
740 | #endif |
741 | ||
742 | list_splice(&c->btree_cache_freeable, | |
743 | &c->btree_cache); | |
744 | ||
745 | while (!list_empty(&c->btree_cache)) { | |
746 | b = list_first_entry(&c->btree_cache, struct btree, list); | |
747 | ||
748 | if (btree_node_dirty(b)) | |
749 | btree_complete_write(b, btree_current_write(b)); | |
750 | clear_bit(BTREE_NODE_dirty, &b->flags); | |
751 | ||
752 | mca_data_free(b); | |
753 | } | |
754 | ||
755 | while (!list_empty(&c->btree_cache_freed)) { | |
756 | b = list_first_entry(&c->btree_cache_freed, | |
757 | struct btree, list); | |
758 | list_del(&b->list); | |
759 | cancel_delayed_work_sync(&b->work); | |
760 | kfree(b); | |
761 | } | |
762 | ||
763 | mutex_unlock(&c->bucket_lock); | |
764 | } | |
765 | ||
766 | int bch_btree_cache_alloc(struct cache_set *c) | |
767 | { | |
768 | unsigned i; | |
769 | ||
cafe5635 | 770 | for (i = 0; i < mca_reserve(c); i++) |
72a44517 KO |
771 | if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) |
772 | return -ENOMEM; | |
cafe5635 KO |
773 | |
774 | list_splice_init(&c->btree_cache, | |
775 | &c->btree_cache_freeable); | |
776 | ||
777 | #ifdef CONFIG_BCACHE_DEBUG | |
778 | mutex_init(&c->verify_lock); | |
779 | ||
78b77bf8 KO |
780 | c->verify_ondisk = (void *) |
781 | __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); | |
782 | ||
cafe5635 KO |
783 | c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); |
784 | ||
785 | if (c->verify_data && | |
a85e968e | 786 | c->verify_data->keys.set->data) |
cafe5635 KO |
787 | list_del_init(&c->verify_data->list); |
788 | else | |
789 | c->verify_data = NULL; | |
790 | #endif | |
791 | ||
7dc19d5a DC |
792 | c->shrink.count_objects = bch_mca_count; |
793 | c->shrink.scan_objects = bch_mca_scan; | |
cafe5635 KO |
794 | c->shrink.seeks = 4; |
795 | c->shrink.batch = c->btree_pages * 2; | |
796 | register_shrinker(&c->shrink); | |
797 | ||
798 | return 0; | |
799 | } | |
800 | ||
801 | /* Btree in memory cache - hash table */ | |
802 | ||
803 | static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) | |
804 | { | |
805 | return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; | |
806 | } | |
807 | ||
808 | static struct btree *mca_find(struct cache_set *c, struct bkey *k) | |
809 | { | |
810 | struct btree *b; | |
811 | ||
812 | rcu_read_lock(); | |
813 | hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) | |
814 | if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) | |
815 | goto out; | |
816 | b = NULL; | |
817 | out: | |
818 | rcu_read_unlock(); | |
819 | return b; | |
820 | } | |
821 | ||
e8e1d468 | 822 | static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) |
cafe5635 | 823 | { |
e8e1d468 | 824 | struct btree *b; |
cafe5635 | 825 | |
c37511b8 KO |
826 | trace_bcache_btree_cache_cannibalize(c); |
827 | ||
e8e1d468 KO |
828 | if (!c->try_harder) { |
829 | c->try_harder = current; | |
830 | c->try_harder_start = local_clock(); | |
831 | } else if (c->try_harder != current) | |
832 | return ERR_PTR(-ENOSPC); | |
cafe5635 | 833 | |
e8e1d468 KO |
834 | list_for_each_entry_reverse(b, &c->btree_cache, list) |
835 | if (!mca_reap(b, btree_order(k), false)) | |
836 | return b; | |
cafe5635 | 837 | |
e8e1d468 KO |
838 | list_for_each_entry_reverse(b, &c->btree_cache, list) |
839 | if (!mca_reap(b, btree_order(k), true)) | |
840 | return b; | |
cafe5635 | 841 | |
e8e1d468 | 842 | return ERR_PTR(-ENOMEM); |
cafe5635 KO |
843 | } |
844 | ||
845 | /* | |
846 | * We can only have one thread cannibalizing other cached btree nodes at a time, | |
847 | * or we'll deadlock. We use an open coded mutex to ensure that, which a | |
848 | * cannibalize_bucket() will take. This means every time we unlock the root of | |
849 | * the btree, we need to release this lock if we have it held. | |
850 | */ | |
df8e8970 | 851 | static void bch_cannibalize_unlock(struct cache_set *c) |
cafe5635 | 852 | { |
e8e1d468 | 853 | if (c->try_harder == current) { |
169ef1cf | 854 | bch_time_stats_update(&c->try_harder_time, c->try_harder_start); |
cafe5635 | 855 | c->try_harder = NULL; |
e8e1d468 | 856 | wake_up(&c->try_wait); |
cafe5635 KO |
857 | } |
858 | } | |
859 | ||
e8e1d468 | 860 | static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) |
cafe5635 KO |
861 | { |
862 | struct btree *b; | |
863 | ||
e8e1d468 KO |
864 | BUG_ON(current->bio_list); |
865 | ||
cafe5635 KO |
866 | lockdep_assert_held(&c->bucket_lock); |
867 | ||
868 | if (mca_find(c, k)) | |
869 | return NULL; | |
870 | ||
871 | /* btree_free() doesn't free memory; it sticks the node on the end of | |
872 | * the list. Check if there's any freed nodes there: | |
873 | */ | |
874 | list_for_each_entry(b, &c->btree_cache_freeable, list) | |
e8e1d468 | 875 | if (!mca_reap(b, btree_order(k), false)) |
cafe5635 KO |
876 | goto out; |
877 | ||
878 | /* We never free struct btree itself, just the memory that holds the on | |
879 | * disk node. Check the freed list before allocating a new one: | |
880 | */ | |
881 | list_for_each_entry(b, &c->btree_cache_freed, list) | |
e8e1d468 | 882 | if (!mca_reap(b, 0, false)) { |
cafe5635 | 883 | mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); |
a85e968e | 884 | if (!b->keys.set[0].data) |
cafe5635 KO |
885 | goto err; |
886 | else | |
887 | goto out; | |
888 | } | |
889 | ||
890 | b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); | |
891 | if (!b) | |
892 | goto err; | |
893 | ||
894 | BUG_ON(!down_write_trylock(&b->lock)); | |
a85e968e | 895 | if (!b->keys.set->data) |
cafe5635 KO |
896 | goto err; |
897 | out: | |
cb7a583e | 898 | BUG_ON(b->io_mutex.count != 1); |
cafe5635 KO |
899 | |
900 | bkey_copy(&b->key, k); | |
901 | list_move(&b->list, &c->btree_cache); | |
902 | hlist_del_init_rcu(&b->hash); | |
903 | hlist_add_head_rcu(&b->hash, mca_hash(c, k)); | |
904 | ||
905 | lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); | |
d6fd3b11 | 906 | b->parent = (void *) ~0UL; |
a85e968e KO |
907 | b->flags = 0; |
908 | b->written = 0; | |
909 | b->level = level; | |
cafe5635 | 910 | |
65d45231 | 911 | if (!b->level) |
a85e968e KO |
912 | bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, |
913 | &b->c->expensive_debug_checks); | |
65d45231 | 914 | else |
a85e968e KO |
915 | bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, |
916 | &b->c->expensive_debug_checks); | |
cafe5635 KO |
917 | |
918 | return b; | |
919 | err: | |
920 | if (b) | |
921 | rw_unlock(true, b); | |
922 | ||
e8e1d468 | 923 | b = mca_cannibalize(c, k); |
cafe5635 KO |
924 | if (!IS_ERR(b)) |
925 | goto out; | |
926 | ||
927 | return b; | |
928 | } | |
929 | ||
930 | /** | |
931 | * bch_btree_node_get - find a btree node in the cache and lock it, reading it | |
932 | * in from disk if necessary. | |
933 | * | |
b54d6934 | 934 | * If IO is necessary and running under generic_make_request, returns -EAGAIN. |
cafe5635 KO |
935 | * |
936 | * The btree node will have either a read or a write lock held, depending on | |
937 | * level and op->lock. | |
938 | */ | |
939 | struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, | |
e8e1d468 | 940 | int level, bool write) |
cafe5635 KO |
941 | { |
942 | int i = 0; | |
cafe5635 KO |
943 | struct btree *b; |
944 | ||
945 | BUG_ON(level < 0); | |
946 | retry: | |
947 | b = mca_find(c, k); | |
948 | ||
949 | if (!b) { | |
57943511 KO |
950 | if (current->bio_list) |
951 | return ERR_PTR(-EAGAIN); | |
952 | ||
cafe5635 | 953 | mutex_lock(&c->bucket_lock); |
e8e1d468 | 954 | b = mca_alloc(c, k, level); |
cafe5635 KO |
955 | mutex_unlock(&c->bucket_lock); |
956 | ||
957 | if (!b) | |
958 | goto retry; | |
959 | if (IS_ERR(b)) | |
960 | return b; | |
961 | ||
57943511 | 962 | bch_btree_node_read(b); |
cafe5635 KO |
963 | |
964 | if (!write) | |
965 | downgrade_write(&b->lock); | |
966 | } else { | |
967 | rw_lock(write, b, level); | |
968 | if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { | |
969 | rw_unlock(write, b); | |
970 | goto retry; | |
971 | } | |
972 | BUG_ON(b->level != level); | |
973 | } | |
974 | ||
975 | b->accessed = 1; | |
976 | ||
a85e968e KO |
977 | for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { |
978 | prefetch(b->keys.set[i].tree); | |
979 | prefetch(b->keys.set[i].data); | |
cafe5635 KO |
980 | } |
981 | ||
a85e968e KO |
982 | for (; i <= b->keys.nsets; i++) |
983 | prefetch(b->keys.set[i].data); | |
cafe5635 | 984 | |
57943511 | 985 | if (btree_node_io_error(b)) { |
cafe5635 | 986 | rw_unlock(write, b); |
57943511 KO |
987 | return ERR_PTR(-EIO); |
988 | } | |
989 | ||
990 | BUG_ON(!b->written); | |
cafe5635 KO |
991 | |
992 | return b; | |
993 | } | |
994 | ||
995 | static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) | |
996 | { | |
997 | struct btree *b; | |
998 | ||
999 | mutex_lock(&c->bucket_lock); | |
e8e1d468 | 1000 | b = mca_alloc(c, k, level); |
cafe5635 KO |
1001 | mutex_unlock(&c->bucket_lock); |
1002 | ||
1003 | if (!IS_ERR_OR_NULL(b)) { | |
57943511 | 1004 | bch_btree_node_read(b); |
cafe5635 KO |
1005 | rw_unlock(true, b); |
1006 | } | |
1007 | } | |
1008 | ||
1009 | /* Btree alloc */ | |
1010 | ||
e8e1d468 | 1011 | static void btree_node_free(struct btree *b) |
cafe5635 KO |
1012 | { |
1013 | unsigned i; | |
1014 | ||
c37511b8 KO |
1015 | trace_bcache_btree_node_free(b); |
1016 | ||
cafe5635 | 1017 | BUG_ON(b == b->c->root); |
cafe5635 KO |
1018 | |
1019 | if (btree_node_dirty(b)) | |
1020 | btree_complete_write(b, btree_current_write(b)); | |
1021 | clear_bit(BTREE_NODE_dirty, &b->flags); | |
1022 | ||
cafe5635 KO |
1023 | cancel_delayed_work(&b->work); |
1024 | ||
1025 | mutex_lock(&b->c->bucket_lock); | |
1026 | ||
1027 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | |
1028 | BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); | |
1029 | ||
1030 | bch_inc_gen(PTR_CACHE(b->c, &b->key, i), | |
1031 | PTR_BUCKET(b->c, &b->key, i)); | |
1032 | } | |
1033 | ||
1034 | bch_bucket_free(b->c, &b->key); | |
1035 | mca_bucket_free(b); | |
1036 | mutex_unlock(&b->c->bucket_lock); | |
1037 | } | |
1038 | ||
bc9389ee | 1039 | struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) |
cafe5635 KO |
1040 | { |
1041 | BKEY_PADDED(key) k; | |
1042 | struct btree *b = ERR_PTR(-EAGAIN); | |
1043 | ||
1044 | mutex_lock(&c->bucket_lock); | |
1045 | retry: | |
78365411 | 1046 | if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) |
cafe5635 KO |
1047 | goto err; |
1048 | ||
3a3b6a4e | 1049 | bkey_put(c, &k.key); |
cafe5635 KO |
1050 | SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); |
1051 | ||
e8e1d468 | 1052 | b = mca_alloc(c, &k.key, level); |
cafe5635 KO |
1053 | if (IS_ERR(b)) |
1054 | goto err_free; | |
1055 | ||
1056 | if (!b) { | |
b1a67b0f KO |
1057 | cache_bug(c, |
1058 | "Tried to allocate bucket that was in btree cache"); | |
cafe5635 KO |
1059 | goto retry; |
1060 | } | |
1061 | ||
cafe5635 | 1062 | b->accessed = 1; |
a85e968e | 1063 | bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); |
cafe5635 KO |
1064 | |
1065 | mutex_unlock(&c->bucket_lock); | |
c37511b8 KO |
1066 | |
1067 | trace_bcache_btree_node_alloc(b); | |
cafe5635 KO |
1068 | return b; |
1069 | err_free: | |
1070 | bch_bucket_free(c, &k.key); | |
cafe5635 KO |
1071 | err: |
1072 | mutex_unlock(&c->bucket_lock); | |
c37511b8 KO |
1073 | |
1074 | trace_bcache_btree_node_alloc_fail(b); | |
cafe5635 KO |
1075 | return b; |
1076 | } | |
1077 | ||
bc9389ee | 1078 | static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) |
cafe5635 | 1079 | { |
bc9389ee | 1080 | struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); |
67539e85 | 1081 | if (!IS_ERR_OR_NULL(n)) { |
89ebb4a2 | 1082 | bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); |
67539e85 KO |
1083 | bkey_copy_key(&n->key, &b->key); |
1084 | } | |
cafe5635 KO |
1085 | |
1086 | return n; | |
1087 | } | |
1088 | ||
8835c123 KO |
1089 | static void make_btree_freeing_key(struct btree *b, struct bkey *k) |
1090 | { | |
1091 | unsigned i; | |
1092 | ||
1093 | bkey_copy(k, &b->key); | |
1094 | bkey_copy_key(k, &ZERO_KEY); | |
1095 | ||
1096 | for (i = 0; i < KEY_PTRS(k); i++) { | |
1097 | uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1; | |
1098 | ||
1099 | SET_PTR_GEN(k, i, g); | |
1100 | } | |
1101 | ||
1102 | atomic_inc(&b->c->prio_blocked); | |
1103 | } | |
1104 | ||
78365411 KO |
1105 | static int btree_check_reserve(struct btree *b, struct btree_op *op) |
1106 | { | |
1107 | struct cache_set *c = b->c; | |
1108 | struct cache *ca; | |
1109 | unsigned i, reserve = c->root->level * 2 + 1; | |
1110 | int ret = 0; | |
1111 | ||
1112 | mutex_lock(&c->bucket_lock); | |
1113 | ||
1114 | for_each_cache(ca, c, i) | |
1115 | if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { | |
1116 | if (op) | |
1117 | prepare_to_wait(&c->bucket_wait, &op->wait, | |
1118 | TASK_UNINTERRUPTIBLE); | |
1119 | ret = -EINTR; | |
1120 | break; | |
1121 | } | |
1122 | ||
1123 | mutex_unlock(&c->bucket_lock); | |
1124 | return ret; | |
1125 | } | |
1126 | ||
cafe5635 KO |
1127 | /* Garbage collection */ |
1128 | ||
1129 | uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | |
1130 | { | |
1131 | uint8_t stale = 0; | |
1132 | unsigned i; | |
1133 | struct bucket *g; | |
1134 | ||
1135 | /* | |
1136 | * ptr_invalid() can't return true for the keys that mark btree nodes as | |
1137 | * freed, but since ptr_bad() returns true we'll never actually use them | |
1138 | * for anything and thus we don't want mark their pointers here | |
1139 | */ | |
1140 | if (!bkey_cmp(k, &ZERO_KEY)) | |
1141 | return stale; | |
1142 | ||
1143 | for (i = 0; i < KEY_PTRS(k); i++) { | |
1144 | if (!ptr_available(c, k, i)) | |
1145 | continue; | |
1146 | ||
1147 | g = PTR_BUCKET(c, k, i); | |
1148 | ||
1149 | if (gen_after(g->gc_gen, PTR_GEN(k, i))) | |
1150 | g->gc_gen = PTR_GEN(k, i); | |
1151 | ||
1152 | if (ptr_stale(c, k, i)) { | |
1153 | stale = max(stale, ptr_stale(c, k, i)); | |
1154 | continue; | |
1155 | } | |
1156 | ||
1157 | cache_bug_on(GC_MARK(g) && | |
1158 | (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), | |
1159 | c, "inconsistent ptrs: mark = %llu, level = %i", | |
1160 | GC_MARK(g), level); | |
1161 | ||
1162 | if (level) | |
1163 | SET_GC_MARK(g, GC_MARK_METADATA); | |
1164 | else if (KEY_DIRTY(k)) | |
1165 | SET_GC_MARK(g, GC_MARK_DIRTY); | |
1166 | ||
1167 | /* guard against overflow */ | |
1168 | SET_GC_SECTORS_USED(g, min_t(unsigned, | |
1169 | GC_SECTORS_USED(g) + KEY_SIZE(k), | |
94717447 | 1170 | MAX_GC_SECTORS_USED)); |
cafe5635 KO |
1171 | |
1172 | BUG_ON(!GC_SECTORS_USED(g)); | |
1173 | } | |
1174 | ||
1175 | return stale; | |
1176 | } | |
1177 | ||
1178 | #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) | |
1179 | ||
a1f0358b | 1180 | static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) |
cafe5635 KO |
1181 | { |
1182 | uint8_t stale = 0; | |
a1f0358b | 1183 | unsigned keys = 0, good_keys = 0; |
cafe5635 KO |
1184 | struct bkey *k; |
1185 | struct btree_iter iter; | |
1186 | struct bset_tree *t; | |
1187 | ||
1188 | gc->nodes++; | |
1189 | ||
c052dd9a | 1190 | for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { |
cafe5635 | 1191 | stale = max(stale, btree_mark_key(b, k)); |
a1f0358b | 1192 | keys++; |
cafe5635 | 1193 | |
a85e968e | 1194 | if (bch_ptr_bad(&b->keys, k)) |
cafe5635 KO |
1195 | continue; |
1196 | ||
cafe5635 KO |
1197 | gc->key_bytes += bkey_u64s(k); |
1198 | gc->nkeys++; | |
a1f0358b | 1199 | good_keys++; |
cafe5635 KO |
1200 | |
1201 | gc->data += KEY_SIZE(k); | |
cafe5635 KO |
1202 | } |
1203 | ||
a85e968e | 1204 | for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) |
cafe5635 | 1205 | btree_bug_on(t->size && |
a85e968e | 1206 | bset_written(&b->keys, t) && |
cafe5635 KO |
1207 | bkey_cmp(&b->key, &t->end) < 0, |
1208 | b, "found short btree key in gc"); | |
1209 | ||
a1f0358b KO |
1210 | if (b->c->gc_always_rewrite) |
1211 | return true; | |
cafe5635 | 1212 | |
a1f0358b KO |
1213 | if (stale > 10) |
1214 | return true; | |
cafe5635 | 1215 | |
a1f0358b KO |
1216 | if ((keys - good_keys) * 2 > keys) |
1217 | return true; | |
cafe5635 | 1218 | |
a1f0358b | 1219 | return false; |
cafe5635 KO |
1220 | } |
1221 | ||
a1f0358b | 1222 | #define GC_MERGE_NODES 4U |
cafe5635 KO |
1223 | |
1224 | struct gc_merge_info { | |
1225 | struct btree *b; | |
cafe5635 KO |
1226 | unsigned keys; |
1227 | }; | |
1228 | ||
a1f0358b KO |
1229 | static int bch_btree_insert_node(struct btree *, struct btree_op *, |
1230 | struct keylist *, atomic_t *, struct bkey *); | |
1231 | ||
1232 | static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | |
1233 | struct keylist *keylist, struct gc_stat *gc, | |
1234 | struct gc_merge_info *r) | |
cafe5635 | 1235 | { |
a1f0358b KO |
1236 | unsigned i, nodes = 0, keys = 0, blocks; |
1237 | struct btree *new_nodes[GC_MERGE_NODES]; | |
b54d6934 | 1238 | struct closure cl; |
a1f0358b | 1239 | struct bkey *k; |
b54d6934 | 1240 | |
a1f0358b | 1241 | memset(new_nodes, 0, sizeof(new_nodes)); |
b54d6934 | 1242 | closure_init_stack(&cl); |
cafe5635 | 1243 | |
a1f0358b | 1244 | while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) |
cafe5635 KO |
1245 | keys += r[nodes++].keys; |
1246 | ||
1247 | blocks = btree_default_blocks(b->c) * 2 / 3; | |
1248 | ||
1249 | if (nodes < 2 || | |
a85e968e | 1250 | __set_blocks(b->keys.set[0].data, keys, |
ee811287 | 1251 | block_bytes(b->c)) > blocks * (nodes - 1)) |
a1f0358b | 1252 | return 0; |
cafe5635 | 1253 | |
a1f0358b | 1254 | for (i = 0; i < nodes; i++) { |
bc9389ee | 1255 | new_nodes[i] = btree_node_alloc_replacement(r[i].b, false); |
a1f0358b KO |
1256 | if (IS_ERR_OR_NULL(new_nodes[i])) |
1257 | goto out_nocoalesce; | |
cafe5635 KO |
1258 | } |
1259 | ||
1260 | for (i = nodes - 1; i > 0; --i) { | |
ee811287 KO |
1261 | struct bset *n1 = btree_bset_first(new_nodes[i]); |
1262 | struct bset *n2 = btree_bset_first(new_nodes[i - 1]); | |
cafe5635 KO |
1263 | struct bkey *k, *last = NULL; |
1264 | ||
1265 | keys = 0; | |
1266 | ||
a1f0358b KO |
1267 | if (i > 1) { |
1268 | for (k = n2->start; | |
fafff81c | 1269 | k < bset_bkey_last(n2); |
a1f0358b KO |
1270 | k = bkey_next(k)) { |
1271 | if (__set_blocks(n1, n1->keys + keys + | |
ee811287 KO |
1272 | bkey_u64s(k), |
1273 | block_bytes(b->c)) > blocks) | |
a1f0358b KO |
1274 | break; |
1275 | ||
1276 | last = k; | |
1277 | keys += bkey_u64s(k); | |
1278 | } | |
1279 | } else { | |
cafe5635 KO |
1280 | /* |
1281 | * Last node we're not getting rid of - we're getting | |
1282 | * rid of the node at r[0]. Have to try and fit all of | |
1283 | * the remaining keys into this node; we can't ensure | |
1284 | * they will always fit due to rounding and variable | |
1285 | * length keys (shouldn't be possible in practice, | |
1286 | * though) | |
1287 | */ | |
a1f0358b | 1288 | if (__set_blocks(n1, n1->keys + n2->keys, |
ee811287 KO |
1289 | block_bytes(b->c)) > |
1290 | btree_blocks(new_nodes[i])) | |
a1f0358b | 1291 | goto out_nocoalesce; |
cafe5635 KO |
1292 | |
1293 | keys = n2->keys; | |
a1f0358b | 1294 | /* Take the key of the node we're getting rid of */ |
cafe5635 | 1295 | last = &r->b->key; |
a1f0358b | 1296 | } |
cafe5635 | 1297 | |
ee811287 KO |
1298 | BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > |
1299 | btree_blocks(new_nodes[i])); | |
cafe5635 | 1300 | |
a1f0358b KO |
1301 | if (last) |
1302 | bkey_copy_key(&new_nodes[i]->key, last); | |
cafe5635 | 1303 | |
fafff81c | 1304 | memcpy(bset_bkey_last(n1), |
cafe5635 | 1305 | n2->start, |
fafff81c | 1306 | (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); |
cafe5635 KO |
1307 | |
1308 | n1->keys += keys; | |
a1f0358b | 1309 | r[i].keys = n1->keys; |
cafe5635 KO |
1310 | |
1311 | memmove(n2->start, | |
fafff81c KO |
1312 | bset_bkey_idx(n2, keys), |
1313 | (void *) bset_bkey_last(n2) - | |
1314 | (void *) bset_bkey_idx(n2, keys)); | |
cafe5635 KO |
1315 | |
1316 | n2->keys -= keys; | |
1317 | ||
085d2a3d KO |
1318 | if (__bch_keylist_realloc(keylist, |
1319 | bkey_u64s(&new_nodes[i]->key))) | |
a1f0358b KO |
1320 | goto out_nocoalesce; |
1321 | ||
1322 | bch_btree_node_write(new_nodes[i], &cl); | |
1323 | bch_keylist_add(keylist, &new_nodes[i]->key); | |
cafe5635 KO |
1324 | } |
1325 | ||
a1f0358b | 1326 | for (i = 0; i < nodes; i++) { |
085d2a3d | 1327 | if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key))) |
a1f0358b | 1328 | goto out_nocoalesce; |
cafe5635 | 1329 | |
a1f0358b KO |
1330 | make_btree_freeing_key(r[i].b, keylist->top); |
1331 | bch_keylist_push(keylist); | |
1332 | } | |
cafe5635 | 1333 | |
a1f0358b | 1334 | /* We emptied out this node */ |
ee811287 | 1335 | BUG_ON(btree_bset_first(new_nodes[0])->keys); |
a1f0358b KO |
1336 | btree_node_free(new_nodes[0]); |
1337 | rw_unlock(true, new_nodes[0]); | |
1338 | ||
1339 | closure_sync(&cl); | |
1340 | ||
1341 | for (i = 0; i < nodes; i++) { | |
1342 | btree_node_free(r[i].b); | |
1343 | rw_unlock(true, r[i].b); | |
1344 | ||
1345 | r[i].b = new_nodes[i]; | |
1346 | } | |
1347 | ||
1348 | bch_btree_insert_node(b, op, keylist, NULL, NULL); | |
1349 | BUG_ON(!bch_keylist_empty(keylist)); | |
1350 | ||
1351 | memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); | |
1352 | r[nodes - 1].b = ERR_PTR(-EINTR); | |
1353 | ||
1354 | trace_bcache_btree_gc_coalesce(nodes); | |
cafe5635 | 1355 | gc->nodes--; |
cafe5635 | 1356 | |
a1f0358b KO |
1357 | /* Invalidated our iterator */ |
1358 | return -EINTR; | |
1359 | ||
1360 | out_nocoalesce: | |
1361 | closure_sync(&cl); | |
1362 | ||
1363 | while ((k = bch_keylist_pop(keylist))) | |
1364 | if (!bkey_cmp(k, &ZERO_KEY)) | |
1365 | atomic_dec(&b->c->prio_blocked); | |
1366 | ||
1367 | for (i = 0; i < nodes; i++) | |
1368 | if (!IS_ERR_OR_NULL(new_nodes[i])) { | |
1369 | btree_node_free(new_nodes[i]); | |
1370 | rw_unlock(true, new_nodes[i]); | |
1371 | } | |
1372 | return 0; | |
cafe5635 KO |
1373 | } |
1374 | ||
a1f0358b | 1375 | static unsigned btree_gc_count_keys(struct btree *b) |
cafe5635 | 1376 | { |
a1f0358b KO |
1377 | struct bkey *k; |
1378 | struct btree_iter iter; | |
1379 | unsigned ret = 0; | |
cafe5635 | 1380 | |
c052dd9a | 1381 | for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) |
a1f0358b KO |
1382 | ret += bkey_u64s(k); |
1383 | ||
1384 | return ret; | |
1385 | } | |
cafe5635 | 1386 | |
a1f0358b KO |
1387 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, |
1388 | struct closure *writes, struct gc_stat *gc) | |
1389 | { | |
cafe5635 | 1390 | unsigned i; |
a1f0358b KO |
1391 | int ret = 0; |
1392 | bool should_rewrite; | |
1393 | struct btree *n; | |
1394 | struct bkey *k; | |
1395 | struct keylist keys; | |
1396 | struct btree_iter iter; | |
cafe5635 | 1397 | struct gc_merge_info r[GC_MERGE_NODES]; |
a1f0358b | 1398 | struct gc_merge_info *last = r + GC_MERGE_NODES - 1; |
cafe5635 | 1399 | |
a1f0358b | 1400 | bch_keylist_init(&keys); |
c052dd9a | 1401 | bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); |
cafe5635 | 1402 | |
a1f0358b KO |
1403 | for (i = 0; i < GC_MERGE_NODES; i++) |
1404 | r[i].b = ERR_PTR(-EINTR); | |
cafe5635 | 1405 | |
a1f0358b | 1406 | while (1) { |
a85e968e | 1407 | k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); |
a1f0358b KO |
1408 | if (k) { |
1409 | r->b = bch_btree_node_get(b->c, k, b->level - 1, true); | |
1410 | if (IS_ERR(r->b)) { | |
1411 | ret = PTR_ERR(r->b); | |
1412 | break; | |
1413 | } | |
1414 | ||
1415 | r->keys = btree_gc_count_keys(r->b); | |
1416 | ||
1417 | ret = btree_gc_coalesce(b, op, &keys, gc, r); | |
1418 | if (ret) | |
1419 | break; | |
cafe5635 KO |
1420 | } |
1421 | ||
a1f0358b KO |
1422 | if (!last->b) |
1423 | break; | |
cafe5635 | 1424 | |
a1f0358b KO |
1425 | if (!IS_ERR(last->b)) { |
1426 | should_rewrite = btree_gc_mark_node(last->b, gc); | |
78365411 KO |
1427 | if (should_rewrite && |
1428 | !btree_check_reserve(b, NULL)) { | |
bc9389ee KO |
1429 | n = btree_node_alloc_replacement(last->b, |
1430 | false); | |
cafe5635 | 1431 | |
a1f0358b KO |
1432 | if (!IS_ERR_OR_NULL(n)) { |
1433 | bch_btree_node_write_sync(n); | |
1434 | bch_keylist_add(&keys, &n->key); | |
cafe5635 | 1435 | |
a1f0358b KO |
1436 | make_btree_freeing_key(last->b, |
1437 | keys.top); | |
1438 | bch_keylist_push(&keys); | |
1439 | ||
1440 | btree_node_free(last->b); | |
cafe5635 | 1441 | |
a1f0358b KO |
1442 | bch_btree_insert_node(b, op, &keys, |
1443 | NULL, NULL); | |
1444 | BUG_ON(!bch_keylist_empty(&keys)); | |
cafe5635 | 1445 | |
a1f0358b KO |
1446 | rw_unlock(true, last->b); |
1447 | last->b = n; | |
cafe5635 | 1448 | |
a1f0358b KO |
1449 | /* Invalidated our iterator */ |
1450 | ret = -EINTR; | |
1451 | break; | |
1452 | } | |
1453 | } | |
1454 | ||
1455 | if (last->b->level) { | |
1456 | ret = btree_gc_recurse(last->b, op, writes, gc); | |
1457 | if (ret) | |
1458 | break; | |
1459 | } | |
cafe5635 | 1460 | |
a1f0358b KO |
1461 | bkey_copy_key(&b->c->gc_done, &last->b->key); |
1462 | ||
1463 | /* | |
1464 | * Must flush leaf nodes before gc ends, since replace | |
1465 | * operations aren't journalled | |
1466 | */ | |
1467 | if (btree_node_dirty(last->b)) | |
1468 | bch_btree_node_write(last->b, writes); | |
1469 | rw_unlock(true, last->b); | |
1470 | } | |
1471 | ||
1472 | memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); | |
1473 | r->b = NULL; | |
cafe5635 | 1474 | |
cafe5635 KO |
1475 | if (need_resched()) { |
1476 | ret = -EAGAIN; | |
1477 | break; | |
1478 | } | |
cafe5635 KO |
1479 | } |
1480 | ||
a1f0358b KO |
1481 | for (i = 0; i < GC_MERGE_NODES; i++) |
1482 | if (!IS_ERR_OR_NULL(r[i].b)) { | |
1483 | if (btree_node_dirty(r[i].b)) | |
1484 | bch_btree_node_write(r[i].b, writes); | |
1485 | rw_unlock(true, r[i].b); | |
1486 | } | |
cafe5635 | 1487 | |
a1f0358b | 1488 | bch_keylist_free(&keys); |
cafe5635 KO |
1489 | |
1490 | return ret; | |
1491 | } | |
1492 | ||
1493 | static int bch_btree_gc_root(struct btree *b, struct btree_op *op, | |
1494 | struct closure *writes, struct gc_stat *gc) | |
1495 | { | |
1496 | struct btree *n = NULL; | |
a1f0358b KO |
1497 | int ret = 0; |
1498 | bool should_rewrite; | |
cafe5635 | 1499 | |
a1f0358b KO |
1500 | should_rewrite = btree_gc_mark_node(b, gc); |
1501 | if (should_rewrite) { | |
bc9389ee | 1502 | n = btree_node_alloc_replacement(b, false); |
cafe5635 | 1503 | |
a1f0358b KO |
1504 | if (!IS_ERR_OR_NULL(n)) { |
1505 | bch_btree_node_write_sync(n); | |
1506 | bch_btree_set_root(n); | |
1507 | btree_node_free(b); | |
1508 | rw_unlock(true, n); | |
cafe5635 | 1509 | |
a1f0358b KO |
1510 | return -EINTR; |
1511 | } | |
1512 | } | |
cafe5635 | 1513 | |
a1f0358b KO |
1514 | if (b->level) { |
1515 | ret = btree_gc_recurse(b, op, writes, gc); | |
1516 | if (ret) | |
1517 | return ret; | |
cafe5635 KO |
1518 | } |
1519 | ||
a1f0358b KO |
1520 | bkey_copy_key(&b->c->gc_done, &b->key); |
1521 | ||
cafe5635 KO |
1522 | return ret; |
1523 | } | |
1524 | ||
1525 | static void btree_gc_start(struct cache_set *c) | |
1526 | { | |
1527 | struct cache *ca; | |
1528 | struct bucket *b; | |
cafe5635 KO |
1529 | unsigned i; |
1530 | ||
1531 | if (!c->gc_mark_valid) | |
1532 | return; | |
1533 | ||
1534 | mutex_lock(&c->bucket_lock); | |
1535 | ||
1536 | c->gc_mark_valid = 0; | |
1537 | c->gc_done = ZERO_KEY; | |
1538 | ||
1539 | for_each_cache(ca, c, i) | |
1540 | for_each_bucket(b, ca) { | |
1541 | b->gc_gen = b->gen; | |
29ebf465 | 1542 | if (!atomic_read(&b->pin)) { |
cafe5635 | 1543 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); |
29ebf465 KO |
1544 | SET_GC_SECTORS_USED(b, 0); |
1545 | } | |
cafe5635 KO |
1546 | } |
1547 | ||
cafe5635 KO |
1548 | mutex_unlock(&c->bucket_lock); |
1549 | } | |
1550 | ||
1551 | size_t bch_btree_gc_finish(struct cache_set *c) | |
1552 | { | |
1553 | size_t available = 0; | |
1554 | struct bucket *b; | |
1555 | struct cache *ca; | |
cafe5635 KO |
1556 | unsigned i; |
1557 | ||
1558 | mutex_lock(&c->bucket_lock); | |
1559 | ||
1560 | set_gc_sectors(c); | |
1561 | c->gc_mark_valid = 1; | |
1562 | c->need_gc = 0; | |
1563 | ||
1564 | if (c->root) | |
1565 | for (i = 0; i < KEY_PTRS(&c->root->key); i++) | |
1566 | SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), | |
1567 | GC_MARK_METADATA); | |
1568 | ||
1569 | for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) | |
1570 | SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), | |
1571 | GC_MARK_METADATA); | |
1572 | ||
bf0a628a NS |
1573 | /* don't reclaim buckets to which writeback keys point */ |
1574 | rcu_read_lock(); | |
1575 | for (i = 0; i < c->nr_uuids; i++) { | |
1576 | struct bcache_device *d = c->devices[i]; | |
1577 | struct cached_dev *dc; | |
1578 | struct keybuf_key *w, *n; | |
1579 | unsigned j; | |
1580 | ||
1581 | if (!d || UUID_FLASH_ONLY(&c->uuids[i])) | |
1582 | continue; | |
1583 | dc = container_of(d, struct cached_dev, disk); | |
1584 | ||
1585 | spin_lock(&dc->writeback_keys.lock); | |
1586 | rbtree_postorder_for_each_entry_safe(w, n, | |
1587 | &dc->writeback_keys.keys, node) | |
1588 | for (j = 0; j < KEY_PTRS(&w->key); j++) | |
1589 | SET_GC_MARK(PTR_BUCKET(c, &w->key, j), | |
1590 | GC_MARK_DIRTY); | |
1591 | spin_unlock(&dc->writeback_keys.lock); | |
1592 | } | |
1593 | rcu_read_unlock(); | |
1594 | ||
cafe5635 KO |
1595 | for_each_cache(ca, c, i) { |
1596 | uint64_t *i; | |
1597 | ||
1598 | ca->invalidate_needs_gc = 0; | |
1599 | ||
1600 | for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) | |
1601 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); | |
1602 | ||
1603 | for (i = ca->prio_buckets; | |
1604 | i < ca->prio_buckets + prio_buckets(ca) * 2; i++) | |
1605 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); | |
1606 | ||
1607 | for_each_bucket(b, ca) { | |
1608 | b->last_gc = b->gc_gen; | |
1609 | c->need_gc = max(c->need_gc, bucket_gc_gen(b)); | |
1610 | ||
1611 | if (!atomic_read(&b->pin) && | |
1612 | GC_MARK(b) == GC_MARK_RECLAIMABLE) { | |
1613 | available++; | |
1614 | if (!GC_SECTORS_USED(b)) | |
1615 | bch_bucket_add_unused(ca, b); | |
1616 | } | |
1617 | } | |
1618 | } | |
1619 | ||
cafe5635 KO |
1620 | mutex_unlock(&c->bucket_lock); |
1621 | return available; | |
1622 | } | |
1623 | ||
72a44517 | 1624 | static void bch_btree_gc(struct cache_set *c) |
cafe5635 | 1625 | { |
cafe5635 KO |
1626 | int ret; |
1627 | unsigned long available; | |
1628 | struct gc_stat stats; | |
1629 | struct closure writes; | |
1630 | struct btree_op op; | |
cafe5635 | 1631 | uint64_t start_time = local_clock(); |
57943511 | 1632 | |
c37511b8 | 1633 | trace_bcache_gc_start(c); |
cafe5635 KO |
1634 | |
1635 | memset(&stats, 0, sizeof(struct gc_stat)); | |
1636 | closure_init_stack(&writes); | |
b54d6934 | 1637 | bch_btree_op_init(&op, SHRT_MAX); |
cafe5635 KO |
1638 | |
1639 | btree_gc_start(c); | |
1640 | ||
a1f0358b KO |
1641 | do { |
1642 | ret = btree_root(gc_root, c, &op, &writes, &stats); | |
1643 | closure_sync(&writes); | |
cafe5635 | 1644 | |
a1f0358b KO |
1645 | if (ret && ret != -EAGAIN) |
1646 | pr_warn("gc failed!"); | |
1647 | } while (ret); | |
cafe5635 KO |
1648 | |
1649 | available = bch_btree_gc_finish(c); | |
57943511 KO |
1650 | wake_up_allocators(c); |
1651 | ||
169ef1cf | 1652 | bch_time_stats_update(&c->btree_gc_time, start_time); |
cafe5635 KO |
1653 | |
1654 | stats.key_bytes *= sizeof(uint64_t); | |
cafe5635 KO |
1655 | stats.data <<= 9; |
1656 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; | |
1657 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); | |
cafe5635 | 1658 | |
c37511b8 | 1659 | trace_bcache_gc_end(c); |
cafe5635 | 1660 | |
72a44517 KO |
1661 | bch_moving_gc(c); |
1662 | } | |
1663 | ||
1664 | static int bch_gc_thread(void *arg) | |
1665 | { | |
1666 | struct cache_set *c = arg; | |
a1f0358b KO |
1667 | struct cache *ca; |
1668 | unsigned i; | |
72a44517 KO |
1669 | |
1670 | while (1) { | |
a1f0358b | 1671 | again: |
72a44517 KO |
1672 | bch_btree_gc(c); |
1673 | ||
1674 | set_current_state(TASK_INTERRUPTIBLE); | |
1675 | if (kthread_should_stop()) | |
1676 | break; | |
1677 | ||
a1f0358b KO |
1678 | mutex_lock(&c->bucket_lock); |
1679 | ||
1680 | for_each_cache(ca, c, i) | |
1681 | if (ca->invalidate_needs_gc) { | |
1682 | mutex_unlock(&c->bucket_lock); | |
1683 | set_current_state(TASK_RUNNING); | |
1684 | goto again; | |
1685 | } | |
1686 | ||
1687 | mutex_unlock(&c->bucket_lock); | |
1688 | ||
72a44517 KO |
1689 | try_to_freeze(); |
1690 | schedule(); | |
1691 | } | |
1692 | ||
1693 | return 0; | |
cafe5635 KO |
1694 | } |
1695 | ||
72a44517 | 1696 | int bch_gc_thread_start(struct cache_set *c) |
cafe5635 | 1697 | { |
72a44517 KO |
1698 | c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); |
1699 | if (IS_ERR(c->gc_thread)) | |
1700 | return PTR_ERR(c->gc_thread); | |
1701 | ||
1702 | set_task_state(c->gc_thread, TASK_INTERRUPTIBLE); | |
1703 | return 0; | |
cafe5635 KO |
1704 | } |
1705 | ||
1706 | /* Initial partial gc */ | |
1707 | ||
1708 | static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, | |
1709 | unsigned long **seen) | |
1710 | { | |
50310164 | 1711 | int ret = 0; |
cafe5635 | 1712 | unsigned i; |
50310164 | 1713 | struct bkey *k, *p = NULL; |
cafe5635 KO |
1714 | struct bucket *g; |
1715 | struct btree_iter iter; | |
1716 | ||
c052dd9a | 1717 | for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { |
cafe5635 KO |
1718 | for (i = 0; i < KEY_PTRS(k); i++) { |
1719 | if (!ptr_available(b->c, k, i)) | |
1720 | continue; | |
1721 | ||
1722 | g = PTR_BUCKET(b->c, k, i); | |
1723 | ||
1724 | if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), | |
1725 | seen[PTR_DEV(k, i)]) || | |
1726 | !ptr_stale(b->c, k, i)) { | |
1727 | g->gen = PTR_GEN(k, i); | |
1728 | ||
1729 | if (b->level) | |
1730 | g->prio = BTREE_PRIO; | |
1731 | else if (g->prio == BTREE_PRIO) | |
1732 | g->prio = INITIAL_PRIO; | |
1733 | } | |
1734 | } | |
1735 | ||
1736 | btree_mark_key(b, k); | |
1737 | } | |
1738 | ||
1739 | if (b->level) { | |
c052dd9a | 1740 | bch_btree_iter_init(&b->keys, &iter, NULL); |
cafe5635 | 1741 | |
50310164 | 1742 | do { |
a85e968e KO |
1743 | k = bch_btree_iter_next_filter(&iter, &b->keys, |
1744 | bch_ptr_bad); | |
50310164 KO |
1745 | if (k) |
1746 | btree_node_prefetch(b->c, k, b->level - 1); | |
cafe5635 | 1747 | |
50310164 KO |
1748 | if (p) |
1749 | ret = btree(check_recurse, p, b, op, seen); | |
cafe5635 | 1750 | |
50310164 KO |
1751 | p = k; |
1752 | } while (p && !ret); | |
cafe5635 KO |
1753 | } |
1754 | ||
1755 | return 0; | |
1756 | } | |
1757 | ||
c18536a7 | 1758 | int bch_btree_check(struct cache_set *c) |
cafe5635 KO |
1759 | { |
1760 | int ret = -ENOMEM; | |
1761 | unsigned i; | |
1762 | unsigned long *seen[MAX_CACHES_PER_SET]; | |
c18536a7 | 1763 | struct btree_op op; |
cafe5635 KO |
1764 | |
1765 | memset(seen, 0, sizeof(seen)); | |
b54d6934 | 1766 | bch_btree_op_init(&op, SHRT_MAX); |
cafe5635 KO |
1767 | |
1768 | for (i = 0; c->cache[i]; i++) { | |
1769 | size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); | |
1770 | seen[i] = kmalloc(n, GFP_KERNEL); | |
1771 | if (!seen[i]) | |
1772 | goto err; | |
1773 | ||
1774 | /* Disables the seen array until prio_read() uses it too */ | |
1775 | memset(seen[i], 0xFF, n); | |
1776 | } | |
1777 | ||
c18536a7 | 1778 | ret = btree_root(check_recurse, c, &op, seen); |
cafe5635 KO |
1779 | err: |
1780 | for (i = 0; i < MAX_CACHES_PER_SET; i++) | |
1781 | kfree(seen[i]); | |
1782 | return ret; | |
1783 | } | |
1784 | ||
1785 | /* Btree insertion */ | |
1786 | ||
829a60b9 KO |
1787 | static bool btree_insert_key(struct btree *b, struct bkey *k, |
1788 | struct bkey *replace_key) | |
cafe5635 | 1789 | { |
829a60b9 | 1790 | unsigned status; |
cafe5635 KO |
1791 | |
1792 | BUG_ON(bkey_cmp(k, &b->key) > 0); | |
1fa8455d | 1793 | |
829a60b9 KO |
1794 | status = bch_btree_insert_key(&b->keys, k, replace_key); |
1795 | if (status != BTREE_INSERT_STATUS_NO_INSERT) { | |
1796 | bch_check_keys(&b->keys, "%u for %s", status, | |
1797 | replace_key ? "replace" : "insert"); | |
cafe5635 | 1798 | |
829a60b9 KO |
1799 | trace_bcache_btree_insert_key(b, k, replace_key != NULL, |
1800 | status); | |
1801 | return true; | |
1802 | } else | |
1803 | return false; | |
cafe5635 KO |
1804 | } |
1805 | ||
59158fde KO |
1806 | static size_t insert_u64s_remaining(struct btree *b) |
1807 | { | |
3572324a | 1808 | long ret = bch_btree_keys_u64s_remaining(&b->keys); |
59158fde KO |
1809 | |
1810 | /* | |
1811 | * Might land in the middle of an existing extent and have to split it | |
1812 | */ | |
1813 | if (b->keys.ops->is_extents) | |
1814 | ret -= KEY_MAX_U64S; | |
1815 | ||
1816 | return max(ret, 0L); | |
1817 | } | |
1818 | ||
26c949f8 | 1819 | static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, |
1b207d80 KO |
1820 | struct keylist *insert_keys, |
1821 | struct bkey *replace_key) | |
cafe5635 KO |
1822 | { |
1823 | bool ret = false; | |
dc9d98d6 | 1824 | int oldsize = bch_count_data(&b->keys); |
cafe5635 | 1825 | |
26c949f8 | 1826 | while (!bch_keylist_empty(insert_keys)) { |
c2f95ae2 | 1827 | struct bkey *k = insert_keys->keys; |
26c949f8 | 1828 | |
59158fde | 1829 | if (bkey_u64s(k) > insert_u64s_remaining(b)) |
403b6cde KO |
1830 | break; |
1831 | ||
1832 | if (bkey_cmp(k, &b->key) <= 0) { | |
3a3b6a4e KO |
1833 | if (!b->level) |
1834 | bkey_put(b->c, k); | |
26c949f8 | 1835 | |
829a60b9 | 1836 | ret |= btree_insert_key(b, k, replace_key); |
26c949f8 KO |
1837 | bch_keylist_pop_front(insert_keys); |
1838 | } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { | |
26c949f8 | 1839 | BKEY_PADDED(key) temp; |
c2f95ae2 | 1840 | bkey_copy(&temp.key, insert_keys->keys); |
26c949f8 KO |
1841 | |
1842 | bch_cut_back(&b->key, &temp.key); | |
c2f95ae2 | 1843 | bch_cut_front(&b->key, insert_keys->keys); |
26c949f8 | 1844 | |
829a60b9 | 1845 | ret |= btree_insert_key(b, &temp.key, replace_key); |
26c949f8 KO |
1846 | break; |
1847 | } else { | |
1848 | break; | |
1849 | } | |
cafe5635 KO |
1850 | } |
1851 | ||
829a60b9 KO |
1852 | if (!ret) |
1853 | op->insert_collision = true; | |
1854 | ||
403b6cde KO |
1855 | BUG_ON(!bch_keylist_empty(insert_keys) && b->level); |
1856 | ||
dc9d98d6 | 1857 | BUG_ON(bch_count_data(&b->keys) < oldsize); |
cafe5635 KO |
1858 | return ret; |
1859 | } | |
1860 | ||
26c949f8 KO |
1861 | static int btree_split(struct btree *b, struct btree_op *op, |
1862 | struct keylist *insert_keys, | |
1b207d80 | 1863 | struct bkey *replace_key) |
cafe5635 | 1864 | { |
d6fd3b11 | 1865 | bool split; |
cafe5635 KO |
1866 | struct btree *n1, *n2 = NULL, *n3 = NULL; |
1867 | uint64_t start_time = local_clock(); | |
b54d6934 | 1868 | struct closure cl; |
17e21a9f | 1869 | struct keylist parent_keys; |
b54d6934 KO |
1870 | |
1871 | closure_init_stack(&cl); | |
17e21a9f | 1872 | bch_keylist_init(&parent_keys); |
cafe5635 | 1873 | |
78365411 KO |
1874 | if (!b->level && |
1875 | btree_check_reserve(b, op)) | |
1876 | return -EINTR; | |
1877 | ||
bc9389ee | 1878 | n1 = btree_node_alloc_replacement(b, true); |
cafe5635 KO |
1879 | if (IS_ERR(n1)) |
1880 | goto err; | |
1881 | ||
ee811287 KO |
1882 | split = set_blocks(btree_bset_first(n1), |
1883 | block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; | |
cafe5635 | 1884 | |
cafe5635 KO |
1885 | if (split) { |
1886 | unsigned keys = 0; | |
1887 | ||
ee811287 | 1888 | trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); |
c37511b8 | 1889 | |
bc9389ee | 1890 | n2 = bch_btree_node_alloc(b->c, b->level, true); |
cafe5635 KO |
1891 | if (IS_ERR(n2)) |
1892 | goto err_free1; | |
1893 | ||
d6fd3b11 | 1894 | if (!b->parent) { |
bc9389ee | 1895 | n3 = bch_btree_node_alloc(b->c, b->level + 1, true); |
cafe5635 KO |
1896 | if (IS_ERR(n3)) |
1897 | goto err_free2; | |
1898 | } | |
1899 | ||
1b207d80 | 1900 | bch_btree_insert_keys(n1, op, insert_keys, replace_key); |
cafe5635 | 1901 | |
d6fd3b11 KO |
1902 | /* |
1903 | * Has to be a linear search because we don't have an auxiliary | |
cafe5635 KO |
1904 | * search tree yet |
1905 | */ | |
1906 | ||
ee811287 KO |
1907 | while (keys < (btree_bset_first(n1)->keys * 3) / 5) |
1908 | keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), | |
fafff81c | 1909 | keys)); |
cafe5635 | 1910 | |
fafff81c | 1911 | bkey_copy_key(&n1->key, |
ee811287 KO |
1912 | bset_bkey_idx(btree_bset_first(n1), keys)); |
1913 | keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); | |
cafe5635 | 1914 | |
ee811287 KO |
1915 | btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; |
1916 | btree_bset_first(n1)->keys = keys; | |
cafe5635 | 1917 | |
ee811287 KO |
1918 | memcpy(btree_bset_first(n2)->start, |
1919 | bset_bkey_last(btree_bset_first(n1)), | |
1920 | btree_bset_first(n2)->keys * sizeof(uint64_t)); | |
cafe5635 KO |
1921 | |
1922 | bkey_copy_key(&n2->key, &b->key); | |
1923 | ||
17e21a9f | 1924 | bch_keylist_add(&parent_keys, &n2->key); |
b54d6934 | 1925 | bch_btree_node_write(n2, &cl); |
cafe5635 | 1926 | rw_unlock(true, n2); |
c37511b8 | 1927 | } else { |
ee811287 | 1928 | trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); |
c37511b8 | 1929 | |
1b207d80 | 1930 | bch_btree_insert_keys(n1, op, insert_keys, replace_key); |
c37511b8 | 1931 | } |
cafe5635 | 1932 | |
17e21a9f | 1933 | bch_keylist_add(&parent_keys, &n1->key); |
b54d6934 | 1934 | bch_btree_node_write(n1, &cl); |
cafe5635 KO |
1935 | |
1936 | if (n3) { | |
d6fd3b11 | 1937 | /* Depth increases, make a new root */ |
cafe5635 | 1938 | bkey_copy_key(&n3->key, &MAX_KEY); |
17e21a9f | 1939 | bch_btree_insert_keys(n3, op, &parent_keys, NULL); |
b54d6934 | 1940 | bch_btree_node_write(n3, &cl); |
cafe5635 | 1941 | |
b54d6934 | 1942 | closure_sync(&cl); |
cafe5635 KO |
1943 | bch_btree_set_root(n3); |
1944 | rw_unlock(true, n3); | |
17e21a9f KO |
1945 | |
1946 | btree_node_free(b); | |
d6fd3b11 KO |
1947 | } else if (!b->parent) { |
1948 | /* Root filled up but didn't need to be split */ | |
b54d6934 | 1949 | closure_sync(&cl); |
cafe5635 | 1950 | bch_btree_set_root(n1); |
17e21a9f KO |
1951 | |
1952 | btree_node_free(b); | |
cafe5635 | 1953 | } else { |
17e21a9f | 1954 | /* Split a non root node */ |
b54d6934 | 1955 | closure_sync(&cl); |
17e21a9f KO |
1956 | make_btree_freeing_key(b, parent_keys.top); |
1957 | bch_keylist_push(&parent_keys); | |
1958 | ||
1959 | btree_node_free(b); | |
1960 | ||
1961 | bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); | |
1962 | BUG_ON(!bch_keylist_empty(&parent_keys)); | |
cafe5635 KO |
1963 | } |
1964 | ||
1965 | rw_unlock(true, n1); | |
cafe5635 | 1966 | |
169ef1cf | 1967 | bch_time_stats_update(&b->c->btree_split_time, start_time); |
cafe5635 KO |
1968 | |
1969 | return 0; | |
1970 | err_free2: | |
5f5837d2 | 1971 | bkey_put(b->c, &n2->key); |
e8e1d468 | 1972 | btree_node_free(n2); |
cafe5635 KO |
1973 | rw_unlock(true, n2); |
1974 | err_free1: | |
5f5837d2 | 1975 | bkey_put(b->c, &n1->key); |
e8e1d468 | 1976 | btree_node_free(n1); |
cafe5635 KO |
1977 | rw_unlock(true, n1); |
1978 | err: | |
5f5837d2 KO |
1979 | WARN(1, "bcache: btree split failed"); |
1980 | ||
cafe5635 KO |
1981 | if (n3 == ERR_PTR(-EAGAIN) || |
1982 | n2 == ERR_PTR(-EAGAIN) || | |
1983 | n1 == ERR_PTR(-EAGAIN)) | |
1984 | return -EAGAIN; | |
1985 | ||
cafe5635 KO |
1986 | return -ENOMEM; |
1987 | } | |
1988 | ||
26c949f8 | 1989 | static int bch_btree_insert_node(struct btree *b, struct btree_op *op, |
c18536a7 | 1990 | struct keylist *insert_keys, |
1b207d80 KO |
1991 | atomic_t *journal_ref, |
1992 | struct bkey *replace_key) | |
cafe5635 | 1993 | { |
17e21a9f KO |
1994 | BUG_ON(b->level && replace_key); |
1995 | ||
59158fde | 1996 | if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { |
17e21a9f KO |
1997 | if (current->bio_list) { |
1998 | op->lock = b->c->root->level + 1; | |
1999 | return -EAGAIN; | |
2000 | } else if (op->lock <= b->c->root->level) { | |
2001 | op->lock = b->c->root->level + 1; | |
2002 | return -EINTR; | |
26c949f8 | 2003 | } else { |
17e21a9f | 2004 | /* Invalidated all iterators */ |
3b3e9e50 KO |
2005 | int ret = btree_split(b, op, insert_keys, replace_key); |
2006 | ||
2007 | return bch_keylist_empty(insert_keys) ? | |
2008 | 0 : ret ?: -EINTR; | |
cafe5635 | 2009 | } |
17e21a9f | 2010 | } else { |
ee811287 | 2011 | BUG_ON(write_block(b) != btree_bset_last(b)); |
cafe5635 | 2012 | |
17e21a9f KO |
2013 | if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { |
2014 | if (!b->level) | |
2015 | bch_btree_leaf_dirty(b, journal_ref); | |
2016 | else | |
2017 | bch_btree_node_write_sync(b); | |
2018 | } | |
2019 | ||
2020 | return 0; | |
2021 | } | |
26c949f8 | 2022 | } |
cafe5635 | 2023 | |
e7c590eb KO |
2024 | int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, |
2025 | struct bkey *check_key) | |
2026 | { | |
2027 | int ret = -EINTR; | |
2028 | uint64_t btree_ptr = b->key.ptr[0]; | |
2029 | unsigned long seq = b->seq; | |
2030 | struct keylist insert; | |
2031 | bool upgrade = op->lock == -1; | |
2032 | ||
2033 | bch_keylist_init(&insert); | |
2034 | ||
2035 | if (upgrade) { | |
2036 | rw_unlock(false, b); | |
2037 | rw_lock(true, b, b->level); | |
2038 | ||
2039 | if (b->key.ptr[0] != btree_ptr || | |
2040 | b->seq != seq + 1) | |
2041 | goto out; | |
2042 | } | |
2043 | ||
2044 | SET_KEY_PTRS(check_key, 1); | |
2045 | get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); | |
2046 | ||
2047 | SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); | |
2048 | ||
2049 | bch_keylist_add(&insert, check_key); | |
2050 | ||
1b207d80 | 2051 | ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); |
e7c590eb KO |
2052 | |
2053 | BUG_ON(!ret && !bch_keylist_empty(&insert)); | |
2054 | out: | |
2055 | if (upgrade) | |
2056 | downgrade_write(&b->lock); | |
2057 | return ret; | |
2058 | } | |
2059 | ||
cc7b8819 KO |
2060 | struct btree_insert_op { |
2061 | struct btree_op op; | |
2062 | struct keylist *keys; | |
2063 | atomic_t *journal_ref; | |
2064 | struct bkey *replace_key; | |
2065 | }; | |
cafe5635 | 2066 | |
08239ca2 | 2067 | static int btree_insert_fn(struct btree_op *b_op, struct btree *b) |
cc7b8819 KO |
2068 | { |
2069 | struct btree_insert_op *op = container_of(b_op, | |
2070 | struct btree_insert_op, op); | |
cafe5635 | 2071 | |
cc7b8819 KO |
2072 | int ret = bch_btree_insert_node(b, &op->op, op->keys, |
2073 | op->journal_ref, op->replace_key); | |
2074 | if (ret && !bch_keylist_empty(op->keys)) | |
2075 | return ret; | |
2076 | else | |
2077 | return MAP_DONE; | |
cafe5635 KO |
2078 | } |
2079 | ||
cc7b8819 KO |
2080 | int bch_btree_insert(struct cache_set *c, struct keylist *keys, |
2081 | atomic_t *journal_ref, struct bkey *replace_key) | |
cafe5635 | 2082 | { |
cc7b8819 | 2083 | struct btree_insert_op op; |
cafe5635 | 2084 | int ret = 0; |
cafe5635 | 2085 | |
cc7b8819 | 2086 | BUG_ON(current->bio_list); |
4f3d4014 | 2087 | BUG_ON(bch_keylist_empty(keys)); |
cafe5635 | 2088 | |
cc7b8819 KO |
2089 | bch_btree_op_init(&op.op, 0); |
2090 | op.keys = keys; | |
2091 | op.journal_ref = journal_ref; | |
2092 | op.replace_key = replace_key; | |
cafe5635 | 2093 | |
cc7b8819 KO |
2094 | while (!ret && !bch_keylist_empty(keys)) { |
2095 | op.op.lock = 0; | |
2096 | ret = bch_btree_map_leaf_nodes(&op.op, c, | |
2097 | &START_KEY(keys->keys), | |
2098 | btree_insert_fn); | |
2099 | } | |
cafe5635 | 2100 | |
cc7b8819 KO |
2101 | if (ret) { |
2102 | struct bkey *k; | |
cafe5635 | 2103 | |
cc7b8819 | 2104 | pr_err("error %i", ret); |
cafe5635 | 2105 | |
cc7b8819 | 2106 | while ((k = bch_keylist_pop(keys))) |
3a3b6a4e | 2107 | bkey_put(c, k); |
cc7b8819 KO |
2108 | } else if (op.op.insert_collision) |
2109 | ret = -ESRCH; | |
6054c6d4 | 2110 | |
cafe5635 KO |
2111 | return ret; |
2112 | } | |
2113 | ||
2114 | void bch_btree_set_root(struct btree *b) | |
2115 | { | |
2116 | unsigned i; | |
e49c7c37 KO |
2117 | struct closure cl; |
2118 | ||
2119 | closure_init_stack(&cl); | |
cafe5635 | 2120 | |
c37511b8 KO |
2121 | trace_bcache_btree_set_root(b); |
2122 | ||
cafe5635 KO |
2123 | BUG_ON(!b->written); |
2124 | ||
2125 | for (i = 0; i < KEY_PTRS(&b->key); i++) | |
2126 | BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); | |
2127 | ||
2128 | mutex_lock(&b->c->bucket_lock); | |
2129 | list_del_init(&b->list); | |
2130 | mutex_unlock(&b->c->bucket_lock); | |
2131 | ||
2132 | b->c->root = b; | |
cafe5635 | 2133 | |
e49c7c37 KO |
2134 | bch_journal_meta(b->c, &cl); |
2135 | closure_sync(&cl); | |
cafe5635 KO |
2136 | } |
2137 | ||
48dad8ba KO |
2138 | /* Map across nodes or keys */ |
2139 | ||
2140 | static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, | |
2141 | struct bkey *from, | |
2142 | btree_map_nodes_fn *fn, int flags) | |
2143 | { | |
2144 | int ret = MAP_CONTINUE; | |
2145 | ||
2146 | if (b->level) { | |
2147 | struct bkey *k; | |
2148 | struct btree_iter iter; | |
2149 | ||
c052dd9a | 2150 | bch_btree_iter_init(&b->keys, &iter, from); |
48dad8ba | 2151 | |
a85e968e | 2152 | while ((k = bch_btree_iter_next_filter(&iter, &b->keys, |
48dad8ba KO |
2153 | bch_ptr_bad))) { |
2154 | ret = btree(map_nodes_recurse, k, b, | |
2155 | op, from, fn, flags); | |
2156 | from = NULL; | |
2157 | ||
2158 | if (ret != MAP_CONTINUE) | |
2159 | return ret; | |
2160 | } | |
2161 | } | |
2162 | ||
2163 | if (!b->level || flags == MAP_ALL_NODES) | |
2164 | ret = fn(op, b); | |
2165 | ||
2166 | return ret; | |
2167 | } | |
2168 | ||
2169 | int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, | |
2170 | struct bkey *from, btree_map_nodes_fn *fn, int flags) | |
2171 | { | |
b54d6934 | 2172 | return btree_root(map_nodes_recurse, c, op, from, fn, flags); |
48dad8ba KO |
2173 | } |
2174 | ||
2175 | static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, | |
2176 | struct bkey *from, btree_map_keys_fn *fn, | |
2177 | int flags) | |
2178 | { | |
2179 | int ret = MAP_CONTINUE; | |
2180 | struct bkey *k; | |
2181 | struct btree_iter iter; | |
2182 | ||
c052dd9a | 2183 | bch_btree_iter_init(&b->keys, &iter, from); |
48dad8ba | 2184 | |
a85e968e | 2185 | while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { |
48dad8ba KO |
2186 | ret = !b->level |
2187 | ? fn(op, b, k) | |
2188 | : btree(map_keys_recurse, k, b, op, from, fn, flags); | |
2189 | from = NULL; | |
2190 | ||
2191 | if (ret != MAP_CONTINUE) | |
2192 | return ret; | |
2193 | } | |
2194 | ||
2195 | if (!b->level && (flags & MAP_END_KEY)) | |
2196 | ret = fn(op, b, &KEY(KEY_INODE(&b->key), | |
2197 | KEY_OFFSET(&b->key), 0)); | |
2198 | ||
2199 | return ret; | |
2200 | } | |
2201 | ||
2202 | int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, | |
2203 | struct bkey *from, btree_map_keys_fn *fn, int flags) | |
2204 | { | |
b54d6934 | 2205 | return btree_root(map_keys_recurse, c, op, from, fn, flags); |
48dad8ba KO |
2206 | } |
2207 | ||
cafe5635 KO |
2208 | /* Keybuf code */ |
2209 | ||
2210 | static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) | |
2211 | { | |
2212 | /* Overlapping keys compare equal */ | |
2213 | if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) | |
2214 | return -1; | |
2215 | if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) | |
2216 | return 1; | |
2217 | return 0; | |
2218 | } | |
2219 | ||
2220 | static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, | |
2221 | struct keybuf_key *r) | |
2222 | { | |
2223 | return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); | |
2224 | } | |
2225 | ||
48dad8ba KO |
2226 | struct refill { |
2227 | struct btree_op op; | |
48a915a8 | 2228 | unsigned nr_found; |
48dad8ba KO |
2229 | struct keybuf *buf; |
2230 | struct bkey *end; | |
2231 | keybuf_pred_fn *pred; | |
2232 | }; | |
cafe5635 | 2233 | |
48dad8ba KO |
2234 | static int refill_keybuf_fn(struct btree_op *op, struct btree *b, |
2235 | struct bkey *k) | |
2236 | { | |
2237 | struct refill *refill = container_of(op, struct refill, op); | |
2238 | struct keybuf *buf = refill->buf; | |
2239 | int ret = MAP_CONTINUE; | |
cafe5635 | 2240 | |
48dad8ba KO |
2241 | if (bkey_cmp(k, refill->end) >= 0) { |
2242 | ret = MAP_DONE; | |
2243 | goto out; | |
2244 | } | |
cafe5635 | 2245 | |
48dad8ba KO |
2246 | if (!KEY_SIZE(k)) /* end key */ |
2247 | goto out; | |
cafe5635 | 2248 | |
48dad8ba KO |
2249 | if (refill->pred(buf, k)) { |
2250 | struct keybuf_key *w; | |
cafe5635 | 2251 | |
48dad8ba | 2252 | spin_lock(&buf->lock); |
cafe5635 | 2253 | |
48dad8ba KO |
2254 | w = array_alloc(&buf->freelist); |
2255 | if (!w) { | |
2256 | spin_unlock(&buf->lock); | |
2257 | return MAP_DONE; | |
2258 | } | |
cafe5635 | 2259 | |
48dad8ba KO |
2260 | w->private = NULL; |
2261 | bkey_copy(&w->key, k); | |
cafe5635 | 2262 | |
48dad8ba KO |
2263 | if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) |
2264 | array_free(&buf->freelist, w); | |
48a915a8 KO |
2265 | else |
2266 | refill->nr_found++; | |
cafe5635 | 2267 | |
48dad8ba KO |
2268 | if (array_freelist_empty(&buf->freelist)) |
2269 | ret = MAP_DONE; | |
cafe5635 | 2270 | |
48dad8ba | 2271 | spin_unlock(&buf->lock); |
cafe5635 | 2272 | } |
48dad8ba KO |
2273 | out: |
2274 | buf->last_scanned = *k; | |
2275 | return ret; | |
cafe5635 KO |
2276 | } |
2277 | ||
2278 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, | |
72c27061 | 2279 | struct bkey *end, keybuf_pred_fn *pred) |
cafe5635 KO |
2280 | { |
2281 | struct bkey start = buf->last_scanned; | |
48dad8ba | 2282 | struct refill refill; |
cafe5635 KO |
2283 | |
2284 | cond_resched(); | |
2285 | ||
b54d6934 | 2286 | bch_btree_op_init(&refill.op, -1); |
48a915a8 KO |
2287 | refill.nr_found = 0; |
2288 | refill.buf = buf; | |
2289 | refill.end = end; | |
2290 | refill.pred = pred; | |
48dad8ba KO |
2291 | |
2292 | bch_btree_map_keys(&refill.op, c, &buf->last_scanned, | |
2293 | refill_keybuf_fn, MAP_END_KEY); | |
cafe5635 | 2294 | |
48a915a8 KO |
2295 | trace_bcache_keyscan(refill.nr_found, |
2296 | KEY_INODE(&start), KEY_OFFSET(&start), | |
2297 | KEY_INODE(&buf->last_scanned), | |
2298 | KEY_OFFSET(&buf->last_scanned)); | |
cafe5635 KO |
2299 | |
2300 | spin_lock(&buf->lock); | |
2301 | ||
2302 | if (!RB_EMPTY_ROOT(&buf->keys)) { | |
2303 | struct keybuf_key *w; | |
2304 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); | |
2305 | buf->start = START_KEY(&w->key); | |
2306 | ||
2307 | w = RB_LAST(&buf->keys, struct keybuf_key, node); | |
2308 | buf->end = w->key; | |
2309 | } else { | |
2310 | buf->start = MAX_KEY; | |
2311 | buf->end = MAX_KEY; | |
2312 | } | |
2313 | ||
2314 | spin_unlock(&buf->lock); | |
2315 | } | |
2316 | ||
2317 | static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) | |
2318 | { | |
2319 | rb_erase(&w->node, &buf->keys); | |
2320 | array_free(&buf->freelist, w); | |
2321 | } | |
2322 | ||
2323 | void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) | |
2324 | { | |
2325 | spin_lock(&buf->lock); | |
2326 | __bch_keybuf_del(buf, w); | |
2327 | spin_unlock(&buf->lock); | |
2328 | } | |
2329 | ||
2330 | bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, | |
2331 | struct bkey *end) | |
2332 | { | |
2333 | bool ret = false; | |
2334 | struct keybuf_key *p, *w, s; | |
2335 | s.key = *start; | |
2336 | ||
2337 | if (bkey_cmp(end, &buf->start) <= 0 || | |
2338 | bkey_cmp(start, &buf->end) >= 0) | |
2339 | return false; | |
2340 | ||
2341 | spin_lock(&buf->lock); | |
2342 | w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); | |
2343 | ||
2344 | while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { | |
2345 | p = w; | |
2346 | w = RB_NEXT(w, node); | |
2347 | ||
2348 | if (p->private) | |
2349 | ret = true; | |
2350 | else | |
2351 | __bch_keybuf_del(buf, p); | |
2352 | } | |
2353 | ||
2354 | spin_unlock(&buf->lock); | |
2355 | return ret; | |
2356 | } | |
2357 | ||
2358 | struct keybuf_key *bch_keybuf_next(struct keybuf *buf) | |
2359 | { | |
2360 | struct keybuf_key *w; | |
2361 | spin_lock(&buf->lock); | |
2362 | ||
2363 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); | |
2364 | ||
2365 | while (w && w->private) | |
2366 | w = RB_NEXT(w, node); | |
2367 | ||
2368 | if (w) | |
2369 | w->private = ERR_PTR(-EINTR); | |
2370 | ||
2371 | spin_unlock(&buf->lock); | |
2372 | return w; | |
2373 | } | |
2374 | ||
2375 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, | |
48dad8ba KO |
2376 | struct keybuf *buf, |
2377 | struct bkey *end, | |
2378 | keybuf_pred_fn *pred) | |
cafe5635 KO |
2379 | { |
2380 | struct keybuf_key *ret; | |
2381 | ||
2382 | while (1) { | |
2383 | ret = bch_keybuf_next(buf); | |
2384 | if (ret) | |
2385 | break; | |
2386 | ||
2387 | if (bkey_cmp(&buf->last_scanned, end) >= 0) { | |
2388 | pr_debug("scan finished"); | |
2389 | break; | |
2390 | } | |
2391 | ||
72c27061 | 2392 | bch_refill_keybuf(c, buf, end, pred); |
cafe5635 KO |
2393 | } |
2394 | ||
2395 | return ret; | |
2396 | } | |
2397 | ||
72c27061 | 2398 | void bch_keybuf_init(struct keybuf *buf) |
cafe5635 | 2399 | { |
cafe5635 KO |
2400 | buf->last_scanned = MAX_KEY; |
2401 | buf->keys = RB_ROOT; | |
2402 | ||
2403 | spin_lock_init(&buf->lock); | |
2404 | array_allocator_init(&buf->freelist); | |
2405 | } | |
2406 | ||
2407 | void bch_btree_exit(void) | |
2408 | { | |
2409 | if (btree_io_wq) | |
2410 | destroy_workqueue(btree_io_wq); | |
cafe5635 KO |
2411 | } |
2412 | ||
2413 | int __init bch_btree_init(void) | |
2414 | { | |
72a44517 KO |
2415 | btree_io_wq = create_singlethread_workqueue("bch_btree_io"); |
2416 | if (!btree_io_wq) | |
cafe5635 KO |
2417 | return -ENOMEM; |
2418 | ||
2419 | return 0; | |
2420 | } |