Commit | Line | Data |
---|---|---|
cafe5635 KO |
1 | /* |
2 | * background writeback - scan btree for dirty data and write it to the backing | |
3 | * device | |
4 | * | |
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | |
6 | * Copyright 2012 Google, Inc. | |
7 | */ | |
8 | ||
9 | #include "bcache.h" | |
10 | #include "btree.h" | |
11 | #include "debug.h" | |
279afbad | 12 | #include "writeback.h" |
cafe5635 | 13 | |
c37511b8 KO |
14 | #include <trace/events/bcache.h> |
15 | ||
cafe5635 KO |
16 | static struct workqueue_struct *dirty_wq; |
17 | ||
18 | static void read_dirty(struct closure *); | |
19 | ||
20 | struct dirty_io { | |
21 | struct closure cl; | |
22 | struct cached_dev *dc; | |
23 | struct bio bio; | |
24 | }; | |
25 | ||
26 | /* Rate limiting */ | |
27 | ||
28 | static void __update_writeback_rate(struct cached_dev *dc) | |
29 | { | |
30 | struct cache_set *c = dc->disk.c; | |
31 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; | |
32 | uint64_t cache_dirty_target = | |
33 | div_u64(cache_sectors * dc->writeback_percent, 100); | |
34 | ||
35 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), | |
36 | c->cached_dev_sectors); | |
37 | ||
38 | /* PD controller */ | |
39 | ||
40 | int change = 0; | |
41 | int64_t error; | |
279afbad | 42 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); |
cafe5635 KO |
43 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; |
44 | ||
45 | dc->disk.sectors_dirty_last = dirty; | |
46 | ||
47 | derivative *= dc->writeback_rate_d_term; | |
48 | derivative = clamp(derivative, -dirty, dirty); | |
49 | ||
50 | derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, | |
51 | dc->writeback_rate_d_smooth, 0); | |
52 | ||
53 | /* Avoid divide by zero */ | |
54 | if (!target) | |
55 | goto out; | |
56 | ||
57 | error = div64_s64((dirty + derivative - target) << 8, target); | |
58 | ||
59 | change = div_s64((dc->writeback_rate.rate * error) >> 8, | |
60 | dc->writeback_rate_p_term_inverse); | |
61 | ||
62 | /* Don't increase writeback rate if the device isn't keeping up */ | |
63 | if (change > 0 && | |
64 | time_after64(local_clock(), | |
65 | dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) | |
66 | change = 0; | |
67 | ||
68 | dc->writeback_rate.rate = | |
69 | clamp_t(int64_t, dc->writeback_rate.rate + change, | |
70 | 1, NSEC_PER_MSEC); | |
71 | out: | |
72 | dc->writeback_rate_derivative = derivative; | |
73 | dc->writeback_rate_change = change; | |
74 | dc->writeback_rate_target = target; | |
75 | ||
76 | schedule_delayed_work(&dc->writeback_rate_update, | |
77 | dc->writeback_rate_update_seconds * HZ); | |
78 | } | |
79 | ||
80 | static void update_writeback_rate(struct work_struct *work) | |
81 | { | |
82 | struct cached_dev *dc = container_of(to_delayed_work(work), | |
83 | struct cached_dev, | |
84 | writeback_rate_update); | |
85 | ||
86 | down_read(&dc->writeback_lock); | |
87 | ||
88 | if (atomic_read(&dc->has_dirty) && | |
89 | dc->writeback_percent) | |
90 | __update_writeback_rate(dc); | |
91 | ||
92 | up_read(&dc->writeback_lock); | |
93 | } | |
94 | ||
95 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | |
96 | { | |
c2a4f318 KO |
97 | uint64_t ret; |
98 | ||
cafe5635 KO |
99 | if (atomic_read(&dc->disk.detaching) || |
100 | !dc->writeback_percent) | |
101 | return 0; | |
102 | ||
c2a4f318 KO |
103 | ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); |
104 | ||
105 | return min_t(uint64_t, ret, HZ); | |
cafe5635 KO |
106 | } |
107 | ||
108 | /* Background writeback */ | |
109 | ||
110 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) | |
111 | { | |
112 | return KEY_DIRTY(k); | |
113 | } | |
114 | ||
72c27061 KO |
115 | static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) |
116 | { | |
117 | uint64_t stripe; | |
118 | unsigned nr_sectors = KEY_SIZE(k); | |
119 | struct cached_dev *dc = container_of(buf, struct cached_dev, | |
120 | writeback_keys); | |
121 | unsigned stripe_size = 1 << dc->disk.stripe_size_bits; | |
122 | ||
123 | if (!KEY_DIRTY(k)) | |
124 | return false; | |
125 | ||
126 | stripe = KEY_START(k) >> dc->disk.stripe_size_bits; | |
127 | while (1) { | |
128 | if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != | |
129 | stripe_size) | |
130 | return false; | |
131 | ||
132 | if (nr_sectors <= stripe_size) | |
133 | return true; | |
134 | ||
135 | nr_sectors -= stripe_size; | |
136 | stripe++; | |
137 | } | |
138 | } | |
139 | ||
cafe5635 KO |
140 | static void dirty_init(struct keybuf_key *w) |
141 | { | |
142 | struct dirty_io *io = w->private; | |
143 | struct bio *bio = &io->bio; | |
144 | ||
145 | bio_init(bio); | |
146 | if (!io->dc->writeback_percent) | |
147 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | |
148 | ||
149 | bio->bi_size = KEY_SIZE(&w->key) << 9; | |
150 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); | |
151 | bio->bi_private = w; | |
152 | bio->bi_io_vec = bio->bi_inline_vecs; | |
169ef1cf | 153 | bch_bio_map(bio, NULL); |
cafe5635 KO |
154 | } |
155 | ||
156 | static void refill_dirty(struct closure *cl) | |
157 | { | |
158 | struct cached_dev *dc = container_of(cl, struct cached_dev, | |
159 | writeback.cl); | |
160 | struct keybuf *buf = &dc->writeback_keys; | |
161 | bool searched_from_start = false; | |
162 | struct bkey end = MAX_KEY; | |
163 | SET_KEY_INODE(&end, dc->disk.id); | |
164 | ||
165 | if (!atomic_read(&dc->disk.detaching) && | |
166 | !dc->writeback_running) | |
167 | closure_return(cl); | |
168 | ||
169 | down_write(&dc->writeback_lock); | |
170 | ||
171 | if (!atomic_read(&dc->has_dirty)) { | |
172 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); | |
173 | bch_write_bdev_super(dc, NULL); | |
174 | ||
175 | up_write(&dc->writeback_lock); | |
176 | closure_return(cl); | |
177 | } | |
178 | ||
179 | if (bkey_cmp(&buf->last_scanned, &end) >= 0) { | |
180 | buf->last_scanned = KEY(dc->disk.id, 0, 0); | |
181 | searched_from_start = true; | |
182 | } | |
183 | ||
72c27061 KO |
184 | if (dc->partial_stripes_expensive) { |
185 | uint64_t i; | |
186 | ||
187 | for (i = 0; i < dc->disk.nr_stripes; i++) | |
188 | if (atomic_read(dc->disk.stripe_sectors_dirty + i) == | |
189 | 1 << dc->disk.stripe_size_bits) | |
190 | goto full_stripes; | |
191 | ||
192 | goto normal_refill; | |
193 | full_stripes: | |
194 | bch_refill_keybuf(dc->disk.c, buf, &end, | |
195 | dirty_full_stripe_pred); | |
196 | } else { | |
197 | normal_refill: | |
198 | bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); | |
199 | } | |
cafe5635 KO |
200 | |
201 | if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { | |
202 | /* Searched the entire btree - delay awhile */ | |
203 | ||
204 | if (RB_EMPTY_ROOT(&buf->keys)) { | |
205 | atomic_set(&dc->has_dirty, 0); | |
206 | cached_dev_put(dc); | |
207 | } | |
208 | ||
209 | if (!atomic_read(&dc->disk.detaching)) | |
210 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | |
211 | } | |
212 | ||
213 | up_write(&dc->writeback_lock); | |
214 | ||
c2a4f318 | 215 | bch_ratelimit_reset(&dc->writeback_rate); |
cafe5635 KO |
216 | |
217 | /* Punt to workqueue only so we don't recurse and blow the stack */ | |
218 | continue_at(cl, read_dirty, dirty_wq); | |
219 | } | |
220 | ||
221 | void bch_writeback_queue(struct cached_dev *dc) | |
222 | { | |
223 | if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { | |
224 | if (!atomic_read(&dc->disk.detaching)) | |
225 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | |
226 | ||
227 | continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); | |
228 | } | |
229 | } | |
230 | ||
279afbad | 231 | void bch_writeback_add(struct cached_dev *dc) |
cafe5635 | 232 | { |
cafe5635 KO |
233 | if (!atomic_read(&dc->has_dirty) && |
234 | !atomic_xchg(&dc->has_dirty, 1)) { | |
235 | atomic_inc(&dc->count); | |
236 | ||
237 | if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { | |
238 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); | |
239 | /* XXX: should do this synchronously */ | |
240 | bch_write_bdev_super(dc, NULL); | |
241 | } | |
242 | ||
243 | bch_writeback_queue(dc); | |
244 | ||
245 | if (dc->writeback_percent) | |
246 | schedule_delayed_work(&dc->writeback_rate_update, | |
247 | dc->writeback_rate_update_seconds * HZ); | |
248 | } | |
249 | } | |
250 | ||
279afbad KO |
251 | void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, |
252 | uint64_t offset, int nr_sectors) | |
253 | { | |
254 | struct bcache_device *d = c->devices[inode]; | |
255 | unsigned stripe_size, stripe_offset; | |
256 | uint64_t stripe; | |
257 | ||
258 | if (!d) | |
259 | return; | |
260 | ||
261 | stripe_size = 1 << d->stripe_size_bits; | |
262 | stripe = offset >> d->stripe_size_bits; | |
263 | stripe_offset = offset & (stripe_size - 1); | |
264 | ||
265 | while (nr_sectors) { | |
266 | int s = min_t(unsigned, abs(nr_sectors), | |
267 | stripe_size - stripe_offset); | |
268 | ||
269 | if (nr_sectors < 0) | |
270 | s = -s; | |
271 | ||
272 | atomic_add(s, d->stripe_sectors_dirty + stripe); | |
273 | nr_sectors -= s; | |
274 | stripe_offset = 0; | |
275 | stripe++; | |
276 | } | |
277 | } | |
278 | ||
cafe5635 KO |
279 | /* Background writeback - IO loop */ |
280 | ||
281 | static void dirty_io_destructor(struct closure *cl) | |
282 | { | |
283 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
284 | kfree(io); | |
285 | } | |
286 | ||
287 | static void write_dirty_finish(struct closure *cl) | |
288 | { | |
289 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
290 | struct keybuf_key *w = io->bio.bi_private; | |
291 | struct cached_dev *dc = io->dc; | |
8e51e414 KO |
292 | struct bio_vec *bv; |
293 | int i; | |
cafe5635 | 294 | |
8e51e414 | 295 | bio_for_each_segment_all(bv, &io->bio, i) |
cafe5635 KO |
296 | __free_page(bv->bv_page); |
297 | ||
298 | /* This is kind of a dumb way of signalling errors. */ | |
299 | if (KEY_DIRTY(&w->key)) { | |
300 | unsigned i; | |
301 | struct btree_op op; | |
302 | bch_btree_op_init_stack(&op); | |
303 | ||
304 | op.type = BTREE_REPLACE; | |
305 | bkey_copy(&op.replace, &w->key); | |
306 | ||
307 | SET_KEY_DIRTY(&w->key, false); | |
308 | bch_keylist_add(&op.keys, &w->key); | |
309 | ||
310 | for (i = 0; i < KEY_PTRS(&w->key); i++) | |
311 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); | |
312 | ||
cafe5635 KO |
313 | bch_btree_insert(&op, dc->disk.c); |
314 | closure_sync(&op.cl); | |
315 | ||
c37511b8 KO |
316 | if (op.insert_collision) |
317 | trace_bcache_writeback_collision(&w->key); | |
318 | ||
cafe5635 KO |
319 | atomic_long_inc(op.insert_collision |
320 | ? &dc->disk.c->writeback_keys_failed | |
321 | : &dc->disk.c->writeback_keys_done); | |
322 | } | |
323 | ||
324 | bch_keybuf_del(&dc->writeback_keys, w); | |
c2a4f318 | 325 | up(&dc->in_flight); |
cafe5635 KO |
326 | |
327 | closure_return_with_destructor(cl, dirty_io_destructor); | |
328 | } | |
329 | ||
330 | static void dirty_endio(struct bio *bio, int error) | |
331 | { | |
332 | struct keybuf_key *w = bio->bi_private; | |
333 | struct dirty_io *io = w->private; | |
334 | ||
335 | if (error) | |
336 | SET_KEY_DIRTY(&w->key, false); | |
337 | ||
338 | closure_put(&io->cl); | |
339 | } | |
340 | ||
341 | static void write_dirty(struct closure *cl) | |
342 | { | |
343 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
344 | struct keybuf_key *w = io->bio.bi_private; | |
345 | ||
346 | dirty_init(w); | |
347 | io->bio.bi_rw = WRITE; | |
348 | io->bio.bi_sector = KEY_START(&w->key); | |
349 | io->bio.bi_bdev = io->dc->bdev; | |
350 | io->bio.bi_end_io = dirty_endio; | |
351 | ||
cafe5635 KO |
352 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
353 | ||
c2a4f318 | 354 | continue_at(cl, write_dirty_finish, system_wq); |
cafe5635 KO |
355 | } |
356 | ||
357 | static void read_dirty_endio(struct bio *bio, int error) | |
358 | { | |
359 | struct keybuf_key *w = bio->bi_private; | |
360 | struct dirty_io *io = w->private; | |
361 | ||
362 | bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), | |
363 | error, "reading dirty data from cache"); | |
364 | ||
365 | dirty_endio(bio, error); | |
366 | } | |
367 | ||
368 | static void read_dirty_submit(struct closure *cl) | |
369 | { | |
370 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
371 | ||
cafe5635 KO |
372 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
373 | ||
c2a4f318 | 374 | continue_at(cl, write_dirty, system_wq); |
cafe5635 KO |
375 | } |
376 | ||
377 | static void read_dirty(struct closure *cl) | |
378 | { | |
379 | struct cached_dev *dc = container_of(cl, struct cached_dev, | |
380 | writeback.cl); | |
381 | unsigned delay = writeback_delay(dc, 0); | |
382 | struct keybuf_key *w; | |
383 | struct dirty_io *io; | |
384 | ||
385 | /* | |
386 | * XXX: if we error, background writeback just spins. Should use some | |
387 | * mempools. | |
388 | */ | |
389 | ||
390 | while (1) { | |
391 | w = bch_keybuf_next(&dc->writeback_keys); | |
392 | if (!w) | |
393 | break; | |
394 | ||
395 | BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); | |
396 | ||
397 | if (delay > 0 && | |
398 | (KEY_START(&w->key) != dc->last_read || | |
c2a4f318 | 399 | jiffies_to_msecs(delay) > 50)) |
79e3dab9 | 400 | delay = schedule_timeout_uninterruptible(delay); |
cafe5635 KO |
401 | |
402 | dc->last_read = KEY_OFFSET(&w->key); | |
403 | ||
404 | io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) | |
405 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | |
406 | GFP_KERNEL); | |
407 | if (!io) | |
408 | goto err; | |
409 | ||
410 | w->private = io; | |
411 | io->dc = dc; | |
412 | ||
413 | dirty_init(w); | |
414 | io->bio.bi_sector = PTR_OFFSET(&w->key, 0); | |
415 | io->bio.bi_bdev = PTR_CACHE(dc->disk.c, | |
416 | &w->key, 0)->bdev; | |
417 | io->bio.bi_rw = READ; | |
418 | io->bio.bi_end_io = read_dirty_endio; | |
419 | ||
8e51e414 | 420 | if (bio_alloc_pages(&io->bio, GFP_KERNEL)) |
cafe5635 KO |
421 | goto err_free; |
422 | ||
c37511b8 | 423 | trace_bcache_writeback(&w->key); |
cafe5635 | 424 | |
c2a4f318 KO |
425 | down(&dc->in_flight); |
426 | closure_call(&io->cl, read_dirty_submit, NULL, cl); | |
cafe5635 KO |
427 | |
428 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); | |
cafe5635 KO |
429 | } |
430 | ||
431 | if (0) { | |
432 | err_free: | |
433 | kfree(w->private); | |
434 | err: | |
435 | bch_keybuf_del(&dc->writeback_keys, w); | |
436 | } | |
437 | ||
c2a4f318 KO |
438 | /* |
439 | * Wait for outstanding writeback IOs to finish (and keybuf slots to be | |
440 | * freed) before refilling again | |
441 | */ | |
442 | continue_at(cl, refill_dirty, dirty_wq); | |
cafe5635 KO |
443 | } |
444 | ||
444fc0b6 KO |
445 | /* Init */ |
446 | ||
447 | static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, | |
448 | struct cached_dev *dc) | |
449 | { | |
450 | struct bkey *k; | |
451 | struct btree_iter iter; | |
452 | ||
453 | bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); | |
454 | while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) | |
455 | if (!b->level) { | |
456 | if (KEY_INODE(k) > dc->disk.id) | |
457 | break; | |
458 | ||
459 | if (KEY_DIRTY(k)) | |
279afbad KO |
460 | bcache_dev_sectors_dirty_add(b->c, dc->disk.id, |
461 | KEY_START(k), | |
462 | KEY_SIZE(k)); | |
444fc0b6 KO |
463 | } else { |
464 | btree(sectors_dirty_init, k, b, op, dc); | |
465 | if (KEY_INODE(k) > dc->disk.id) | |
466 | break; | |
467 | ||
468 | cond_resched(); | |
469 | } | |
470 | ||
471 | return 0; | |
472 | } | |
473 | ||
474 | void bch_sectors_dirty_init(struct cached_dev *dc) | |
475 | { | |
476 | struct btree_op op; | |
477 | ||
478 | bch_btree_op_init_stack(&op); | |
479 | btree_root(sectors_dirty_init, dc->disk.c, &op, dc); | |
480 | } | |
481 | ||
f59fce84 | 482 | void bch_cached_dev_writeback_init(struct cached_dev *dc) |
cafe5635 | 483 | { |
c2a4f318 | 484 | sema_init(&dc->in_flight, 64); |
cafe5635 KO |
485 | closure_init_unlocked(&dc->writeback); |
486 | init_rwsem(&dc->writeback_lock); | |
487 | ||
72c27061 | 488 | bch_keybuf_init(&dc->writeback_keys); |
cafe5635 KO |
489 | |
490 | dc->writeback_metadata = true; | |
491 | dc->writeback_running = true; | |
492 | dc->writeback_percent = 10; | |
493 | dc->writeback_delay = 30; | |
494 | dc->writeback_rate.rate = 1024; | |
495 | ||
496 | dc->writeback_rate_update_seconds = 30; | |
497 | dc->writeback_rate_d_term = 16; | |
498 | dc->writeback_rate_p_term_inverse = 64; | |
499 | dc->writeback_rate_d_smooth = 8; | |
500 | ||
501 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); | |
502 | schedule_delayed_work(&dc->writeback_rate_update, | |
503 | dc->writeback_rate_update_seconds * HZ); | |
504 | } | |
505 | ||
506 | void bch_writeback_exit(void) | |
507 | { | |
508 | if (dirty_wq) | |
509 | destroy_workqueue(dirty_wq); | |
510 | } | |
511 | ||
512 | int __init bch_writeback_init(void) | |
513 | { | |
c2a4f318 | 514 | dirty_wq = create_workqueue("bcache_writeback"); |
cafe5635 KO |
515 | if (!dirty_wq) |
516 | return -ENOMEM; | |
517 | ||
518 | return 0; | |
519 | } |