Commit | Line | Data |
---|---|---|
cafe5635 KO |
1 | /* |
2 | * background writeback - scan btree for dirty data and write it to the backing | |
3 | * device | |
4 | * | |
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | |
6 | * Copyright 2012 Google, Inc. | |
7 | */ | |
8 | ||
9 | #include "bcache.h" | |
10 | #include "btree.h" | |
11 | #include "debug.h" | |
279afbad | 12 | #include "writeback.h" |
cafe5635 | 13 | |
5e6926da KO |
14 | #include <linux/delay.h> |
15 | #include <linux/freezer.h> | |
16 | #include <linux/kthread.h> | |
c37511b8 KO |
17 | #include <trace/events/bcache.h> |
18 | ||
cafe5635 KO |
19 | /* Rate limiting */ |
20 | ||
21 | static void __update_writeback_rate(struct cached_dev *dc) | |
22 | { | |
23 | struct cache_set *c = dc->disk.c; | |
24 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; | |
25 | uint64_t cache_dirty_target = | |
26 | div_u64(cache_sectors * dc->writeback_percent, 100); | |
27 | ||
28 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), | |
29 | c->cached_dev_sectors); | |
30 | ||
31 | /* PD controller */ | |
32 | ||
33 | int change = 0; | |
34 | int64_t error; | |
279afbad | 35 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); |
cafe5635 KO |
36 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; |
37 | ||
38 | dc->disk.sectors_dirty_last = dirty; | |
39 | ||
40 | derivative *= dc->writeback_rate_d_term; | |
41 | derivative = clamp(derivative, -dirty, dirty); | |
42 | ||
43 | derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, | |
44 | dc->writeback_rate_d_smooth, 0); | |
45 | ||
46 | /* Avoid divide by zero */ | |
47 | if (!target) | |
48 | goto out; | |
49 | ||
50 | error = div64_s64((dirty + derivative - target) << 8, target); | |
51 | ||
52 | change = div_s64((dc->writeback_rate.rate * error) >> 8, | |
53 | dc->writeback_rate_p_term_inverse); | |
54 | ||
55 | /* Don't increase writeback rate if the device isn't keeping up */ | |
56 | if (change > 0 && | |
57 | time_after64(local_clock(), | |
58 | dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) | |
59 | change = 0; | |
60 | ||
61 | dc->writeback_rate.rate = | |
62 | clamp_t(int64_t, dc->writeback_rate.rate + change, | |
63 | 1, NSEC_PER_MSEC); | |
64 | out: | |
65 | dc->writeback_rate_derivative = derivative; | |
66 | dc->writeback_rate_change = change; | |
67 | dc->writeback_rate_target = target; | |
cafe5635 KO |
68 | } |
69 | ||
70 | static void update_writeback_rate(struct work_struct *work) | |
71 | { | |
72 | struct cached_dev *dc = container_of(to_delayed_work(work), | |
73 | struct cached_dev, | |
74 | writeback_rate_update); | |
75 | ||
76 | down_read(&dc->writeback_lock); | |
77 | ||
78 | if (atomic_read(&dc->has_dirty) && | |
79 | dc->writeback_percent) | |
80 | __update_writeback_rate(dc); | |
81 | ||
82 | up_read(&dc->writeback_lock); | |
5e6926da KO |
83 | |
84 | schedule_delayed_work(&dc->writeback_rate_update, | |
85 | dc->writeback_rate_update_seconds * HZ); | |
cafe5635 KO |
86 | } |
87 | ||
88 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | |
89 | { | |
c2a4f318 KO |
90 | uint64_t ret; |
91 | ||
c4d951dd | 92 | if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || |
cafe5635 KO |
93 | !dc->writeback_percent) |
94 | return 0; | |
95 | ||
c2a4f318 KO |
96 | ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); |
97 | ||
98 | return min_t(uint64_t, ret, HZ); | |
cafe5635 KO |
99 | } |
100 | ||
5e6926da KO |
101 | struct dirty_io { |
102 | struct closure cl; | |
103 | struct cached_dev *dc; | |
104 | struct bio bio; | |
105 | }; | |
72c27061 | 106 | |
cafe5635 KO |
107 | static void dirty_init(struct keybuf_key *w) |
108 | { | |
109 | struct dirty_io *io = w->private; | |
110 | struct bio *bio = &io->bio; | |
111 | ||
112 | bio_init(bio); | |
113 | if (!io->dc->writeback_percent) | |
114 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | |
115 | ||
116 | bio->bi_size = KEY_SIZE(&w->key) << 9; | |
117 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); | |
118 | bio->bi_private = w; | |
119 | bio->bi_io_vec = bio->bi_inline_vecs; | |
169ef1cf | 120 | bch_bio_map(bio, NULL); |
cafe5635 KO |
121 | } |
122 | ||
cafe5635 KO |
123 | static void dirty_io_destructor(struct closure *cl) |
124 | { | |
125 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
126 | kfree(io); | |
127 | } | |
128 | ||
129 | static void write_dirty_finish(struct closure *cl) | |
130 | { | |
131 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
132 | struct keybuf_key *w = io->bio.bi_private; | |
133 | struct cached_dev *dc = io->dc; | |
8e51e414 KO |
134 | struct bio_vec *bv; |
135 | int i; | |
cafe5635 | 136 | |
8e51e414 | 137 | bio_for_each_segment_all(bv, &io->bio, i) |
cafe5635 KO |
138 | __free_page(bv->bv_page); |
139 | ||
140 | /* This is kind of a dumb way of signalling errors. */ | |
141 | if (KEY_DIRTY(&w->key)) { | |
cc7b8819 | 142 | int ret; |
cafe5635 | 143 | unsigned i; |
0b93207a KO |
144 | struct keylist keys; |
145 | ||
0b93207a | 146 | bch_keylist_init(&keys); |
cafe5635 | 147 | |
1b207d80 KO |
148 | bkey_copy(keys.top, &w->key); |
149 | SET_KEY_DIRTY(keys.top, false); | |
150 | bch_keylist_push(&keys); | |
cafe5635 KO |
151 | |
152 | for (i = 0; i < KEY_PTRS(&w->key); i++) | |
153 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); | |
154 | ||
cc7b8819 | 155 | ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key); |
cafe5635 | 156 | |
6054c6d4 | 157 | if (ret) |
c37511b8 KO |
158 | trace_bcache_writeback_collision(&w->key); |
159 | ||
6054c6d4 | 160 | atomic_long_inc(ret |
cafe5635 KO |
161 | ? &dc->disk.c->writeback_keys_failed |
162 | : &dc->disk.c->writeback_keys_done); | |
163 | } | |
164 | ||
165 | bch_keybuf_del(&dc->writeback_keys, w); | |
c2a4f318 | 166 | up(&dc->in_flight); |
cafe5635 KO |
167 | |
168 | closure_return_with_destructor(cl, dirty_io_destructor); | |
169 | } | |
170 | ||
171 | static void dirty_endio(struct bio *bio, int error) | |
172 | { | |
173 | struct keybuf_key *w = bio->bi_private; | |
174 | struct dirty_io *io = w->private; | |
175 | ||
176 | if (error) | |
177 | SET_KEY_DIRTY(&w->key, false); | |
178 | ||
179 | closure_put(&io->cl); | |
180 | } | |
181 | ||
182 | static void write_dirty(struct closure *cl) | |
183 | { | |
184 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
185 | struct keybuf_key *w = io->bio.bi_private; | |
186 | ||
187 | dirty_init(w); | |
188 | io->bio.bi_rw = WRITE; | |
189 | io->bio.bi_sector = KEY_START(&w->key); | |
190 | io->bio.bi_bdev = io->dc->bdev; | |
191 | io->bio.bi_end_io = dirty_endio; | |
192 | ||
cafe5635 KO |
193 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
194 | ||
c2a4f318 | 195 | continue_at(cl, write_dirty_finish, system_wq); |
cafe5635 KO |
196 | } |
197 | ||
198 | static void read_dirty_endio(struct bio *bio, int error) | |
199 | { | |
200 | struct keybuf_key *w = bio->bi_private; | |
201 | struct dirty_io *io = w->private; | |
202 | ||
203 | bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), | |
204 | error, "reading dirty data from cache"); | |
205 | ||
206 | dirty_endio(bio, error); | |
207 | } | |
208 | ||
209 | static void read_dirty_submit(struct closure *cl) | |
210 | { | |
211 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | |
212 | ||
cafe5635 KO |
213 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
214 | ||
c2a4f318 | 215 | continue_at(cl, write_dirty, system_wq); |
cafe5635 KO |
216 | } |
217 | ||
5e6926da | 218 | static void read_dirty(struct cached_dev *dc) |
cafe5635 | 219 | { |
5e6926da | 220 | unsigned delay = 0; |
cafe5635 KO |
221 | struct keybuf_key *w; |
222 | struct dirty_io *io; | |
5e6926da KO |
223 | struct closure cl; |
224 | ||
225 | closure_init_stack(&cl); | |
cafe5635 KO |
226 | |
227 | /* | |
228 | * XXX: if we error, background writeback just spins. Should use some | |
229 | * mempools. | |
230 | */ | |
231 | ||
5e6926da KO |
232 | while (!kthread_should_stop()) { |
233 | try_to_freeze(); | |
234 | ||
cafe5635 KO |
235 | w = bch_keybuf_next(&dc->writeback_keys); |
236 | if (!w) | |
237 | break; | |
238 | ||
239 | BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); | |
240 | ||
5e6926da KO |
241 | if (KEY_START(&w->key) != dc->last_read || |
242 | jiffies_to_msecs(delay) > 50) | |
243 | while (!kthread_should_stop() && delay) | |
244 | delay = schedule_timeout_interruptible(delay); | |
cafe5635 KO |
245 | |
246 | dc->last_read = KEY_OFFSET(&w->key); | |
247 | ||
248 | io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) | |
249 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | |
250 | GFP_KERNEL); | |
251 | if (!io) | |
252 | goto err; | |
253 | ||
254 | w->private = io; | |
255 | io->dc = dc; | |
256 | ||
257 | dirty_init(w); | |
258 | io->bio.bi_sector = PTR_OFFSET(&w->key, 0); | |
259 | io->bio.bi_bdev = PTR_CACHE(dc->disk.c, | |
260 | &w->key, 0)->bdev; | |
261 | io->bio.bi_rw = READ; | |
262 | io->bio.bi_end_io = read_dirty_endio; | |
263 | ||
8e51e414 | 264 | if (bio_alloc_pages(&io->bio, GFP_KERNEL)) |
cafe5635 KO |
265 | goto err_free; |
266 | ||
c37511b8 | 267 | trace_bcache_writeback(&w->key); |
cafe5635 | 268 | |
c2a4f318 | 269 | down(&dc->in_flight); |
5e6926da | 270 | closure_call(&io->cl, read_dirty_submit, NULL, &cl); |
cafe5635 KO |
271 | |
272 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); | |
cafe5635 KO |
273 | } |
274 | ||
275 | if (0) { | |
276 | err_free: | |
277 | kfree(w->private); | |
278 | err: | |
279 | bch_keybuf_del(&dc->writeback_keys, w); | |
280 | } | |
281 | ||
c2a4f318 KO |
282 | /* |
283 | * Wait for outstanding writeback IOs to finish (and keybuf slots to be | |
284 | * freed) before refilling again | |
285 | */ | |
5e6926da KO |
286 | closure_sync(&cl); |
287 | } | |
288 | ||
289 | /* Scan for dirty data */ | |
290 | ||
291 | void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, | |
292 | uint64_t offset, int nr_sectors) | |
293 | { | |
294 | struct bcache_device *d = c->devices[inode]; | |
48a915a8 | 295 | unsigned stripe_offset, stripe, sectors_dirty; |
5e6926da KO |
296 | |
297 | if (!d) | |
298 | return; | |
299 | ||
48a915a8 | 300 | stripe = offset_to_stripe(d, offset); |
5e6926da KO |
301 | stripe_offset = offset & (d->stripe_size - 1); |
302 | ||
303 | while (nr_sectors) { | |
304 | int s = min_t(unsigned, abs(nr_sectors), | |
305 | d->stripe_size - stripe_offset); | |
306 | ||
307 | if (nr_sectors < 0) | |
308 | s = -s; | |
309 | ||
48a915a8 KO |
310 | if (stripe >= d->nr_stripes) |
311 | return; | |
312 | ||
313 | sectors_dirty = atomic_add_return(s, | |
314 | d->stripe_sectors_dirty + stripe); | |
315 | if (sectors_dirty == d->stripe_size) | |
316 | set_bit(stripe, d->full_dirty_stripes); | |
317 | else | |
318 | clear_bit(stripe, d->full_dirty_stripes); | |
319 | ||
5e6926da KO |
320 | nr_sectors -= s; |
321 | stripe_offset = 0; | |
322 | stripe++; | |
323 | } | |
324 | } | |
325 | ||
326 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) | |
327 | { | |
328 | return KEY_DIRTY(k); | |
329 | } | |
330 | ||
48a915a8 | 331 | static void refill_full_stripes(struct cached_dev *dc) |
5e6926da | 332 | { |
48a915a8 KO |
333 | struct keybuf *buf = &dc->writeback_keys; |
334 | unsigned start_stripe, stripe, next_stripe; | |
335 | bool wrapped = false; | |
336 | ||
337 | stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); | |
5e6926da | 338 | |
48a915a8 KO |
339 | if (stripe >= dc->disk.nr_stripes) |
340 | stripe = 0; | |
5e6926da | 341 | |
48a915a8 | 342 | start_stripe = stripe; |
5e6926da KO |
343 | |
344 | while (1) { | |
48a915a8 KO |
345 | stripe = find_next_bit(dc->disk.full_dirty_stripes, |
346 | dc->disk.nr_stripes, stripe); | |
5e6926da | 347 | |
48a915a8 KO |
348 | if (stripe == dc->disk.nr_stripes) |
349 | goto next; | |
5e6926da | 350 | |
48a915a8 KO |
351 | next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, |
352 | dc->disk.nr_stripes, stripe); | |
353 | ||
354 | buf->last_scanned = KEY(dc->disk.id, | |
355 | stripe * dc->disk.stripe_size, 0); | |
356 | ||
357 | bch_refill_keybuf(dc->disk.c, buf, | |
358 | &KEY(dc->disk.id, | |
359 | next_stripe * dc->disk.stripe_size, 0), | |
360 | dirty_pred); | |
361 | ||
362 | if (array_freelist_empty(&buf->freelist)) | |
363 | return; | |
364 | ||
365 | stripe = next_stripe; | |
366 | next: | |
367 | if (wrapped && stripe > start_stripe) | |
368 | return; | |
369 | ||
370 | if (stripe == dc->disk.nr_stripes) { | |
371 | stripe = 0; | |
372 | wrapped = true; | |
373 | } | |
5e6926da KO |
374 | } |
375 | } | |
376 | ||
377 | static bool refill_dirty(struct cached_dev *dc) | |
378 | { | |
379 | struct keybuf *buf = &dc->writeback_keys; | |
5e6926da | 380 | struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); |
48a915a8 KO |
381 | bool searched_from_start = false; |
382 | ||
383 | if (dc->partial_stripes_expensive) { | |
384 | refill_full_stripes(dc); | |
385 | if (array_freelist_empty(&buf->freelist)) | |
386 | return false; | |
387 | } | |
5e6926da KO |
388 | |
389 | if (bkey_cmp(&buf->last_scanned, &end) >= 0) { | |
390 | buf->last_scanned = KEY(dc->disk.id, 0, 0); | |
391 | searched_from_start = true; | |
392 | } | |
393 | ||
48a915a8 | 394 | bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); |
5e6926da KO |
395 | |
396 | return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; | |
397 | } | |
398 | ||
399 | static int bch_writeback_thread(void *arg) | |
400 | { | |
401 | struct cached_dev *dc = arg; | |
402 | bool searched_full_index; | |
403 | ||
404 | while (!kthread_should_stop()) { | |
405 | down_write(&dc->writeback_lock); | |
406 | if (!atomic_read(&dc->has_dirty) || | |
c4d951dd | 407 | (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && |
5e6926da KO |
408 | !dc->writeback_running)) { |
409 | up_write(&dc->writeback_lock); | |
410 | set_current_state(TASK_INTERRUPTIBLE); | |
411 | ||
412 | if (kthread_should_stop()) | |
413 | return 0; | |
414 | ||
415 | try_to_freeze(); | |
416 | schedule(); | |
417 | continue; | |
418 | } | |
419 | ||
420 | searched_full_index = refill_dirty(dc); | |
421 | ||
422 | if (searched_full_index && | |
423 | RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { | |
424 | atomic_set(&dc->has_dirty, 0); | |
425 | cached_dev_put(dc); | |
426 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); | |
427 | bch_write_bdev_super(dc, NULL); | |
428 | } | |
429 | ||
430 | up_write(&dc->writeback_lock); | |
431 | ||
432 | bch_ratelimit_reset(&dc->writeback_rate); | |
433 | read_dirty(dc); | |
434 | ||
435 | if (searched_full_index) { | |
436 | unsigned delay = dc->writeback_delay * HZ; | |
437 | ||
438 | while (delay && | |
439 | !kthread_should_stop() && | |
c4d951dd | 440 | !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) |
5e6926da KO |
441 | delay = schedule_timeout_interruptible(delay); |
442 | } | |
443 | } | |
444 | ||
445 | return 0; | |
cafe5635 KO |
446 | } |
447 | ||
444fc0b6 KO |
448 | /* Init */ |
449 | ||
c18536a7 KO |
450 | struct sectors_dirty_init { |
451 | struct btree_op op; | |
452 | unsigned inode; | |
453 | }; | |
454 | ||
455 | static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, | |
48dad8ba | 456 | struct bkey *k) |
444fc0b6 | 457 | { |
c18536a7 KO |
458 | struct sectors_dirty_init *op = container_of(_op, |
459 | struct sectors_dirty_init, op); | |
48dad8ba KO |
460 | if (KEY_INODE(k) > op->inode) |
461 | return MAP_DONE; | |
444fc0b6 | 462 | |
48dad8ba KO |
463 | if (KEY_DIRTY(k)) |
464 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | |
465 | KEY_START(k), KEY_SIZE(k)); | |
466 | ||
467 | return MAP_CONTINUE; | |
444fc0b6 KO |
468 | } |
469 | ||
470 | void bch_sectors_dirty_init(struct cached_dev *dc) | |
471 | { | |
c18536a7 | 472 | struct sectors_dirty_init op; |
444fc0b6 | 473 | |
b54d6934 | 474 | bch_btree_op_init(&op.op, -1); |
48dad8ba KO |
475 | op.inode = dc->disk.id; |
476 | ||
c18536a7 | 477 | bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), |
48dad8ba | 478 | sectors_dirty_init_fn, 0); |
444fc0b6 KO |
479 | } |
480 | ||
5e6926da | 481 | int bch_cached_dev_writeback_init(struct cached_dev *dc) |
cafe5635 | 482 | { |
c2a4f318 | 483 | sema_init(&dc->in_flight, 64); |
cafe5635 | 484 | init_rwsem(&dc->writeback_lock); |
72c27061 | 485 | bch_keybuf_init(&dc->writeback_keys); |
cafe5635 KO |
486 | |
487 | dc->writeback_metadata = true; | |
488 | dc->writeback_running = true; | |
489 | dc->writeback_percent = 10; | |
490 | dc->writeback_delay = 30; | |
491 | dc->writeback_rate.rate = 1024; | |
492 | ||
493 | dc->writeback_rate_update_seconds = 30; | |
494 | dc->writeback_rate_d_term = 16; | |
495 | dc->writeback_rate_p_term_inverse = 64; | |
496 | dc->writeback_rate_d_smooth = 8; | |
497 | ||
5e6926da KO |
498 | dc->writeback_thread = kthread_create(bch_writeback_thread, dc, |
499 | "bcache_writeback"); | |
500 | if (IS_ERR(dc->writeback_thread)) | |
501 | return PTR_ERR(dc->writeback_thread); | |
502 | ||
503 | set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE); | |
504 | ||
cafe5635 KO |
505 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); |
506 | schedule_delayed_work(&dc->writeback_rate_update, | |
507 | dc->writeback_rate_update_seconds * HZ); | |
cafe5635 KO |
508 | |
509 | return 0; | |
510 | } |