raid5: log reclaim support
[deliverable/linux.git] / drivers / md / raid5-cache.c
CommitLineData
f6bed0ef
SL
1/*
2 * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 */
14#include <linux/kernel.h>
15#include <linux/wait.h>
16#include <linux/blkdev.h>
17#include <linux/slab.h>
18#include <linux/raid/md_p.h>
19#include <linux/crc32.h>
20#include <linux/random.h>
21#include "md.h"
22#include "raid5.h"
23
24/*
25 * metadata/data stored in disk with 4k size unit (a block) regardless
26 * underneath hardware sector size. only works with PAGE_SIZE == 4096
27 */
28#define BLOCK_SECTORS (8)
29
0576b1c6
SL
30/*
31 * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
32 * recovery scans a very long log
33 */
34#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
35#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
36
f6bed0ef
SL
37struct r5l_log {
38 struct md_rdev *rdev;
39
40 u32 uuid_checksum;
41
42 sector_t device_size; /* log device size, round to
43 * BLOCK_SECTORS */
0576b1c6
SL
44 sector_t max_free_space; /* reclaim run if free space is at
45 * this size */
f6bed0ef
SL
46
47 sector_t last_checkpoint; /* log tail. where recovery scan
48 * starts from */
49 u64 last_cp_seq; /* log tail sequence */
50
51 sector_t log_start; /* log head. where new data appends */
52 u64 seq; /* log head sequence */
53
54 struct mutex io_mutex;
55 struct r5l_io_unit *current_io; /* current io_unit accepting new data */
56
57 spinlock_t io_list_lock;
58 struct list_head running_ios; /* io_units which are still running,
59 * and have not yet been completely
60 * written to the log */
61 struct list_head io_end_ios; /* io_units which have been completely
62 * written to the log but not yet written
63 * to the RAID */
0576b1c6
SL
64 struct list_head stripe_end_ios;/* io_units which have been completely
65 * written to the RAID but have not yet
66 * been considered for updating super */
f6bed0ef
SL
67
68 struct kmem_cache *io_kc;
69
0576b1c6
SL
70 struct md_thread *reclaim_thread;
71 unsigned long reclaim_target; /* number of space that need to be
72 * reclaimed. if it's 0, reclaim spaces
73 * used by io_units which are in
74 * IO_UNIT_STRIPE_END state (eg, reclaim
75 * dones't wait for specific io_unit
76 * switching to IO_UNIT_STRIPE_END
77 * state) */
78
f6bed0ef
SL
79 struct list_head no_space_stripes; /* pending stripes, log has no space */
80 spinlock_t no_space_stripes_lock;
81};
82
83/*
84 * an IO range starts from a meta data block and end at the next meta data
85 * block. The io unit's the meta data block tracks data/parity followed it. io
86 * unit is written to log disk with normal write, as we always flush log disk
87 * first and then start move data to raid disks, there is no requirement to
88 * write io unit with FLUSH/FUA
89 */
90struct r5l_io_unit {
91 struct r5l_log *log;
92
93 struct page *meta_page; /* store meta block */
94 int meta_offset; /* current offset in meta_page */
95
96 struct bio_list bios;
97 atomic_t pending_io; /* pending bios not written to log yet */
98 struct bio *current_bio;/* current_bio accepting new data */
99
100 atomic_t pending_stripe;/* how many stripes not flushed to raid */
101 u64 seq; /* seq number of the metablock */
102 sector_t log_start; /* where the io_unit starts */
103 sector_t log_end; /* where the io_unit ends */
104 struct list_head log_sibling; /* log->running_ios */
105 struct list_head stripe_list; /* stripes added to the io_unit */
106
107 int state;
108 wait_queue_head_t wait_state;
109};
110
111/* r5l_io_unit state */
112enum r5l_io_unit_state {
113 IO_UNIT_RUNNING = 0, /* accepting new IO */
114 IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
115 * don't accepting new bio */
116 IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
117 IO_UNIT_STRIPE_START = 3, /* stripes of io_unit are flushing to raid */
118 IO_UNIT_STRIPE_END = 4, /* stripes data finished writing to raid */
119};
120
121static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
122{
123 start += inc;
124 if (start >= log->device_size)
125 start = start - log->device_size;
126 return start;
127}
128
129static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
130 sector_t end)
131{
132 if (end >= start)
133 return end - start;
134 else
135 return end + log->device_size - start;
136}
137
138static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
139{
140 sector_t used_size;
141
142 used_size = r5l_ring_distance(log, log->last_checkpoint,
143 log->log_start);
144
145 return log->device_size > used_size + size;
146}
147
148static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
149{
150 struct r5l_io_unit *io;
151 /* We can't handle memory allocate failure so far */
152 gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
153
154 io = kmem_cache_zalloc(log->io_kc, gfp);
155 io->log = log;
156 io->meta_page = alloc_page(gfp | __GFP_ZERO);
157
158 bio_list_init(&io->bios);
159 INIT_LIST_HEAD(&io->log_sibling);
160 INIT_LIST_HEAD(&io->stripe_list);
161 io->state = IO_UNIT_RUNNING;
162 init_waitqueue_head(&io->wait_state);
163 return io;
164}
165
166static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
167{
168 __free_page(io->meta_page);
169 kmem_cache_free(log->io_kc, io);
170}
171
172static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
173 enum r5l_io_unit_state state)
174{
175 struct r5l_io_unit *io;
176
177 while (!list_empty(from)) {
178 io = list_first_entry(from, struct r5l_io_unit, log_sibling);
179 /* don't change list order */
180 if (io->state >= state)
181 list_move_tail(&io->log_sibling, to);
182 else
183 break;
184 }
185}
186
0576b1c6
SL
187/*
188 * We don't want too many io_units reside in stripe_end_ios list, which will
189 * waste a lot of memory. So we try to remove some. But we must keep at least 2
190 * io_units. The superblock must point to a valid meta, if it's the last meta,
191 * recovery can scan less
192 */
193static void r5l_compress_stripe_end_list(struct r5l_log *log)
194{
195 struct r5l_io_unit *first, *last, *io;
196
197 first = list_first_entry(&log->stripe_end_ios,
198 struct r5l_io_unit, log_sibling);
199 last = list_last_entry(&log->stripe_end_ios,
200 struct r5l_io_unit, log_sibling);
201 if (first == last)
202 return;
203 list_del(&first->log_sibling);
204 list_del(&last->log_sibling);
205 while (!list_empty(&log->stripe_end_ios)) {
206 io = list_first_entry(&log->stripe_end_ios,
207 struct r5l_io_unit, log_sibling);
208 list_del(&io->log_sibling);
209 first->log_end = io->log_end;
210 r5l_free_io_unit(log, io);
211 }
212 list_add_tail(&first->log_sibling, &log->stripe_end_ios);
213 list_add_tail(&last->log_sibling, &log->stripe_end_ios);
214}
215
f6bed0ef
SL
216static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
217static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
218 enum r5l_io_unit_state state)
219{
220 struct r5l_log *log = io->log;
221
222 if (WARN_ON(io->state >= state))
223 return;
224 io->state = state;
225 if (state == IO_UNIT_IO_END)
226 r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
227 IO_UNIT_IO_END);
0576b1c6
SL
228 if (state == IO_UNIT_STRIPE_END) {
229 struct r5l_io_unit *last;
230 sector_t reclaimable_space;
231
232 r5l_move_io_unit_list(&log->io_end_ios, &log->stripe_end_ios,
233 IO_UNIT_STRIPE_END);
234
235 last = list_last_entry(&log->stripe_end_ios,
236 struct r5l_io_unit, log_sibling);
237 reclaimable_space = r5l_ring_distance(log, log->last_checkpoint,
238 last->log_end);
239 if (reclaimable_space >= log->max_free_space)
240 r5l_wake_reclaim(log, 0);
241
242 r5l_compress_stripe_end_list(log);
243 }
f6bed0ef
SL
244 wake_up(&io->wait_state);
245}
246
247static void r5l_set_io_unit_state(struct r5l_io_unit *io,
248 enum r5l_io_unit_state state)
249{
250 struct r5l_log *log = io->log;
251 unsigned long flags;
252
253 spin_lock_irqsave(&log->io_list_lock, flags);
254 __r5l_set_io_unit_state(io, state);
255 spin_unlock_irqrestore(&log->io_list_lock, flags);
256}
257
258/* XXX: totally ignores I/O errors */
259static void r5l_log_endio(struct bio *bio)
260{
261 struct r5l_io_unit *io = bio->bi_private;
262 struct r5l_log *log = io->log;
263
264 bio_put(bio);
265
266 if (!atomic_dec_and_test(&io->pending_io))
267 return;
268
269 r5l_set_io_unit_state(io, IO_UNIT_IO_END);
270 md_wakeup_thread(log->rdev->mddev->thread);
271}
272
273static void r5l_submit_current_io(struct r5l_log *log)
274{
275 struct r5l_io_unit *io = log->current_io;
276 struct r5l_meta_block *block;
277 struct bio *bio;
278 u32 crc;
279
280 if (!io)
281 return;
282
283 block = page_address(io->meta_page);
284 block->meta_size = cpu_to_le32(io->meta_offset);
285 crc = crc32_le(log->uuid_checksum, (void *)block, PAGE_SIZE);
286 block->checksum = cpu_to_le32(crc);
287
288 log->current_io = NULL;
289 r5l_set_io_unit_state(io, IO_UNIT_IO_START);
290
291 while ((bio = bio_list_pop(&io->bios))) {
292 /* all IO must start from rdev->data_offset */
293 bio->bi_iter.bi_sector += log->rdev->data_offset;
294 submit_bio(WRITE, bio);
295 }
296}
297
298static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
299{
300 struct r5l_io_unit *io;
301 struct r5l_meta_block *block;
302 struct bio *bio;
303
304 io = r5l_alloc_io_unit(log);
305
306 block = page_address(io->meta_page);
307 block->magic = cpu_to_le32(R5LOG_MAGIC);
308 block->version = R5LOG_VERSION;
309 block->seq = cpu_to_le64(log->seq);
310 block->position = cpu_to_le64(log->log_start);
311
312 io->log_start = log->log_start;
313 io->meta_offset = sizeof(struct r5l_meta_block);
314 io->seq = log->seq;
315
316 bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
317 io->current_bio = bio;
318 bio->bi_rw = WRITE;
319 bio->bi_bdev = log->rdev->bdev;
320 bio->bi_iter.bi_sector = log->log_start;
321 bio_add_page(bio, io->meta_page, PAGE_SIZE, 0);
322 bio->bi_end_io = r5l_log_endio;
323 bio->bi_private = io;
324
325 bio_list_add(&io->bios, bio);
326 atomic_inc(&io->pending_io);
327
328 log->seq++;
329 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
330 io->log_end = log->log_start;
331 /* current bio hit disk end */
332 if (log->log_start == 0)
333 io->current_bio = NULL;
334
335 spin_lock_irq(&log->io_list_lock);
336 list_add_tail(&io->log_sibling, &log->running_ios);
337 spin_unlock_irq(&log->io_list_lock);
338
339 return io;
340}
341
342static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
343{
344 struct r5l_io_unit *io;
345
346 io = log->current_io;
347 if (io && io->meta_offset + payload_size > PAGE_SIZE)
348 r5l_submit_current_io(log);
349 io = log->current_io;
350 if (io)
351 return 0;
352
353 log->current_io = r5l_new_meta(log);
354 return 0;
355}
356
357static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
358 sector_t location,
359 u32 checksum1, u32 checksum2,
360 bool checksum2_valid)
361{
362 struct r5l_io_unit *io = log->current_io;
363 struct r5l_payload_data_parity *payload;
364
365 payload = page_address(io->meta_page) + io->meta_offset;
366 payload->header.type = cpu_to_le16(type);
367 payload->header.flags = cpu_to_le16(0);
368 payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
369 (PAGE_SHIFT - 9));
370 payload->location = cpu_to_le64(location);
371 payload->checksum[0] = cpu_to_le32(checksum1);
372 if (checksum2_valid)
373 payload->checksum[1] = cpu_to_le32(checksum2);
374
375 io->meta_offset += sizeof(struct r5l_payload_data_parity) +
376 sizeof(__le32) * (1 + !!checksum2_valid);
377}
378
379static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
380{
381 struct r5l_io_unit *io = log->current_io;
382
383alloc_bio:
384 if (!io->current_bio) {
385 struct bio *bio;
386
387 bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
388 bio->bi_rw = WRITE;
389 bio->bi_bdev = log->rdev->bdev;
390 bio->bi_iter.bi_sector = log->log_start;
391 bio->bi_end_io = r5l_log_endio;
392 bio->bi_private = io;
393 bio_list_add(&io->bios, bio);
394 atomic_inc(&io->pending_io);
395 io->current_bio = bio;
396 }
397 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) {
398 io->current_bio = NULL;
399 goto alloc_bio;
400 }
401 log->log_start = r5l_ring_add(log, log->log_start,
402 BLOCK_SECTORS);
403 /* current bio hit disk end */
404 if (log->log_start == 0)
405 io->current_bio = NULL;
406
407 io->log_end = log->log_start;
408}
409
410static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
411 int data_pages, int parity_pages)
412{
413 int i;
414 int meta_size;
415 struct r5l_io_unit *io;
416
417 meta_size =
418 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
419 * data_pages) +
420 sizeof(struct r5l_payload_data_parity) +
421 sizeof(__le32) * parity_pages;
422
423 r5l_get_meta(log, meta_size);
424 io = log->current_io;
425
426 for (i = 0; i < sh->disks; i++) {
427 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
428 continue;
429 if (i == sh->pd_idx || i == sh->qd_idx)
430 continue;
431 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
432 raid5_compute_blocknr(sh, i, 0),
433 sh->dev[i].log_checksum, 0, false);
434 r5l_append_payload_page(log, sh->dev[i].page);
435 }
436
437 if (sh->qd_idx >= 0) {
438 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
439 sh->sector, sh->dev[sh->pd_idx].log_checksum,
440 sh->dev[sh->qd_idx].log_checksum, true);
441 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
442 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
443 } else {
444 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
445 sh->sector, sh->dev[sh->pd_idx].log_checksum,
446 0, false);
447 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
448 }
449
450 list_add_tail(&sh->log_list, &io->stripe_list);
451 atomic_inc(&io->pending_stripe);
452 sh->log_io = io;
453}
454
455/*
456 * running in raid5d, where reclaim could wait for raid5d too (when it flushes
457 * data from log to raid disks), so we shouldn't wait for reclaim here
458 */
459int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
460{
461 int write_disks = 0;
462 int data_pages, parity_pages;
463 int meta_size;
464 int reserve;
465 int i;
466
467 if (!log)
468 return -EAGAIN;
469 /* Don't support stripe batch */
470 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
471 test_bit(STRIPE_SYNCING, &sh->state)) {
472 /* the stripe is written to log, we start writing it to raid */
473 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
474 return -EAGAIN;
475 }
476
477 for (i = 0; i < sh->disks; i++) {
478 void *addr;
479
480 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
481 continue;
482 write_disks++;
483 /* checksum is already calculated in last run */
484 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
485 continue;
486 addr = kmap_atomic(sh->dev[i].page);
487 sh->dev[i].log_checksum = crc32_le(log->uuid_checksum,
488 addr, PAGE_SIZE);
489 kunmap_atomic(addr);
490 }
491 parity_pages = 1 + !!(sh->qd_idx >= 0);
492 data_pages = write_disks - parity_pages;
493
494 meta_size =
495 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
496 * data_pages) +
497 sizeof(struct r5l_payload_data_parity) +
498 sizeof(__le32) * parity_pages;
499 /* Doesn't work with very big raid array */
500 if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
501 return -EINVAL;
502
503 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
504 atomic_inc(&sh->count);
505
506 mutex_lock(&log->io_mutex);
507 /* meta + data */
508 reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
509 if (r5l_has_free_space(log, reserve))
510 r5l_log_stripe(log, sh, data_pages, parity_pages);
511 else {
512 spin_lock(&log->no_space_stripes_lock);
513 list_add_tail(&sh->log_list, &log->no_space_stripes);
514 spin_unlock(&log->no_space_stripes_lock);
515
516 r5l_wake_reclaim(log, reserve);
517 }
518 mutex_unlock(&log->io_mutex);
519
520 return 0;
521}
522
523void r5l_write_stripe_run(struct r5l_log *log)
524{
525 if (!log)
526 return;
527 mutex_lock(&log->io_mutex);
528 r5l_submit_current_io(log);
529 mutex_unlock(&log->io_mutex);
530}
531
532/* This will run after log space is reclaimed */
533static void r5l_run_no_space_stripes(struct r5l_log *log)
534{
535 struct stripe_head *sh;
536
537 spin_lock(&log->no_space_stripes_lock);
538 while (!list_empty(&log->no_space_stripes)) {
539 sh = list_first_entry(&log->no_space_stripes,
540 struct stripe_head, log_list);
541 list_del_init(&sh->log_list);
542 set_bit(STRIPE_HANDLE, &sh->state);
543 raid5_release_stripe(sh);
544 }
545 spin_unlock(&log->no_space_stripes_lock);
546}
547
0576b1c6
SL
548void r5l_stripe_write_finished(struct stripe_head *sh)
549{
550 struct r5l_io_unit *io;
551
552 /* Don't support stripe batch */
553 io = sh->log_io;
554 if (!io)
555 return;
556 sh->log_io = NULL;
557
558 if (atomic_dec_and_test(&io->pending_stripe))
559 r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
560}
561
562/*
563 * Starting dispatch IO to raid.
564 * io_unit(meta) consists of a log. There is one situation we want to avoid. A
565 * broken meta in the middle of a log causes recovery can't find meta at the
566 * head of log. If operations require meta at the head persistent in log, we
567 * must make sure meta before it persistent in log too. A case is:
568 *
569 * stripe data/parity is in log, we start write stripe to raid disks. stripe
570 * data/parity must be persistent in log before we do the write to raid disks.
571 *
572 * The solution is we restrictly maintain io_unit list order. In this case, we
573 * only write stripes of an io_unit to raid disks till the io_unit is the first
574 * one whose data/parity is in log.
575 */
576void r5l_flush_stripe_to_raid(struct r5l_log *log)
577{
578 struct r5l_io_unit *io;
579 struct stripe_head *sh;
580 bool run_stripe;
581
582 if (!log)
583 return;
584 spin_lock_irq(&log->io_list_lock);
585 run_stripe = !list_empty(&log->io_end_ios);
586 spin_unlock_irq(&log->io_list_lock);
587
588 if (!run_stripe)
589 return;
590
591 blkdev_issue_flush(log->rdev->bdev, GFP_NOIO, NULL);
592
593 spin_lock_irq(&log->io_list_lock);
594 list_for_each_entry(io, &log->io_end_ios, log_sibling) {
595 if (io->state >= IO_UNIT_STRIPE_START)
596 continue;
597 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_START);
598
599 while (!list_empty(&io->stripe_list)) {
600 sh = list_first_entry(&io->stripe_list,
601 struct stripe_head, log_list);
602 list_del_init(&sh->log_list);
603 set_bit(STRIPE_HANDLE, &sh->state);
604 raid5_release_stripe(sh);
605 }
606 }
607 spin_unlock_irq(&log->io_list_lock);
608}
609
610static void r5l_kick_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
611{
612 /* the log thread will write the io unit */
613 wait_event(io->wait_state, io->state >= IO_UNIT_IO_END);
614 if (io->state < IO_UNIT_STRIPE_START)
615 r5l_flush_stripe_to_raid(log);
616 wait_event(io->wait_state, io->state >= IO_UNIT_STRIPE_END);
617}
618
619static void r5l_write_super(struct r5l_log *log, sector_t cp);
620static void r5l_do_reclaim(struct r5l_log *log)
621{
622 struct r5l_io_unit *io, *last;
623 LIST_HEAD(list);
624 sector_t free = 0;
625 sector_t reclaim_target = xchg(&log->reclaim_target, 0);
626
627 spin_lock_irq(&log->io_list_lock);
628 /*
629 * move proper io_unit to reclaim list. We should not change the order.
630 * reclaimable/unreclaimable io_unit can be mixed in the list, we
631 * shouldn't reuse space of an unreclaimable io_unit
632 */
633 while (1) {
634 while (!list_empty(&log->stripe_end_ios)) {
635 io = list_first_entry(&log->stripe_end_ios,
636 struct r5l_io_unit, log_sibling);
637 list_move_tail(&io->log_sibling, &list);
638 free += r5l_ring_distance(log, io->log_start,
639 io->log_end);
640 }
641
642 if (free >= reclaim_target ||
643 (list_empty(&log->running_ios) &&
644 list_empty(&log->io_end_ios) &&
645 list_empty(&log->stripe_end_ios)))
646 break;
647
648 /* Below waiting mostly happens when we shutdown the raid */
649 if (!list_empty(&log->io_end_ios)) {
650 io = list_first_entry(&log->io_end_ios,
651 struct r5l_io_unit, log_sibling);
652 spin_unlock_irq(&log->io_list_lock);
653 /* nobody else can delete the io, we are safe */
654 r5l_kick_io_unit(log, io);
655 spin_lock_irq(&log->io_list_lock);
656 continue;
657 }
658
659 if (!list_empty(&log->running_ios)) {
660 io = list_first_entry(&log->running_ios,
661 struct r5l_io_unit, log_sibling);
662 spin_unlock_irq(&log->io_list_lock);
663 /* nobody else can delete the io, we are safe */
664 r5l_kick_io_unit(log, io);
665 spin_lock_irq(&log->io_list_lock);
666 continue;
667 }
668 }
669 spin_unlock_irq(&log->io_list_lock);
670
671 if (list_empty(&list))
672 return;
673
674 /* super always point to last valid meta */
675 last = list_last_entry(&list, struct r5l_io_unit, log_sibling);
676 /*
677 * write_super will flush cache of each raid disk. We must write super
678 * here, because the log area might be reused soon and we don't want to
679 * confuse recovery
680 */
681 r5l_write_super(log, last->log_start);
682
683 mutex_lock(&log->io_mutex);
684 log->last_checkpoint = last->log_start;
685 log->last_cp_seq = last->seq;
686 mutex_unlock(&log->io_mutex);
687 r5l_run_no_space_stripes(log);
688
689 while (!list_empty(&list)) {
690 io = list_first_entry(&list, struct r5l_io_unit, log_sibling);
691 list_del(&io->log_sibling);
692 r5l_free_io_unit(log, io);
693 }
694}
695
696static void r5l_reclaim_thread(struct md_thread *thread)
697{
698 struct mddev *mddev = thread->mddev;
699 struct r5conf *conf = mddev->private;
700 struct r5l_log *log = conf->log;
701
702 if (!log)
703 return;
704 r5l_do_reclaim(log);
705}
706
f6bed0ef
SL
707static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
708{
0576b1c6
SL
709 unsigned long target;
710 unsigned long new = (unsigned long)space; /* overflow in theory */
711
712 do {
713 target = log->reclaim_target;
714 if (new < target)
715 return;
716 } while (cmpxchg(&log->reclaim_target, target, new) != target);
717 md_wakeup_thread(log->reclaim_thread);
f6bed0ef
SL
718}
719
720static int r5l_recovery_log(struct r5l_log *log)
721{
722 /* fake recovery */
723 log->seq = log->last_cp_seq + 1;
724 log->log_start = r5l_ring_add(log, log->last_checkpoint, BLOCK_SECTORS);
725 return 0;
726}
727
728static void r5l_write_super(struct r5l_log *log, sector_t cp)
729{
730 struct mddev *mddev = log->rdev->mddev;
731
732 log->rdev->journal_tail = cp;
733 set_bit(MD_CHANGE_DEVS, &mddev->flags);
734}
735
736static int r5l_load_log(struct r5l_log *log)
737{
738 struct md_rdev *rdev = log->rdev;
739 struct page *page;
740 struct r5l_meta_block *mb;
741 sector_t cp = log->rdev->journal_tail;
742 u32 stored_crc, expected_crc;
743 bool create_super = false;
744 int ret;
745
746 /* Make sure it's valid */
747 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
748 cp = 0;
749 page = alloc_page(GFP_KERNEL);
750 if (!page)
751 return -ENOMEM;
752
753 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
754 ret = -EIO;
755 goto ioerr;
756 }
757 mb = page_address(page);
758
759 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
760 mb->version != R5LOG_VERSION) {
761 create_super = true;
762 goto create;
763 }
764 stored_crc = le32_to_cpu(mb->checksum);
765 mb->checksum = 0;
766 expected_crc = crc32_le(log->uuid_checksum, (void *)mb, PAGE_SIZE);
767 if (stored_crc != expected_crc) {
768 create_super = true;
769 goto create;
770 }
771 if (le64_to_cpu(mb->position) != cp) {
772 create_super = true;
773 goto create;
774 }
775create:
776 if (create_super) {
777 log->last_cp_seq = prandom_u32();
778 cp = 0;
779 /*
780 * Make sure super points to correct address. Log might have
781 * data very soon. If super hasn't correct log tail address,
782 * recovery can't find the log
783 */
784 r5l_write_super(log, cp);
785 } else
786 log->last_cp_seq = le64_to_cpu(mb->seq);
787
788 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
0576b1c6
SL
789 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
790 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
791 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
f6bed0ef
SL
792 log->last_checkpoint = cp;
793
794 __free_page(page);
795
796 return r5l_recovery_log(log);
797ioerr:
798 __free_page(page);
799 return ret;
800}
801
802int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
803{
804 struct r5l_log *log;
805
806 if (PAGE_SIZE != 4096)
807 return -EINVAL;
808 log = kzalloc(sizeof(*log), GFP_KERNEL);
809 if (!log)
810 return -ENOMEM;
811 log->rdev = rdev;
812
813 log->uuid_checksum = crc32_le(~0, (void *)rdev->mddev->uuid,
814 sizeof(rdev->mddev->uuid));
815
816 mutex_init(&log->io_mutex);
817
818 spin_lock_init(&log->io_list_lock);
819 INIT_LIST_HEAD(&log->running_ios);
0576b1c6
SL
820 INIT_LIST_HEAD(&log->io_end_ios);
821 INIT_LIST_HEAD(&log->stripe_end_ios);
f6bed0ef
SL
822
823 log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
824 if (!log->io_kc)
825 goto io_kc;
826
0576b1c6
SL
827 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
828 log->rdev->mddev, "reclaim");
829 if (!log->reclaim_thread)
830 goto reclaim_thread;
831
f6bed0ef
SL
832 INIT_LIST_HEAD(&log->no_space_stripes);
833 spin_lock_init(&log->no_space_stripes_lock);
834
835 if (r5l_load_log(log))
836 goto error;
837
838 conf->log = log;
839 return 0;
840error:
0576b1c6
SL
841 md_unregister_thread(&log->reclaim_thread);
842reclaim_thread:
f6bed0ef
SL
843 kmem_cache_destroy(log->io_kc);
844io_kc:
845 kfree(log);
846 return -EINVAL;
847}
848
849void r5l_exit_log(struct r5l_log *log)
850{
0576b1c6
SL
851 /*
852 * at this point all stripes are finished, so io_unit is at least in
853 * STRIPE_END state
854 */
855 r5l_wake_reclaim(log, -1L);
856 md_unregister_thread(&log->reclaim_thread);
857 r5l_do_reclaim(log);
858 /*
859 * force a super update, r5l_do_reclaim might updated the super.
860 * mddev->thread is already stopped
861 */
862 md_update_sb(log->rdev->mddev, 1);
863
f6bed0ef
SL
864 kmem_cache_destroy(log->io_kc);
865 kfree(log);
866}
This page took 0.063036 seconds and 5 git commands to generate.