4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26 #include <linux/module.h>
27 #include <linux/drbd.h>
28 #include <linux/sched.h>
29 #include <linux/wait.h>
31 #include <linux/memcontrol.h>
32 #include <linux/mm_inline.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/string.h>
36 #include <linux/scatterlist.h>
41 static int w_make_ov_request(struct drbd_work
*w
, int cancel
);
45 * drbd_md_io_complete (defined here)
46 * drbd_request_endio (defined here)
47 * drbd_peer_request_endio (defined here)
48 * bm_async_io_complete (defined in drbd_bitmap.c)
50 * For all these callbacks, note the following:
51 * The callbacks will be called in irq context by the IDE drivers,
52 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
53 * Try to get the locking right :)
58 /* About the global_state_lock
59 Each state transition on an device holds a read lock. In case we have
60 to evaluate the sync after dependencies, we grab a write lock, because
61 we need stable states on all devices for that. */
62 rwlock_t global_state_lock
;
64 /* used for synchronous meta data and bitmap IO
65 * submitted by drbd_md_sync_page_io()
67 void drbd_md_io_complete(struct bio
*bio
, int error
)
69 struct drbd_md_io
*md_io
;
71 md_io
= (struct drbd_md_io
*)bio
->bi_private
;
74 complete(&md_io
->event
);
77 /* reads on behalf of the partner,
78 * "submitted" by the receiver
80 void drbd_endio_read_sec_final(struct drbd_peer_request
*peer_req
) __releases(local
)
82 unsigned long flags
= 0;
83 struct drbd_conf
*mdev
= peer_req
->w
.mdev
;
85 spin_lock_irqsave(&mdev
->tconn
->req_lock
, flags
);
86 mdev
->read_cnt
+= peer_req
->i
.size
>> 9;
87 list_del(&peer_req
->w
.list
);
88 if (list_empty(&mdev
->read_ee
))
89 wake_up(&mdev
->ee_wait
);
90 if (test_bit(__EE_WAS_ERROR
, &peer_req
->flags
))
91 __drbd_chk_io_error(mdev
, false);
92 spin_unlock_irqrestore(&mdev
->tconn
->req_lock
, flags
);
94 drbd_queue_work(&mdev
->tconn
->data
.work
, &peer_req
->w
);
98 /* writes on behalf of the partner, or resync writes,
99 * "submitted" by the receiver, final stage. */
100 static void drbd_endio_write_sec_final(struct drbd_peer_request
*peer_req
) __releases(local
)
102 unsigned long flags
= 0;
103 struct drbd_conf
*mdev
= peer_req
->w
.mdev
;
104 struct drbd_interval i
;
107 int do_al_complete_io
;
109 /* after we moved peer_req to done_ee,
110 * we may no longer access it,
111 * it may be freed/reused already!
112 * (as soon as we release the req_lock) */
114 do_al_complete_io
= peer_req
->flags
& EE_CALL_AL_COMPLETE_IO
;
115 block_id
= peer_req
->block_id
;
117 spin_lock_irqsave(&mdev
->tconn
->req_lock
, flags
);
118 mdev
->writ_cnt
+= peer_req
->i
.size
>> 9;
119 list_del(&peer_req
->w
.list
); /* has been on active_ee or sync_ee */
120 list_add_tail(&peer_req
->w
.list
, &mdev
->done_ee
);
123 * Do not remove from the write_requests tree here: we did not send the
124 * Ack yet and did not wake possibly waiting conflicting requests.
125 * Removed from the tree from "drbd_process_done_ee" within the
126 * appropriate w.cb (e_end_block/e_end_resync_block) or from
127 * _drbd_clear_done_ee.
130 do_wake
= list_empty(block_id
== ID_SYNCER
? &mdev
->sync_ee
: &mdev
->active_ee
);
132 if (test_bit(__EE_WAS_ERROR
, &peer_req
->flags
))
133 __drbd_chk_io_error(mdev
, false);
134 spin_unlock_irqrestore(&mdev
->tconn
->req_lock
, flags
);
136 if (block_id
== ID_SYNCER
)
137 drbd_rs_complete_io(mdev
, i
.sector
);
140 wake_up(&mdev
->ee_wait
);
142 if (do_al_complete_io
)
143 drbd_al_complete_io(mdev
, &i
);
145 wake_asender(mdev
->tconn
);
149 /* writes on behalf of the partner, or resync writes,
150 * "submitted" by the receiver.
152 void drbd_peer_request_endio(struct bio
*bio
, int error
)
154 struct drbd_peer_request
*peer_req
= bio
->bi_private
;
155 struct drbd_conf
*mdev
= peer_req
->w
.mdev
;
156 int uptodate
= bio_flagged(bio
, BIO_UPTODATE
);
157 int is_write
= bio_data_dir(bio
) == WRITE
;
159 if (error
&& __ratelimit(&drbd_ratelimit_state
))
160 dev_warn(DEV
, "%s: error=%d s=%llus\n",
161 is_write
? "write" : "read", error
,
162 (unsigned long long)peer_req
->i
.sector
);
163 if (!error
&& !uptodate
) {
164 if (__ratelimit(&drbd_ratelimit_state
))
165 dev_warn(DEV
, "%s: setting error to -EIO s=%llus\n",
166 is_write
? "write" : "read",
167 (unsigned long long)peer_req
->i
.sector
);
168 /* strange behavior of some lower level drivers...
169 * fail the request by clearing the uptodate flag,
170 * but do not return any error?! */
175 set_bit(__EE_WAS_ERROR
, &peer_req
->flags
);
177 bio_put(bio
); /* no need for the bio anymore */
178 if (atomic_dec_and_test(&peer_req
->pending_bios
)) {
180 drbd_endio_write_sec_final(peer_req
);
182 drbd_endio_read_sec_final(peer_req
);
186 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
188 void drbd_request_endio(struct bio
*bio
, int error
)
191 struct drbd_request
*req
= bio
->bi_private
;
192 struct drbd_conf
*mdev
= req
->w
.mdev
;
193 struct bio_and_error m
;
194 enum drbd_req_event what
;
195 int uptodate
= bio_flagged(bio
, BIO_UPTODATE
);
197 if (!error
&& !uptodate
) {
198 dev_warn(DEV
, "p %s: setting error to -EIO\n",
199 bio_data_dir(bio
) == WRITE
? "write" : "read");
200 /* strange behavior of some lower level drivers...
201 * fail the request by clearing the uptodate flag,
202 * but do not return any error?! */
206 /* to avoid recursion in __req_mod */
207 if (unlikely(error
)) {
208 what
= (bio_data_dir(bio
) == WRITE
)
209 ? WRITE_COMPLETED_WITH_ERROR
210 : (bio_rw(bio
) == READ
)
211 ? READ_COMPLETED_WITH_ERROR
212 : READ_AHEAD_COMPLETED_WITH_ERROR
;
216 bio_put(req
->private_bio
);
217 req
->private_bio
= ERR_PTR(error
);
219 /* not req_mod(), we need irqsave here! */
220 spin_lock_irqsave(&mdev
->tconn
->req_lock
, flags
);
221 __req_mod(req
, what
, &m
);
222 spin_unlock_irqrestore(&mdev
->tconn
->req_lock
, flags
);
225 complete_master_bio(mdev
, &m
);
228 int w_read_retry_remote(struct drbd_work
*w
, int cancel
)
230 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
231 struct drbd_conf
*mdev
= w
->mdev
;
233 /* We should not detach for read io-error,
234 * but try to WRITE the P_DATA_REPLY to the failed location,
235 * to give the disk the chance to relocate that block */
237 spin_lock_irq(&mdev
->tconn
->req_lock
);
238 if (cancel
|| mdev
->state
.pdsk
!= D_UP_TO_DATE
) {
239 _req_mod(req
, READ_RETRY_REMOTE_CANCELED
);
240 spin_unlock_irq(&mdev
->tconn
->req_lock
);
243 spin_unlock_irq(&mdev
->tconn
->req_lock
);
245 return w_send_read_req(w
, 0);
248 void drbd_csum_ee(struct drbd_conf
*mdev
, struct crypto_hash
*tfm
,
249 struct drbd_peer_request
*peer_req
, void *digest
)
251 struct hash_desc desc
;
252 struct scatterlist sg
;
253 struct page
*page
= peer_req
->pages
;
260 sg_init_table(&sg
, 1);
261 crypto_hash_init(&desc
);
263 while ((tmp
= page_chain_next(page
))) {
264 /* all but the last page will be fully used */
265 sg_set_page(&sg
, page
, PAGE_SIZE
, 0);
266 crypto_hash_update(&desc
, &sg
, sg
.length
);
269 /* and now the last, possibly only partially used page */
270 len
= peer_req
->i
.size
& (PAGE_SIZE
- 1);
271 sg_set_page(&sg
, page
, len
?: PAGE_SIZE
, 0);
272 crypto_hash_update(&desc
, &sg
, sg
.length
);
273 crypto_hash_final(&desc
, digest
);
276 void drbd_csum_bio(struct drbd_conf
*mdev
, struct crypto_hash
*tfm
, struct bio
*bio
, void *digest
)
278 struct hash_desc desc
;
279 struct scatterlist sg
;
280 struct bio_vec
*bvec
;
286 sg_init_table(&sg
, 1);
287 crypto_hash_init(&desc
);
289 __bio_for_each_segment(bvec
, bio
, i
, 0) {
290 sg_set_page(&sg
, bvec
->bv_page
, bvec
->bv_len
, bvec
->bv_offset
);
291 crypto_hash_update(&desc
, &sg
, sg
.length
);
293 crypto_hash_final(&desc
, digest
);
296 /* MAYBE merge common code with w_e_end_ov_req */
297 static int w_e_send_csum(struct drbd_work
*w
, int cancel
)
299 struct drbd_peer_request
*peer_req
= container_of(w
, struct drbd_peer_request
, w
);
300 struct drbd_conf
*mdev
= w
->mdev
;
305 if (unlikely(cancel
))
308 if (unlikely((peer_req
->flags
& EE_WAS_ERROR
) != 0))
311 digest_size
= crypto_hash_digestsize(mdev
->tconn
->csums_tfm
);
312 digest
= kmalloc(digest_size
, GFP_NOIO
);
314 sector_t sector
= peer_req
->i
.sector
;
315 unsigned int size
= peer_req
->i
.size
;
316 drbd_csum_ee(mdev
, mdev
->tconn
->csums_tfm
, peer_req
, digest
);
317 /* Free peer_req and pages before send.
318 * In case we block on congestion, we could otherwise run into
319 * some distributed deadlock, if the other side blocks on
320 * congestion as well, because our receiver blocks in
321 * drbd_pp_alloc due to pp_in_use > max_buffers. */
322 drbd_free_ee(mdev
, peer_req
);
324 inc_rs_pending(mdev
);
325 err
= drbd_send_drequest_csum(mdev
, sector
, size
,
330 dev_err(DEV
, "kmalloc() of digest failed.\n");
336 drbd_free_ee(mdev
, peer_req
);
339 dev_err(DEV
, "drbd_send_drequest(..., csum) failed\n");
343 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
345 static int read_for_csum(struct drbd_conf
*mdev
, sector_t sector
, int size
)
347 struct drbd_peer_request
*peer_req
;
352 if (drbd_rs_should_slow_down(mdev
, sector
))
355 /* GFP_TRY, because if there is no memory available right now, this may
356 * be rescheduled for later. It is "only" background resync, after all. */
357 peer_req
= drbd_alloc_ee(mdev
, ID_SYNCER
/* unused */, sector
, size
, GFP_TRY
);
361 peer_req
->w
.cb
= w_e_send_csum
;
362 spin_lock_irq(&mdev
->tconn
->req_lock
);
363 list_add(&peer_req
->w
.list
, &mdev
->read_ee
);
364 spin_unlock_irq(&mdev
->tconn
->req_lock
);
366 atomic_add(size
>> 9, &mdev
->rs_sect_ev
);
367 if (drbd_submit_peer_request(mdev
, peer_req
, READ
, DRBD_FAULT_RS_RD
) == 0)
370 /* If it failed because of ENOMEM, retry should help. If it failed
371 * because bio_add_page failed (probably broken lower level driver),
372 * retry may or may not help.
373 * If it does not, you may need to force disconnect. */
374 spin_lock_irq(&mdev
->tconn
->req_lock
);
375 list_del(&peer_req
->w
.list
);
376 spin_unlock_irq(&mdev
->tconn
->req_lock
);
378 drbd_free_ee(mdev
, peer_req
);
384 int w_resync_timer(struct drbd_work
*w
, int cancel
)
386 struct drbd_conf
*mdev
= w
->mdev
;
387 switch (mdev
->state
.conn
) {
389 w_make_ov_request(w
, cancel
);
392 w_make_resync_request(w
, cancel
);
399 void resync_timer_fn(unsigned long data
)
401 struct drbd_conf
*mdev
= (struct drbd_conf
*) data
;
403 if (list_empty(&mdev
->resync_work
.list
))
404 drbd_queue_work(&mdev
->tconn
->data
.work
, &mdev
->resync_work
);
407 static void fifo_set(struct fifo_buffer
*fb
, int value
)
411 for (i
= 0; i
< fb
->size
; i
++)
412 fb
->values
[i
] = value
;
415 static int fifo_push(struct fifo_buffer
*fb
, int value
)
419 ov
= fb
->values
[fb
->head_index
];
420 fb
->values
[fb
->head_index
++] = value
;
422 if (fb
->head_index
>= fb
->size
)
428 static void fifo_add_val(struct fifo_buffer
*fb
, int value
)
432 for (i
= 0; i
< fb
->size
; i
++)
433 fb
->values
[i
] += value
;
436 static int drbd_rs_controller(struct drbd_conf
*mdev
)
438 unsigned int sect_in
; /* Number of sectors that came in since the last turn */
439 unsigned int want
; /* The number of sectors we want in the proxy */
440 int req_sect
; /* Number of sectors to request in this turn */
441 int correction
; /* Number of sectors more we need in the proxy*/
442 int cps
; /* correction per invocation of drbd_rs_controller() */
443 int steps
; /* Number of time steps to plan ahead */
447 sect_in
= atomic_xchg(&mdev
->rs_sect_in
, 0); /* Number of sectors that came in */
448 mdev
->rs_in_flight
-= sect_in
;
450 spin_lock(&mdev
->peer_seq_lock
); /* get an atomic view on mdev->rs_plan_s */
452 steps
= mdev
->rs_plan_s
.size
; /* (mdev->ldev->dc.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
454 if (mdev
->rs_in_flight
+ sect_in
== 0) { /* At start of resync */
455 want
= ((mdev
->ldev
->dc
.resync_rate
* 2 * SLEEP_TIME
) / HZ
) * steps
;
456 } else { /* normal path */
457 want
= mdev
->ldev
->dc
.c_fill_target
? mdev
->ldev
->dc
.c_fill_target
:
458 sect_in
* mdev
->ldev
->dc
.c_delay_target
* HZ
/ (SLEEP_TIME
* 10);
461 correction
= want
- mdev
->rs_in_flight
- mdev
->rs_planed
;
464 cps
= correction
/ steps
;
465 fifo_add_val(&mdev
->rs_plan_s
, cps
);
466 mdev
->rs_planed
+= cps
* steps
;
468 /* What we do in this step */
469 curr_corr
= fifo_push(&mdev
->rs_plan_s
, 0);
470 spin_unlock(&mdev
->peer_seq_lock
);
471 mdev
->rs_planed
-= curr_corr
;
473 req_sect
= sect_in
+ curr_corr
;
477 max_sect
= (mdev
->ldev
->dc
.c_max_rate
* 2 * SLEEP_TIME
) / HZ
;
478 if (req_sect
> max_sect
)
482 dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
483 sect_in, mdev->rs_in_flight, want, correction,
484 steps, cps, mdev->rs_planed, curr_corr, req_sect);
490 static int drbd_rs_number_requests(struct drbd_conf
*mdev
)
493 if (mdev
->rs_plan_s
.size
) { /* mdev->ldev->dc.c_plan_ahead */
494 number
= drbd_rs_controller(mdev
) >> (BM_BLOCK_SHIFT
- 9);
495 mdev
->c_sync_rate
= number
* HZ
* (BM_BLOCK_SIZE
/ 1024) / SLEEP_TIME
;
497 mdev
->c_sync_rate
= mdev
->ldev
->dc
.resync_rate
;
498 number
= SLEEP_TIME
* mdev
->c_sync_rate
/ ((BM_BLOCK_SIZE
/ 1024) * HZ
);
501 /* ignore the amount of pending requests, the resync controller should
502 * throttle down to incoming reply rate soon enough anyways. */
506 int w_make_resync_request(struct drbd_work
*w
, int cancel
)
508 struct drbd_conf
*mdev
= w
->mdev
;
511 const sector_t capacity
= drbd_get_capacity(mdev
->this_bdev
);
513 int number
, rollback_i
, size
;
514 int align
, queued
, sndbuf
;
517 if (unlikely(cancel
))
520 if (mdev
->rs_total
== 0) {
522 drbd_resync_finished(mdev
);
526 if (!get_ldev(mdev
)) {
527 /* Since we only need to access mdev->rsync a
528 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
529 to continue resync with a broken disk makes no sense at
531 dev_err(DEV
, "Disk broke down during resync!\n");
535 max_bio_size
= queue_max_hw_sectors(mdev
->rq_queue
) << 9;
536 number
= drbd_rs_number_requests(mdev
);
540 for (i
= 0; i
< number
; i
++) {
541 /* Stop generating RS requests, when half of the send buffer is filled */
542 mutex_lock(&mdev
->tconn
->data
.mutex
);
543 if (mdev
->tconn
->data
.socket
) {
544 queued
= mdev
->tconn
->data
.socket
->sk
->sk_wmem_queued
;
545 sndbuf
= mdev
->tconn
->data
.socket
->sk
->sk_sndbuf
;
550 mutex_unlock(&mdev
->tconn
->data
.mutex
);
551 if (queued
> sndbuf
/ 2)
555 size
= BM_BLOCK_SIZE
;
556 bit
= drbd_bm_find_next(mdev
, mdev
->bm_resync_fo
);
558 if (bit
== DRBD_END_OF_BITMAP
) {
559 mdev
->bm_resync_fo
= drbd_bm_bits(mdev
);
564 sector
= BM_BIT_TO_SECT(bit
);
566 if (drbd_rs_should_slow_down(mdev
, sector
) ||
567 drbd_try_rs_begin_io(mdev
, sector
)) {
568 mdev
->bm_resync_fo
= bit
;
571 mdev
->bm_resync_fo
= bit
+ 1;
573 if (unlikely(drbd_bm_test_bit(mdev
, bit
) == 0)) {
574 drbd_rs_complete_io(mdev
, sector
);
578 #if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
579 /* try to find some adjacent bits.
580 * we stop if we have already the maximum req size.
582 * Additionally always align bigger requests, in order to
583 * be prepared for all stripe sizes of software RAIDs.
588 if (size
+ BM_BLOCK_SIZE
> max_bio_size
)
591 /* Be always aligned */
592 if (sector
& ((1<<(align
+3))-1))
595 /* do not cross extent boundaries */
596 if (((bit
+1) & BM_BLOCKS_PER_BM_EXT_MASK
) == 0)
598 /* now, is it actually dirty, after all?
599 * caution, drbd_bm_test_bit is tri-state for some
600 * obscure reason; ( b == 0 ) would get the out-of-band
601 * only accidentally right because of the "oddly sized"
602 * adjustment below */
603 if (drbd_bm_test_bit(mdev
, bit
+1) != 1)
606 size
+= BM_BLOCK_SIZE
;
607 if ((BM_BLOCK_SIZE
<< align
) <= size
)
611 /* if we merged some,
612 * reset the offset to start the next drbd_bm_find_next from */
613 if (size
> BM_BLOCK_SIZE
)
614 mdev
->bm_resync_fo
= bit
+ 1;
617 /* adjust very last sectors, in case we are oddly sized */
618 if (sector
+ (size
>>9) > capacity
)
619 size
= (capacity
-sector
)<<9;
620 if (mdev
->tconn
->agreed_pro_version
>= 89 && mdev
->tconn
->csums_tfm
) {
621 switch (read_for_csum(mdev
, sector
, size
)) {
622 case -EIO
: /* Disk failure */
625 case -EAGAIN
: /* allocation failed, or ldev busy */
626 drbd_rs_complete_io(mdev
, sector
);
627 mdev
->bm_resync_fo
= BM_SECT_TO_BIT(sector
);
639 inc_rs_pending(mdev
);
640 err
= drbd_send_drequest(mdev
, P_RS_DATA_REQUEST
,
641 sector
, size
, ID_SYNCER
);
643 dev_err(DEV
, "drbd_send_drequest() failed, aborting...\n");
644 dec_rs_pending(mdev
);
651 if (mdev
->bm_resync_fo
>= drbd_bm_bits(mdev
)) {
652 /* last syncer _request_ was sent,
653 * but the P_RS_DATA_REPLY not yet received. sync will end (and
654 * next sync group will resume), as soon as we receive the last
655 * resync data block, and the last bit is cleared.
656 * until then resync "work" is "inactive" ...
663 mdev
->rs_in_flight
+= (i
<< (BM_BLOCK_SHIFT
- 9));
664 mod_timer(&mdev
->resync_timer
, jiffies
+ SLEEP_TIME
);
669 static int w_make_ov_request(struct drbd_work
*w
, int cancel
)
671 struct drbd_conf
*mdev
= w
->mdev
;
674 const sector_t capacity
= drbd_get_capacity(mdev
->this_bdev
);
676 if (unlikely(cancel
))
679 number
= drbd_rs_number_requests(mdev
);
681 sector
= mdev
->ov_position
;
682 for (i
= 0; i
< number
; i
++) {
683 if (sector
>= capacity
) {
687 size
= BM_BLOCK_SIZE
;
689 if (drbd_rs_should_slow_down(mdev
, sector
) ||
690 drbd_try_rs_begin_io(mdev
, sector
)) {
691 mdev
->ov_position
= sector
;
695 if (sector
+ (size
>>9) > capacity
)
696 size
= (capacity
-sector
)<<9;
698 inc_rs_pending(mdev
);
699 if (drbd_send_ov_request(mdev
, sector
, size
)) {
700 dec_rs_pending(mdev
);
703 sector
+= BM_SECT_PER_BIT
;
705 mdev
->ov_position
= sector
;
708 mdev
->rs_in_flight
+= (i
<< (BM_BLOCK_SHIFT
- 9));
709 mod_timer(&mdev
->resync_timer
, jiffies
+ SLEEP_TIME
);
713 int w_ov_finished(struct drbd_work
*w
, int cancel
)
715 struct drbd_conf
*mdev
= w
->mdev
;
717 ov_out_of_sync_print(mdev
);
718 drbd_resync_finished(mdev
);
723 static int w_resync_finished(struct drbd_work
*w
, int cancel
)
725 struct drbd_conf
*mdev
= w
->mdev
;
728 drbd_resync_finished(mdev
);
733 static void ping_peer(struct drbd_conf
*mdev
)
735 struct drbd_tconn
*tconn
= mdev
->tconn
;
737 clear_bit(GOT_PING_ACK
, &tconn
->flags
);
739 wait_event(tconn
->ping_wait
,
740 test_bit(GOT_PING_ACK
, &tconn
->flags
) || mdev
->state
.conn
< C_CONNECTED
);
743 int drbd_resync_finished(struct drbd_conf
*mdev
)
745 unsigned long db
, dt
, dbdt
;
747 union drbd_state os
, ns
;
749 char *khelper_cmd
= NULL
;
752 /* Remove all elements from the resync LRU. Since future actions
753 * might set bits in the (main) bitmap, then the entries in the
754 * resync LRU would be wrong. */
755 if (drbd_rs_del_all(mdev
)) {
756 /* In case this is not possible now, most probably because
757 * there are P_RS_DATA_REPLY Packets lingering on the worker's
758 * queue (or even the read operations for those packets
759 * is not finished by now). Retry in 100ms. */
761 schedule_timeout_interruptible(HZ
/ 10);
762 w
= kmalloc(sizeof(struct drbd_work
), GFP_ATOMIC
);
764 w
->cb
= w_resync_finished
;
765 drbd_queue_work(&mdev
->tconn
->data
.work
, w
);
768 dev_err(DEV
, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
771 dt
= (jiffies
- mdev
->rs_start
- mdev
->rs_paused
) / HZ
;
775 dbdt
= Bit2KB(db
/dt
);
776 mdev
->rs_paused
/= HZ
;
783 spin_lock_irq(&mdev
->tconn
->req_lock
);
784 os
= drbd_read_state(mdev
);
786 verify_done
= (os
.conn
== C_VERIFY_S
|| os
.conn
== C_VERIFY_T
);
788 /* This protects us against multiple calls (that can happen in the presence
789 of application IO), and against connectivity loss just before we arrive here. */
790 if (os
.conn
<= C_CONNECTED
)
794 ns
.conn
= C_CONNECTED
;
796 dev_info(DEV
, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
797 verify_done
? "Online verify " : "Resync",
798 dt
+ mdev
->rs_paused
, mdev
->rs_paused
, dbdt
);
800 n_oos
= drbd_bm_total_weight(mdev
);
802 if (os
.conn
== C_VERIFY_S
|| os
.conn
== C_VERIFY_T
) {
804 dev_alert(DEV
, "Online verify found %lu %dk block out of sync!\n",
806 khelper_cmd
= "out-of-sync";
809 D_ASSERT((n_oos
- mdev
->rs_failed
) == 0);
811 if (os
.conn
== C_SYNC_TARGET
|| os
.conn
== C_PAUSED_SYNC_T
)
812 khelper_cmd
= "after-resync-target";
814 if (mdev
->tconn
->csums_tfm
&& mdev
->rs_total
) {
815 const unsigned long s
= mdev
->rs_same_csum
;
816 const unsigned long t
= mdev
->rs_total
;
819 (t
< 100000) ? ((s
*100)/t
) : (s
/(t
/100));
820 dev_info(DEV
, "%u %% had equal checksums, eliminated: %luK; "
821 "transferred %luK total %luK\n",
823 Bit2KB(mdev
->rs_same_csum
),
824 Bit2KB(mdev
->rs_total
- mdev
->rs_same_csum
),
825 Bit2KB(mdev
->rs_total
));
829 if (mdev
->rs_failed
) {
830 dev_info(DEV
, " %lu failed blocks\n", mdev
->rs_failed
);
832 if (os
.conn
== C_SYNC_TARGET
|| os
.conn
== C_PAUSED_SYNC_T
) {
833 ns
.disk
= D_INCONSISTENT
;
834 ns
.pdsk
= D_UP_TO_DATE
;
836 ns
.disk
= D_UP_TO_DATE
;
837 ns
.pdsk
= D_INCONSISTENT
;
840 ns
.disk
= D_UP_TO_DATE
;
841 ns
.pdsk
= D_UP_TO_DATE
;
843 if (os
.conn
== C_SYNC_TARGET
|| os
.conn
== C_PAUSED_SYNC_T
) {
846 for (i
= UI_BITMAP
; i
<= UI_HISTORY_END
; i
++)
847 _drbd_uuid_set(mdev
, i
, mdev
->p_uuid
[i
]);
848 drbd_uuid_set(mdev
, UI_BITMAP
, mdev
->ldev
->md
.uuid
[UI_CURRENT
]);
849 _drbd_uuid_set(mdev
, UI_CURRENT
, mdev
->p_uuid
[UI_CURRENT
]);
851 dev_err(DEV
, "mdev->p_uuid is NULL! BUG\n");
855 if (!(os
.conn
== C_VERIFY_S
|| os
.conn
== C_VERIFY_T
)) {
856 /* for verify runs, we don't update uuids here,
857 * so there would be nothing to report. */
858 drbd_uuid_set_bm(mdev
, 0UL);
859 drbd_print_uuids(mdev
, "updated UUIDs");
861 /* Now the two UUID sets are equal, update what we
862 * know of the peer. */
864 for (i
= UI_CURRENT
; i
<= UI_HISTORY_END
; i
++)
865 mdev
->p_uuid
[i
] = mdev
->ldev
->md
.uuid
[i
];
870 _drbd_set_state(mdev
, ns
, CS_VERBOSE
, NULL
);
872 spin_unlock_irq(&mdev
->tconn
->req_lock
);
879 mdev
->ov_start_sector
= 0;
884 drbd_khelper(mdev
, khelper_cmd
);
890 static void move_to_net_ee_or_free(struct drbd_conf
*mdev
, struct drbd_peer_request
*peer_req
)
892 if (drbd_ee_has_active_page(peer_req
)) {
893 /* This might happen if sendpage() has not finished */
894 int i
= (peer_req
->i
.size
+ PAGE_SIZE
-1) >> PAGE_SHIFT
;
895 atomic_add(i
, &mdev
->pp_in_use_by_net
);
896 atomic_sub(i
, &mdev
->pp_in_use
);
897 spin_lock_irq(&mdev
->tconn
->req_lock
);
898 list_add_tail(&peer_req
->w
.list
, &mdev
->net_ee
);
899 spin_unlock_irq(&mdev
->tconn
->req_lock
);
900 wake_up(&drbd_pp_wait
);
902 drbd_free_ee(mdev
, peer_req
);
906 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
907 * @mdev: DRBD device.
909 * @cancel: The connection will be closed anyways
911 int w_e_end_data_req(struct drbd_work
*w
, int cancel
)
913 struct drbd_peer_request
*peer_req
= container_of(w
, struct drbd_peer_request
, w
);
914 struct drbd_conf
*mdev
= w
->mdev
;
917 if (unlikely(cancel
)) {
918 drbd_free_ee(mdev
, peer_req
);
923 if (likely((peer_req
->flags
& EE_WAS_ERROR
) == 0)) {
924 err
= drbd_send_block(mdev
, P_DATA_REPLY
, peer_req
);
926 if (__ratelimit(&drbd_ratelimit_state
))
927 dev_err(DEV
, "Sending NegDReply. sector=%llus.\n",
928 (unsigned long long)peer_req
->i
.sector
);
930 err
= drbd_send_ack(mdev
, P_NEG_DREPLY
, peer_req
);
935 move_to_net_ee_or_free(mdev
, peer_req
);
938 dev_err(DEV
, "drbd_send_block() failed\n");
943 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
944 * @mdev: DRBD device.
946 * @cancel: The connection will be closed anyways
948 int w_e_end_rsdata_req(struct drbd_work
*w
, int cancel
)
950 struct drbd_peer_request
*peer_req
= container_of(w
, struct drbd_peer_request
, w
);
951 struct drbd_conf
*mdev
= w
->mdev
;
954 if (unlikely(cancel
)) {
955 drbd_free_ee(mdev
, peer_req
);
960 if (get_ldev_if_state(mdev
, D_FAILED
)) {
961 drbd_rs_complete_io(mdev
, peer_req
->i
.sector
);
965 if (mdev
->state
.conn
== C_AHEAD
) {
966 err
= drbd_send_ack(mdev
, P_RS_CANCEL
, peer_req
);
967 } else if (likely((peer_req
->flags
& EE_WAS_ERROR
) == 0)) {
968 if (likely(mdev
->state
.pdsk
>= D_INCONSISTENT
)) {
969 inc_rs_pending(mdev
);
970 err
= drbd_send_block(mdev
, P_RS_DATA_REPLY
, peer_req
);
972 if (__ratelimit(&drbd_ratelimit_state
))
973 dev_err(DEV
, "Not sending RSDataReply, "
974 "partner DISKLESS!\n");
978 if (__ratelimit(&drbd_ratelimit_state
))
979 dev_err(DEV
, "Sending NegRSDReply. sector %llus.\n",
980 (unsigned long long)peer_req
->i
.sector
);
982 err
= drbd_send_ack(mdev
, P_NEG_RS_DREPLY
, peer_req
);
984 /* update resync data with failure */
985 drbd_rs_failed_io(mdev
, peer_req
->i
.sector
, peer_req
->i
.size
);
990 move_to_net_ee_or_free(mdev
, peer_req
);
993 dev_err(DEV
, "drbd_send_block() failed\n");
997 int w_e_end_csum_rs_req(struct drbd_work
*w
, int cancel
)
999 struct drbd_peer_request
*peer_req
= container_of(w
, struct drbd_peer_request
, w
);
1000 struct drbd_conf
*mdev
= w
->mdev
;
1001 struct digest_info
*di
;
1003 void *digest
= NULL
;
1006 if (unlikely(cancel
)) {
1007 drbd_free_ee(mdev
, peer_req
);
1012 if (get_ldev(mdev
)) {
1013 drbd_rs_complete_io(mdev
, peer_req
->i
.sector
);
1017 di
= peer_req
->digest
;
1019 if (likely((peer_req
->flags
& EE_WAS_ERROR
) == 0)) {
1020 /* quick hack to try to avoid a race against reconfiguration.
1021 * a real fix would be much more involved,
1022 * introducing more locking mechanisms */
1023 if (mdev
->tconn
->csums_tfm
) {
1024 digest_size
= crypto_hash_digestsize(mdev
->tconn
->csums_tfm
);
1025 D_ASSERT(digest_size
== di
->digest_size
);
1026 digest
= kmalloc(digest_size
, GFP_NOIO
);
1029 drbd_csum_ee(mdev
, mdev
->tconn
->csums_tfm
, peer_req
, digest
);
1030 eq
= !memcmp(digest
, di
->digest
, digest_size
);
1035 drbd_set_in_sync(mdev
, peer_req
->i
.sector
, peer_req
->i
.size
);
1036 /* rs_same_csums unit is BM_BLOCK_SIZE */
1037 mdev
->rs_same_csum
+= peer_req
->i
.size
>> BM_BLOCK_SHIFT
;
1038 err
= drbd_send_ack(mdev
, P_RS_IS_IN_SYNC
, peer_req
);
1040 inc_rs_pending(mdev
);
1041 peer_req
->block_id
= ID_SYNCER
; /* By setting block_id, digest pointer becomes invalid! */
1042 peer_req
->flags
&= ~EE_HAS_DIGEST
; /* This peer request no longer has a digest pointer */
1044 err
= drbd_send_block(mdev
, P_RS_DATA_REPLY
, peer_req
);
1047 err
= drbd_send_ack(mdev
, P_NEG_RS_DREPLY
, peer_req
);
1048 if (__ratelimit(&drbd_ratelimit_state
))
1049 dev_err(DEV
, "Sending NegDReply. I guess it gets messy.\n");
1053 move_to_net_ee_or_free(mdev
, peer_req
);
1056 dev_err(DEV
, "drbd_send_block/ack() failed\n");
1060 int w_e_end_ov_req(struct drbd_work
*w
, int cancel
)
1062 struct drbd_peer_request
*peer_req
= container_of(w
, struct drbd_peer_request
, w
);
1063 struct drbd_conf
*mdev
= w
->mdev
;
1064 sector_t sector
= peer_req
->i
.sector
;
1065 unsigned int size
= peer_req
->i
.size
;
1070 if (unlikely(cancel
))
1073 digest_size
= crypto_hash_digestsize(mdev
->tconn
->verify_tfm
);
1074 digest
= kmalloc(digest_size
, GFP_NOIO
);
1076 err
= 1; /* terminate the connection in case the allocation failed */
1080 if (likely(!(peer_req
->flags
& EE_WAS_ERROR
)))
1081 drbd_csum_ee(mdev
, mdev
->tconn
->verify_tfm
, peer_req
, digest
);
1083 memset(digest
, 0, digest_size
);
1085 /* Free e and pages before send.
1086 * In case we block on congestion, we could otherwise run into
1087 * some distributed deadlock, if the other side blocks on
1088 * congestion as well, because our receiver blocks in
1089 * drbd_pp_alloc due to pp_in_use > max_buffers. */
1090 drbd_free_ee(mdev
, peer_req
);
1092 inc_rs_pending(mdev
);
1093 err
= drbd_send_drequest_csum(mdev
, sector
, size
, digest
, digest_size
, P_OV_REPLY
);
1095 dec_rs_pending(mdev
);
1100 drbd_free_ee(mdev
, peer_req
);
1105 void drbd_ov_out_of_sync_found(struct drbd_conf
*mdev
, sector_t sector
, int size
)
1107 if (mdev
->ov_last_oos_start
+ mdev
->ov_last_oos_size
== sector
) {
1108 mdev
->ov_last_oos_size
+= size
>>9;
1110 mdev
->ov_last_oos_start
= sector
;
1111 mdev
->ov_last_oos_size
= size
>>9;
1113 drbd_set_out_of_sync(mdev
, sector
, size
);
1116 int w_e_end_ov_reply(struct drbd_work
*w
, int cancel
)
1118 struct drbd_peer_request
*peer_req
= container_of(w
, struct drbd_peer_request
, w
);
1119 struct drbd_conf
*mdev
= w
->mdev
;
1120 struct digest_info
*di
;
1122 sector_t sector
= peer_req
->i
.sector
;
1123 unsigned int size
= peer_req
->i
.size
;
1127 if (unlikely(cancel
)) {
1128 drbd_free_ee(mdev
, peer_req
);
1133 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1134 * the resync lru has been cleaned up already */
1135 if (get_ldev(mdev
)) {
1136 drbd_rs_complete_io(mdev
, peer_req
->i
.sector
);
1140 di
= peer_req
->digest
;
1142 if (likely((peer_req
->flags
& EE_WAS_ERROR
) == 0)) {
1143 digest_size
= crypto_hash_digestsize(mdev
->tconn
->verify_tfm
);
1144 digest
= kmalloc(digest_size
, GFP_NOIO
);
1146 drbd_csum_ee(mdev
, mdev
->tconn
->verify_tfm
, peer_req
, digest
);
1148 D_ASSERT(digest_size
== di
->digest_size
);
1149 eq
= !memcmp(digest
, di
->digest
, digest_size
);
1154 /* Free peer_req and pages before send.
1155 * In case we block on congestion, we could otherwise run into
1156 * some distributed deadlock, if the other side blocks on
1157 * congestion as well, because our receiver blocks in
1158 * drbd_pp_alloc due to pp_in_use > max_buffers. */
1159 drbd_free_ee(mdev
, peer_req
);
1161 drbd_ov_out_of_sync_found(mdev
, sector
, size
);
1163 ov_out_of_sync_print(mdev
);
1165 err
= drbd_send_ack_ex(mdev
, P_OV_RESULT
, sector
, size
,
1166 eq
? ID_IN_SYNC
: ID_OUT_OF_SYNC
);
1172 /* let's advance progress step marks only for every other megabyte */
1173 if ((mdev
->ov_left
& 0x200) == 0x200)
1174 drbd_advance_rs_marks(mdev
, mdev
->ov_left
);
1176 if (mdev
->ov_left
== 0) {
1177 ov_out_of_sync_print(mdev
);
1178 drbd_resync_finished(mdev
);
1184 int w_prev_work_done(struct drbd_work
*w
, int cancel
)
1186 struct drbd_wq_barrier
*b
= container_of(w
, struct drbd_wq_barrier
, w
);
1192 int w_send_barrier(struct drbd_work
*w
, int cancel
)
1194 struct drbd_socket
*sock
;
1195 struct drbd_tl_epoch
*b
= container_of(w
, struct drbd_tl_epoch
, w
);
1196 struct drbd_conf
*mdev
= w
->mdev
;
1197 struct p_barrier
*p
;
1199 /* really avoid racing with tl_clear. w.cb may have been referenced
1200 * just before it was reassigned and re-queued, so double check that.
1201 * actually, this race was harmless, since we only try to send the
1202 * barrier packet here, and otherwise do nothing with the object.
1203 * but compare with the head of w_clear_epoch */
1204 spin_lock_irq(&mdev
->tconn
->req_lock
);
1205 if (w
->cb
!= w_send_barrier
|| mdev
->state
.conn
< C_CONNECTED
)
1207 spin_unlock_irq(&mdev
->tconn
->req_lock
);
1211 sock
= &mdev
->tconn
->data
;
1212 p
= drbd_prepare_command(mdev
, sock
);
1215 p
->barrier
= b
->br_number
;
1216 /* inc_ap_pending was done where this was queued.
1217 * dec_ap_pending will be done in got_BarrierAck
1218 * or (on connection loss) in w_clear_epoch. */
1219 return drbd_send_command(mdev
, sock
, P_BARRIER
, sizeof(*p
), NULL
, 0);
1222 int w_send_write_hint(struct drbd_work
*w
, int cancel
)
1224 struct drbd_conf
*mdev
= w
->mdev
;
1225 struct drbd_socket
*sock
;
1229 sock
= &mdev
->tconn
->data
;
1230 if (!drbd_prepare_command(mdev
, sock
))
1232 return drbd_send_command(mdev
, sock
, P_UNPLUG_REMOTE
, sizeof(struct p_header
), NULL
, 0);
1235 int w_send_out_of_sync(struct drbd_work
*w
, int cancel
)
1237 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
1238 struct drbd_conf
*mdev
= w
->mdev
;
1241 if (unlikely(cancel
)) {
1242 req_mod(req
, SEND_CANCELED
);
1246 err
= drbd_send_out_of_sync(mdev
, req
);
1247 req_mod(req
, OOS_HANDED_TO_NETWORK
);
1253 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1254 * @mdev: DRBD device.
1256 * @cancel: The connection will be closed anyways
1258 int w_send_dblock(struct drbd_work
*w
, int cancel
)
1260 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
1261 struct drbd_conf
*mdev
= w
->mdev
;
1264 if (unlikely(cancel
)) {
1265 req_mod(req
, SEND_CANCELED
);
1269 err
= drbd_send_dblock(mdev
, req
);
1270 req_mod(req
, err
? SEND_FAILED
: HANDED_OVER_TO_NETWORK
);
1276 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1277 * @mdev: DRBD device.
1279 * @cancel: The connection will be closed anyways
1281 int w_send_read_req(struct drbd_work
*w
, int cancel
)
1283 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
1284 struct drbd_conf
*mdev
= w
->mdev
;
1287 if (unlikely(cancel
)) {
1288 req_mod(req
, SEND_CANCELED
);
1292 err
= drbd_send_drequest(mdev
, P_DATA_REQUEST
, req
->i
.sector
, req
->i
.size
,
1293 (unsigned long)req
);
1295 req_mod(req
, err
? SEND_FAILED
: HANDED_OVER_TO_NETWORK
);
1300 int w_restart_disk_io(struct drbd_work
*w
, int cancel
)
1302 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
1303 struct drbd_conf
*mdev
= w
->mdev
;
1305 if (bio_data_dir(req
->master_bio
) == WRITE
&& req
->rq_state
& RQ_IN_ACT_LOG
)
1306 drbd_al_begin_io(mdev
, &req
->i
);
1307 /* Calling drbd_al_begin_io() out of the worker might deadlocks
1308 theoretically. Practically it can not deadlock, since this is
1309 only used when unfreezing IOs. All the extents of the requests
1310 that made it into the TL are already active */
1312 drbd_req_make_private_bio(req
, req
->master_bio
);
1313 req
->private_bio
->bi_bdev
= mdev
->ldev
->backing_bdev
;
1314 generic_make_request(req
->private_bio
);
1319 static int _drbd_may_sync_now(struct drbd_conf
*mdev
)
1321 struct drbd_conf
*odev
= mdev
;
1326 if (odev
->ldev
->dc
.resync_after
== -1)
1328 odev
= minor_to_mdev(odev
->ldev
->dc
.resync_after
);
1331 if ((odev
->state
.conn
>= C_SYNC_SOURCE
&&
1332 odev
->state
.conn
<= C_PAUSED_SYNC_T
) ||
1333 odev
->state
.aftr_isp
|| odev
->state
.peer_isp
||
1334 odev
->state
.user_isp
)
1340 * _drbd_pause_after() - Pause resync on all devices that may not resync now
1341 * @mdev: DRBD device.
1343 * Called from process context only (admin command and after_state_ch).
1345 static int _drbd_pause_after(struct drbd_conf
*mdev
)
1347 struct drbd_conf
*odev
;
1350 idr_for_each_entry(&minors
, odev
, i
) {
1351 if (odev
->state
.conn
== C_STANDALONE
&& odev
->state
.disk
== D_DISKLESS
)
1353 if (!_drbd_may_sync_now(odev
))
1354 rv
|= (__drbd_set_state(_NS(odev
, aftr_isp
, 1), CS_HARD
, NULL
)
1355 != SS_NOTHING_TO_DO
);
1362 * _drbd_resume_next() - Resume resync on all devices that may resync now
1363 * @mdev: DRBD device.
1365 * Called from process context only (admin command and worker).
1367 static int _drbd_resume_next(struct drbd_conf
*mdev
)
1369 struct drbd_conf
*odev
;
1372 idr_for_each_entry(&minors
, odev
, i
) {
1373 if (odev
->state
.conn
== C_STANDALONE
&& odev
->state
.disk
== D_DISKLESS
)
1375 if (odev
->state
.aftr_isp
) {
1376 if (_drbd_may_sync_now(odev
))
1377 rv
|= (__drbd_set_state(_NS(odev
, aftr_isp
, 0),
1379 != SS_NOTHING_TO_DO
) ;
1385 void resume_next_sg(struct drbd_conf
*mdev
)
1387 write_lock_irq(&global_state_lock
);
1388 _drbd_resume_next(mdev
);
1389 write_unlock_irq(&global_state_lock
);
1392 void suspend_other_sg(struct drbd_conf
*mdev
)
1394 write_lock_irq(&global_state_lock
);
1395 _drbd_pause_after(mdev
);
1396 write_unlock_irq(&global_state_lock
);
1399 static int sync_after_error(struct drbd_conf
*mdev
, int o_minor
)
1401 struct drbd_conf
*odev
;
1405 if (o_minor
< -1 || minor_to_mdev(o_minor
) == NULL
)
1406 return ERR_SYNC_AFTER
;
1408 /* check for loops */
1409 odev
= minor_to_mdev(o_minor
);
1412 return ERR_SYNC_AFTER_CYCLE
;
1414 /* dependency chain ends here, no cycles. */
1415 if (odev
->ldev
->dc
.resync_after
== -1)
1418 /* follow the dependency chain */
1419 odev
= minor_to_mdev(odev
->ldev
->dc
.resync_after
);
1423 int drbd_alter_sa(struct drbd_conf
*mdev
, int na
)
1428 write_lock_irq(&global_state_lock
);
1429 retcode
= sync_after_error(mdev
, na
);
1430 if (retcode
== NO_ERROR
) {
1431 mdev
->ldev
->dc
.resync_after
= na
;
1433 changes
= _drbd_pause_after(mdev
);
1434 changes
|= _drbd_resume_next(mdev
);
1437 write_unlock_irq(&global_state_lock
);
1441 void drbd_rs_controller_reset(struct drbd_conf
*mdev
)
1443 atomic_set(&mdev
->rs_sect_in
, 0);
1444 atomic_set(&mdev
->rs_sect_ev
, 0);
1445 mdev
->rs_in_flight
= 0;
1446 mdev
->rs_planed
= 0;
1447 spin_lock(&mdev
->peer_seq_lock
);
1448 fifo_set(&mdev
->rs_plan_s
, 0);
1449 spin_unlock(&mdev
->peer_seq_lock
);
1452 void start_resync_timer_fn(unsigned long data
)
1454 struct drbd_conf
*mdev
= (struct drbd_conf
*) data
;
1456 drbd_queue_work(&mdev
->tconn
->data
.work
, &mdev
->start_resync_work
);
1459 int w_start_resync(struct drbd_work
*w
, int cancel
)
1461 struct drbd_conf
*mdev
= w
->mdev
;
1463 if (atomic_read(&mdev
->unacked_cnt
) || atomic_read(&mdev
->rs_pending_cnt
)) {
1464 dev_warn(DEV
, "w_start_resync later...\n");
1465 mdev
->start_resync_timer
.expires
= jiffies
+ HZ
/10;
1466 add_timer(&mdev
->start_resync_timer
);
1470 drbd_start_resync(mdev
, C_SYNC_SOURCE
);
1471 clear_bit(AHEAD_TO_SYNC_SOURCE
, &mdev
->current_epoch
->flags
);
1476 * drbd_start_resync() - Start the resync process
1477 * @mdev: DRBD device.
1478 * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
1480 * This function might bring you directly into one of the
1481 * C_PAUSED_SYNC_* states.
1483 void drbd_start_resync(struct drbd_conf
*mdev
, enum drbd_conns side
)
1485 union drbd_state ns
;
1488 if (mdev
->state
.conn
>= C_SYNC_SOURCE
&& mdev
->state
.conn
< C_AHEAD
) {
1489 dev_err(DEV
, "Resync already running!\n");
1493 if (mdev
->state
.conn
< C_AHEAD
) {
1494 /* In case a previous resync run was aborted by an IO error/detach on the peer. */
1495 drbd_rs_cancel_all(mdev
);
1496 /* This should be done when we abort the resync. We definitely do not
1497 want to have this for connections going back and forth between
1498 Ahead/Behind and SyncSource/SyncTarget */
1501 if (!test_bit(B_RS_H_DONE
, &mdev
->flags
)) {
1502 if (side
== C_SYNC_TARGET
) {
1503 /* Since application IO was locked out during C_WF_BITMAP_T and
1504 C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1505 we check that we might make the data inconsistent. */
1506 r
= drbd_khelper(mdev
, "before-resync-target");
1507 r
= (r
>> 8) & 0xff;
1509 dev_info(DEV
, "before-resync-target handler returned %d, "
1510 "dropping connection.\n", r
);
1511 conn_request_state(mdev
->tconn
, NS(conn
, C_DISCONNECTING
), CS_HARD
);
1514 } else /* C_SYNC_SOURCE */ {
1515 r
= drbd_khelper(mdev
, "before-resync-source");
1516 r
= (r
>> 8) & 0xff;
1519 dev_info(DEV
, "before-resync-source handler returned %d, "
1520 "ignoring. Old userland tools?", r
);
1522 dev_info(DEV
, "before-resync-source handler returned %d, "
1523 "dropping connection.\n", r
);
1524 conn_request_state(mdev
->tconn
, NS(conn
, C_DISCONNECTING
), CS_HARD
);
1531 if (current
== mdev
->tconn
->worker
.task
) {
1532 /* The worker should not sleep waiting for state_mutex,
1533 that can take long */
1534 if (!mutex_trylock(mdev
->state_mutex
)) {
1535 set_bit(B_RS_H_DONE
, &mdev
->flags
);
1536 mdev
->start_resync_timer
.expires
= jiffies
+ HZ
/5;
1537 add_timer(&mdev
->start_resync_timer
);
1541 mutex_lock(mdev
->state_mutex
);
1543 clear_bit(B_RS_H_DONE
, &mdev
->flags
);
1545 if (!get_ldev_if_state(mdev
, D_NEGOTIATING
)) {
1546 mutex_unlock(mdev
->state_mutex
);
1550 write_lock_irq(&global_state_lock
);
1551 ns
= drbd_read_state(mdev
);
1553 ns
.aftr_isp
= !_drbd_may_sync_now(mdev
);
1557 if (side
== C_SYNC_TARGET
)
1558 ns
.disk
= D_INCONSISTENT
;
1559 else /* side == C_SYNC_SOURCE */
1560 ns
.pdsk
= D_INCONSISTENT
;
1562 r
= __drbd_set_state(mdev
, ns
, CS_VERBOSE
, NULL
);
1563 ns
= drbd_read_state(mdev
);
1565 if (ns
.conn
< C_CONNECTED
)
1566 r
= SS_UNKNOWN_ERROR
;
1568 if (r
== SS_SUCCESS
) {
1569 unsigned long tw
= drbd_bm_total_weight(mdev
);
1570 unsigned long now
= jiffies
;
1573 mdev
->rs_failed
= 0;
1574 mdev
->rs_paused
= 0;
1575 mdev
->rs_same_csum
= 0;
1576 mdev
->rs_last_events
= 0;
1577 mdev
->rs_last_sect_ev
= 0;
1578 mdev
->rs_total
= tw
;
1579 mdev
->rs_start
= now
;
1580 for (i
= 0; i
< DRBD_SYNC_MARKS
; i
++) {
1581 mdev
->rs_mark_left
[i
] = tw
;
1582 mdev
->rs_mark_time
[i
] = now
;
1584 _drbd_pause_after(mdev
);
1586 write_unlock_irq(&global_state_lock
);
1588 if (r
== SS_SUCCESS
) {
1589 dev_info(DEV
, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1590 drbd_conn_str(ns
.conn
),
1591 (unsigned long) mdev
->rs_total
<< (BM_BLOCK_SHIFT
-10),
1592 (unsigned long) mdev
->rs_total
);
1593 if (side
== C_SYNC_TARGET
)
1594 mdev
->bm_resync_fo
= 0;
1596 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
1597 * with w_send_oos, or the sync target will get confused as to
1598 * how much bits to resync. We cannot do that always, because for an
1599 * empty resync and protocol < 95, we need to do it here, as we call
1600 * drbd_resync_finished from here in that case.
1601 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
1602 * and from after_state_ch otherwise. */
1603 if (side
== C_SYNC_SOURCE
&& mdev
->tconn
->agreed_pro_version
< 96)
1604 drbd_gen_and_send_sync_uuid(mdev
);
1606 if (mdev
->tconn
->agreed_pro_version
< 95 && mdev
->rs_total
== 0) {
1607 /* This still has a race (about when exactly the peers
1608 * detect connection loss) that can lead to a full sync
1609 * on next handshake. In 8.3.9 we fixed this with explicit
1610 * resync-finished notifications, but the fix
1611 * introduces a protocol change. Sleeping for some
1612 * time longer than the ping interval + timeout on the
1613 * SyncSource, to give the SyncTarget the chance to
1614 * detect connection loss, then waiting for a ping
1615 * response (implicit in drbd_resync_finished) reduces
1616 * the race considerably, but does not solve it. */
1617 if (side
== C_SYNC_SOURCE
)
1618 schedule_timeout_interruptible(
1619 mdev
->tconn
->net_conf
->ping_int
* HZ
+
1620 mdev
->tconn
->net_conf
->ping_timeo
*HZ
/9);
1621 drbd_resync_finished(mdev
);
1624 drbd_rs_controller_reset(mdev
);
1625 /* ns.conn may already be != mdev->state.conn,
1626 * we may have been paused in between, or become paused until
1627 * the timer triggers.
1628 * No matter, that is handled in resync_timer_fn() */
1629 if (ns
.conn
== C_SYNC_TARGET
)
1630 mod_timer(&mdev
->resync_timer
, jiffies
);
1635 mutex_unlock(mdev
->state_mutex
);
1638 int drbd_worker(struct drbd_thread
*thi
)
1640 struct drbd_tconn
*tconn
= thi
->tconn
;
1641 struct drbd_work
*w
= NULL
;
1642 struct drbd_conf
*mdev
;
1643 LIST_HEAD(work_list
);
1646 while (get_t_state(thi
) == RUNNING
) {
1647 drbd_thread_current_set_cpu(thi
);
1649 if (down_trylock(&tconn
->data
.work
.s
)) {
1650 mutex_lock(&tconn
->data
.mutex
);
1651 if (tconn
->data
.socket
&& !tconn
->net_conf
->no_cork
)
1652 drbd_tcp_uncork(tconn
->data
.socket
);
1653 mutex_unlock(&tconn
->data
.mutex
);
1655 intr
= down_interruptible(&tconn
->data
.work
.s
);
1657 mutex_lock(&tconn
->data
.mutex
);
1658 if (tconn
->data
.socket
&& !tconn
->net_conf
->no_cork
)
1659 drbd_tcp_cork(tconn
->data
.socket
);
1660 mutex_unlock(&tconn
->data
.mutex
);
1664 flush_signals(current
);
1665 if (get_t_state(thi
) == RUNNING
) {
1666 conn_warn(tconn
, "Worker got an unexpected signal\n");
1672 if (get_t_state(thi
) != RUNNING
)
1674 /* With this break, we have done a down() but not consumed
1675 the entry from the list. The cleanup code takes care of
1679 spin_lock_irq(&tconn
->data
.work
.q_lock
);
1680 if (list_empty(&tconn
->data
.work
.q
)) {
1681 /* something terribly wrong in our logic.
1682 * we were able to down() the semaphore,
1683 * but the list is empty... doh.
1685 * what is the best thing to do now?
1686 * try again from scratch, restarting the receiver,
1687 * asender, whatnot? could break even more ugly,
1688 * e.g. when we are primary, but no good local data.
1690 * I'll try to get away just starting over this loop.
1692 conn_warn(tconn
, "Work list unexpectedly empty\n");
1693 spin_unlock_irq(&tconn
->data
.work
.q_lock
);
1696 w
= list_entry(tconn
->data
.work
.q
.next
, struct drbd_work
, list
);
1697 list_del_init(&w
->list
);
1698 spin_unlock_irq(&tconn
->data
.work
.q_lock
);
1700 if (w
->cb(w
, tconn
->cstate
< C_WF_REPORT_PARAMS
)) {
1701 /* dev_warn(DEV, "worker: a callback failed! \n"); */
1702 if (tconn
->cstate
>= C_WF_REPORT_PARAMS
)
1703 conn_request_state(tconn
, NS(conn
, C_NETWORK_FAILURE
), CS_HARD
);
1707 spin_lock_irq(&tconn
->data
.work
.q_lock
);
1708 while (!list_empty(&tconn
->data
.work
.q
)) {
1709 list_splice_init(&tconn
->data
.work
.q
, &work_list
);
1710 spin_unlock_irq(&tconn
->data
.work
.q_lock
);
1712 while (!list_empty(&work_list
)) {
1713 w
= list_entry(work_list
.next
, struct drbd_work
, list
);
1714 list_del_init(&w
->list
);
1718 spin_lock_irq(&tconn
->data
.work
.q_lock
);
1720 sema_init(&tconn
->data
.work
.s
, 0);
1721 /* DANGEROUS race: if someone did queue his work within the spinlock,
1722 * but up() ed outside the spinlock, we could get an up() on the
1723 * semaphore without corresponding list entry.
1726 spin_unlock_irq(&tconn
->data
.work
.q_lock
);
1728 drbd_thread_stop(&tconn
->receiver
);
1729 idr_for_each_entry(&tconn
->volumes
, mdev
, vnr
) {
1730 D_ASSERT(mdev
->state
.disk
== D_DISKLESS
&& mdev
->state
.conn
== C_STANDALONE
);
1731 /* _drbd_set_state only uses stop_nowait.
1732 * wait here for the exiting receiver. */
1733 drbd_mdev_cleanup(mdev
);
1735 clear_bit(OBJECT_DYING
, &tconn
->flags
);
1736 clear_bit(CONFIG_PENDING
, &tconn
->flags
);
1737 wake_up(&tconn
->ping_wait
);