4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26 #include <linux/module.h>
27 #include <linux/drbd.h>
28 #include <linux/sched.h>
29 #include <linux/wait.h>
31 #include <linux/memcontrol.h>
32 #include <linux/mm_inline.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/string.h>
36 #include <linux/scatterlist.h>
41 static int w_make_ov_request(struct drbd_work
*w
, int cancel
);
45 * drbd_md_io_complete (defined here)
46 * drbd_request_endio (defined here)
47 * drbd_peer_request_endio (defined here)
48 * bm_async_io_complete (defined in drbd_bitmap.c)
50 * For all these callbacks, note the following:
51 * The callbacks will be called in irq context by the IDE drivers,
52 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
53 * Try to get the locking right :)
58 /* About the global_state_lock
59 Each state transition on an device holds a read lock. In case we have
60 to evaluate the resync after dependencies, we grab a write lock, because
61 we need stable states on all devices for that. */
62 rwlock_t global_state_lock
;
64 /* used for synchronous meta data and bitmap IO
65 * submitted by drbd_md_sync_page_io()
67 void drbd_md_io_complete(struct bio
*bio
, int error
)
69 struct drbd_md_io
*md_io
;
70 struct drbd_conf
*mdev
;
72 md_io
= (struct drbd_md_io
*)bio
->bi_private
;
73 mdev
= container_of(md_io
, struct drbd_conf
, md_io
);
78 wake_up(&mdev
->misc_wait
);
80 drbd_md_put_buffer(mdev
);
84 /* reads on behalf of the partner,
85 * "submitted" by the receiver
87 void drbd_endio_read_sec_final(struct drbd_peer_request
*peer_req
) __releases(local
)
89 unsigned long flags
= 0;
90 struct drbd_conf
*mdev
= peer_req
->w
.mdev
;
92 spin_lock_irqsave(&mdev
->tconn
->req_lock
, flags
);
93 mdev
->read_cnt
+= peer_req
->i
.size
>> 9;
94 list_del(&peer_req
->w
.list
);
95 if (list_empty(&mdev
->read_ee
))
96 wake_up(&mdev
->ee_wait
);
97 if (test_bit(__EE_WAS_ERROR
, &peer_req
->flags
))
98 __drbd_chk_io_error(mdev
, false);
99 spin_unlock_irqrestore(&mdev
->tconn
->req_lock
, flags
);
101 drbd_queue_work(&mdev
->tconn
->data
.work
, &peer_req
->w
);
105 /* writes on behalf of the partner, or resync writes,
106 * "submitted" by the receiver, final stage. */
107 static void drbd_endio_write_sec_final(struct drbd_peer_request
*peer_req
) __releases(local
)
109 unsigned long flags
= 0;
110 struct drbd_conf
*mdev
= peer_req
->w
.mdev
;
111 struct drbd_interval i
;
114 int do_al_complete_io
;
116 /* after we moved peer_req to done_ee,
117 * we may no longer access it,
118 * it may be freed/reused already!
119 * (as soon as we release the req_lock) */
121 do_al_complete_io
= peer_req
->flags
& EE_CALL_AL_COMPLETE_IO
;
122 block_id
= peer_req
->block_id
;
124 spin_lock_irqsave(&mdev
->tconn
->req_lock
, flags
);
125 mdev
->writ_cnt
+= peer_req
->i
.size
>> 9;
126 list_del(&peer_req
->w
.list
); /* has been on active_ee or sync_ee */
127 list_add_tail(&peer_req
->w
.list
, &mdev
->done_ee
);
130 * Do not remove from the write_requests tree here: we did not send the
131 * Ack yet and did not wake possibly waiting conflicting requests.
132 * Removed from the tree from "drbd_process_done_ee" within the
133 * appropriate w.cb (e_end_block/e_end_resync_block) or from
134 * _drbd_clear_done_ee.
137 do_wake
= list_empty(block_id
== ID_SYNCER
? &mdev
->sync_ee
: &mdev
->active_ee
);
139 if (test_bit(__EE_WAS_ERROR
, &peer_req
->flags
))
140 __drbd_chk_io_error(mdev
, false);
141 spin_unlock_irqrestore(&mdev
->tconn
->req_lock
, flags
);
143 if (block_id
== ID_SYNCER
)
144 drbd_rs_complete_io(mdev
, i
.sector
);
147 wake_up(&mdev
->ee_wait
);
149 if (do_al_complete_io
)
150 drbd_al_complete_io(mdev
, &i
);
152 wake_asender(mdev
->tconn
);
156 /* writes on behalf of the partner, or resync writes,
157 * "submitted" by the receiver.
159 void drbd_peer_request_endio(struct bio
*bio
, int error
)
161 struct drbd_peer_request
*peer_req
= bio
->bi_private
;
162 struct drbd_conf
*mdev
= peer_req
->w
.mdev
;
163 int uptodate
= bio_flagged(bio
, BIO_UPTODATE
);
164 int is_write
= bio_data_dir(bio
) == WRITE
;
166 if (error
&& __ratelimit(&drbd_ratelimit_state
))
167 dev_warn(DEV
, "%s: error=%d s=%llus\n",
168 is_write
? "write" : "read", error
,
169 (unsigned long long)peer_req
->i
.sector
);
170 if (!error
&& !uptodate
) {
171 if (__ratelimit(&drbd_ratelimit_state
))
172 dev_warn(DEV
, "%s: setting error to -EIO s=%llus\n",
173 is_write
? "write" : "read",
174 (unsigned long long)peer_req
->i
.sector
);
175 /* strange behavior of some lower level drivers...
176 * fail the request by clearing the uptodate flag,
177 * but do not return any error?! */
182 set_bit(__EE_WAS_ERROR
, &peer_req
->flags
);
184 bio_put(bio
); /* no need for the bio anymore */
185 if (atomic_dec_and_test(&peer_req
->pending_bios
)) {
187 drbd_endio_write_sec_final(peer_req
);
189 drbd_endio_read_sec_final(peer_req
);
193 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
195 void drbd_request_endio(struct bio
*bio
, int error
)
198 struct drbd_request
*req
= bio
->bi_private
;
199 struct drbd_conf
*mdev
= req
->w
.mdev
;
200 struct bio_and_error m
;
201 enum drbd_req_event what
;
202 int uptodate
= bio_flagged(bio
, BIO_UPTODATE
);
204 if (!error
&& !uptodate
) {
205 dev_warn(DEV
, "p %s: setting error to -EIO\n",
206 bio_data_dir(bio
) == WRITE
? "write" : "read");
207 /* strange behavior of some lower level drivers...
208 * fail the request by clearing the uptodate flag,
209 * but do not return any error?! */
213 /* to avoid recursion in __req_mod */
214 if (unlikely(error
)) {
215 what
= (bio_data_dir(bio
) == WRITE
)
216 ? WRITE_COMPLETED_WITH_ERROR
217 : (bio_rw(bio
) == READ
)
218 ? READ_COMPLETED_WITH_ERROR
219 : READ_AHEAD_COMPLETED_WITH_ERROR
;
223 bio_put(req
->private_bio
);
224 req
->private_bio
= ERR_PTR(error
);
226 /* not req_mod(), we need irqsave here! */
227 spin_lock_irqsave(&mdev
->tconn
->req_lock
, flags
);
228 __req_mod(req
, what
, &m
);
229 spin_unlock_irqrestore(&mdev
->tconn
->req_lock
, flags
);
232 complete_master_bio(mdev
, &m
);
235 int w_read_retry_remote(struct drbd_work
*w
, int cancel
)
237 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
238 struct drbd_conf
*mdev
= w
->mdev
;
240 /* We should not detach for read io-error,
241 * but try to WRITE the P_DATA_REPLY to the failed location,
242 * to give the disk the chance to relocate that block */
244 spin_lock_irq(&mdev
->tconn
->req_lock
);
245 if (cancel
|| mdev
->state
.pdsk
!= D_UP_TO_DATE
) {
246 _req_mod(req
, READ_RETRY_REMOTE_CANCELED
);
247 spin_unlock_irq(&mdev
->tconn
->req_lock
);
250 spin_unlock_irq(&mdev
->tconn
->req_lock
);
252 return w_send_read_req(w
, 0);
255 void drbd_csum_ee(struct drbd_conf
*mdev
, struct crypto_hash
*tfm
,
256 struct drbd_peer_request
*peer_req
, void *digest
)
258 struct hash_desc desc
;
259 struct scatterlist sg
;
260 struct page
*page
= peer_req
->pages
;
267 sg_init_table(&sg
, 1);
268 crypto_hash_init(&desc
);
270 while ((tmp
= page_chain_next(page
))) {
271 /* all but the last page will be fully used */
272 sg_set_page(&sg
, page
, PAGE_SIZE
, 0);
273 crypto_hash_update(&desc
, &sg
, sg
.length
);
276 /* and now the last, possibly only partially used page */
277 len
= peer_req
->i
.size
& (PAGE_SIZE
- 1);
278 sg_set_page(&sg
, page
, len
?: PAGE_SIZE
, 0);
279 crypto_hash_update(&desc
, &sg
, sg
.length
);
280 crypto_hash_final(&desc
, digest
);
283 void drbd_csum_bio(struct drbd_conf
*mdev
, struct crypto_hash
*tfm
, struct bio
*bio
, void *digest
)
285 struct hash_desc desc
;
286 struct scatterlist sg
;
287 struct bio_vec
*bvec
;
293 sg_init_table(&sg
, 1);
294 crypto_hash_init(&desc
);
296 __bio_for_each_segment(bvec
, bio
, i
, 0) {
297 sg_set_page(&sg
, bvec
->bv_page
, bvec
->bv_len
, bvec
->bv_offset
);
298 crypto_hash_update(&desc
, &sg
, sg
.length
);
300 crypto_hash_final(&desc
, digest
);
303 /* MAYBE merge common code with w_e_end_ov_req */
304 static int w_e_send_csum(struct drbd_work
*w
, int cancel
)
306 struct drbd_peer_request
*peer_req
= container_of(w
, struct drbd_peer_request
, w
);
307 struct drbd_conf
*mdev
= w
->mdev
;
312 if (unlikely(cancel
))
315 if (unlikely((peer_req
->flags
& EE_WAS_ERROR
) != 0))
318 digest_size
= crypto_hash_digestsize(mdev
->tconn
->csums_tfm
);
319 digest
= kmalloc(digest_size
, GFP_NOIO
);
321 sector_t sector
= peer_req
->i
.sector
;
322 unsigned int size
= peer_req
->i
.size
;
323 drbd_csum_ee(mdev
, mdev
->tconn
->csums_tfm
, peer_req
, digest
);
324 /* Free peer_req and pages before send.
325 * In case we block on congestion, we could otherwise run into
326 * some distributed deadlock, if the other side blocks on
327 * congestion as well, because our receiver blocks in
328 * drbd_alloc_pages due to pp_in_use > max_buffers. */
329 drbd_free_peer_req(mdev
, peer_req
);
331 inc_rs_pending(mdev
);
332 err
= drbd_send_drequest_csum(mdev
, sector
, size
,
337 dev_err(DEV
, "kmalloc() of digest failed.\n");
343 drbd_free_peer_req(mdev
, peer_req
);
346 dev_err(DEV
, "drbd_send_drequest(..., csum) failed\n");
350 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
352 static int read_for_csum(struct drbd_conf
*mdev
, sector_t sector
, int size
)
354 struct drbd_peer_request
*peer_req
;
359 if (drbd_rs_should_slow_down(mdev
, sector
))
362 /* GFP_TRY, because if there is no memory available right now, this may
363 * be rescheduled for later. It is "only" background resync, after all. */
364 peer_req
= drbd_alloc_peer_req(mdev
, ID_SYNCER
/* unused */, sector
,
369 peer_req
->w
.cb
= w_e_send_csum
;
370 spin_lock_irq(&mdev
->tconn
->req_lock
);
371 list_add(&peer_req
->w
.list
, &mdev
->read_ee
);
372 spin_unlock_irq(&mdev
->tconn
->req_lock
);
374 atomic_add(size
>> 9, &mdev
->rs_sect_ev
);
375 if (drbd_submit_peer_request(mdev
, peer_req
, READ
, DRBD_FAULT_RS_RD
) == 0)
378 /* If it failed because of ENOMEM, retry should help. If it failed
379 * because bio_add_page failed (probably broken lower level driver),
380 * retry may or may not help.
381 * If it does not, you may need to force disconnect. */
382 spin_lock_irq(&mdev
->tconn
->req_lock
);
383 list_del(&peer_req
->w
.list
);
384 spin_unlock_irq(&mdev
->tconn
->req_lock
);
386 drbd_free_peer_req(mdev
, peer_req
);
392 int w_resync_timer(struct drbd_work
*w
, int cancel
)
394 struct drbd_conf
*mdev
= w
->mdev
;
395 switch (mdev
->state
.conn
) {
397 w_make_ov_request(w
, cancel
);
400 w_make_resync_request(w
, cancel
);
407 void resync_timer_fn(unsigned long data
)
409 struct drbd_conf
*mdev
= (struct drbd_conf
*) data
;
411 if (list_empty(&mdev
->resync_work
.list
))
412 drbd_queue_work(&mdev
->tconn
->data
.work
, &mdev
->resync_work
);
415 static void fifo_set(struct fifo_buffer
*fb
, int value
)
419 for (i
= 0; i
< fb
->size
; i
++)
420 fb
->values
[i
] = value
;
423 static int fifo_push(struct fifo_buffer
*fb
, int value
)
427 ov
= fb
->values
[fb
->head_index
];
428 fb
->values
[fb
->head_index
++] = value
;
430 if (fb
->head_index
>= fb
->size
)
436 static void fifo_add_val(struct fifo_buffer
*fb
, int value
)
440 for (i
= 0; i
< fb
->size
; i
++)
441 fb
->values
[i
] += value
;
444 struct fifo_buffer
*fifo_alloc(int fifo_size
)
446 struct fifo_buffer
*fb
;
448 fb
= kzalloc(sizeof(struct fifo_buffer
) + sizeof(int) * fifo_size
, GFP_KERNEL
);
453 fb
->size
= fifo_size
;
459 static int drbd_rs_controller(struct drbd_conf
*mdev
)
461 struct disk_conf
*dc
;
462 unsigned int sect_in
; /* Number of sectors that came in since the last turn */
463 unsigned int want
; /* The number of sectors we want in the proxy */
464 int req_sect
; /* Number of sectors to request in this turn */
465 int correction
; /* Number of sectors more we need in the proxy*/
466 int cps
; /* correction per invocation of drbd_rs_controller() */
467 int steps
; /* Number of time steps to plan ahead */
470 struct fifo_buffer
*plan
;
472 sect_in
= atomic_xchg(&mdev
->rs_sect_in
, 0); /* Number of sectors that came in */
473 mdev
->rs_in_flight
-= sect_in
;
475 dc
= rcu_dereference(mdev
->ldev
->disk_conf
);
476 plan
= rcu_dereference(mdev
->rs_plan_s
);
478 steps
= plan
->size
; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
480 if (mdev
->rs_in_flight
+ sect_in
== 0) { /* At start of resync */
481 want
= ((dc
->resync_rate
* 2 * SLEEP_TIME
) / HZ
) * steps
;
482 } else { /* normal path */
483 want
= dc
->c_fill_target
? dc
->c_fill_target
:
484 sect_in
* dc
->c_delay_target
* HZ
/ (SLEEP_TIME
* 10);
487 correction
= want
- mdev
->rs_in_flight
- plan
->total
;
490 cps
= correction
/ steps
;
491 fifo_add_val(plan
, cps
);
492 plan
->total
+= cps
* steps
;
494 /* What we do in this step */
495 curr_corr
= fifo_push(plan
, 0);
496 plan
->total
-= curr_corr
;
498 req_sect
= sect_in
+ curr_corr
;
502 max_sect
= (dc
->c_max_rate
* 2 * SLEEP_TIME
) / HZ
;
503 if (req_sect
> max_sect
)
507 dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
508 sect_in, mdev->rs_in_flight, want, correction,
509 steps, cps, mdev->rs_planed, curr_corr, req_sect);
515 static int drbd_rs_number_requests(struct drbd_conf
*mdev
)
520 if (rcu_dereference(mdev
->rs_plan_s
)->size
) {
521 number
= drbd_rs_controller(mdev
) >> (BM_BLOCK_SHIFT
- 9);
522 mdev
->c_sync_rate
= number
* HZ
* (BM_BLOCK_SIZE
/ 1024) / SLEEP_TIME
;
524 mdev
->c_sync_rate
= rcu_dereference(mdev
->ldev
->disk_conf
)->resync_rate
;
525 number
= SLEEP_TIME
* mdev
->c_sync_rate
/ ((BM_BLOCK_SIZE
/ 1024) * HZ
);
529 /* ignore the amount of pending requests, the resync controller should
530 * throttle down to incoming reply rate soon enough anyways. */
534 int w_make_resync_request(struct drbd_work
*w
, int cancel
)
536 struct drbd_conf
*mdev
= w
->mdev
;
539 const sector_t capacity
= drbd_get_capacity(mdev
->this_bdev
);
541 int number
, rollback_i
, size
;
542 int align
, queued
, sndbuf
;
545 if (unlikely(cancel
))
548 if (mdev
->rs_total
== 0) {
550 drbd_resync_finished(mdev
);
554 if (!get_ldev(mdev
)) {
555 /* Since we only need to access mdev->rsync a
556 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
557 to continue resync with a broken disk makes no sense at
559 dev_err(DEV
, "Disk broke down during resync!\n");
563 max_bio_size
= queue_max_hw_sectors(mdev
->rq_queue
) << 9;
564 number
= drbd_rs_number_requests(mdev
);
568 for (i
= 0; i
< number
; i
++) {
569 /* Stop generating RS requests, when half of the send buffer is filled */
570 mutex_lock(&mdev
->tconn
->data
.mutex
);
571 if (mdev
->tconn
->data
.socket
) {
572 queued
= mdev
->tconn
->data
.socket
->sk
->sk_wmem_queued
;
573 sndbuf
= mdev
->tconn
->data
.socket
->sk
->sk_sndbuf
;
578 mutex_unlock(&mdev
->tconn
->data
.mutex
);
579 if (queued
> sndbuf
/ 2)
583 size
= BM_BLOCK_SIZE
;
584 bit
= drbd_bm_find_next(mdev
, mdev
->bm_resync_fo
);
586 if (bit
== DRBD_END_OF_BITMAP
) {
587 mdev
->bm_resync_fo
= drbd_bm_bits(mdev
);
592 sector
= BM_BIT_TO_SECT(bit
);
594 if (drbd_rs_should_slow_down(mdev
, sector
) ||
595 drbd_try_rs_begin_io(mdev
, sector
)) {
596 mdev
->bm_resync_fo
= bit
;
599 mdev
->bm_resync_fo
= bit
+ 1;
601 if (unlikely(drbd_bm_test_bit(mdev
, bit
) == 0)) {
602 drbd_rs_complete_io(mdev
, sector
);
606 #if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
607 /* try to find some adjacent bits.
608 * we stop if we have already the maximum req size.
610 * Additionally always align bigger requests, in order to
611 * be prepared for all stripe sizes of software RAIDs.
616 if (size
+ BM_BLOCK_SIZE
> max_bio_size
)
619 /* Be always aligned */
620 if (sector
& ((1<<(align
+3))-1))
623 /* do not cross extent boundaries */
624 if (((bit
+1) & BM_BLOCKS_PER_BM_EXT_MASK
) == 0)
626 /* now, is it actually dirty, after all?
627 * caution, drbd_bm_test_bit is tri-state for some
628 * obscure reason; ( b == 0 ) would get the out-of-band
629 * only accidentally right because of the "oddly sized"
630 * adjustment below */
631 if (drbd_bm_test_bit(mdev
, bit
+1) != 1)
634 size
+= BM_BLOCK_SIZE
;
635 if ((BM_BLOCK_SIZE
<< align
) <= size
)
639 /* if we merged some,
640 * reset the offset to start the next drbd_bm_find_next from */
641 if (size
> BM_BLOCK_SIZE
)
642 mdev
->bm_resync_fo
= bit
+ 1;
645 /* adjust very last sectors, in case we are oddly sized */
646 if (sector
+ (size
>>9) > capacity
)
647 size
= (capacity
-sector
)<<9;
648 if (mdev
->tconn
->agreed_pro_version
>= 89 && mdev
->tconn
->csums_tfm
) {
649 switch (read_for_csum(mdev
, sector
, size
)) {
650 case -EIO
: /* Disk failure */
653 case -EAGAIN
: /* allocation failed, or ldev busy */
654 drbd_rs_complete_io(mdev
, sector
);
655 mdev
->bm_resync_fo
= BM_SECT_TO_BIT(sector
);
667 inc_rs_pending(mdev
);
668 err
= drbd_send_drequest(mdev
, P_RS_DATA_REQUEST
,
669 sector
, size
, ID_SYNCER
);
671 dev_err(DEV
, "drbd_send_drequest() failed, aborting...\n");
672 dec_rs_pending(mdev
);
679 if (mdev
->bm_resync_fo
>= drbd_bm_bits(mdev
)) {
680 /* last syncer _request_ was sent,
681 * but the P_RS_DATA_REPLY not yet received. sync will end (and
682 * next sync group will resume), as soon as we receive the last
683 * resync data block, and the last bit is cleared.
684 * until then resync "work" is "inactive" ...
691 mdev
->rs_in_flight
+= (i
<< (BM_BLOCK_SHIFT
- 9));
692 mod_timer(&mdev
->resync_timer
, jiffies
+ SLEEP_TIME
);
697 static int w_make_ov_request(struct drbd_work
*w
, int cancel
)
699 struct drbd_conf
*mdev
= w
->mdev
;
702 const sector_t capacity
= drbd_get_capacity(mdev
->this_bdev
);
704 if (unlikely(cancel
))
707 number
= drbd_rs_number_requests(mdev
);
709 sector
= mdev
->ov_position
;
710 for (i
= 0; i
< number
; i
++) {
711 if (sector
>= capacity
) {
715 size
= BM_BLOCK_SIZE
;
717 if (drbd_rs_should_slow_down(mdev
, sector
) ||
718 drbd_try_rs_begin_io(mdev
, sector
)) {
719 mdev
->ov_position
= sector
;
723 if (sector
+ (size
>>9) > capacity
)
724 size
= (capacity
-sector
)<<9;
726 inc_rs_pending(mdev
);
727 if (drbd_send_ov_request(mdev
, sector
, size
)) {
728 dec_rs_pending(mdev
);
731 sector
+= BM_SECT_PER_BIT
;
733 mdev
->ov_position
= sector
;
736 mdev
->rs_in_flight
+= (i
<< (BM_BLOCK_SHIFT
- 9));
737 mod_timer(&mdev
->resync_timer
, jiffies
+ SLEEP_TIME
);
741 int w_ov_finished(struct drbd_work
*w
, int cancel
)
743 struct drbd_conf
*mdev
= w
->mdev
;
745 ov_out_of_sync_print(mdev
);
746 drbd_resync_finished(mdev
);
751 static int w_resync_finished(struct drbd_work
*w
, int cancel
)
753 struct drbd_conf
*mdev
= w
->mdev
;
756 drbd_resync_finished(mdev
);
761 static void ping_peer(struct drbd_conf
*mdev
)
763 struct drbd_tconn
*tconn
= mdev
->tconn
;
765 clear_bit(GOT_PING_ACK
, &tconn
->flags
);
767 wait_event(tconn
->ping_wait
,
768 test_bit(GOT_PING_ACK
, &tconn
->flags
) || mdev
->state
.conn
< C_CONNECTED
);
771 int drbd_resync_finished(struct drbd_conf
*mdev
)
773 unsigned long db
, dt
, dbdt
;
775 union drbd_state os
, ns
;
777 char *khelper_cmd
= NULL
;
780 /* Remove all elements from the resync LRU. Since future actions
781 * might set bits in the (main) bitmap, then the entries in the
782 * resync LRU would be wrong. */
783 if (drbd_rs_del_all(mdev
)) {
784 /* In case this is not possible now, most probably because
785 * there are P_RS_DATA_REPLY Packets lingering on the worker's
786 * queue (or even the read operations for those packets
787 * is not finished by now). Retry in 100ms. */
789 schedule_timeout_interruptible(HZ
/ 10);
790 w
= kmalloc(sizeof(struct drbd_work
), GFP_ATOMIC
);
792 w
->cb
= w_resync_finished
;
793 drbd_queue_work(&mdev
->tconn
->data
.work
, w
);
796 dev_err(DEV
, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
799 dt
= (jiffies
- mdev
->rs_start
- mdev
->rs_paused
) / HZ
;
803 dbdt
= Bit2KB(db
/dt
);
804 mdev
->rs_paused
/= HZ
;
811 spin_lock_irq(&mdev
->tconn
->req_lock
);
812 os
= drbd_read_state(mdev
);
814 verify_done
= (os
.conn
== C_VERIFY_S
|| os
.conn
== C_VERIFY_T
);
816 /* This protects us against multiple calls (that can happen in the presence
817 of application IO), and against connectivity loss just before we arrive here. */
818 if (os
.conn
<= C_CONNECTED
)
822 ns
.conn
= C_CONNECTED
;
824 dev_info(DEV
, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
825 verify_done
? "Online verify " : "Resync",
826 dt
+ mdev
->rs_paused
, mdev
->rs_paused
, dbdt
);
828 n_oos
= drbd_bm_total_weight(mdev
);
830 if (os
.conn
== C_VERIFY_S
|| os
.conn
== C_VERIFY_T
) {
832 dev_alert(DEV
, "Online verify found %lu %dk block out of sync!\n",
834 khelper_cmd
= "out-of-sync";
837 D_ASSERT((n_oos
- mdev
->rs_failed
) == 0);
839 if (os
.conn
== C_SYNC_TARGET
|| os
.conn
== C_PAUSED_SYNC_T
)
840 khelper_cmd
= "after-resync-target";
842 if (mdev
->tconn
->csums_tfm
&& mdev
->rs_total
) {
843 const unsigned long s
= mdev
->rs_same_csum
;
844 const unsigned long t
= mdev
->rs_total
;
847 (t
< 100000) ? ((s
*100)/t
) : (s
/(t
/100));
848 dev_info(DEV
, "%u %% had equal checksums, eliminated: %luK; "
849 "transferred %luK total %luK\n",
851 Bit2KB(mdev
->rs_same_csum
),
852 Bit2KB(mdev
->rs_total
- mdev
->rs_same_csum
),
853 Bit2KB(mdev
->rs_total
));
857 if (mdev
->rs_failed
) {
858 dev_info(DEV
, " %lu failed blocks\n", mdev
->rs_failed
);
860 if (os
.conn
== C_SYNC_TARGET
|| os
.conn
== C_PAUSED_SYNC_T
) {
861 ns
.disk
= D_INCONSISTENT
;
862 ns
.pdsk
= D_UP_TO_DATE
;
864 ns
.disk
= D_UP_TO_DATE
;
865 ns
.pdsk
= D_INCONSISTENT
;
868 ns
.disk
= D_UP_TO_DATE
;
869 ns
.pdsk
= D_UP_TO_DATE
;
871 if (os
.conn
== C_SYNC_TARGET
|| os
.conn
== C_PAUSED_SYNC_T
) {
874 for (i
= UI_BITMAP
; i
<= UI_HISTORY_END
; i
++)
875 _drbd_uuid_set(mdev
, i
, mdev
->p_uuid
[i
]);
876 drbd_uuid_set(mdev
, UI_BITMAP
, mdev
->ldev
->md
.uuid
[UI_CURRENT
]);
877 _drbd_uuid_set(mdev
, UI_CURRENT
, mdev
->p_uuid
[UI_CURRENT
]);
879 dev_err(DEV
, "mdev->p_uuid is NULL! BUG\n");
883 if (!(os
.conn
== C_VERIFY_S
|| os
.conn
== C_VERIFY_T
)) {
884 /* for verify runs, we don't update uuids here,
885 * so there would be nothing to report. */
886 drbd_uuid_set_bm(mdev
, 0UL);
887 drbd_print_uuids(mdev
, "updated UUIDs");
889 /* Now the two UUID sets are equal, update what we
890 * know of the peer. */
892 for (i
= UI_CURRENT
; i
<= UI_HISTORY_END
; i
++)
893 mdev
->p_uuid
[i
] = mdev
->ldev
->md
.uuid
[i
];
898 _drbd_set_state(mdev
, ns
, CS_VERBOSE
, NULL
);
900 spin_unlock_irq(&mdev
->tconn
->req_lock
);
907 mdev
->ov_start_sector
= 0;
912 drbd_khelper(mdev
, khelper_cmd
);
918 static void move_to_net_ee_or_free(struct drbd_conf
*mdev
, struct drbd_peer_request
*peer_req
)
920 if (drbd_peer_req_has_active_page(peer_req
)) {
921 /* This might happen if sendpage() has not finished */
922 int i
= (peer_req
->i
.size
+ PAGE_SIZE
-1) >> PAGE_SHIFT
;
923 atomic_add(i
, &mdev
->pp_in_use_by_net
);
924 atomic_sub(i
, &mdev
->pp_in_use
);
925 spin_lock_irq(&mdev
->tconn
->req_lock
);
926 list_add_tail(&peer_req
->w
.list
, &mdev
->net_ee
);
927 spin_unlock_irq(&mdev
->tconn
->req_lock
);
928 wake_up(&drbd_pp_wait
);
930 drbd_free_peer_req(mdev
, peer_req
);
934 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
935 * @mdev: DRBD device.
937 * @cancel: The connection will be closed anyways
939 int w_e_end_data_req(struct drbd_work
*w
, int cancel
)
941 struct drbd_peer_request
*peer_req
= container_of(w
, struct drbd_peer_request
, w
);
942 struct drbd_conf
*mdev
= w
->mdev
;
945 if (unlikely(cancel
)) {
946 drbd_free_peer_req(mdev
, peer_req
);
951 if (likely((peer_req
->flags
& EE_WAS_ERROR
) == 0)) {
952 err
= drbd_send_block(mdev
, P_DATA_REPLY
, peer_req
);
954 if (__ratelimit(&drbd_ratelimit_state
))
955 dev_err(DEV
, "Sending NegDReply. sector=%llus.\n",
956 (unsigned long long)peer_req
->i
.sector
);
958 err
= drbd_send_ack(mdev
, P_NEG_DREPLY
, peer_req
);
963 move_to_net_ee_or_free(mdev
, peer_req
);
966 dev_err(DEV
, "drbd_send_block() failed\n");
971 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
972 * @mdev: DRBD device.
974 * @cancel: The connection will be closed anyways
976 int w_e_end_rsdata_req(struct drbd_work
*w
, int cancel
)
978 struct drbd_peer_request
*peer_req
= container_of(w
, struct drbd_peer_request
, w
);
979 struct drbd_conf
*mdev
= w
->mdev
;
982 if (unlikely(cancel
)) {
983 drbd_free_peer_req(mdev
, peer_req
);
988 if (get_ldev_if_state(mdev
, D_FAILED
)) {
989 drbd_rs_complete_io(mdev
, peer_req
->i
.sector
);
993 if (mdev
->state
.conn
== C_AHEAD
) {
994 err
= drbd_send_ack(mdev
, P_RS_CANCEL
, peer_req
);
995 } else if (likely((peer_req
->flags
& EE_WAS_ERROR
) == 0)) {
996 if (likely(mdev
->state
.pdsk
>= D_INCONSISTENT
)) {
997 inc_rs_pending(mdev
);
998 err
= drbd_send_block(mdev
, P_RS_DATA_REPLY
, peer_req
);
1000 if (__ratelimit(&drbd_ratelimit_state
))
1001 dev_err(DEV
, "Not sending RSDataReply, "
1002 "partner DISKLESS!\n");
1006 if (__ratelimit(&drbd_ratelimit_state
))
1007 dev_err(DEV
, "Sending NegRSDReply. sector %llus.\n",
1008 (unsigned long long)peer_req
->i
.sector
);
1010 err
= drbd_send_ack(mdev
, P_NEG_RS_DREPLY
, peer_req
);
1012 /* update resync data with failure */
1013 drbd_rs_failed_io(mdev
, peer_req
->i
.sector
, peer_req
->i
.size
);
1018 move_to_net_ee_or_free(mdev
, peer_req
);
1021 dev_err(DEV
, "drbd_send_block() failed\n");
1025 int w_e_end_csum_rs_req(struct drbd_work
*w
, int cancel
)
1027 struct drbd_peer_request
*peer_req
= container_of(w
, struct drbd_peer_request
, w
);
1028 struct drbd_conf
*mdev
= w
->mdev
;
1029 struct digest_info
*di
;
1031 void *digest
= NULL
;
1034 if (unlikely(cancel
)) {
1035 drbd_free_peer_req(mdev
, peer_req
);
1040 if (get_ldev(mdev
)) {
1041 drbd_rs_complete_io(mdev
, peer_req
->i
.sector
);
1045 di
= peer_req
->digest
;
1047 if (likely((peer_req
->flags
& EE_WAS_ERROR
) == 0)) {
1048 /* quick hack to try to avoid a race against reconfiguration.
1049 * a real fix would be much more involved,
1050 * introducing more locking mechanisms */
1051 if (mdev
->tconn
->csums_tfm
) {
1052 digest_size
= crypto_hash_digestsize(mdev
->tconn
->csums_tfm
);
1053 D_ASSERT(digest_size
== di
->digest_size
);
1054 digest
= kmalloc(digest_size
, GFP_NOIO
);
1057 drbd_csum_ee(mdev
, mdev
->tconn
->csums_tfm
, peer_req
, digest
);
1058 eq
= !memcmp(digest
, di
->digest
, digest_size
);
1063 drbd_set_in_sync(mdev
, peer_req
->i
.sector
, peer_req
->i
.size
);
1064 /* rs_same_csums unit is BM_BLOCK_SIZE */
1065 mdev
->rs_same_csum
+= peer_req
->i
.size
>> BM_BLOCK_SHIFT
;
1066 err
= drbd_send_ack(mdev
, P_RS_IS_IN_SYNC
, peer_req
);
1068 inc_rs_pending(mdev
);
1069 peer_req
->block_id
= ID_SYNCER
; /* By setting block_id, digest pointer becomes invalid! */
1070 peer_req
->flags
&= ~EE_HAS_DIGEST
; /* This peer request no longer has a digest pointer */
1072 err
= drbd_send_block(mdev
, P_RS_DATA_REPLY
, peer_req
);
1075 err
= drbd_send_ack(mdev
, P_NEG_RS_DREPLY
, peer_req
);
1076 if (__ratelimit(&drbd_ratelimit_state
))
1077 dev_err(DEV
, "Sending NegDReply. I guess it gets messy.\n");
1081 move_to_net_ee_or_free(mdev
, peer_req
);
1084 dev_err(DEV
, "drbd_send_block/ack() failed\n");
1088 int w_e_end_ov_req(struct drbd_work
*w
, int cancel
)
1090 struct drbd_peer_request
*peer_req
= container_of(w
, struct drbd_peer_request
, w
);
1091 struct drbd_conf
*mdev
= w
->mdev
;
1092 sector_t sector
= peer_req
->i
.sector
;
1093 unsigned int size
= peer_req
->i
.size
;
1098 if (unlikely(cancel
))
1101 digest_size
= crypto_hash_digestsize(mdev
->tconn
->verify_tfm
);
1102 digest
= kmalloc(digest_size
, GFP_NOIO
);
1104 err
= 1; /* terminate the connection in case the allocation failed */
1108 if (likely(!(peer_req
->flags
& EE_WAS_ERROR
)))
1109 drbd_csum_ee(mdev
, mdev
->tconn
->verify_tfm
, peer_req
, digest
);
1111 memset(digest
, 0, digest_size
);
1113 /* Free e and pages before send.
1114 * In case we block on congestion, we could otherwise run into
1115 * some distributed deadlock, if the other side blocks on
1116 * congestion as well, because our receiver blocks in
1117 * drbd_alloc_pages due to pp_in_use > max_buffers. */
1118 drbd_free_peer_req(mdev
, peer_req
);
1120 inc_rs_pending(mdev
);
1121 err
= drbd_send_drequest_csum(mdev
, sector
, size
, digest
, digest_size
, P_OV_REPLY
);
1123 dec_rs_pending(mdev
);
1128 drbd_free_peer_req(mdev
, peer_req
);
1133 void drbd_ov_out_of_sync_found(struct drbd_conf
*mdev
, sector_t sector
, int size
)
1135 if (mdev
->ov_last_oos_start
+ mdev
->ov_last_oos_size
== sector
) {
1136 mdev
->ov_last_oos_size
+= size
>>9;
1138 mdev
->ov_last_oos_start
= sector
;
1139 mdev
->ov_last_oos_size
= size
>>9;
1141 drbd_set_out_of_sync(mdev
, sector
, size
);
1144 int w_e_end_ov_reply(struct drbd_work
*w
, int cancel
)
1146 struct drbd_peer_request
*peer_req
= container_of(w
, struct drbd_peer_request
, w
);
1147 struct drbd_conf
*mdev
= w
->mdev
;
1148 struct digest_info
*di
;
1150 sector_t sector
= peer_req
->i
.sector
;
1151 unsigned int size
= peer_req
->i
.size
;
1155 if (unlikely(cancel
)) {
1156 drbd_free_peer_req(mdev
, peer_req
);
1161 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1162 * the resync lru has been cleaned up already */
1163 if (get_ldev(mdev
)) {
1164 drbd_rs_complete_io(mdev
, peer_req
->i
.sector
);
1168 di
= peer_req
->digest
;
1170 if (likely((peer_req
->flags
& EE_WAS_ERROR
) == 0)) {
1171 digest_size
= crypto_hash_digestsize(mdev
->tconn
->verify_tfm
);
1172 digest
= kmalloc(digest_size
, GFP_NOIO
);
1174 drbd_csum_ee(mdev
, mdev
->tconn
->verify_tfm
, peer_req
, digest
);
1176 D_ASSERT(digest_size
== di
->digest_size
);
1177 eq
= !memcmp(digest
, di
->digest
, digest_size
);
1182 /* Free peer_req and pages before send.
1183 * In case we block on congestion, we could otherwise run into
1184 * some distributed deadlock, if the other side blocks on
1185 * congestion as well, because our receiver blocks in
1186 * drbd_alloc_pages due to pp_in_use > max_buffers. */
1187 drbd_free_peer_req(mdev
, peer_req
);
1189 drbd_ov_out_of_sync_found(mdev
, sector
, size
);
1191 ov_out_of_sync_print(mdev
);
1193 err
= drbd_send_ack_ex(mdev
, P_OV_RESULT
, sector
, size
,
1194 eq
? ID_IN_SYNC
: ID_OUT_OF_SYNC
);
1200 /* let's advance progress step marks only for every other megabyte */
1201 if ((mdev
->ov_left
& 0x200) == 0x200)
1202 drbd_advance_rs_marks(mdev
, mdev
->ov_left
);
1204 if (mdev
->ov_left
== 0) {
1205 ov_out_of_sync_print(mdev
);
1206 drbd_resync_finished(mdev
);
1212 int w_prev_work_done(struct drbd_work
*w
, int cancel
)
1214 struct drbd_wq_barrier
*b
= container_of(w
, struct drbd_wq_barrier
, w
);
1220 int w_send_barrier(struct drbd_work
*w
, int cancel
)
1222 struct drbd_socket
*sock
;
1223 struct drbd_tl_epoch
*b
= container_of(w
, struct drbd_tl_epoch
, w
);
1224 struct drbd_conf
*mdev
= w
->mdev
;
1225 struct p_barrier
*p
;
1227 /* really avoid racing with tl_clear. w.cb may have been referenced
1228 * just before it was reassigned and re-queued, so double check that.
1229 * actually, this race was harmless, since we only try to send the
1230 * barrier packet here, and otherwise do nothing with the object.
1231 * but compare with the head of w_clear_epoch */
1232 spin_lock_irq(&mdev
->tconn
->req_lock
);
1233 if (w
->cb
!= w_send_barrier
|| mdev
->state
.conn
< C_CONNECTED
)
1235 spin_unlock_irq(&mdev
->tconn
->req_lock
);
1239 sock
= &mdev
->tconn
->data
;
1240 p
= drbd_prepare_command(mdev
, sock
);
1243 p
->barrier
= b
->br_number
;
1244 /* inc_ap_pending was done where this was queued.
1245 * dec_ap_pending will be done in got_BarrierAck
1246 * or (on connection loss) in w_clear_epoch. */
1247 return drbd_send_command(mdev
, sock
, P_BARRIER
, sizeof(*p
), NULL
, 0);
1250 int w_send_write_hint(struct drbd_work
*w
, int cancel
)
1252 struct drbd_conf
*mdev
= w
->mdev
;
1253 struct drbd_socket
*sock
;
1257 sock
= &mdev
->tconn
->data
;
1258 if (!drbd_prepare_command(mdev
, sock
))
1260 return drbd_send_command(mdev
, sock
, P_UNPLUG_REMOTE
, 0, NULL
, 0);
1263 int w_send_out_of_sync(struct drbd_work
*w
, int cancel
)
1265 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
1266 struct drbd_conf
*mdev
= w
->mdev
;
1269 if (unlikely(cancel
)) {
1270 req_mod(req
, SEND_CANCELED
);
1274 err
= drbd_send_out_of_sync(mdev
, req
);
1275 req_mod(req
, OOS_HANDED_TO_NETWORK
);
1281 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1282 * @mdev: DRBD device.
1284 * @cancel: The connection will be closed anyways
1286 int w_send_dblock(struct drbd_work
*w
, int cancel
)
1288 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
1289 struct drbd_conf
*mdev
= w
->mdev
;
1292 if (unlikely(cancel
)) {
1293 req_mod(req
, SEND_CANCELED
);
1297 err
= drbd_send_dblock(mdev
, req
);
1298 req_mod(req
, err
? SEND_FAILED
: HANDED_OVER_TO_NETWORK
);
1304 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1305 * @mdev: DRBD device.
1307 * @cancel: The connection will be closed anyways
1309 int w_send_read_req(struct drbd_work
*w
, int cancel
)
1311 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
1312 struct drbd_conf
*mdev
= w
->mdev
;
1315 if (unlikely(cancel
)) {
1316 req_mod(req
, SEND_CANCELED
);
1320 err
= drbd_send_drequest(mdev
, P_DATA_REQUEST
, req
->i
.sector
, req
->i
.size
,
1321 (unsigned long)req
);
1323 req_mod(req
, err
? SEND_FAILED
: HANDED_OVER_TO_NETWORK
);
1328 int w_restart_disk_io(struct drbd_work
*w
, int cancel
)
1330 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
1331 struct drbd_conf
*mdev
= w
->mdev
;
1333 if (bio_data_dir(req
->master_bio
) == WRITE
&& req
->rq_state
& RQ_IN_ACT_LOG
)
1334 drbd_al_begin_io(mdev
, &req
->i
);
1335 /* Calling drbd_al_begin_io() out of the worker might deadlocks
1336 theoretically. Practically it can not deadlock, since this is
1337 only used when unfreezing IOs. All the extents of the requests
1338 that made it into the TL are already active */
1340 drbd_req_make_private_bio(req
, req
->master_bio
);
1341 req
->private_bio
->bi_bdev
= mdev
->ldev
->backing_bdev
;
1342 generic_make_request(req
->private_bio
);
1347 static int _drbd_may_sync_now(struct drbd_conf
*mdev
)
1349 struct drbd_conf
*odev
= mdev
;
1356 resync_after
= rcu_dereference(odev
->ldev
->disk_conf
)->resync_after
;
1358 if (resync_after
== -1)
1360 odev
= minor_to_mdev(resync_after
);
1363 if ((odev
->state
.conn
>= C_SYNC_SOURCE
&&
1364 odev
->state
.conn
<= C_PAUSED_SYNC_T
) ||
1365 odev
->state
.aftr_isp
|| odev
->state
.peer_isp
||
1366 odev
->state
.user_isp
)
1372 * _drbd_pause_after() - Pause resync on all devices that may not resync now
1373 * @mdev: DRBD device.
1375 * Called from process context only (admin command and after_state_ch).
1377 static int _drbd_pause_after(struct drbd_conf
*mdev
)
1379 struct drbd_conf
*odev
;
1383 idr_for_each_entry(&minors
, odev
, i
) {
1384 if (odev
->state
.conn
== C_STANDALONE
&& odev
->state
.disk
== D_DISKLESS
)
1386 if (!_drbd_may_sync_now(odev
))
1387 rv
|= (__drbd_set_state(_NS(odev
, aftr_isp
, 1), CS_HARD
, NULL
)
1388 != SS_NOTHING_TO_DO
);
1396 * _drbd_resume_next() - Resume resync on all devices that may resync now
1397 * @mdev: DRBD device.
1399 * Called from process context only (admin command and worker).
1401 static int _drbd_resume_next(struct drbd_conf
*mdev
)
1403 struct drbd_conf
*odev
;
1407 idr_for_each_entry(&minors
, odev
, i
) {
1408 if (odev
->state
.conn
== C_STANDALONE
&& odev
->state
.disk
== D_DISKLESS
)
1410 if (odev
->state
.aftr_isp
) {
1411 if (_drbd_may_sync_now(odev
))
1412 rv
|= (__drbd_set_state(_NS(odev
, aftr_isp
, 0),
1414 != SS_NOTHING_TO_DO
) ;
1421 void resume_next_sg(struct drbd_conf
*mdev
)
1423 write_lock_irq(&global_state_lock
);
1424 _drbd_resume_next(mdev
);
1425 write_unlock_irq(&global_state_lock
);
1428 void suspend_other_sg(struct drbd_conf
*mdev
)
1430 write_lock_irq(&global_state_lock
);
1431 _drbd_pause_after(mdev
);
1432 write_unlock_irq(&global_state_lock
);
1435 /* caller must hold global_state_lock */
1436 enum drbd_ret_code
drbd_resync_after_valid(struct drbd_conf
*mdev
, int o_minor
)
1438 struct drbd_conf
*odev
;
1443 if (o_minor
< -1 || minor_to_mdev(o_minor
) == NULL
)
1444 return ERR_RESYNC_AFTER
;
1446 /* check for loops */
1447 odev
= minor_to_mdev(o_minor
);
1450 return ERR_RESYNC_AFTER_CYCLE
;
1453 resync_after
= rcu_dereference(odev
->ldev
->disk_conf
)->resync_after
;
1455 /* dependency chain ends here, no cycles. */
1456 if (resync_after
== -1)
1459 /* follow the dependency chain */
1460 odev
= minor_to_mdev(resync_after
);
1464 /* caller must hold global_state_lock */
1465 void drbd_resync_after_changed(struct drbd_conf
*mdev
)
1470 changes
= _drbd_pause_after(mdev
);
1471 changes
|= _drbd_resume_next(mdev
);
1475 void drbd_rs_controller_reset(struct drbd_conf
*mdev
)
1477 struct fifo_buffer
*plan
;
1479 atomic_set(&mdev
->rs_sect_in
, 0);
1480 atomic_set(&mdev
->rs_sect_ev
, 0);
1481 mdev
->rs_in_flight
= 0;
1483 /* Updating the RCU protected object in place is necessary since
1484 this function gets called from atomic context.
1485 It is valid since all other updates also lead to an completely
1488 plan
= rcu_dereference(mdev
->rs_plan_s
);
1494 void start_resync_timer_fn(unsigned long data
)
1496 struct drbd_conf
*mdev
= (struct drbd_conf
*) data
;
1498 drbd_queue_work(&mdev
->tconn
->data
.work
, &mdev
->start_resync_work
);
1501 int w_start_resync(struct drbd_work
*w
, int cancel
)
1503 struct drbd_conf
*mdev
= w
->mdev
;
1505 if (atomic_read(&mdev
->unacked_cnt
) || atomic_read(&mdev
->rs_pending_cnt
)) {
1506 dev_warn(DEV
, "w_start_resync later...\n");
1507 mdev
->start_resync_timer
.expires
= jiffies
+ HZ
/10;
1508 add_timer(&mdev
->start_resync_timer
);
1512 drbd_start_resync(mdev
, C_SYNC_SOURCE
);
1513 clear_bit(AHEAD_TO_SYNC_SOURCE
, &mdev
->current_epoch
->flags
);
1518 * drbd_start_resync() - Start the resync process
1519 * @mdev: DRBD device.
1520 * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
1522 * This function might bring you directly into one of the
1523 * C_PAUSED_SYNC_* states.
1525 void drbd_start_resync(struct drbd_conf
*mdev
, enum drbd_conns side
)
1527 union drbd_state ns
;
1530 if (mdev
->state
.conn
>= C_SYNC_SOURCE
&& mdev
->state
.conn
< C_AHEAD
) {
1531 dev_err(DEV
, "Resync already running!\n");
1535 if (mdev
->state
.conn
< C_AHEAD
) {
1536 /* In case a previous resync run was aborted by an IO error/detach on the peer. */
1537 drbd_rs_cancel_all(mdev
);
1538 /* This should be done when we abort the resync. We definitely do not
1539 want to have this for connections going back and forth between
1540 Ahead/Behind and SyncSource/SyncTarget */
1543 if (!test_bit(B_RS_H_DONE
, &mdev
->flags
)) {
1544 if (side
== C_SYNC_TARGET
) {
1545 /* Since application IO was locked out during C_WF_BITMAP_T and
1546 C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1547 we check that we might make the data inconsistent. */
1548 r
= drbd_khelper(mdev
, "before-resync-target");
1549 r
= (r
>> 8) & 0xff;
1551 dev_info(DEV
, "before-resync-target handler returned %d, "
1552 "dropping connection.\n", r
);
1553 conn_request_state(mdev
->tconn
, NS(conn
, C_DISCONNECTING
), CS_HARD
);
1556 } else /* C_SYNC_SOURCE */ {
1557 r
= drbd_khelper(mdev
, "before-resync-source");
1558 r
= (r
>> 8) & 0xff;
1561 dev_info(DEV
, "before-resync-source handler returned %d, "
1562 "ignoring. Old userland tools?", r
);
1564 dev_info(DEV
, "before-resync-source handler returned %d, "
1565 "dropping connection.\n", r
);
1566 conn_request_state(mdev
->tconn
, NS(conn
, C_DISCONNECTING
), CS_HARD
);
1573 if (current
== mdev
->tconn
->worker
.task
) {
1574 /* The worker should not sleep waiting for state_mutex,
1575 that can take long */
1576 if (!mutex_trylock(mdev
->state_mutex
)) {
1577 set_bit(B_RS_H_DONE
, &mdev
->flags
);
1578 mdev
->start_resync_timer
.expires
= jiffies
+ HZ
/5;
1579 add_timer(&mdev
->start_resync_timer
);
1583 mutex_lock(mdev
->state_mutex
);
1585 clear_bit(B_RS_H_DONE
, &mdev
->flags
);
1587 if (!get_ldev_if_state(mdev
, D_NEGOTIATING
)) {
1588 mutex_unlock(mdev
->state_mutex
);
1592 write_lock_irq(&global_state_lock
);
1593 ns
= drbd_read_state(mdev
);
1595 ns
.aftr_isp
= !_drbd_may_sync_now(mdev
);
1599 if (side
== C_SYNC_TARGET
)
1600 ns
.disk
= D_INCONSISTENT
;
1601 else /* side == C_SYNC_SOURCE */
1602 ns
.pdsk
= D_INCONSISTENT
;
1604 r
= __drbd_set_state(mdev
, ns
, CS_VERBOSE
, NULL
);
1605 ns
= drbd_read_state(mdev
);
1607 if (ns
.conn
< C_CONNECTED
)
1608 r
= SS_UNKNOWN_ERROR
;
1610 if (r
== SS_SUCCESS
) {
1611 unsigned long tw
= drbd_bm_total_weight(mdev
);
1612 unsigned long now
= jiffies
;
1615 mdev
->rs_failed
= 0;
1616 mdev
->rs_paused
= 0;
1617 mdev
->rs_same_csum
= 0;
1618 mdev
->rs_last_events
= 0;
1619 mdev
->rs_last_sect_ev
= 0;
1620 mdev
->rs_total
= tw
;
1621 mdev
->rs_start
= now
;
1622 for (i
= 0; i
< DRBD_SYNC_MARKS
; i
++) {
1623 mdev
->rs_mark_left
[i
] = tw
;
1624 mdev
->rs_mark_time
[i
] = now
;
1626 _drbd_pause_after(mdev
);
1628 write_unlock_irq(&global_state_lock
);
1630 if (r
== SS_SUCCESS
) {
1631 dev_info(DEV
, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1632 drbd_conn_str(ns
.conn
),
1633 (unsigned long) mdev
->rs_total
<< (BM_BLOCK_SHIFT
-10),
1634 (unsigned long) mdev
->rs_total
);
1635 if (side
== C_SYNC_TARGET
)
1636 mdev
->bm_resync_fo
= 0;
1638 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
1639 * with w_send_oos, or the sync target will get confused as to
1640 * how much bits to resync. We cannot do that always, because for an
1641 * empty resync and protocol < 95, we need to do it here, as we call
1642 * drbd_resync_finished from here in that case.
1643 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
1644 * and from after_state_ch otherwise. */
1645 if (side
== C_SYNC_SOURCE
&& mdev
->tconn
->agreed_pro_version
< 96)
1646 drbd_gen_and_send_sync_uuid(mdev
);
1648 if (mdev
->tconn
->agreed_pro_version
< 95 && mdev
->rs_total
== 0) {
1649 /* This still has a race (about when exactly the peers
1650 * detect connection loss) that can lead to a full sync
1651 * on next handshake. In 8.3.9 we fixed this with explicit
1652 * resync-finished notifications, but the fix
1653 * introduces a protocol change. Sleeping for some
1654 * time longer than the ping interval + timeout on the
1655 * SyncSource, to give the SyncTarget the chance to
1656 * detect connection loss, then waiting for a ping
1657 * response (implicit in drbd_resync_finished) reduces
1658 * the race considerably, but does not solve it. */
1659 if (side
== C_SYNC_SOURCE
) {
1660 struct net_conf
*nc
;
1664 nc
= rcu_dereference(mdev
->tconn
->net_conf
);
1665 timeo
= nc
->ping_int
* HZ
+ nc
->ping_timeo
* HZ
/ 9;
1667 schedule_timeout_interruptible(timeo
);
1669 drbd_resync_finished(mdev
);
1672 drbd_rs_controller_reset(mdev
);
1673 /* ns.conn may already be != mdev->state.conn,
1674 * we may have been paused in between, or become paused until
1675 * the timer triggers.
1676 * No matter, that is handled in resync_timer_fn() */
1677 if (ns
.conn
== C_SYNC_TARGET
)
1678 mod_timer(&mdev
->resync_timer
, jiffies
);
1683 mutex_unlock(mdev
->state_mutex
);
1686 int drbd_worker(struct drbd_thread
*thi
)
1688 struct drbd_tconn
*tconn
= thi
->tconn
;
1689 struct drbd_work
*w
= NULL
;
1690 struct drbd_conf
*mdev
;
1691 struct net_conf
*nc
;
1692 LIST_HEAD(work_list
);
1696 while (get_t_state(thi
) == RUNNING
) {
1697 drbd_thread_current_set_cpu(thi
);
1699 if (down_trylock(&tconn
->data
.work
.s
)) {
1700 mutex_lock(&tconn
->data
.mutex
);
1703 nc
= rcu_dereference(tconn
->net_conf
);
1704 cork
= nc
? nc
->tcp_cork
: 0;
1707 if (tconn
->data
.socket
&& cork
)
1708 drbd_tcp_uncork(tconn
->data
.socket
);
1709 mutex_unlock(&tconn
->data
.mutex
);
1711 intr
= down_interruptible(&tconn
->data
.work
.s
);
1713 mutex_lock(&tconn
->data
.mutex
);
1714 if (tconn
->data
.socket
&& cork
)
1715 drbd_tcp_cork(tconn
->data
.socket
);
1716 mutex_unlock(&tconn
->data
.mutex
);
1720 flush_signals(current
);
1721 if (get_t_state(thi
) == RUNNING
) {
1722 conn_warn(tconn
, "Worker got an unexpected signal\n");
1728 if (get_t_state(thi
) != RUNNING
)
1730 /* With this break, we have done a down() but not consumed
1731 the entry from the list. The cleanup code takes care of
1735 spin_lock_irq(&tconn
->data
.work
.q_lock
);
1736 if (list_empty(&tconn
->data
.work
.q
)) {
1737 /* something terribly wrong in our logic.
1738 * we were able to down() the semaphore,
1739 * but the list is empty... doh.
1741 * what is the best thing to do now?
1742 * try again from scratch, restarting the receiver,
1743 * asender, whatnot? could break even more ugly,
1744 * e.g. when we are primary, but no good local data.
1746 * I'll try to get away just starting over this loop.
1748 conn_warn(tconn
, "Work list unexpectedly empty\n");
1749 spin_unlock_irq(&tconn
->data
.work
.q_lock
);
1752 w
= list_entry(tconn
->data
.work
.q
.next
, struct drbd_work
, list
);
1753 list_del_init(&w
->list
);
1754 spin_unlock_irq(&tconn
->data
.work
.q_lock
);
1756 if (w
->cb(w
, tconn
->cstate
< C_WF_REPORT_PARAMS
)) {
1757 /* dev_warn(DEV, "worker: a callback failed! \n"); */
1758 if (tconn
->cstate
>= C_WF_REPORT_PARAMS
)
1759 conn_request_state(tconn
, NS(conn
, C_NETWORK_FAILURE
), CS_HARD
);
1763 spin_lock_irq(&tconn
->data
.work
.q_lock
);
1764 while (!list_empty(&tconn
->data
.work
.q
)) {
1765 list_splice_init(&tconn
->data
.work
.q
, &work_list
);
1766 spin_unlock_irq(&tconn
->data
.work
.q_lock
);
1768 while (!list_empty(&work_list
)) {
1769 w
= list_entry(work_list
.next
, struct drbd_work
, list
);
1770 list_del_init(&w
->list
);
1774 spin_lock_irq(&tconn
->data
.work
.q_lock
);
1776 sema_init(&tconn
->data
.work
.s
, 0);
1777 /* DANGEROUS race: if someone did queue his work within the spinlock,
1778 * but up() ed outside the spinlock, we could get an up() on the
1779 * semaphore without corresponding list entry.
1782 spin_unlock_irq(&tconn
->data
.work
.q_lock
);
1785 idr_for_each_entry(&tconn
->volumes
, mdev
, vnr
) {
1786 D_ASSERT(mdev
->state
.disk
== D_DISKLESS
&& mdev
->state
.conn
== C_STANDALONE
);
1787 kref_get(&mdev
->kref
);
1789 drbd_mdev_cleanup(mdev
);
1790 kref_put(&mdev
->kref
, &drbd_minor_destroy
);