4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26 #include <linux/module.h>
27 #include <linux/drbd.h>
28 #include <linux/sched.h>
29 #include <linux/smp_lock.h>
30 #include <linux/wait.h>
32 #include <linux/memcontrol.h>
33 #include <linux/mm_inline.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/string.h>
37 #include <linux/scatterlist.h>
42 static int w_make_ov_request(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
);
51 * more endio handlers:
52 atodb_endio in drbd_actlog.c
53 drbd_bm_async_io_complete in drbd_bitmap.c
55 * For all these callbacks, note the following:
56 * The callbacks will be called in irq context by the IDE drivers,
57 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
58 * Try to get the locking right :)
63 /* About the global_state_lock
64 Each state transition on an device holds a read lock. In case we have
65 to evaluate the sync after dependencies, we grab a write lock, because
66 we need stable states on all devices for that. */
67 rwlock_t global_state_lock
;
69 /* used for synchronous meta data and bitmap IO
70 * submitted by drbd_md_sync_page_io()
72 void drbd_md_io_complete(struct bio
*bio
, int error
)
74 struct drbd_md_io
*md_io
;
76 md_io
= (struct drbd_md_io
*)bio
->bi_private
;
79 complete(&md_io
->event
);
82 /* reads on behalf of the partner,
83 * "submitted" by the receiver
85 void drbd_endio_read_sec_final(struct drbd_epoch_entry
*e
) __releases(local
)
87 unsigned long flags
= 0;
88 struct drbd_conf
*mdev
= e
->mdev
;
90 D_ASSERT(e
->block_id
!= ID_VACANT
);
92 spin_lock_irqsave(&mdev
->req_lock
, flags
);
93 mdev
->read_cnt
+= e
->size
>> 9;
95 if (list_empty(&mdev
->read_ee
))
96 wake_up(&mdev
->ee_wait
);
97 if (test_bit(__EE_WAS_ERROR
, &e
->flags
))
98 __drbd_chk_io_error(mdev
, FALSE
);
99 spin_unlock_irqrestore(&mdev
->req_lock
, flags
);
101 drbd_queue_work(&mdev
->data
.work
, &e
->w
);
105 static int is_failed_barrier(int ee_flags
)
107 return (ee_flags
& (EE_IS_BARRIER
|EE_WAS_ERROR
|EE_RESUBMITTED
))
108 == (EE_IS_BARRIER
|EE_WAS_ERROR
);
111 /* writes on behalf of the partner, or resync writes,
112 * "submitted" by the receiver, final stage. */
113 static void drbd_endio_write_sec_final(struct drbd_epoch_entry
*e
) __releases(local
)
115 unsigned long flags
= 0;
116 struct drbd_conf
*mdev
= e
->mdev
;
120 int do_al_complete_io
;
122 /* if this is a failed barrier request, disable use of barriers,
123 * and schedule for resubmission */
124 if (is_failed_barrier(e
->flags
)) {
125 drbd_bump_write_ordering(mdev
, WO_bdev_flush
);
126 spin_lock_irqsave(&mdev
->req_lock
, flags
);
127 list_del(&e
->w
.list
);
128 e
->flags
= (e
->flags
& ~EE_WAS_ERROR
) | EE_RESUBMITTED
;
129 e
->w
.cb
= w_e_reissue
;
130 /* put_ldev actually happens below, once we come here again. */
132 spin_unlock_irqrestore(&mdev
->req_lock
, flags
);
133 drbd_queue_work(&mdev
->data
.work
, &e
->w
);
137 D_ASSERT(e
->block_id
!= ID_VACANT
);
139 /* after we moved e to done_ee,
140 * we may no longer access it,
141 * it may be freed/reused already!
142 * (as soon as we release the req_lock) */
143 e_sector
= e
->sector
;
144 do_al_complete_io
= e
->flags
& EE_CALL_AL_COMPLETE_IO
;
145 is_syncer_req
= is_syncer_block_id(e
->block_id
);
147 spin_lock_irqsave(&mdev
->req_lock
, flags
);
148 mdev
->writ_cnt
+= e
->size
>> 9;
149 list_del(&e
->w
.list
); /* has been on active_ee or sync_ee */
150 list_add_tail(&e
->w
.list
, &mdev
->done_ee
);
152 /* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
153 * neither did we wake possibly waiting conflicting requests.
154 * done from "drbd_process_done_ee" within the appropriate w.cb
155 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
157 do_wake
= is_syncer_req
158 ? list_empty(&mdev
->sync_ee
)
159 : list_empty(&mdev
->active_ee
);
161 if (test_bit(__EE_WAS_ERROR
, &e
->flags
))
162 __drbd_chk_io_error(mdev
, FALSE
);
163 spin_unlock_irqrestore(&mdev
->req_lock
, flags
);
166 drbd_rs_complete_io(mdev
, e_sector
);
169 wake_up(&mdev
->ee_wait
);
171 if (do_al_complete_io
)
172 drbd_al_complete_io(mdev
, e_sector
);
178 /* writes on behalf of the partner, or resync writes,
179 * "submitted" by the receiver.
181 void drbd_endio_sec(struct bio
*bio
, int error
)
183 struct drbd_epoch_entry
*e
= bio
->bi_private
;
184 struct drbd_conf
*mdev
= e
->mdev
;
185 int uptodate
= bio_flagged(bio
, BIO_UPTODATE
);
186 int is_write
= bio_data_dir(bio
) == WRITE
;
189 dev_warn(DEV
, "%s: error=%d s=%llus\n",
190 is_write
? "write" : "read", error
,
191 (unsigned long long)e
->sector
);
192 if (!error
&& !uptodate
) {
193 dev_warn(DEV
, "%s: setting error to -EIO s=%llus\n",
194 is_write
? "write" : "read",
195 (unsigned long long)e
->sector
);
196 /* strange behavior of some lower level drivers...
197 * fail the request by clearing the uptodate flag,
198 * but do not return any error?! */
203 set_bit(__EE_WAS_ERROR
, &e
->flags
);
205 bio_put(bio
); /* no need for the bio anymore */
206 if (atomic_dec_and_test(&e
->pending_bios
)) {
208 drbd_endio_write_sec_final(e
);
210 drbd_endio_read_sec_final(e
);
214 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
216 void drbd_endio_pri(struct bio
*bio
, int error
)
218 struct drbd_request
*req
= bio
->bi_private
;
219 struct drbd_conf
*mdev
= req
->mdev
;
220 enum drbd_req_event what
;
221 int uptodate
= bio_flagged(bio
, BIO_UPTODATE
);
223 if (!error
&& !uptodate
) {
224 dev_warn(DEV
, "p %s: setting error to -EIO\n",
225 bio_data_dir(bio
) == WRITE
? "write" : "read");
226 /* strange behavior of some lower level drivers...
227 * fail the request by clearing the uptodate flag,
228 * but do not return any error?! */
232 /* to avoid recursion in __req_mod */
233 if (unlikely(error
)) {
234 what
= (bio_data_dir(bio
) == WRITE
)
235 ? write_completed_with_error
236 : (bio_rw(bio
) == READ
)
237 ? read_completed_with_error
238 : read_ahead_completed_with_error
;
242 bio_put(req
->private_bio
);
243 req
->private_bio
= ERR_PTR(error
);
248 int w_read_retry_remote(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
250 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
252 /* We should not detach for read io-error,
253 * but try to WRITE the P_DATA_REPLY to the failed location,
254 * to give the disk the chance to relocate that block */
256 spin_lock_irq(&mdev
->req_lock
);
257 if (cancel
|| mdev
->state
.pdsk
!= D_UP_TO_DATE
) {
258 _req_mod(req
, read_retry_remote_canceled
);
259 spin_unlock_irq(&mdev
->req_lock
);
262 spin_unlock_irq(&mdev
->req_lock
);
264 return w_send_read_req(mdev
, w
, 0);
267 int w_resync_inactive(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
269 ERR_IF(cancel
) return 1;
270 dev_err(DEV
, "resync inactive, but callback triggered??\n");
271 return 1; /* Simply ignore this! */
274 void drbd_csum_ee(struct drbd_conf
*mdev
, struct crypto_hash
*tfm
, struct drbd_epoch_entry
*e
, void *digest
)
276 struct hash_desc desc
;
277 struct scatterlist sg
;
278 struct page
*page
= e
->pages
;
285 sg_init_table(&sg
, 1);
286 crypto_hash_init(&desc
);
288 while ((tmp
= page_chain_next(page
))) {
289 /* all but the last page will be fully used */
290 sg_set_page(&sg
, page
, PAGE_SIZE
, 0);
291 crypto_hash_update(&desc
, &sg
, sg
.length
);
294 /* and now the last, possibly only partially used page */
295 len
= e
->size
& (PAGE_SIZE
- 1);
296 sg_set_page(&sg
, page
, len
?: PAGE_SIZE
, 0);
297 crypto_hash_update(&desc
, &sg
, sg
.length
);
298 crypto_hash_final(&desc
, digest
);
301 void drbd_csum_bio(struct drbd_conf
*mdev
, struct crypto_hash
*tfm
, struct bio
*bio
, void *digest
)
303 struct hash_desc desc
;
304 struct scatterlist sg
;
305 struct bio_vec
*bvec
;
311 sg_init_table(&sg
, 1);
312 crypto_hash_init(&desc
);
314 __bio_for_each_segment(bvec
, bio
, i
, 0) {
315 sg_set_page(&sg
, bvec
->bv_page
, bvec
->bv_len
, bvec
->bv_offset
);
316 crypto_hash_update(&desc
, &sg
, sg
.length
);
318 crypto_hash_final(&desc
, digest
);
321 static int w_e_send_csum(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
323 struct drbd_epoch_entry
*e
= container_of(w
, struct drbd_epoch_entry
, w
);
328 D_ASSERT(e
->block_id
== DRBD_MAGIC
+ 0xbeef);
330 if (unlikely(cancel
)) {
331 drbd_free_ee(mdev
, e
);
335 if (likely((e
->flags
& EE_WAS_ERROR
) == 0)) {
336 digest_size
= crypto_hash_digestsize(mdev
->csums_tfm
);
337 digest
= kmalloc(digest_size
, GFP_NOIO
);
339 drbd_csum_ee(mdev
, mdev
->csums_tfm
, e
, digest
);
341 inc_rs_pending(mdev
);
342 ok
= drbd_send_drequest_csum(mdev
,
350 dev_err(DEV
, "kmalloc() of digest failed.\n");
356 drbd_free_ee(mdev
, e
);
359 dev_err(DEV
, "drbd_send_drequest(..., csum) failed\n");
363 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
365 static int read_for_csum(struct drbd_conf
*mdev
, sector_t sector
, int size
)
367 struct drbd_epoch_entry
*e
;
372 if (drbd_rs_should_slow_down(mdev
))
375 /* GFP_TRY, because if there is no memory available right now, this may
376 * be rescheduled for later. It is "only" background resync, after all. */
377 e
= drbd_alloc_ee(mdev
, DRBD_MAGIC
+0xbeef, sector
, size
, GFP_TRY
);
381 e
->w
.cb
= w_e_send_csum
;
382 spin_lock_irq(&mdev
->req_lock
);
383 list_add(&e
->w
.list
, &mdev
->read_ee
);
384 spin_unlock_irq(&mdev
->req_lock
);
386 atomic_add(size
>> 9, &mdev
->rs_sect_ev
);
387 if (drbd_submit_ee(mdev
, e
, READ
, DRBD_FAULT_RS_RD
) == 0)
390 drbd_free_ee(mdev
, e
);
396 void resync_timer_fn(unsigned long data
)
398 struct drbd_conf
*mdev
= (struct drbd_conf
*) data
;
402 switch (mdev
->state
.conn
) {
404 mdev
->resync_work
.cb
= w_make_ov_request
;
407 mdev
->resync_work
.cb
= w_make_resync_request
;
411 mdev
->resync_work
.cb
= w_resync_inactive
;
414 /* harmless race: list_empty outside data.work.q_lock */
415 if (list_empty(&mdev
->resync_work
.list
) && queue
)
416 drbd_queue_work(&mdev
->data
.work
, &mdev
->resync_work
);
419 static void fifo_set(struct fifo_buffer
*fb
, int value
)
423 for (i
= 0; i
< fb
->size
; i
++)
424 fb
->values
[i
] += value
;
427 static int fifo_push(struct fifo_buffer
*fb
, int value
)
431 ov
= fb
->values
[fb
->head_index
];
432 fb
->values
[fb
->head_index
++] = value
;
434 if (fb
->head_index
>= fb
->size
)
440 static void fifo_add_val(struct fifo_buffer
*fb
, int value
)
444 for (i
= 0; i
< fb
->size
; i
++)
445 fb
->values
[i
] += value
;
448 int drbd_rs_controller(struct drbd_conf
*mdev
)
450 unsigned int sect_in
; /* Number of sectors that came in since the last turn */
451 unsigned int want
; /* The number of sectors we want in the proxy */
452 int req_sect
; /* Number of sectors to request in this turn */
453 int correction
; /* Number of sectors more we need in the proxy*/
454 int cps
; /* correction per invocation of drbd_rs_controller() */
455 int steps
; /* Number of time steps to plan ahead */
459 sect_in
= atomic_xchg(&mdev
->rs_sect_in
, 0); /* Number of sectors that came in */
460 mdev
->rs_in_flight
-= sect_in
;
462 spin_lock(&mdev
->peer_seq_lock
); /* get an atomic view on mdev->rs_plan_s */
464 steps
= mdev
->rs_plan_s
.size
; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
466 if (mdev
->rs_in_flight
+ sect_in
== 0) { /* At start of resync */
467 want
= ((mdev
->sync_conf
.rate
* 2 * SLEEP_TIME
) / HZ
) * steps
;
468 } else { /* normal path */
469 want
= mdev
->sync_conf
.c_fill_target
? mdev
->sync_conf
.c_fill_target
:
470 sect_in
* mdev
->sync_conf
.c_delay_target
* HZ
/ (SLEEP_TIME
* 10);
473 correction
= want
- mdev
->rs_in_flight
- mdev
->rs_planed
;
476 cps
= correction
/ steps
;
477 fifo_add_val(&mdev
->rs_plan_s
, cps
);
478 mdev
->rs_planed
+= cps
* steps
;
480 /* What we do in this step */
481 curr_corr
= fifo_push(&mdev
->rs_plan_s
, 0);
482 spin_unlock(&mdev
->peer_seq_lock
);
483 mdev
->rs_planed
-= curr_corr
;
485 req_sect
= sect_in
+ curr_corr
;
489 max_sect
= (mdev
->sync_conf
.c_max_rate
* 2 * SLEEP_TIME
) / HZ
;
490 if (req_sect
> max_sect
)
494 dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
495 sect_in, mdev->rs_in_flight, want, correction,
496 steps, cps, mdev->rs_planed, curr_corr, req_sect);
502 int w_make_resync_request(struct drbd_conf
*mdev
,
503 struct drbd_work
*w
, int cancel
)
507 const sector_t capacity
= drbd_get_capacity(mdev
->this_bdev
);
508 int max_segment_size
;
509 int number
, rollback_i
, size
, pe
, mx
;
510 int align
, queued
, sndbuf
;
513 if (unlikely(cancel
))
516 if (unlikely(mdev
->state
.conn
< C_CONNECTED
)) {
517 dev_err(DEV
, "Confused in w_make_resync_request()! cstate < Connected");
521 if (mdev
->state
.conn
!= C_SYNC_TARGET
)
522 dev_err(DEV
, "%s in w_make_resync_request\n",
523 drbd_conn_str(mdev
->state
.conn
));
525 if (!get_ldev(mdev
)) {
526 /* Since we only need to access mdev->rsync a
527 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
528 to continue resync with a broken disk makes no sense at
530 dev_err(DEV
, "Disk broke down during resync!\n");
531 mdev
->resync_work
.cb
= w_resync_inactive
;
535 /* starting with drbd 8.3.8, we can handle multi-bio EEs,
536 * if it should be necessary */
537 max_segment_size
= mdev
->agreed_pro_version
< 94 ?
538 queue_max_segment_size(mdev
->rq_queue
) : DRBD_MAX_SEGMENT_SIZE
;
540 if (mdev
->rs_plan_s
.size
) { /* mdev->sync_conf.c_plan_ahead */
541 number
= drbd_rs_controller(mdev
) >> (BM_BLOCK_SHIFT
- 9);
542 mdev
->c_sync_rate
= number
* HZ
* (BM_BLOCK_SIZE
/ 1024) / SLEEP_TIME
;
544 mdev
->c_sync_rate
= mdev
->sync_conf
.rate
;
545 number
= SLEEP_TIME
* mdev
->c_sync_rate
/ ((BM_BLOCK_SIZE
/ 1024) * HZ
);
548 /* Throttle resync on lower level disk activity, which may also be
549 * caused by application IO on Primary/SyncTarget.
550 * Keep this after the call to drbd_rs_controller, as that assumes
551 * to be called as precisely as possible every SLEEP_TIME,
552 * and would be confused otherwise. */
553 if (drbd_rs_should_slow_down(mdev
))
556 mutex_lock(&mdev
->data
.mutex
);
557 if (mdev
->data
.socket
)
558 mx
= mdev
->data
.socket
->sk
->sk_rcvbuf
/ sizeof(struct p_block_req
);
561 mutex_unlock(&mdev
->data
.mutex
);
563 /* For resync rates >160MB/sec, allow more pending RS requests */
567 /* Limit the number of pending RS requests to no more than the peer's receive buffer */
568 pe
= atomic_read(&mdev
->rs_pending_cnt
);
569 if ((pe
+ number
) > mx
) {
573 for (i
= 0; i
< number
; i
++) {
574 /* Stop generating RS requests, when half of the send buffer is filled */
575 mutex_lock(&mdev
->data
.mutex
);
576 if (mdev
->data
.socket
) {
577 queued
= mdev
->data
.socket
->sk
->sk_wmem_queued
;
578 sndbuf
= mdev
->data
.socket
->sk
->sk_sndbuf
;
583 mutex_unlock(&mdev
->data
.mutex
);
584 if (queued
> sndbuf
/ 2)
588 size
= BM_BLOCK_SIZE
;
589 bit
= drbd_bm_find_next(mdev
, mdev
->bm_resync_fo
);
592 mdev
->bm_resync_fo
= drbd_bm_bits(mdev
);
593 mdev
->resync_work
.cb
= w_resync_inactive
;
598 sector
= BM_BIT_TO_SECT(bit
);
600 if (drbd_try_rs_begin_io(mdev
, sector
)) {
601 mdev
->bm_resync_fo
= bit
;
604 mdev
->bm_resync_fo
= bit
+ 1;
606 if (unlikely(drbd_bm_test_bit(mdev
, bit
) == 0)) {
607 drbd_rs_complete_io(mdev
, sector
);
611 #if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
612 /* try to find some adjacent bits.
613 * we stop if we have already the maximum req size.
615 * Additionally always align bigger requests, in order to
616 * be prepared for all stripe sizes of software RAIDs.
621 if (size
+ BM_BLOCK_SIZE
> max_segment_size
)
624 /* Be always aligned */
625 if (sector
& ((1<<(align
+3))-1))
628 /* do not cross extent boundaries */
629 if (((bit
+1) & BM_BLOCKS_PER_BM_EXT_MASK
) == 0)
631 /* now, is it actually dirty, after all?
632 * caution, drbd_bm_test_bit is tri-state for some
633 * obscure reason; ( b == 0 ) would get the out-of-band
634 * only accidentally right because of the "oddly sized"
635 * adjustment below */
636 if (drbd_bm_test_bit(mdev
, bit
+1) != 1)
639 size
+= BM_BLOCK_SIZE
;
640 if ((BM_BLOCK_SIZE
<< align
) <= size
)
644 /* if we merged some,
645 * reset the offset to start the next drbd_bm_find_next from */
646 if (size
> BM_BLOCK_SIZE
)
647 mdev
->bm_resync_fo
= bit
+ 1;
650 /* adjust very last sectors, in case we are oddly sized */
651 if (sector
+ (size
>>9) > capacity
)
652 size
= (capacity
-sector
)<<9;
653 if (mdev
->agreed_pro_version
>= 89 && mdev
->csums_tfm
) {
654 switch (read_for_csum(mdev
, sector
, size
)) {
655 case -EIO
: /* Disk failure */
658 case -EAGAIN
: /* allocation failed, or ldev busy */
659 drbd_rs_complete_io(mdev
, sector
);
660 mdev
->bm_resync_fo
= BM_SECT_TO_BIT(sector
);
670 inc_rs_pending(mdev
);
671 if (!drbd_send_drequest(mdev
, P_RS_DATA_REQUEST
,
672 sector
, size
, ID_SYNCER
)) {
673 dev_err(DEV
, "drbd_send_drequest() failed, aborting...\n");
674 dec_rs_pending(mdev
);
681 if (mdev
->bm_resync_fo
>= drbd_bm_bits(mdev
)) {
682 /* last syncer _request_ was sent,
683 * but the P_RS_DATA_REPLY not yet received. sync will end (and
684 * next sync group will resume), as soon as we receive the last
685 * resync data block, and the last bit is cleared.
686 * until then resync "work" is "inactive" ...
688 mdev
->resync_work
.cb
= w_resync_inactive
;
694 mdev
->rs_in_flight
+= (i
<< (BM_BLOCK_SHIFT
- 9));
695 mod_timer(&mdev
->resync_timer
, jiffies
+ SLEEP_TIME
);
700 static int w_make_ov_request(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
704 const sector_t capacity
= drbd_get_capacity(mdev
->this_bdev
);
706 if (unlikely(cancel
))
709 if (unlikely(mdev
->state
.conn
< C_CONNECTED
)) {
710 dev_err(DEV
, "Confused in w_make_ov_request()! cstate < Connected");
714 number
= SLEEP_TIME
*mdev
->sync_conf
.rate
/ ((BM_BLOCK_SIZE
/1024)*HZ
);
715 if (atomic_read(&mdev
->rs_pending_cnt
) > number
)
718 number
-= atomic_read(&mdev
->rs_pending_cnt
);
720 sector
= mdev
->ov_position
;
721 for (i
= 0; i
< number
; i
++) {
722 if (sector
>= capacity
) {
723 mdev
->resync_work
.cb
= w_resync_inactive
;
727 size
= BM_BLOCK_SIZE
;
729 if (drbd_try_rs_begin_io(mdev
, sector
)) {
730 mdev
->ov_position
= sector
;
734 if (sector
+ (size
>>9) > capacity
)
735 size
= (capacity
-sector
)<<9;
737 inc_rs_pending(mdev
);
738 if (!drbd_send_ov_request(mdev
, sector
, size
)) {
739 dec_rs_pending(mdev
);
742 sector
+= BM_SECT_PER_BIT
;
744 mdev
->ov_position
= sector
;
747 mod_timer(&mdev
->resync_timer
, jiffies
+ SLEEP_TIME
);
752 int w_ov_finished(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
756 drbd_resync_finished(mdev
);
761 static int w_resync_finished(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
765 drbd_resync_finished(mdev
);
770 int drbd_resync_finished(struct drbd_conf
*mdev
)
772 unsigned long db
, dt
, dbdt
;
774 union drbd_state os
, ns
;
776 char *khelper_cmd
= NULL
;
778 /* Remove all elements from the resync LRU. Since future actions
779 * might set bits in the (main) bitmap, then the entries in the
780 * resync LRU would be wrong. */
781 if (drbd_rs_del_all(mdev
)) {
782 /* In case this is not possible now, most probably because
783 * there are P_RS_DATA_REPLY Packets lingering on the worker's
784 * queue (or even the read operations for those packets
785 * is not finished by now). Retry in 100ms. */
788 __set_current_state(TASK_INTERRUPTIBLE
);
789 schedule_timeout(HZ
/ 10);
790 w
= kmalloc(sizeof(struct drbd_work
), GFP_ATOMIC
);
792 w
->cb
= w_resync_finished
;
793 drbd_queue_work(&mdev
->data
.work
, w
);
796 dev_err(DEV
, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
799 dt
= (jiffies
- mdev
->rs_start
- mdev
->rs_paused
) / HZ
;
803 dbdt
= Bit2KB(db
/dt
);
804 mdev
->rs_paused
/= HZ
;
809 spin_lock_irq(&mdev
->req_lock
);
812 /* This protects us against multiple calls (that can happen in the presence
813 of application IO), and against connectivity loss just before we arrive here. */
814 if (os
.conn
<= C_CONNECTED
)
818 ns
.conn
= C_CONNECTED
;
820 dev_info(DEV
, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
821 (os
.conn
== C_VERIFY_S
|| os
.conn
== C_VERIFY_T
) ?
822 "Online verify " : "Resync",
823 dt
+ mdev
->rs_paused
, mdev
->rs_paused
, dbdt
);
825 n_oos
= drbd_bm_total_weight(mdev
);
827 if (os
.conn
== C_VERIFY_S
|| os
.conn
== C_VERIFY_T
) {
829 dev_alert(DEV
, "Online verify found %lu %dk block out of sync!\n",
831 khelper_cmd
= "out-of-sync";
834 D_ASSERT((n_oos
- mdev
->rs_failed
) == 0);
836 if (os
.conn
== C_SYNC_TARGET
|| os
.conn
== C_PAUSED_SYNC_T
)
837 khelper_cmd
= "after-resync-target";
839 if (mdev
->csums_tfm
&& mdev
->rs_total
) {
840 const unsigned long s
= mdev
->rs_same_csum
;
841 const unsigned long t
= mdev
->rs_total
;
844 (t
< 100000) ? ((s
*100)/t
) : (s
/(t
/100));
845 dev_info(DEV
, "%u %% had equal check sums, eliminated: %luK; "
846 "transferred %luK total %luK\n",
848 Bit2KB(mdev
->rs_same_csum
),
849 Bit2KB(mdev
->rs_total
- mdev
->rs_same_csum
),
850 Bit2KB(mdev
->rs_total
));
854 if (mdev
->rs_failed
) {
855 dev_info(DEV
, " %lu failed blocks\n", mdev
->rs_failed
);
857 if (os
.conn
== C_SYNC_TARGET
|| os
.conn
== C_PAUSED_SYNC_T
) {
858 ns
.disk
= D_INCONSISTENT
;
859 ns
.pdsk
= D_UP_TO_DATE
;
861 ns
.disk
= D_UP_TO_DATE
;
862 ns
.pdsk
= D_INCONSISTENT
;
865 ns
.disk
= D_UP_TO_DATE
;
866 ns
.pdsk
= D_UP_TO_DATE
;
868 if (os
.conn
== C_SYNC_TARGET
|| os
.conn
== C_PAUSED_SYNC_T
) {
871 for (i
= UI_BITMAP
; i
<= UI_HISTORY_END
; i
++)
872 _drbd_uuid_set(mdev
, i
, mdev
->p_uuid
[i
]);
873 drbd_uuid_set(mdev
, UI_BITMAP
, mdev
->ldev
->md
.uuid
[UI_CURRENT
]);
874 _drbd_uuid_set(mdev
, UI_CURRENT
, mdev
->p_uuid
[UI_CURRENT
]);
876 dev_err(DEV
, "mdev->p_uuid is NULL! BUG\n");
880 drbd_uuid_set_bm(mdev
, 0UL);
883 /* Now the two UUID sets are equal, update what we
884 * know of the peer. */
886 for (i
= UI_CURRENT
; i
<= UI_HISTORY_END
; i
++)
887 mdev
->p_uuid
[i
] = mdev
->ldev
->md
.uuid
[i
];
891 _drbd_set_state(mdev
, ns
, CS_VERBOSE
, NULL
);
893 spin_unlock_irq(&mdev
->req_lock
);
899 mdev
->ov_start_sector
= 0;
901 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC
, &mdev
->flags
)) {
902 dev_warn(DEV
, "Writing the whole bitmap, due to failed kmalloc\n");
903 drbd_queue_bitmap_io(mdev
, &drbd_bm_write
, NULL
, "write from resync_finished");
907 drbd_khelper(mdev
, khelper_cmd
);
913 static void move_to_net_ee_or_free(struct drbd_conf
*mdev
, struct drbd_epoch_entry
*e
)
915 if (drbd_ee_has_active_page(e
)) {
916 /* This might happen if sendpage() has not finished */
917 int i
= DIV_ROUND_UP(e
->size
, PAGE_SIZE
);
918 atomic_add(i
, &mdev
->pp_in_use_by_net
);
919 atomic_sub(i
, &mdev
->pp_in_use
);
920 spin_lock_irq(&mdev
->req_lock
);
921 list_add_tail(&e
->w
.list
, &mdev
->net_ee
);
922 spin_unlock_irq(&mdev
->req_lock
);
923 wake_up(&drbd_pp_wait
);
925 drbd_free_ee(mdev
, e
);
929 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
930 * @mdev: DRBD device.
932 * @cancel: The connection will be closed anyways
934 int w_e_end_data_req(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
936 struct drbd_epoch_entry
*e
= container_of(w
, struct drbd_epoch_entry
, w
);
939 if (unlikely(cancel
)) {
940 drbd_free_ee(mdev
, e
);
945 if (likely((e
->flags
& EE_WAS_ERROR
) == 0)) {
946 ok
= drbd_send_block(mdev
, P_DATA_REPLY
, e
);
948 if (__ratelimit(&drbd_ratelimit_state
))
949 dev_err(DEV
, "Sending NegDReply. sector=%llus.\n",
950 (unsigned long long)e
->sector
);
952 ok
= drbd_send_ack(mdev
, P_NEG_DREPLY
, e
);
957 move_to_net_ee_or_free(mdev
, e
);
960 dev_err(DEV
, "drbd_send_block() failed\n");
965 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
966 * @mdev: DRBD device.
968 * @cancel: The connection will be closed anyways
970 int w_e_end_rsdata_req(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
972 struct drbd_epoch_entry
*e
= container_of(w
, struct drbd_epoch_entry
, w
);
975 if (unlikely(cancel
)) {
976 drbd_free_ee(mdev
, e
);
981 if (get_ldev_if_state(mdev
, D_FAILED
)) {
982 drbd_rs_complete_io(mdev
, e
->sector
);
986 if (likely((e
->flags
& EE_WAS_ERROR
) == 0)) {
987 if (likely(mdev
->state
.pdsk
>= D_INCONSISTENT
)) {
988 inc_rs_pending(mdev
);
989 ok
= drbd_send_block(mdev
, P_RS_DATA_REPLY
, e
);
991 if (__ratelimit(&drbd_ratelimit_state
))
992 dev_err(DEV
, "Not sending RSDataReply, "
993 "partner DISKLESS!\n");
997 if (__ratelimit(&drbd_ratelimit_state
))
998 dev_err(DEV
, "Sending NegRSDReply. sector %llus.\n",
999 (unsigned long long)e
->sector
);
1001 ok
= drbd_send_ack(mdev
, P_NEG_RS_DREPLY
, e
);
1003 /* update resync data with failure */
1004 drbd_rs_failed_io(mdev
, e
->sector
, e
->size
);
1009 move_to_net_ee_or_free(mdev
, e
);
1012 dev_err(DEV
, "drbd_send_block() failed\n");
1016 int w_e_end_csum_rs_req(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1018 struct drbd_epoch_entry
*e
= container_of(w
, struct drbd_epoch_entry
, w
);
1019 struct digest_info
*di
;
1021 void *digest
= NULL
;
1024 if (unlikely(cancel
)) {
1025 drbd_free_ee(mdev
, e
);
1030 drbd_rs_complete_io(mdev
, e
->sector
);
1034 if (likely((e
->flags
& EE_WAS_ERROR
) == 0)) {
1035 /* quick hack to try to avoid a race against reconfiguration.
1036 * a real fix would be much more involved,
1037 * introducing more locking mechanisms */
1038 if (mdev
->csums_tfm
) {
1039 digest_size
= crypto_hash_digestsize(mdev
->csums_tfm
);
1040 D_ASSERT(digest_size
== di
->digest_size
);
1041 digest
= kmalloc(digest_size
, GFP_NOIO
);
1044 drbd_csum_ee(mdev
, mdev
->csums_tfm
, e
, digest
);
1045 eq
= !memcmp(digest
, di
->digest
, digest_size
);
1050 drbd_set_in_sync(mdev
, e
->sector
, e
->size
);
1051 /* rs_same_csums unit is BM_BLOCK_SIZE */
1052 mdev
->rs_same_csum
+= e
->size
>> BM_BLOCK_SHIFT
;
1053 ok
= drbd_send_ack(mdev
, P_RS_IS_IN_SYNC
, e
);
1055 inc_rs_pending(mdev
);
1056 e
->block_id
= ID_SYNCER
; /* By setting block_id, digest pointer becomes invalid! */
1057 e
->flags
&= ~EE_HAS_DIGEST
; /* This e no longer has a digest pointer */
1059 ok
= drbd_send_block(mdev
, P_RS_DATA_REPLY
, e
);
1062 ok
= drbd_send_ack(mdev
, P_NEG_RS_DREPLY
, e
);
1063 if (__ratelimit(&drbd_ratelimit_state
))
1064 dev_err(DEV
, "Sending NegDReply. I guess it gets messy.\n");
1068 move_to_net_ee_or_free(mdev
, e
);
1071 dev_err(DEV
, "drbd_send_block/ack() failed\n");
1075 int w_e_end_ov_req(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1077 struct drbd_epoch_entry
*e
= container_of(w
, struct drbd_epoch_entry
, w
);
1082 if (unlikely(cancel
))
1085 if (unlikely((e
->flags
& EE_WAS_ERROR
) != 0))
1088 digest_size
= crypto_hash_digestsize(mdev
->verify_tfm
);
1089 /* FIXME if this allocation fails, online verify will not terminate! */
1090 digest
= kmalloc(digest_size
, GFP_NOIO
);
1092 drbd_csum_ee(mdev
, mdev
->verify_tfm
, e
, digest
);
1093 inc_rs_pending(mdev
);
1094 ok
= drbd_send_drequest_csum(mdev
, e
->sector
, e
->size
,
1095 digest
, digest_size
, P_OV_REPLY
);
1097 dec_rs_pending(mdev
);
1102 drbd_free_ee(mdev
, e
);
1109 void drbd_ov_oos_found(struct drbd_conf
*mdev
, sector_t sector
, int size
)
1111 if (mdev
->ov_last_oos_start
+ mdev
->ov_last_oos_size
== sector
) {
1112 mdev
->ov_last_oos_size
+= size
>>9;
1114 mdev
->ov_last_oos_start
= sector
;
1115 mdev
->ov_last_oos_size
= size
>>9;
1117 drbd_set_out_of_sync(mdev
, sector
, size
);
1118 set_bit(WRITE_BM_AFTER_RESYNC
, &mdev
->flags
);
1121 int w_e_end_ov_reply(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1123 struct drbd_epoch_entry
*e
= container_of(w
, struct drbd_epoch_entry
, w
);
1124 struct digest_info
*di
;
1129 if (unlikely(cancel
)) {
1130 drbd_free_ee(mdev
, e
);
1135 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1136 * the resync lru has been cleaned up already */
1137 drbd_rs_complete_io(mdev
, e
->sector
);
1141 if (likely((e
->flags
& EE_WAS_ERROR
) == 0)) {
1142 digest_size
= crypto_hash_digestsize(mdev
->verify_tfm
);
1143 digest
= kmalloc(digest_size
, GFP_NOIO
);
1145 drbd_csum_ee(mdev
, mdev
->verify_tfm
, e
, digest
);
1147 D_ASSERT(digest_size
== di
->digest_size
);
1148 eq
= !memcmp(digest
, di
->digest
, digest_size
);
1152 ok
= drbd_send_ack(mdev
, P_NEG_RS_DREPLY
, e
);
1153 if (__ratelimit(&drbd_ratelimit_state
))
1154 dev_err(DEV
, "Sending NegDReply. I guess it gets messy.\n");
1159 drbd_ov_oos_found(mdev
, e
->sector
, e
->size
);
1163 ok
= drbd_send_ack_ex(mdev
, P_OV_RESULT
, e
->sector
, e
->size
,
1164 eq
? ID_IN_SYNC
: ID_OUT_OF_SYNC
);
1166 drbd_free_ee(mdev
, e
);
1168 if (--mdev
->ov_left
== 0) {
1170 drbd_resync_finished(mdev
);
1176 int w_prev_work_done(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1178 struct drbd_wq_barrier
*b
= container_of(w
, struct drbd_wq_barrier
, w
);
1183 int w_send_barrier(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1185 struct drbd_tl_epoch
*b
= container_of(w
, struct drbd_tl_epoch
, w
);
1186 struct p_barrier
*p
= &mdev
->data
.sbuf
.barrier
;
1189 /* really avoid racing with tl_clear. w.cb may have been referenced
1190 * just before it was reassigned and re-queued, so double check that.
1191 * actually, this race was harmless, since we only try to send the
1192 * barrier packet here, and otherwise do nothing with the object.
1193 * but compare with the head of w_clear_epoch */
1194 spin_lock_irq(&mdev
->req_lock
);
1195 if (w
->cb
!= w_send_barrier
|| mdev
->state
.conn
< C_CONNECTED
)
1197 spin_unlock_irq(&mdev
->req_lock
);
1201 if (!drbd_get_data_sock(mdev
))
1203 p
->barrier
= b
->br_number
;
1204 /* inc_ap_pending was done where this was queued.
1205 * dec_ap_pending will be done in got_BarrierAck
1206 * or (on connection loss) in w_clear_epoch. */
1207 ok
= _drbd_send_cmd(mdev
, mdev
->data
.socket
, P_BARRIER
,
1208 (struct p_header80
*)p
, sizeof(*p
), 0);
1209 drbd_put_data_sock(mdev
);
1214 int w_send_write_hint(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1218 return drbd_send_short_cmd(mdev
, P_UNPLUG_REMOTE
);
1222 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1223 * @mdev: DRBD device.
1225 * @cancel: The connection will be closed anyways
1227 int w_send_dblock(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1229 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
1232 if (unlikely(cancel
)) {
1233 req_mod(req
, send_canceled
);
1237 ok
= drbd_send_dblock(mdev
, req
);
1238 req_mod(req
, ok
? handed_over_to_network
: send_failed
);
1244 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1245 * @mdev: DRBD device.
1247 * @cancel: The connection will be closed anyways
1249 int w_send_read_req(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1251 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
1254 if (unlikely(cancel
)) {
1255 req_mod(req
, send_canceled
);
1259 ok
= drbd_send_drequest(mdev
, P_DATA_REQUEST
, req
->sector
, req
->size
,
1260 (unsigned long)req
);
1263 /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
1264 * so this is probably redundant */
1265 if (mdev
->state
.conn
>= C_CONNECTED
)
1266 drbd_force_state(mdev
, NS(conn
, C_NETWORK_FAILURE
));
1268 req_mod(req
, ok
? handed_over_to_network
: send_failed
);
1273 int w_restart_disk_io(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1275 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
1277 if (bio_data_dir(req
->master_bio
) == WRITE
&& req
->rq_state
& RQ_IN_ACT_LOG
)
1278 drbd_al_begin_io(mdev
, req
->sector
);
1279 /* Calling drbd_al_begin_io() out of the worker might deadlocks
1280 theoretically. Practically it can not deadlock, since this is
1281 only used when unfreezing IOs. All the extents of the requests
1282 that made it into the TL are already active */
1284 drbd_req_make_private_bio(req
, req
->master_bio
);
1285 req
->private_bio
->bi_bdev
= mdev
->ldev
->backing_bdev
;
1286 generic_make_request(req
->private_bio
);
1291 static int _drbd_may_sync_now(struct drbd_conf
*mdev
)
1293 struct drbd_conf
*odev
= mdev
;
1296 if (odev
->sync_conf
.after
== -1)
1298 odev
= minor_to_mdev(odev
->sync_conf
.after
);
1299 ERR_IF(!odev
) return 1;
1300 if ((odev
->state
.conn
>= C_SYNC_SOURCE
&&
1301 odev
->state
.conn
<= C_PAUSED_SYNC_T
) ||
1302 odev
->state
.aftr_isp
|| odev
->state
.peer_isp
||
1303 odev
->state
.user_isp
)
1309 * _drbd_pause_after() - Pause resync on all devices that may not resync now
1310 * @mdev: DRBD device.
1312 * Called from process context only (admin command and after_state_ch).
1314 static int _drbd_pause_after(struct drbd_conf
*mdev
)
1316 struct drbd_conf
*odev
;
1319 for (i
= 0; i
< minor_count
; i
++) {
1320 odev
= minor_to_mdev(i
);
1323 if (odev
->state
.conn
== C_STANDALONE
&& odev
->state
.disk
== D_DISKLESS
)
1325 if (!_drbd_may_sync_now(odev
))
1326 rv
|= (__drbd_set_state(_NS(odev
, aftr_isp
, 1), CS_HARD
, NULL
)
1327 != SS_NOTHING_TO_DO
);
1334 * _drbd_resume_next() - Resume resync on all devices that may resync now
1335 * @mdev: DRBD device.
1337 * Called from process context only (admin command and worker).
1339 static int _drbd_resume_next(struct drbd_conf
*mdev
)
1341 struct drbd_conf
*odev
;
1344 for (i
= 0; i
< minor_count
; i
++) {
1345 odev
= minor_to_mdev(i
);
1348 if (odev
->state
.conn
== C_STANDALONE
&& odev
->state
.disk
== D_DISKLESS
)
1350 if (odev
->state
.aftr_isp
) {
1351 if (_drbd_may_sync_now(odev
))
1352 rv
|= (__drbd_set_state(_NS(odev
, aftr_isp
, 0),
1354 != SS_NOTHING_TO_DO
) ;
1360 void resume_next_sg(struct drbd_conf
*mdev
)
1362 write_lock_irq(&global_state_lock
);
1363 _drbd_resume_next(mdev
);
1364 write_unlock_irq(&global_state_lock
);
1367 void suspend_other_sg(struct drbd_conf
*mdev
)
1369 write_lock_irq(&global_state_lock
);
1370 _drbd_pause_after(mdev
);
1371 write_unlock_irq(&global_state_lock
);
1374 static int sync_after_error(struct drbd_conf
*mdev
, int o_minor
)
1376 struct drbd_conf
*odev
;
1380 if (o_minor
< -1 || minor_to_mdev(o_minor
) == NULL
)
1381 return ERR_SYNC_AFTER
;
1383 /* check for loops */
1384 odev
= minor_to_mdev(o_minor
);
1387 return ERR_SYNC_AFTER_CYCLE
;
1389 /* dependency chain ends here, no cycles. */
1390 if (odev
->sync_conf
.after
== -1)
1393 /* follow the dependency chain */
1394 odev
= minor_to_mdev(odev
->sync_conf
.after
);
1398 int drbd_alter_sa(struct drbd_conf
*mdev
, int na
)
1403 write_lock_irq(&global_state_lock
);
1404 retcode
= sync_after_error(mdev
, na
);
1405 if (retcode
== NO_ERROR
) {
1406 mdev
->sync_conf
.after
= na
;
1408 changes
= _drbd_pause_after(mdev
);
1409 changes
|= _drbd_resume_next(mdev
);
1412 write_unlock_irq(&global_state_lock
);
1416 static void ping_peer(struct drbd_conf
*mdev
)
1418 clear_bit(GOT_PING_ACK
, &mdev
->flags
);
1420 wait_event(mdev
->misc_wait
,
1421 test_bit(GOT_PING_ACK
, &mdev
->flags
) || mdev
->state
.conn
< C_CONNECTED
);
1425 * drbd_start_resync() - Start the resync process
1426 * @mdev: DRBD device.
1427 * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
1429 * This function might bring you directly into one of the
1430 * C_PAUSED_SYNC_* states.
1432 void drbd_start_resync(struct drbd_conf
*mdev
, enum drbd_conns side
)
1434 union drbd_state ns
;
1437 if (mdev
->state
.conn
>= C_SYNC_SOURCE
) {
1438 dev_err(DEV
, "Resync already running!\n");
1442 /* In case a previous resync run was aborted by an IO error/detach on the peer. */
1443 drbd_rs_cancel_all(mdev
);
1445 if (side
== C_SYNC_TARGET
) {
1446 /* Since application IO was locked out during C_WF_BITMAP_T and
1447 C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1448 we check that we might make the data inconsistent. */
1449 r
= drbd_khelper(mdev
, "before-resync-target");
1450 r
= (r
>> 8) & 0xff;
1452 dev_info(DEV
, "before-resync-target handler returned %d, "
1453 "dropping connection.\n", r
);
1454 drbd_force_state(mdev
, NS(conn
, C_DISCONNECTING
));
1459 drbd_state_lock(mdev
);
1461 if (!get_ldev_if_state(mdev
, D_NEGOTIATING
)) {
1462 drbd_state_unlock(mdev
);
1466 if (side
== C_SYNC_TARGET
) {
1467 mdev
->bm_resync_fo
= 0;
1468 } else /* side == C_SYNC_SOURCE */ {
1471 get_random_bytes(&uuid
, sizeof(u64
));
1472 drbd_uuid_set(mdev
, UI_BITMAP
, uuid
);
1473 drbd_send_sync_uuid(mdev
, uuid
);
1475 D_ASSERT(mdev
->state
.disk
== D_UP_TO_DATE
);
1478 write_lock_irq(&global_state_lock
);
1481 ns
.aftr_isp
= !_drbd_may_sync_now(mdev
);
1485 if (side
== C_SYNC_TARGET
)
1486 ns
.disk
= D_INCONSISTENT
;
1487 else /* side == C_SYNC_SOURCE */
1488 ns
.pdsk
= D_INCONSISTENT
;
1490 r
= __drbd_set_state(mdev
, ns
, CS_VERBOSE
, NULL
);
1493 if (ns
.conn
< C_CONNECTED
)
1494 r
= SS_UNKNOWN_ERROR
;
1496 if (r
== SS_SUCCESS
) {
1497 unsigned long tw
= drbd_bm_total_weight(mdev
);
1498 unsigned long now
= jiffies
;
1501 mdev
->rs_failed
= 0;
1502 mdev
->rs_paused
= 0;
1503 mdev
->rs_same_csum
= 0;
1504 mdev
->rs_last_events
= 0;
1505 mdev
->rs_last_sect_ev
= 0;
1506 mdev
->rs_total
= tw
;
1507 mdev
->rs_start
= now
;
1508 for (i
= 0; i
< DRBD_SYNC_MARKS
; i
++) {
1509 mdev
->rs_mark_left
[i
] = tw
;
1510 mdev
->rs_mark_time
[i
] = now
;
1512 _drbd_pause_after(mdev
);
1514 write_unlock_irq(&global_state_lock
);
1517 if (r
== SS_SUCCESS
) {
1518 dev_info(DEV
, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1519 drbd_conn_str(ns
.conn
),
1520 (unsigned long) mdev
->rs_total
<< (BM_BLOCK_SHIFT
-10),
1521 (unsigned long) mdev
->rs_total
);
1523 if (mdev
->rs_total
== 0) {
1524 /* Peer still reachable? Beware of failing before-resync-target handlers! */
1526 drbd_resync_finished(mdev
);
1529 atomic_set(&mdev
->rs_sect_in
, 0);
1530 atomic_set(&mdev
->rs_sect_ev
, 0);
1531 mdev
->rs_in_flight
= 0;
1532 mdev
->rs_planed
= 0;
1533 spin_lock(&mdev
->peer_seq_lock
);
1534 fifo_set(&mdev
->rs_plan_s
, 0);
1535 spin_unlock(&mdev
->peer_seq_lock
);
1536 /* ns.conn may already be != mdev->state.conn,
1537 * we may have been paused in between, or become paused until
1538 * the timer triggers.
1539 * No matter, that is handled in resync_timer_fn() */
1540 if (ns
.conn
== C_SYNC_TARGET
)
1541 mod_timer(&mdev
->resync_timer
, jiffies
);
1545 drbd_state_unlock(mdev
);
1548 int drbd_worker(struct drbd_thread
*thi
)
1550 struct drbd_conf
*mdev
= thi
->mdev
;
1551 struct drbd_work
*w
= NULL
;
1552 LIST_HEAD(work_list
);
1555 sprintf(current
->comm
, "drbd%d_worker", mdev_to_minor(mdev
));
1557 while (get_t_state(thi
) == Running
) {
1558 drbd_thread_current_set_cpu(mdev
);
1560 if (down_trylock(&mdev
->data
.work
.s
)) {
1561 mutex_lock(&mdev
->data
.mutex
);
1562 if (mdev
->data
.socket
&& !mdev
->net_conf
->no_cork
)
1563 drbd_tcp_uncork(mdev
->data
.socket
);
1564 mutex_unlock(&mdev
->data
.mutex
);
1566 intr
= down_interruptible(&mdev
->data
.work
.s
);
1568 mutex_lock(&mdev
->data
.mutex
);
1569 if (mdev
->data
.socket
&& !mdev
->net_conf
->no_cork
)
1570 drbd_tcp_cork(mdev
->data
.socket
);
1571 mutex_unlock(&mdev
->data
.mutex
);
1575 D_ASSERT(intr
== -EINTR
);
1576 flush_signals(current
);
1577 ERR_IF (get_t_state(thi
) == Running
)
1582 if (get_t_state(thi
) != Running
)
1584 /* With this break, we have done a down() but not consumed
1585 the entry from the list. The cleanup code takes care of
1589 spin_lock_irq(&mdev
->data
.work
.q_lock
);
1590 ERR_IF(list_empty(&mdev
->data
.work
.q
)) {
1591 /* something terribly wrong in our logic.
1592 * we were able to down() the semaphore,
1593 * but the list is empty... doh.
1595 * what is the best thing to do now?
1596 * try again from scratch, restarting the receiver,
1597 * asender, whatnot? could break even more ugly,
1598 * e.g. when we are primary, but no good local data.
1600 * I'll try to get away just starting over this loop.
1602 spin_unlock_irq(&mdev
->data
.work
.q_lock
);
1605 w
= list_entry(mdev
->data
.work
.q
.next
, struct drbd_work
, list
);
1606 list_del_init(&w
->list
);
1607 spin_unlock_irq(&mdev
->data
.work
.q_lock
);
1609 if (!w
->cb(mdev
, w
, mdev
->state
.conn
< C_CONNECTED
)) {
1610 /* dev_warn(DEV, "worker: a callback failed! \n"); */
1611 if (mdev
->state
.conn
>= C_CONNECTED
)
1612 drbd_force_state(mdev
,
1613 NS(conn
, C_NETWORK_FAILURE
));
1616 D_ASSERT(test_bit(DEVICE_DYING
, &mdev
->flags
));
1617 D_ASSERT(test_bit(CONFIG_PENDING
, &mdev
->flags
));
1619 spin_lock_irq(&mdev
->data
.work
.q_lock
);
1621 while (!list_empty(&mdev
->data
.work
.q
)) {
1622 list_splice_init(&mdev
->data
.work
.q
, &work_list
);
1623 spin_unlock_irq(&mdev
->data
.work
.q_lock
);
1625 while (!list_empty(&work_list
)) {
1626 w
= list_entry(work_list
.next
, struct drbd_work
, list
);
1627 list_del_init(&w
->list
);
1629 i
++; /* dead debugging code */
1632 spin_lock_irq(&mdev
->data
.work
.q_lock
);
1634 sema_init(&mdev
->data
.work
.s
, 0);
1635 /* DANGEROUS race: if someone did queue his work within the spinlock,
1636 * but up() ed outside the spinlock, we could get an up() on the
1637 * semaphore without corresponding list entry.
1640 spin_unlock_irq(&mdev
->data
.work
.q_lock
);
1642 D_ASSERT(mdev
->state
.disk
== D_DISKLESS
&& mdev
->state
.conn
== C_STANDALONE
);
1643 /* _drbd_set_state only uses stop_nowait.
1644 * wait here for the Exiting receiver. */
1645 drbd_thread_stop(&mdev
->receiver
);
1646 drbd_mdev_cleanup(mdev
);
1648 dev_info(DEV
, "worker terminated\n");
1650 clear_bit(DEVICE_DYING
, &mdev
->flags
);
1651 clear_bit(CONFIG_PENDING
, &mdev
->flags
);
1652 wake_up(&mdev
->state_wait
);