4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26 #include <linux/module.h>
27 #include <linux/drbd.h>
28 #include <linux/sched.h>
29 #include <linux/smp_lock.h>
30 #include <linux/wait.h>
32 #include <linux/memcontrol.h>
33 #include <linux/mm_inline.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/string.h>
37 #include <linux/scatterlist.h>
42 static int w_make_ov_request(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
);
51 * more endio handlers:
52 atodb_endio in drbd_actlog.c
53 drbd_bm_async_io_complete in drbd_bitmap.c
55 * For all these callbacks, note the following:
56 * The callbacks will be called in irq context by the IDE drivers,
57 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
58 * Try to get the locking right :)
63 /* About the global_state_lock
64 Each state transition on an device holds a read lock. In case we have
65 to evaluate the sync after dependencies, we grab a write lock, because
66 we need stable states on all devices for that. */
67 rwlock_t global_state_lock
;
69 /* used for synchronous meta data and bitmap IO
70 * submitted by drbd_md_sync_page_io()
72 void drbd_md_io_complete(struct bio
*bio
, int error
)
74 struct drbd_md_io
*md_io
;
76 md_io
= (struct drbd_md_io
*)bio
->bi_private
;
79 complete(&md_io
->event
);
82 /* reads on behalf of the partner,
83 * "submitted" by the receiver
85 void drbd_endio_read_sec_final(struct drbd_epoch_entry
*e
) __releases(local
)
87 unsigned long flags
= 0;
88 struct drbd_conf
*mdev
= e
->mdev
;
90 D_ASSERT(e
->block_id
!= ID_VACANT
);
92 spin_lock_irqsave(&mdev
->req_lock
, flags
);
93 mdev
->read_cnt
+= e
->size
>> 9;
95 if (list_empty(&mdev
->read_ee
))
96 wake_up(&mdev
->ee_wait
);
97 if (test_bit(__EE_WAS_ERROR
, &e
->flags
))
98 __drbd_chk_io_error(mdev
, FALSE
);
99 spin_unlock_irqrestore(&mdev
->req_lock
, flags
);
101 drbd_queue_work(&mdev
->data
.work
, &e
->w
);
105 static int is_failed_barrier(int ee_flags
)
107 return (ee_flags
& (EE_IS_BARRIER
|EE_WAS_ERROR
|EE_RESUBMITTED
))
108 == (EE_IS_BARRIER
|EE_WAS_ERROR
);
111 /* writes on behalf of the partner, or resync writes,
112 * "submitted" by the receiver, final stage. */
113 static void drbd_endio_write_sec_final(struct drbd_epoch_entry
*e
) __releases(local
)
115 unsigned long flags
= 0;
116 struct drbd_conf
*mdev
= e
->mdev
;
120 int do_al_complete_io
;
122 /* if this is a failed barrier request, disable use of barriers,
123 * and schedule for resubmission */
124 if (is_failed_barrier(e
->flags
)) {
125 drbd_bump_write_ordering(mdev
, WO_bdev_flush
);
126 spin_lock_irqsave(&mdev
->req_lock
, flags
);
127 list_del(&e
->w
.list
);
128 e
->flags
= (e
->flags
& ~EE_WAS_ERROR
) | EE_RESUBMITTED
;
129 e
->w
.cb
= w_e_reissue
;
130 /* put_ldev actually happens below, once we come here again. */
132 spin_unlock_irqrestore(&mdev
->req_lock
, flags
);
133 drbd_queue_work(&mdev
->data
.work
, &e
->w
);
137 D_ASSERT(e
->block_id
!= ID_VACANT
);
139 /* after we moved e to done_ee,
140 * we may no longer access it,
141 * it may be freed/reused already!
142 * (as soon as we release the req_lock) */
143 e_sector
= e
->sector
;
144 do_al_complete_io
= e
->flags
& EE_CALL_AL_COMPLETE_IO
;
145 is_syncer_req
= is_syncer_block_id(e
->block_id
);
147 spin_lock_irqsave(&mdev
->req_lock
, flags
);
148 mdev
->writ_cnt
+= e
->size
>> 9;
149 list_del(&e
->w
.list
); /* has been on active_ee or sync_ee */
150 list_add_tail(&e
->w
.list
, &mdev
->done_ee
);
152 /* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
153 * neither did we wake possibly waiting conflicting requests.
154 * done from "drbd_process_done_ee" within the appropriate w.cb
155 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
157 do_wake
= is_syncer_req
158 ? list_empty(&mdev
->sync_ee
)
159 : list_empty(&mdev
->active_ee
);
161 if (test_bit(__EE_WAS_ERROR
, &e
->flags
))
162 __drbd_chk_io_error(mdev
, FALSE
);
163 spin_unlock_irqrestore(&mdev
->req_lock
, flags
);
166 drbd_rs_complete_io(mdev
, e_sector
);
169 wake_up(&mdev
->ee_wait
);
171 if (do_al_complete_io
)
172 drbd_al_complete_io(mdev
, e_sector
);
178 /* writes on behalf of the partner, or resync writes,
179 * "submitted" by the receiver.
181 void drbd_endio_sec(struct bio
*bio
, int error
)
183 struct drbd_epoch_entry
*e
= bio
->bi_private
;
184 struct drbd_conf
*mdev
= e
->mdev
;
185 int uptodate
= bio_flagged(bio
, BIO_UPTODATE
);
186 int is_write
= bio_data_dir(bio
) == WRITE
;
189 dev_warn(DEV
, "%s: error=%d s=%llus\n",
190 is_write
? "write" : "read", error
,
191 (unsigned long long)e
->sector
);
192 if (!error
&& !uptodate
) {
193 dev_warn(DEV
, "%s: setting error to -EIO s=%llus\n",
194 is_write
? "write" : "read",
195 (unsigned long long)e
->sector
);
196 /* strange behavior of some lower level drivers...
197 * fail the request by clearing the uptodate flag,
198 * but do not return any error?! */
203 set_bit(__EE_WAS_ERROR
, &e
->flags
);
205 bio_put(bio
); /* no need for the bio anymore */
206 if (atomic_dec_and_test(&e
->pending_bios
)) {
208 drbd_endio_write_sec_final(e
);
210 drbd_endio_read_sec_final(e
);
214 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
216 void drbd_endio_pri(struct bio
*bio
, int error
)
219 struct drbd_request
*req
= bio
->bi_private
;
220 struct drbd_conf
*mdev
= req
->mdev
;
221 struct bio_and_error m
;
222 enum drbd_req_event what
;
223 int uptodate
= bio_flagged(bio
, BIO_UPTODATE
);
225 if (!error
&& !uptodate
) {
226 dev_warn(DEV
, "p %s: setting error to -EIO\n",
227 bio_data_dir(bio
) == WRITE
? "write" : "read");
228 /* strange behavior of some lower level drivers...
229 * fail the request by clearing the uptodate flag,
230 * but do not return any error?! */
234 /* to avoid recursion in __req_mod */
235 if (unlikely(error
)) {
236 what
= (bio_data_dir(bio
) == WRITE
)
237 ? write_completed_with_error
238 : (bio_rw(bio
) == READ
)
239 ? read_completed_with_error
240 : read_ahead_completed_with_error
;
244 bio_put(req
->private_bio
);
245 req
->private_bio
= ERR_PTR(error
);
247 spin_lock_irqsave(&mdev
->req_lock
, flags
);
248 __req_mod(req
, what
, &m
);
249 spin_unlock_irqrestore(&mdev
->req_lock
, flags
);
252 complete_master_bio(mdev
, &m
);
255 int w_read_retry_remote(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
257 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
259 /* We should not detach for read io-error,
260 * but try to WRITE the P_DATA_REPLY to the failed location,
261 * to give the disk the chance to relocate that block */
263 spin_lock_irq(&mdev
->req_lock
);
264 if (cancel
|| mdev
->state
.pdsk
!= D_UP_TO_DATE
) {
265 _req_mod(req
, read_retry_remote_canceled
);
266 spin_unlock_irq(&mdev
->req_lock
);
269 spin_unlock_irq(&mdev
->req_lock
);
271 return w_send_read_req(mdev
, w
, 0);
274 int w_resync_inactive(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
276 ERR_IF(cancel
) return 1;
277 dev_err(DEV
, "resync inactive, but callback triggered??\n");
278 return 1; /* Simply ignore this! */
281 void drbd_csum_ee(struct drbd_conf
*mdev
, struct crypto_hash
*tfm
, struct drbd_epoch_entry
*e
, void *digest
)
283 struct hash_desc desc
;
284 struct scatterlist sg
;
285 struct page
*page
= e
->pages
;
292 sg_init_table(&sg
, 1);
293 crypto_hash_init(&desc
);
295 while ((tmp
= page_chain_next(page
))) {
296 /* all but the last page will be fully used */
297 sg_set_page(&sg
, page
, PAGE_SIZE
, 0);
298 crypto_hash_update(&desc
, &sg
, sg
.length
);
301 /* and now the last, possibly only partially used page */
302 len
= e
->size
& (PAGE_SIZE
- 1);
303 sg_set_page(&sg
, page
, len
?: PAGE_SIZE
, 0);
304 crypto_hash_update(&desc
, &sg
, sg
.length
);
305 crypto_hash_final(&desc
, digest
);
308 void drbd_csum_bio(struct drbd_conf
*mdev
, struct crypto_hash
*tfm
, struct bio
*bio
, void *digest
)
310 struct hash_desc desc
;
311 struct scatterlist sg
;
312 struct bio_vec
*bvec
;
318 sg_init_table(&sg
, 1);
319 crypto_hash_init(&desc
);
321 __bio_for_each_segment(bvec
, bio
, i
, 0) {
322 sg_set_page(&sg
, bvec
->bv_page
, bvec
->bv_len
, bvec
->bv_offset
);
323 crypto_hash_update(&desc
, &sg
, sg
.length
);
325 crypto_hash_final(&desc
, digest
);
328 static int w_e_send_csum(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
330 struct drbd_epoch_entry
*e
= container_of(w
, struct drbd_epoch_entry
, w
);
335 D_ASSERT(e
->block_id
== DRBD_MAGIC
+ 0xbeef);
337 if (unlikely(cancel
)) {
338 drbd_free_ee(mdev
, e
);
342 if (likely((e
->flags
& EE_WAS_ERROR
) == 0)) {
343 digest_size
= crypto_hash_digestsize(mdev
->csums_tfm
);
344 digest
= kmalloc(digest_size
, GFP_NOIO
);
346 drbd_csum_ee(mdev
, mdev
->csums_tfm
, e
, digest
);
348 inc_rs_pending(mdev
);
349 ok
= drbd_send_drequest_csum(mdev
,
357 dev_err(DEV
, "kmalloc() of digest failed.\n");
363 drbd_free_ee(mdev
, e
);
366 dev_err(DEV
, "drbd_send_drequest(..., csum) failed\n");
370 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
372 static int read_for_csum(struct drbd_conf
*mdev
, sector_t sector
, int size
)
374 struct drbd_epoch_entry
*e
;
379 /* GFP_TRY, because if there is no memory available right now, this may
380 * be rescheduled for later. It is "only" background resync, after all. */
381 e
= drbd_alloc_ee(mdev
, DRBD_MAGIC
+0xbeef, sector
, size
, GFP_TRY
);
385 e
->w
.cb
= w_e_send_csum
;
386 spin_lock_irq(&mdev
->req_lock
);
387 list_add(&e
->w
.list
, &mdev
->read_ee
);
388 spin_unlock_irq(&mdev
->req_lock
);
390 if (drbd_submit_ee(mdev
, e
, READ
, DRBD_FAULT_RS_RD
) == 0)
393 drbd_free_ee(mdev
, e
);
399 void resync_timer_fn(unsigned long data
)
402 struct drbd_conf
*mdev
= (struct drbd_conf
*) data
;
405 spin_lock_irqsave(&mdev
->req_lock
, flags
);
407 if (likely(!test_and_clear_bit(STOP_SYNC_TIMER
, &mdev
->flags
))) {
409 if (mdev
->state
.conn
== C_VERIFY_S
)
410 mdev
->resync_work
.cb
= w_make_ov_request
;
412 mdev
->resync_work
.cb
= w_make_resync_request
;
415 mdev
->resync_work
.cb
= w_resync_inactive
;
418 spin_unlock_irqrestore(&mdev
->req_lock
, flags
);
420 /* harmless race: list_empty outside data.work.q_lock */
421 if (list_empty(&mdev
->resync_work
.list
) && queue
)
422 drbd_queue_work(&mdev
->data
.work
, &mdev
->resync_work
);
425 static void fifo_set(struct fifo_buffer
*fb
, int value
)
429 for (i
= 0; i
< fb
->size
; i
++)
430 fb
->values
[i
] += value
;
433 static int fifo_push(struct fifo_buffer
*fb
, int value
)
437 ov
= fb
->values
[fb
->head_index
];
438 fb
->values
[fb
->head_index
++] = value
;
440 if (fb
->head_index
>= fb
->size
)
446 static void fifo_add_val(struct fifo_buffer
*fb
, int value
)
450 for (i
= 0; i
< fb
->size
; i
++)
451 fb
->values
[i
] += value
;
454 int drbd_rs_controller(struct drbd_conf
*mdev
)
456 unsigned int sect_in
; /* Number of sectors that came in since the last turn */
457 unsigned int want
; /* The number of sectors we want in the proxy */
458 int req_sect
; /* Number of sectors to request in this turn */
459 int correction
; /* Number of sectors more we need in the proxy*/
460 int cps
; /* correction per invocation of drbd_rs_controller() */
461 int steps
; /* Number of time steps to plan ahead */
465 sect_in
= atomic_xchg(&mdev
->rs_sect_in
, 0); /* Number of sectors that came in */
466 mdev
->rs_in_flight
-= sect_in
;
468 spin_lock(&mdev
->peer_seq_lock
); /* get an atomic view on mdev->rs_plan_s */
470 steps
= mdev
->rs_plan_s
.size
; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
472 if (mdev
->rs_in_flight
+ sect_in
== 0) { /* At start of resync */
473 want
= ((mdev
->sync_conf
.rate
* 2 * SLEEP_TIME
) / HZ
) * steps
;
474 } else { /* normal path */
475 want
= mdev
->sync_conf
.c_fill_target
? mdev
->sync_conf
.c_fill_target
:
476 sect_in
* mdev
->sync_conf
.c_delay_target
* HZ
/ (SLEEP_TIME
* 10);
479 correction
= want
- mdev
->rs_in_flight
- mdev
->rs_planed
;
482 cps
= correction
/ steps
;
483 fifo_add_val(&mdev
->rs_plan_s
, cps
);
484 mdev
->rs_planed
+= cps
* steps
;
486 /* What we do in this step */
487 curr_corr
= fifo_push(&mdev
->rs_plan_s
, 0);
488 spin_unlock(&mdev
->peer_seq_lock
);
489 mdev
->rs_planed
-= curr_corr
;
491 req_sect
= sect_in
+ curr_corr
;
495 max_sect
= (mdev
->sync_conf
.c_max_rate
* 2 * SLEEP_TIME
) / HZ
;
496 if (req_sect
> max_sect
)
500 dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
501 sect_in, mdev->rs_in_flight, want, correction,
502 steps, cps, mdev->rs_planed, curr_corr, req_sect);
508 int w_make_resync_request(struct drbd_conf
*mdev
,
509 struct drbd_work
*w
, int cancel
)
513 const sector_t capacity
= drbd_get_capacity(mdev
->this_bdev
);
514 int max_segment_size
;
515 int number
, i
, rollback_i
, size
, pe
, mx
;
516 int align
, queued
, sndbuf
;
518 if (unlikely(cancel
))
521 if (unlikely(mdev
->state
.conn
< C_CONNECTED
)) {
522 dev_err(DEV
, "Confused in w_make_resync_request()! cstate < Connected");
526 if (mdev
->state
.conn
!= C_SYNC_TARGET
)
527 dev_err(DEV
, "%s in w_make_resync_request\n",
528 drbd_conn_str(mdev
->state
.conn
));
530 if (!get_ldev(mdev
)) {
531 /* Since we only need to access mdev->rsync a
532 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
533 to continue resync with a broken disk makes no sense at
535 dev_err(DEV
, "Disk broke down during resync!\n");
536 mdev
->resync_work
.cb
= w_resync_inactive
;
540 /* starting with drbd 8.3.8, we can handle multi-bio EEs,
541 * if it should be necessary */
542 max_segment_size
= mdev
->agreed_pro_version
< 94 ?
543 queue_max_segment_size(mdev
->rq_queue
) : DRBD_MAX_SEGMENT_SIZE
;
545 if (mdev
->rs_plan_s
.size
) { /* mdev->sync_conf.c_plan_ahead */
546 number
= drbd_rs_controller(mdev
) >> (BM_BLOCK_SHIFT
- 9);
547 mdev
->c_sync_rate
= number
* HZ
* (BM_BLOCK_SIZE
/ 1024) / SLEEP_TIME
;
549 mdev
->c_sync_rate
= mdev
->sync_conf
.rate
;
550 number
= SLEEP_TIME
* mdev
->c_sync_rate
/ ((BM_BLOCK_SIZE
/ 1024) * HZ
);
552 pe
= atomic_read(&mdev
->rs_pending_cnt
);
554 mutex_lock(&mdev
->data
.mutex
);
555 if (mdev
->data
.socket
)
556 mx
= mdev
->data
.socket
->sk
->sk_rcvbuf
/ sizeof(struct p_block_req
);
559 mutex_unlock(&mdev
->data
.mutex
);
561 /* For resync rates >160MB/sec, allow more pending RS requests */
565 /* Limit the number of pending RS requests to no more than the peer's receive buffer */
566 if ((pe
+ number
) > mx
) {
570 for (i
= 0; i
< number
; i
++) {
571 /* Stop generating RS requests, when half of the send buffer is filled */
572 mutex_lock(&mdev
->data
.mutex
);
573 if (mdev
->data
.socket
) {
574 queued
= mdev
->data
.socket
->sk
->sk_wmem_queued
;
575 sndbuf
= mdev
->data
.socket
->sk
->sk_sndbuf
;
580 mutex_unlock(&mdev
->data
.mutex
);
581 if (queued
> sndbuf
/ 2)
585 size
= BM_BLOCK_SIZE
;
586 bit
= drbd_bm_find_next(mdev
, mdev
->bm_resync_fo
);
589 mdev
->bm_resync_fo
= drbd_bm_bits(mdev
);
590 mdev
->resync_work
.cb
= w_resync_inactive
;
595 sector
= BM_BIT_TO_SECT(bit
);
597 if (drbd_try_rs_begin_io(mdev
, sector
)) {
598 mdev
->bm_resync_fo
= bit
;
601 mdev
->bm_resync_fo
= bit
+ 1;
603 if (unlikely(drbd_bm_test_bit(mdev
, bit
) == 0)) {
604 drbd_rs_complete_io(mdev
, sector
);
608 #if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
609 /* try to find some adjacent bits.
610 * we stop if we have already the maximum req size.
612 * Additionally always align bigger requests, in order to
613 * be prepared for all stripe sizes of software RAIDs.
618 if (size
+ BM_BLOCK_SIZE
> max_segment_size
)
621 /* Be always aligned */
622 if (sector
& ((1<<(align
+3))-1))
625 /* do not cross extent boundaries */
626 if (((bit
+1) & BM_BLOCKS_PER_BM_EXT_MASK
) == 0)
628 /* now, is it actually dirty, after all?
629 * caution, drbd_bm_test_bit is tri-state for some
630 * obscure reason; ( b == 0 ) would get the out-of-band
631 * only accidentally right because of the "oddly sized"
632 * adjustment below */
633 if (drbd_bm_test_bit(mdev
, bit
+1) != 1)
636 size
+= BM_BLOCK_SIZE
;
637 if ((BM_BLOCK_SIZE
<< align
) <= size
)
641 /* if we merged some,
642 * reset the offset to start the next drbd_bm_find_next from */
643 if (size
> BM_BLOCK_SIZE
)
644 mdev
->bm_resync_fo
= bit
+ 1;
647 /* adjust very last sectors, in case we are oddly sized */
648 if (sector
+ (size
>>9) > capacity
)
649 size
= (capacity
-sector
)<<9;
650 if (mdev
->agreed_pro_version
>= 89 && mdev
->csums_tfm
) {
651 switch (read_for_csum(mdev
, sector
, size
)) {
652 case -EIO
: /* Disk failure */
655 case -EAGAIN
: /* allocation failed, or ldev busy */
656 drbd_rs_complete_io(mdev
, sector
);
657 mdev
->bm_resync_fo
= BM_SECT_TO_BIT(sector
);
667 inc_rs_pending(mdev
);
668 if (!drbd_send_drequest(mdev
, P_RS_DATA_REQUEST
,
669 sector
, size
, ID_SYNCER
)) {
670 dev_err(DEV
, "drbd_send_drequest() failed, aborting...\n");
671 dec_rs_pending(mdev
);
678 if (mdev
->bm_resync_fo
>= drbd_bm_bits(mdev
)) {
679 /* last syncer _request_ was sent,
680 * but the P_RS_DATA_REPLY not yet received. sync will end (and
681 * next sync group will resume), as soon as we receive the last
682 * resync data block, and the last bit is cleared.
683 * until then resync "work" is "inactive" ...
685 mdev
->resync_work
.cb
= w_resync_inactive
;
691 mdev
->rs_in_flight
+= (i
<< (BM_BLOCK_SHIFT
- 9));
692 mod_timer(&mdev
->resync_timer
, jiffies
+ SLEEP_TIME
);
697 static int w_make_ov_request(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
701 const sector_t capacity
= drbd_get_capacity(mdev
->this_bdev
);
703 if (unlikely(cancel
))
706 if (unlikely(mdev
->state
.conn
< C_CONNECTED
)) {
707 dev_err(DEV
, "Confused in w_make_ov_request()! cstate < Connected");
711 number
= SLEEP_TIME
*mdev
->sync_conf
.rate
/ ((BM_BLOCK_SIZE
/1024)*HZ
);
712 if (atomic_read(&mdev
->rs_pending_cnt
) > number
)
715 number
-= atomic_read(&mdev
->rs_pending_cnt
);
717 sector
= mdev
->ov_position
;
718 for (i
= 0; i
< number
; i
++) {
719 if (sector
>= capacity
) {
720 mdev
->resync_work
.cb
= w_resync_inactive
;
724 size
= BM_BLOCK_SIZE
;
726 if (drbd_try_rs_begin_io(mdev
, sector
)) {
727 mdev
->ov_position
= sector
;
731 if (sector
+ (size
>>9) > capacity
)
732 size
= (capacity
-sector
)<<9;
734 inc_rs_pending(mdev
);
735 if (!drbd_send_ov_request(mdev
, sector
, size
)) {
736 dec_rs_pending(mdev
);
739 sector
+= BM_SECT_PER_BIT
;
741 mdev
->ov_position
= sector
;
744 mod_timer(&mdev
->resync_timer
, jiffies
+ SLEEP_TIME
);
749 int w_ov_finished(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
753 drbd_resync_finished(mdev
);
758 static int w_resync_finished(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
762 drbd_resync_finished(mdev
);
767 int drbd_resync_finished(struct drbd_conf
*mdev
)
769 unsigned long db
, dt
, dbdt
;
771 union drbd_state os
, ns
;
773 char *khelper_cmd
= NULL
;
775 /* Remove all elements from the resync LRU. Since future actions
776 * might set bits in the (main) bitmap, then the entries in the
777 * resync LRU would be wrong. */
778 if (drbd_rs_del_all(mdev
)) {
779 /* In case this is not possible now, most probably because
780 * there are P_RS_DATA_REPLY Packets lingering on the worker's
781 * queue (or even the read operations for those packets
782 * is not finished by now). Retry in 100ms. */
785 __set_current_state(TASK_INTERRUPTIBLE
);
786 schedule_timeout(HZ
/ 10);
787 w
= kmalloc(sizeof(struct drbd_work
), GFP_ATOMIC
);
789 w
->cb
= w_resync_finished
;
790 drbd_queue_work(&mdev
->data
.work
, w
);
793 dev_err(DEV
, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
796 dt
= (jiffies
- mdev
->rs_start
- mdev
->rs_paused
) / HZ
;
800 dbdt
= Bit2KB(db
/dt
);
801 mdev
->rs_paused
/= HZ
;
806 spin_lock_irq(&mdev
->req_lock
);
809 /* This protects us against multiple calls (that can happen in the presence
810 of application IO), and against connectivity loss just before we arrive here. */
811 if (os
.conn
<= C_CONNECTED
)
815 ns
.conn
= C_CONNECTED
;
817 dev_info(DEV
, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
818 (os
.conn
== C_VERIFY_S
|| os
.conn
== C_VERIFY_T
) ?
819 "Online verify " : "Resync",
820 dt
+ mdev
->rs_paused
, mdev
->rs_paused
, dbdt
);
822 n_oos
= drbd_bm_total_weight(mdev
);
824 if (os
.conn
== C_VERIFY_S
|| os
.conn
== C_VERIFY_T
) {
826 dev_alert(DEV
, "Online verify found %lu %dk block out of sync!\n",
828 khelper_cmd
= "out-of-sync";
831 D_ASSERT((n_oos
- mdev
->rs_failed
) == 0);
833 if (os
.conn
== C_SYNC_TARGET
|| os
.conn
== C_PAUSED_SYNC_T
)
834 khelper_cmd
= "after-resync-target";
836 if (mdev
->csums_tfm
&& mdev
->rs_total
) {
837 const unsigned long s
= mdev
->rs_same_csum
;
838 const unsigned long t
= mdev
->rs_total
;
841 (t
< 100000) ? ((s
*100)/t
) : (s
/(t
/100));
842 dev_info(DEV
, "%u %% had equal check sums, eliminated: %luK; "
843 "transferred %luK total %luK\n",
845 Bit2KB(mdev
->rs_same_csum
),
846 Bit2KB(mdev
->rs_total
- mdev
->rs_same_csum
),
847 Bit2KB(mdev
->rs_total
));
851 if (mdev
->rs_failed
) {
852 dev_info(DEV
, " %lu failed blocks\n", mdev
->rs_failed
);
854 if (os
.conn
== C_SYNC_TARGET
|| os
.conn
== C_PAUSED_SYNC_T
) {
855 ns
.disk
= D_INCONSISTENT
;
856 ns
.pdsk
= D_UP_TO_DATE
;
858 ns
.disk
= D_UP_TO_DATE
;
859 ns
.pdsk
= D_INCONSISTENT
;
862 ns
.disk
= D_UP_TO_DATE
;
863 ns
.pdsk
= D_UP_TO_DATE
;
865 if (os
.conn
== C_SYNC_TARGET
|| os
.conn
== C_PAUSED_SYNC_T
) {
868 for (i
= UI_BITMAP
; i
<= UI_HISTORY_END
; i
++)
869 _drbd_uuid_set(mdev
, i
, mdev
->p_uuid
[i
]);
870 drbd_uuid_set(mdev
, UI_BITMAP
, mdev
->ldev
->md
.uuid
[UI_CURRENT
]);
871 _drbd_uuid_set(mdev
, UI_CURRENT
, mdev
->p_uuid
[UI_CURRENT
]);
873 dev_err(DEV
, "mdev->p_uuid is NULL! BUG\n");
877 drbd_uuid_set_bm(mdev
, 0UL);
880 /* Now the two UUID sets are equal, update what we
881 * know of the peer. */
883 for (i
= UI_CURRENT
; i
<= UI_HISTORY_END
; i
++)
884 mdev
->p_uuid
[i
] = mdev
->ldev
->md
.uuid
[i
];
888 _drbd_set_state(mdev
, ns
, CS_VERBOSE
, NULL
);
890 spin_unlock_irq(&mdev
->req_lock
);
896 mdev
->ov_start_sector
= 0;
898 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC
, &mdev
->flags
)) {
899 dev_warn(DEV
, "Writing the whole bitmap, due to failed kmalloc\n");
900 drbd_queue_bitmap_io(mdev
, &drbd_bm_write
, NULL
, "write from resync_finished");
904 drbd_khelper(mdev
, khelper_cmd
);
910 static void move_to_net_ee_or_free(struct drbd_conf
*mdev
, struct drbd_epoch_entry
*e
)
912 if (drbd_ee_has_active_page(e
)) {
913 /* This might happen if sendpage() has not finished */
914 spin_lock_irq(&mdev
->req_lock
);
915 list_add_tail(&e
->w
.list
, &mdev
->net_ee
);
916 spin_unlock_irq(&mdev
->req_lock
);
918 drbd_free_ee(mdev
, e
);
922 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
923 * @mdev: DRBD device.
925 * @cancel: The connection will be closed anyways
927 int w_e_end_data_req(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
929 struct drbd_epoch_entry
*e
= container_of(w
, struct drbd_epoch_entry
, w
);
932 if (unlikely(cancel
)) {
933 drbd_free_ee(mdev
, e
);
938 if (likely((e
->flags
& EE_WAS_ERROR
) == 0)) {
939 ok
= drbd_send_block(mdev
, P_DATA_REPLY
, e
);
941 if (__ratelimit(&drbd_ratelimit_state
))
942 dev_err(DEV
, "Sending NegDReply. sector=%llus.\n",
943 (unsigned long long)e
->sector
);
945 ok
= drbd_send_ack(mdev
, P_NEG_DREPLY
, e
);
950 move_to_net_ee_or_free(mdev
, e
);
953 dev_err(DEV
, "drbd_send_block() failed\n");
958 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
959 * @mdev: DRBD device.
961 * @cancel: The connection will be closed anyways
963 int w_e_end_rsdata_req(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
965 struct drbd_epoch_entry
*e
= container_of(w
, struct drbd_epoch_entry
, w
);
968 if (unlikely(cancel
)) {
969 drbd_free_ee(mdev
, e
);
974 if (get_ldev_if_state(mdev
, D_FAILED
)) {
975 drbd_rs_complete_io(mdev
, e
->sector
);
979 if (likely((e
->flags
& EE_WAS_ERROR
) == 0)) {
980 if (likely(mdev
->state
.pdsk
>= D_INCONSISTENT
)) {
981 inc_rs_pending(mdev
);
982 ok
= drbd_send_block(mdev
, P_RS_DATA_REPLY
, e
);
984 if (__ratelimit(&drbd_ratelimit_state
))
985 dev_err(DEV
, "Not sending RSDataReply, "
986 "partner DISKLESS!\n");
990 if (__ratelimit(&drbd_ratelimit_state
))
991 dev_err(DEV
, "Sending NegRSDReply. sector %llus.\n",
992 (unsigned long long)e
->sector
);
994 ok
= drbd_send_ack(mdev
, P_NEG_RS_DREPLY
, e
);
996 /* update resync data with failure */
997 drbd_rs_failed_io(mdev
, e
->sector
, e
->size
);
1002 move_to_net_ee_or_free(mdev
, e
);
1005 dev_err(DEV
, "drbd_send_block() failed\n");
1009 int w_e_end_csum_rs_req(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1011 struct drbd_epoch_entry
*e
= container_of(w
, struct drbd_epoch_entry
, w
);
1012 struct digest_info
*di
;
1014 void *digest
= NULL
;
1017 if (unlikely(cancel
)) {
1018 drbd_free_ee(mdev
, e
);
1023 drbd_rs_complete_io(mdev
, e
->sector
);
1027 if (likely((e
->flags
& EE_WAS_ERROR
) == 0)) {
1028 /* quick hack to try to avoid a race against reconfiguration.
1029 * a real fix would be much more involved,
1030 * introducing more locking mechanisms */
1031 if (mdev
->csums_tfm
) {
1032 digest_size
= crypto_hash_digestsize(mdev
->csums_tfm
);
1033 D_ASSERT(digest_size
== di
->digest_size
);
1034 digest
= kmalloc(digest_size
, GFP_NOIO
);
1037 drbd_csum_ee(mdev
, mdev
->csums_tfm
, e
, digest
);
1038 eq
= !memcmp(digest
, di
->digest
, digest_size
);
1043 drbd_set_in_sync(mdev
, e
->sector
, e
->size
);
1044 /* rs_same_csums unit is BM_BLOCK_SIZE */
1045 mdev
->rs_same_csum
+= e
->size
>> BM_BLOCK_SHIFT
;
1046 ok
= drbd_send_ack(mdev
, P_RS_IS_IN_SYNC
, e
);
1048 inc_rs_pending(mdev
);
1049 e
->block_id
= ID_SYNCER
;
1050 ok
= drbd_send_block(mdev
, P_RS_DATA_REPLY
, e
);
1053 ok
= drbd_send_ack(mdev
, P_NEG_RS_DREPLY
, e
);
1054 if (__ratelimit(&drbd_ratelimit_state
))
1055 dev_err(DEV
, "Sending NegDReply. I guess it gets messy.\n");
1059 move_to_net_ee_or_free(mdev
, e
);
1062 dev_err(DEV
, "drbd_send_block/ack() failed\n");
1066 int w_e_end_ov_req(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1068 struct drbd_epoch_entry
*e
= container_of(w
, struct drbd_epoch_entry
, w
);
1073 if (unlikely(cancel
))
1076 if (unlikely((e
->flags
& EE_WAS_ERROR
) != 0))
1079 digest_size
= crypto_hash_digestsize(mdev
->verify_tfm
);
1080 /* FIXME if this allocation fails, online verify will not terminate! */
1081 digest
= kmalloc(digest_size
, GFP_NOIO
);
1083 drbd_csum_ee(mdev
, mdev
->verify_tfm
, e
, digest
);
1084 inc_rs_pending(mdev
);
1085 ok
= drbd_send_drequest_csum(mdev
, e
->sector
, e
->size
,
1086 digest
, digest_size
, P_OV_REPLY
);
1088 dec_rs_pending(mdev
);
1093 drbd_free_ee(mdev
, e
);
1100 void drbd_ov_oos_found(struct drbd_conf
*mdev
, sector_t sector
, int size
)
1102 if (mdev
->ov_last_oos_start
+ mdev
->ov_last_oos_size
== sector
) {
1103 mdev
->ov_last_oos_size
+= size
>>9;
1105 mdev
->ov_last_oos_start
= sector
;
1106 mdev
->ov_last_oos_size
= size
>>9;
1108 drbd_set_out_of_sync(mdev
, sector
, size
);
1109 set_bit(WRITE_BM_AFTER_RESYNC
, &mdev
->flags
);
1112 int w_e_end_ov_reply(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1114 struct drbd_epoch_entry
*e
= container_of(w
, struct drbd_epoch_entry
, w
);
1115 struct digest_info
*di
;
1120 if (unlikely(cancel
)) {
1121 drbd_free_ee(mdev
, e
);
1126 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1127 * the resync lru has been cleaned up already */
1128 drbd_rs_complete_io(mdev
, e
->sector
);
1132 if (likely((e
->flags
& EE_WAS_ERROR
) == 0)) {
1133 digest_size
= crypto_hash_digestsize(mdev
->verify_tfm
);
1134 digest
= kmalloc(digest_size
, GFP_NOIO
);
1136 drbd_csum_ee(mdev
, mdev
->verify_tfm
, e
, digest
);
1138 D_ASSERT(digest_size
== di
->digest_size
);
1139 eq
= !memcmp(digest
, di
->digest
, digest_size
);
1143 ok
= drbd_send_ack(mdev
, P_NEG_RS_DREPLY
, e
);
1144 if (__ratelimit(&drbd_ratelimit_state
))
1145 dev_err(DEV
, "Sending NegDReply. I guess it gets messy.\n");
1150 drbd_ov_oos_found(mdev
, e
->sector
, e
->size
);
1154 ok
= drbd_send_ack_ex(mdev
, P_OV_RESULT
, e
->sector
, e
->size
,
1155 eq
? ID_IN_SYNC
: ID_OUT_OF_SYNC
);
1157 drbd_free_ee(mdev
, e
);
1159 if (--mdev
->ov_left
== 0) {
1161 drbd_resync_finished(mdev
);
1167 int w_prev_work_done(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1169 struct drbd_wq_barrier
*b
= container_of(w
, struct drbd_wq_barrier
, w
);
1174 int w_send_barrier(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1176 struct drbd_tl_epoch
*b
= container_of(w
, struct drbd_tl_epoch
, w
);
1177 struct p_barrier
*p
= &mdev
->data
.sbuf
.barrier
;
1180 /* really avoid racing with tl_clear. w.cb may have been referenced
1181 * just before it was reassigned and re-queued, so double check that.
1182 * actually, this race was harmless, since we only try to send the
1183 * barrier packet here, and otherwise do nothing with the object.
1184 * but compare with the head of w_clear_epoch */
1185 spin_lock_irq(&mdev
->req_lock
);
1186 if (w
->cb
!= w_send_barrier
|| mdev
->state
.conn
< C_CONNECTED
)
1188 spin_unlock_irq(&mdev
->req_lock
);
1192 if (!drbd_get_data_sock(mdev
))
1194 p
->barrier
= b
->br_number
;
1195 /* inc_ap_pending was done where this was queued.
1196 * dec_ap_pending will be done in got_BarrierAck
1197 * or (on connection loss) in w_clear_epoch. */
1198 ok
= _drbd_send_cmd(mdev
, mdev
->data
.socket
, P_BARRIER
,
1199 (struct p_header
*)p
, sizeof(*p
), 0);
1200 drbd_put_data_sock(mdev
);
1205 int w_send_write_hint(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1209 return drbd_send_short_cmd(mdev
, P_UNPLUG_REMOTE
);
1213 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1214 * @mdev: DRBD device.
1216 * @cancel: The connection will be closed anyways
1218 int w_send_dblock(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1220 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
1223 if (unlikely(cancel
)) {
1224 req_mod(req
, send_canceled
);
1228 ok
= drbd_send_dblock(mdev
, req
);
1229 req_mod(req
, ok
? handed_over_to_network
: send_failed
);
1235 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1236 * @mdev: DRBD device.
1238 * @cancel: The connection will be closed anyways
1240 int w_send_read_req(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1242 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
1245 if (unlikely(cancel
)) {
1246 req_mod(req
, send_canceled
);
1250 ok
= drbd_send_drequest(mdev
, P_DATA_REQUEST
, req
->sector
, req
->size
,
1251 (unsigned long)req
);
1254 /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
1255 * so this is probably redundant */
1256 if (mdev
->state
.conn
>= C_CONNECTED
)
1257 drbd_force_state(mdev
, NS(conn
, C_NETWORK_FAILURE
));
1259 req_mod(req
, ok
? handed_over_to_network
: send_failed
);
1264 int w_restart_disk_io(struct drbd_conf
*mdev
, struct drbd_work
*w
, int cancel
)
1266 struct drbd_request
*req
= container_of(w
, struct drbd_request
, w
);
1268 if (bio_data_dir(req
->master_bio
) == WRITE
)
1269 drbd_al_begin_io(mdev
, req
->sector
);
1270 /* Calling drbd_al_begin_io() out of the worker might deadlocks
1271 theoretically. Practically it can not deadlock, since this is
1272 only used when unfreezing IOs. All the extents of the requests
1273 that made it into the TL are already active */
1275 drbd_req_make_private_bio(req
, req
->master_bio
);
1276 req
->private_bio
->bi_bdev
= mdev
->ldev
->backing_bdev
;
1277 generic_make_request(req
->private_bio
);
1282 static int _drbd_may_sync_now(struct drbd_conf
*mdev
)
1284 struct drbd_conf
*odev
= mdev
;
1287 if (odev
->sync_conf
.after
== -1)
1289 odev
= minor_to_mdev(odev
->sync_conf
.after
);
1290 ERR_IF(!odev
) return 1;
1291 if ((odev
->state
.conn
>= C_SYNC_SOURCE
&&
1292 odev
->state
.conn
<= C_PAUSED_SYNC_T
) ||
1293 odev
->state
.aftr_isp
|| odev
->state
.peer_isp
||
1294 odev
->state
.user_isp
)
1300 * _drbd_pause_after() - Pause resync on all devices that may not resync now
1301 * @mdev: DRBD device.
1303 * Called from process context only (admin command and after_state_ch).
1305 static int _drbd_pause_after(struct drbd_conf
*mdev
)
1307 struct drbd_conf
*odev
;
1310 for (i
= 0; i
< minor_count
; i
++) {
1311 odev
= minor_to_mdev(i
);
1314 if (odev
->state
.conn
== C_STANDALONE
&& odev
->state
.disk
== D_DISKLESS
)
1316 if (!_drbd_may_sync_now(odev
))
1317 rv
|= (__drbd_set_state(_NS(odev
, aftr_isp
, 1), CS_HARD
, NULL
)
1318 != SS_NOTHING_TO_DO
);
1325 * _drbd_resume_next() - Resume resync on all devices that may resync now
1326 * @mdev: DRBD device.
1328 * Called from process context only (admin command and worker).
1330 static int _drbd_resume_next(struct drbd_conf
*mdev
)
1332 struct drbd_conf
*odev
;
1335 for (i
= 0; i
< minor_count
; i
++) {
1336 odev
= minor_to_mdev(i
);
1339 if (odev
->state
.conn
== C_STANDALONE
&& odev
->state
.disk
== D_DISKLESS
)
1341 if (odev
->state
.aftr_isp
) {
1342 if (_drbd_may_sync_now(odev
))
1343 rv
|= (__drbd_set_state(_NS(odev
, aftr_isp
, 0),
1345 != SS_NOTHING_TO_DO
) ;
1351 void resume_next_sg(struct drbd_conf
*mdev
)
1353 write_lock_irq(&global_state_lock
);
1354 _drbd_resume_next(mdev
);
1355 write_unlock_irq(&global_state_lock
);
1358 void suspend_other_sg(struct drbd_conf
*mdev
)
1360 write_lock_irq(&global_state_lock
);
1361 _drbd_pause_after(mdev
);
1362 write_unlock_irq(&global_state_lock
);
1365 static int sync_after_error(struct drbd_conf
*mdev
, int o_minor
)
1367 struct drbd_conf
*odev
;
1371 if (o_minor
< -1 || minor_to_mdev(o_minor
) == NULL
)
1372 return ERR_SYNC_AFTER
;
1374 /* check for loops */
1375 odev
= minor_to_mdev(o_minor
);
1378 return ERR_SYNC_AFTER_CYCLE
;
1380 /* dependency chain ends here, no cycles. */
1381 if (odev
->sync_conf
.after
== -1)
1384 /* follow the dependency chain */
1385 odev
= minor_to_mdev(odev
->sync_conf
.after
);
1389 int drbd_alter_sa(struct drbd_conf
*mdev
, int na
)
1394 write_lock_irq(&global_state_lock
);
1395 retcode
= sync_after_error(mdev
, na
);
1396 if (retcode
== NO_ERROR
) {
1397 mdev
->sync_conf
.after
= na
;
1399 changes
= _drbd_pause_after(mdev
);
1400 changes
|= _drbd_resume_next(mdev
);
1403 write_unlock_irq(&global_state_lock
);
1407 static void ping_peer(struct drbd_conf
*mdev
)
1409 clear_bit(GOT_PING_ACK
, &mdev
->flags
);
1411 wait_event(mdev
->misc_wait
,
1412 test_bit(GOT_PING_ACK
, &mdev
->flags
) || mdev
->state
.conn
< C_CONNECTED
);
1416 * drbd_start_resync() - Start the resync process
1417 * @mdev: DRBD device.
1418 * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
1420 * This function might bring you directly into one of the
1421 * C_PAUSED_SYNC_* states.
1423 void drbd_start_resync(struct drbd_conf
*mdev
, enum drbd_conns side
)
1425 union drbd_state ns
;
1428 if (mdev
->state
.conn
>= C_SYNC_SOURCE
) {
1429 dev_err(DEV
, "Resync already running!\n");
1433 /* In case a previous resync run was aborted by an IO error/detach on the peer. */
1434 drbd_rs_cancel_all(mdev
);
1436 if (side
== C_SYNC_TARGET
) {
1437 /* Since application IO was locked out during C_WF_BITMAP_T and
1438 C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1439 we check that we might make the data inconsistent. */
1440 r
= drbd_khelper(mdev
, "before-resync-target");
1441 r
= (r
>> 8) & 0xff;
1443 dev_info(DEV
, "before-resync-target handler returned %d, "
1444 "dropping connection.\n", r
);
1445 drbd_force_state(mdev
, NS(conn
, C_DISCONNECTING
));
1450 drbd_state_lock(mdev
);
1452 if (!get_ldev_if_state(mdev
, D_NEGOTIATING
)) {
1453 drbd_state_unlock(mdev
);
1457 if (side
== C_SYNC_TARGET
) {
1458 mdev
->bm_resync_fo
= 0;
1459 } else /* side == C_SYNC_SOURCE */ {
1462 get_random_bytes(&uuid
, sizeof(u64
));
1463 drbd_uuid_set(mdev
, UI_BITMAP
, uuid
);
1464 drbd_send_sync_uuid(mdev
, uuid
);
1466 D_ASSERT(mdev
->state
.disk
== D_UP_TO_DATE
);
1469 write_lock_irq(&global_state_lock
);
1472 ns
.aftr_isp
= !_drbd_may_sync_now(mdev
);
1476 if (side
== C_SYNC_TARGET
)
1477 ns
.disk
= D_INCONSISTENT
;
1478 else /* side == C_SYNC_SOURCE */
1479 ns
.pdsk
= D_INCONSISTENT
;
1481 r
= __drbd_set_state(mdev
, ns
, CS_VERBOSE
, NULL
);
1484 if (ns
.conn
< C_CONNECTED
)
1485 r
= SS_UNKNOWN_ERROR
;
1487 if (r
== SS_SUCCESS
) {
1488 unsigned long tw
= drbd_bm_total_weight(mdev
);
1489 unsigned long now
= jiffies
;
1492 mdev
->rs_failed
= 0;
1493 mdev
->rs_paused
= 0;
1494 mdev
->rs_same_csum
= 0;
1495 mdev
->rs_total
= tw
;
1496 mdev
->rs_start
= now
;
1497 for (i
= 0; i
< DRBD_SYNC_MARKS
; i
++) {
1498 mdev
->rs_mark_left
[i
] = tw
;
1499 mdev
->rs_mark_time
[i
] = now
;
1501 _drbd_pause_after(mdev
);
1503 write_unlock_irq(&global_state_lock
);
1506 if (r
== SS_SUCCESS
) {
1507 dev_info(DEV
, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1508 drbd_conn_str(ns
.conn
),
1509 (unsigned long) mdev
->rs_total
<< (BM_BLOCK_SHIFT
-10),
1510 (unsigned long) mdev
->rs_total
);
1512 if (mdev
->rs_total
== 0) {
1513 /* Peer still reachable? Beware of failing before-resync-target handlers! */
1515 drbd_resync_finished(mdev
);
1518 atomic_set(&mdev
->rs_sect_in
, 0);
1519 mdev
->rs_in_flight
= 0;
1520 mdev
->rs_planed
= 0;
1521 spin_lock(&mdev
->peer_seq_lock
);
1522 fifo_set(&mdev
->rs_plan_s
, 0);
1523 spin_unlock(&mdev
->peer_seq_lock
);
1524 /* ns.conn may already be != mdev->state.conn,
1525 * we may have been paused in between, or become paused until
1526 * the timer triggers.
1527 * No matter, that is handled in resync_timer_fn() */
1528 if (ns
.conn
== C_SYNC_TARGET
)
1529 mod_timer(&mdev
->resync_timer
, jiffies
);
1533 drbd_state_unlock(mdev
);
1536 int drbd_worker(struct drbd_thread
*thi
)
1538 struct drbd_conf
*mdev
= thi
->mdev
;
1539 struct drbd_work
*w
= NULL
;
1540 LIST_HEAD(work_list
);
1543 sprintf(current
->comm
, "drbd%d_worker", mdev_to_minor(mdev
));
1545 while (get_t_state(thi
) == Running
) {
1546 drbd_thread_current_set_cpu(mdev
);
1548 if (down_trylock(&mdev
->data
.work
.s
)) {
1549 mutex_lock(&mdev
->data
.mutex
);
1550 if (mdev
->data
.socket
&& !mdev
->net_conf
->no_cork
)
1551 drbd_tcp_uncork(mdev
->data
.socket
);
1552 mutex_unlock(&mdev
->data
.mutex
);
1554 intr
= down_interruptible(&mdev
->data
.work
.s
);
1556 mutex_lock(&mdev
->data
.mutex
);
1557 if (mdev
->data
.socket
&& !mdev
->net_conf
->no_cork
)
1558 drbd_tcp_cork(mdev
->data
.socket
);
1559 mutex_unlock(&mdev
->data
.mutex
);
1563 D_ASSERT(intr
== -EINTR
);
1564 flush_signals(current
);
1565 ERR_IF (get_t_state(thi
) == Running
)
1570 if (get_t_state(thi
) != Running
)
1572 /* With this break, we have done a down() but not consumed
1573 the entry from the list. The cleanup code takes care of
1577 spin_lock_irq(&mdev
->data
.work
.q_lock
);
1578 ERR_IF(list_empty(&mdev
->data
.work
.q
)) {
1579 /* something terribly wrong in our logic.
1580 * we were able to down() the semaphore,
1581 * but the list is empty... doh.
1583 * what is the best thing to do now?
1584 * try again from scratch, restarting the receiver,
1585 * asender, whatnot? could break even more ugly,
1586 * e.g. when we are primary, but no good local data.
1588 * I'll try to get away just starting over this loop.
1590 spin_unlock_irq(&mdev
->data
.work
.q_lock
);
1593 w
= list_entry(mdev
->data
.work
.q
.next
, struct drbd_work
, list
);
1594 list_del_init(&w
->list
);
1595 spin_unlock_irq(&mdev
->data
.work
.q_lock
);
1597 if (!w
->cb(mdev
, w
, mdev
->state
.conn
< C_CONNECTED
)) {
1598 /* dev_warn(DEV, "worker: a callback failed! \n"); */
1599 if (mdev
->state
.conn
>= C_CONNECTED
)
1600 drbd_force_state(mdev
,
1601 NS(conn
, C_NETWORK_FAILURE
));
1604 D_ASSERT(test_bit(DEVICE_DYING
, &mdev
->flags
));
1605 D_ASSERT(test_bit(CONFIG_PENDING
, &mdev
->flags
));
1607 spin_lock_irq(&mdev
->data
.work
.q_lock
);
1609 while (!list_empty(&mdev
->data
.work
.q
)) {
1610 list_splice_init(&mdev
->data
.work
.q
, &work_list
);
1611 spin_unlock_irq(&mdev
->data
.work
.q_lock
);
1613 while (!list_empty(&work_list
)) {
1614 w
= list_entry(work_list
.next
, struct drbd_work
, list
);
1615 list_del_init(&w
->list
);
1617 i
++; /* dead debugging code */
1620 spin_lock_irq(&mdev
->data
.work
.q_lock
);
1622 sema_init(&mdev
->data
.work
.s
, 0);
1623 /* DANGEROUS race: if someone did queue his work within the spinlock,
1624 * but up() ed outside the spinlock, we could get an up() on the
1625 * semaphore without corresponding list entry.
1628 spin_unlock_irq(&mdev
->data
.work
.q_lock
);
1630 D_ASSERT(mdev
->state
.disk
== D_DISKLESS
&& mdev
->state
.conn
== C_STANDALONE
);
1631 /* _drbd_set_state only uses stop_nowait.
1632 * wait here for the Exiting receiver. */
1633 drbd_thread_stop(&mdev
->receiver
);
1634 drbd_mdev_cleanup(mdev
);
1636 dev_info(DEV
, "worker terminated\n");
1638 clear_bit(DEVICE_DYING
, &mdev
->flags
);
1639 clear_bit(CONFIG_PENDING
, &mdev
->flags
);
1640 wake_up(&mdev
->state_wait
);