return NULL;
}
-/* kick lower level device, if we have more than (arbitrary number)
- * reference counts on it, which typically are locally submitted io
- * requests. don't use unacked_cnt, so we speed up proto A and B, too. */
-static void maybe_kick_lo(struct drbd_conf *mdev)
-{
- if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
- drbd_kick_lo(mdev);
-}
-
static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
{
struct drbd_epoch_entry *e;
LIST_HEAD(reclaimed);
struct drbd_epoch_entry *e, *t;
- maybe_kick_lo(mdev);
spin_lock_irq(&mdev->req_lock);
reclaim_net_ee(mdev, &reclaimed);
spin_unlock_irq(&mdev->req_lock);
atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
int i;
- if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
+ if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count)
i = page_chain_free(page);
else {
struct page *tmp;
struct page *page;
unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
- if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
+ if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
return NULL;
e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
while (!list_empty(head)) {
prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&mdev->req_lock);
- drbd_kick_lo(mdev);
- schedule();
+ io_schedule();
finish_wait(&mdev->ee_wait, &wait);
spin_lock_irq(&mdev->req_lock);
}
/* > e->sector, unless this is the first bio */
bio->bi_sector = sector;
bio->bi_bdev = mdev->ldev->backing_bdev;
- /* we special case some flags in the multi-bio case, see below
- * (REQ_UNPLUG) */
bio->bi_rw = rw;
bio->bi_private = e;
bio->bi_end_io = drbd_endio_sec;
bios = bios->bi_next;
bio->bi_next = NULL;
- /* strip off REQ_UNPLUG unless it is the last bio */
- if (bios)
- bio->bi_rw &= ~REQ_UNPLUG;
-
drbd_generic_make_request(mdev, fault_type, bio);
} while (bios);
- maybe_kick_lo(mdev);
return 0;
fail:
inc_unacked(mdev);
- if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
- drbd_kick_lo(mdev);
-
mdev->current_epoch->barrier_nr = p->barrier;
rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
data_size -= dgs;
ERR_IF(data_size & 0x1ff) return NULL;
- ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL;
+ ERR_IF(data_size > DRBD_MAX_BIO_SIZE) return NULL;
/* even though we trust out peer,
* we sometimes have to double check. */
unsigned len = min_t(int, ds, PAGE_SIZE);
data = kmap(page);
rr = drbd_recv(mdev, data, len);
- if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) {
+ if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) {
dev_err(DEV, "Fault injection: Corrupting data on receive\n");
data[0] = data[0] ^ (unsigned long)-1;
}
if (dgs) {
drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
if (memcmp(dig_in, dig_vv, dgs)) {
- dev_err(DEV, "Digest integrity check FAILED.\n");
+ dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n",
+ (unsigned long long)sector, data_size);
drbd_bcast_ee(mdev, "digest failed",
dgs, dig_in, dig_vv, e);
drbd_free_ee(mdev, e);
return ret;
}
-static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
+/* see also bio_flags_to_wire()
+ * DRBD_REQ_*, because we need to semantically map the flags to data packet
+ * flags and back. We may replicate to other kernel versions. */
+static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
{
- if (mdev->agreed_pro_version >= 95)
- return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
- (dpf & DP_UNPLUG ? REQ_UNPLUG : 0) |
- (dpf & DP_FUA ? REQ_FUA : 0) |
- (dpf & DP_FLUSH ? REQ_FUA : 0) |
- (dpf & DP_DISCARD ? REQ_DISCARD : 0);
- else
- return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0;
+ return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
+ (dpf & DP_FUA ? REQ_FUA : 0) |
+ (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
+ (dpf & DP_DISCARD ? REQ_DISCARD : 0);
}
/* mirrored write */
e->w.cb = e_end_block;
+ dp_flags = be32_to_cpu(p->dp_flags);
+ rw |= wire_flags_to_bio(mdev, dp_flags);
+
+ if (dp_flags & DP_MAY_SET_IN_SYNC)
+ e->flags |= EE_MAY_SET_IN_SYNC;
+
spin_lock(&mdev->epoch_lock);
e->epoch = mdev->current_epoch;
atomic_inc(&e->epoch->epoch_size);
atomic_inc(&e->epoch->active);
spin_unlock(&mdev->epoch_lock);
- dp_flags = be32_to_cpu(p->dp_flags);
- rw |= write_flags_to_bio(mdev, dp_flags);
-
- if (dp_flags & DP_MAY_SET_IN_SYNC)
- e->flags |= EE_MAY_SET_IN_SYNC;
-
/* I'm the receiver, I do hold a net_cnt reference. */
if (!mdev->net_conf->two_primaries) {
spin_lock_irq(&mdev->req_lock);
* The current sync rate used here uses only the most recent two step marks,
* to have a short time average so we can react faster.
*/
-int drbd_rs_should_slow_down(struct drbd_conf *mdev)
+int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
{
struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
unsigned long db, dt, dbdt;
+ struct lc_element *tmp;
int curr_events;
int throttle = 0;
if (mdev->sync_conf.c_min_rate == 0)
return 0;
+ spin_lock_irq(&mdev->al_lock);
+ tmp = lc_find(mdev->resync, BM_SECT_TO_EXT(sector));
+ if (tmp) {
+ struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+ if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
+ spin_unlock_irq(&mdev->al_lock);
+ return 0;
+ }
+ /* Do not slow down if app IO is already waiting for this extent */
+ }
+ spin_unlock_irq(&mdev->al_lock);
+
curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
(int)part_stat_read(&disk->part0, sectors[1]) -
atomic_read(&mdev->rs_sect_ev);
+
if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
unsigned long rs_left;
int i;
/* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
* approx. */
- i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS;
- rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+ i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+
+ if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
+ rs_left = mdev->ov_left;
+ else
+ rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
if (!dt)
sector = be64_to_cpu(p->sector);
size = be32_to_cpu(p->blksize);
- if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+ if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
(unsigned long long)sector, size);
return FALSE;
case P_RS_DATA_REQUEST:
e->w.cb = w_e_end_rsdata_req;
fault_type = DRBD_FAULT_RS_RD;
+ /* used in the sector offset progress display */
+ mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
break;
case P_OV_REPLY:
if (cmd == P_CSUM_RS_REQUEST) {
D_ASSERT(mdev->agreed_pro_version >= 89);
e->w.cb = w_e_end_csum_rs_req;
+ /* used in the sector offset progress display */
+ mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
} else if (cmd == P_OV_REPLY) {
+ /* track progress, we may need to throttle */
+ atomic_add(size >> 9, &mdev->rs_sect_in);
e->w.cb = w_e_end_ov_reply;
dec_rs_pending(mdev);
/* drbd_rs_begin_io done when we sent this request,
case P_OV_REQUEST:
if (mdev->ov_start_sector == ~(sector_t)0 &&
mdev->agreed_pro_version >= 90) {
+ unsigned long now = jiffies;
+ int i;
mdev->ov_start_sector = sector;
mdev->ov_position = sector;
- mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector);
+ mdev->ov_left = drbd_bm_bits(mdev) - BM_SECT_TO_BIT(sector);
+ mdev->rs_total = mdev->ov_left;
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ mdev->rs_mark_left[i] = mdev->ov_left;
+ mdev->rs_mark_time[i] = now;
+ }
dev_info(DEV, "Online Verify start sector: %llu\n",
(unsigned long long)sector);
}
* we would also throttle its application reads.
* In that case, throttling is done on the SyncTarget only.
*/
- if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
- msleep(100);
- if (drbd_rs_begin_io(mdev, e->sector))
+ if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev, sector))
+ schedule_timeout_uninterruptible(HZ/10);
+ if (drbd_rs_begin_io(mdev, sector))
goto out_free_e;
submit_for_resync:
case ASB_CALL_HELPER:
hg = drbd_asb_recover_0p(mdev);
if (hg == -1 && mdev->state.role == R_PRIMARY) {
- self = drbd_set_role(mdev, R_SECONDARY, 0);
+ enum drbd_state_rv rv2;
+
+ drbd_set_role(mdev, R_SECONDARY, 0);
/* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
* we might be here in C_WF_REPORT_PARAMS which is transient.
* we do not need to wait for the after state change work either. */
- self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
- if (self != SS_SUCCESS) {
+ rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
+ if (rv2 != SS_SUCCESS) {
drbd_khelper(mdev, "pri-lost-after-sb");
} else {
dev_warn(DEV, "Successfully gave up primary role.\n");
case ASB_CALL_HELPER:
hg = drbd_asb_recover_0p(mdev);
if (hg == -1) {
+ enum drbd_state_rv rv2;
+
/* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
* we might be here in C_WF_REPORT_PARAMS which is transient.
* we do not need to wait for the after state change work either. */
- self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
- if (self != SS_SUCCESS) {
+ rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
+ if (rv2 != SS_SUCCESS) {
drbd_khelper(mdev, "pri-lost-after-sb");
} else {
dev_warn(DEV, "Successfully gave up primary role.\n");
return C_MASK;
}
if (hg == -1001) {
- dev_alert(DEV, "To resolve this both sides have to support at least protocol\n");
+ dev_alert(DEV, "To resolve this both sides have to support at least protocol 91\n");
return C_MASK;
}
{
struct p_sizes *p = &mdev->data.rbuf.sizes;
enum determine_dev_size dd = unchanged;
- unsigned int max_seg_s;
+ unsigned int max_bio_size;
sector_t p_size, p_usize, my_usize;
int ldsc = 0; /* local disk size changed */
enum dds_flags ddsf;
}
put_ldev(mdev);
}
-#undef min_not_zero
ddsf = be16_to_cpu(p->dds_flags);
if (get_ldev(mdev)) {
}
if (mdev->agreed_pro_version < 94)
- max_seg_s = be32_to_cpu(p->max_segment_size);
+ max_bio_size = be32_to_cpu(p->max_bio_size);
else if (mdev->agreed_pro_version == 94)
- max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
+ max_bio_size = DRBD_MAX_SIZE_H80_PACKET;
else /* drbd 8.3.8 onwards */
- max_seg_s = DRBD_MAX_SEGMENT_SIZE;
+ max_bio_size = DRBD_MAX_BIO_SIZE;
- if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
- drbd_setup_queue_param(mdev, max_seg_s);
+ if (max_bio_size != queue_max_hw_sectors(mdev->rq_queue) << 9)
+ drbd_setup_queue_param(mdev, max_bio_size);
drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
put_ldev(mdev);
{
struct p_req_state *p = &mdev->data.rbuf.req_state;
union drbd_state mask, val;
- int rv;
+ enum drbd_state_rv rv;
mask.i = be32_to_cpu(p->mask);
val.i = be32_to_cpu(p->val);
if (ns.conn == C_WF_REPORT_PARAMS)
ns.conn = C_CONNECTED;
+ if (peer_state.conn == C_AHEAD)
+ ns.conn = C_BEHIND;
+
if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
get_ldev_if_state(mdev, D_NEGOTIATING)) {
int cr; /* consider resync */
wait_event(mdev->misc_wait,
mdev->state.conn == C_WF_SYNC_UUID ||
+ mdev->state.conn == C_BEHIND ||
mdev->state.conn < C_CONNECTED ||
mdev->state.disk < D_NEGOTIATING);
int ok = FALSE;
struct p_header80 *h = &mdev->data.rbuf.header.h80;
- wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
-
- drbd_bm_lock(mdev, "receive bitmap");
+ /* drbd_bm_lock(mdev, "receive bitmap"); By intention no bm_lock */
/* maybe we should use some per thread scratch page,
* and allocate that during initial device creation? */
ok = TRUE;
out:
- drbd_bm_unlock(mdev);
+ /* drbd_bm_unlock(mdev); by intention no lock */
if (ok && mdev->state.conn == C_WF_BITMAP_S)
drbd_start_resync(mdev, C_SYNC_SOURCE);
free_page((unsigned long) buffer);
static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- if (mdev->state.disk >= D_INCONSISTENT)
- drbd_kick_lo(mdev);
-
/* Make sure we've acked all the TCP data associated
* with the data requests being unplugged */
drbd_tcp_quickack(mdev->data.socket);
return TRUE;
}
+static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+ struct p_block_desc *p = &mdev->data.rbuf.block_desc;
+
+ drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
+
+ return TRUE;
+}
+
typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
struct data_cmd {
[P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
[P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
+ [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
/* anything missing from this table is in
* the asender_tbl, see get_asender_cmd */
[P_MAX_CMD] = { 0, 0, NULL },
if (os.conn == C_DISCONNECTING) {
wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
- if (!is_susp(mdev->state)) {
- /* we must not free the tl_hash
- * while application io is still on the fly */
- wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
- drbd_free_tl_hash(mdev);
- }
-
crypto_free_hash(mdev->cram_hmac_tfm);
mdev->cram_hmac_tfm = NULL;
tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
+ if (mdev->state.conn == C_AHEAD &&
+ atomic_read(&mdev->ap_in_flight) == 0 &&
+ list_empty(&mdev->start_resync_work.list)) {
+ struct drbd_work *w = &mdev->start_resync_work;
+ w->cb = w_start_resync;
+ drbd_queue_work_front(&mdev->data.work, w);
+ }
+
return TRUE;
}
drbd_rs_complete_io(mdev, sector);
dec_rs_pending(mdev);
- if (--mdev->ov_left == 0) {
+ --mdev->ov_left;
+
+ /* let's advance progress step marks only for every other megabyte */
+ if ((mdev->ov_left & 0x200) == 0x200)
+ drbd_advance_rs_marks(mdev, mdev->ov_left);
+
+ if (mdev->ov_left == 0) {
w = kmalloc(sizeof(*w), GFP_NOIO);
if (w) {
w->cb = w_ov_finished;