drbd: Introduce new primitives for sending commands
[deliverable/linux.git] / drivers / block / drbd / drbd_receiver.c
CommitLineData
b411b363
PR
1/*
2 drbd_receiver.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25
b411b363
PR
26#include <linux/module.h>
27
28#include <asm/uaccess.h>
29#include <net/sock.h>
30
b411b363
PR
31#include <linux/drbd.h>
32#include <linux/fs.h>
33#include <linux/file.h>
34#include <linux/in.h>
35#include <linux/mm.h>
36#include <linux/memcontrol.h>
37#include <linux/mm_inline.h>
38#include <linux/slab.h>
b411b363
PR
39#include <linux/pkt_sched.h>
40#define __KERNEL_SYSCALLS__
41#include <linux/unistd.h>
42#include <linux/vmalloc.h>
43#include <linux/random.h>
b411b363
PR
44#include <linux/string.h>
45#include <linux/scatterlist.h>
46#include "drbd_int.h"
b411b363
PR
47#include "drbd_req.h"
48
49#include "drbd_vli.h"
50
77351055
PR
51struct packet_info {
52 enum drbd_packet cmd;
e2857216
AG
53 unsigned int size;
54 unsigned int vnr;
77351055
PR
55};
56
b411b363
PR
57enum finish_epoch {
58 FE_STILL_LIVE,
59 FE_DESTROYED,
60 FE_RECYCLED,
61};
62
6038178e 63static int drbd_do_features(struct drbd_tconn *tconn);
13e6037d 64static int drbd_do_auth(struct drbd_tconn *tconn);
360cc740 65static int drbd_disconnected(int vnr, void *p, void *data);
b411b363
PR
66
67static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
99920dc5 68static int e_end_block(struct drbd_work *, int);
b411b363 69
b411b363
PR
70
71#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
72
45bb912b
LE
73/*
74 * some helper functions to deal with single linked page lists,
75 * page->private being our "next" pointer.
76 */
77
78/* If at least n pages are linked at head, get n pages off.
79 * Otherwise, don't modify head, and return NULL.
80 * Locking is the responsibility of the caller.
81 */
82static struct page *page_chain_del(struct page **head, int n)
83{
84 struct page *page;
85 struct page *tmp;
86
87 BUG_ON(!n);
88 BUG_ON(!head);
89
90 page = *head;
23ce4227
PR
91
92 if (!page)
93 return NULL;
94
45bb912b
LE
95 while (page) {
96 tmp = page_chain_next(page);
97 if (--n == 0)
98 break; /* found sufficient pages */
99 if (tmp == NULL)
100 /* insufficient pages, don't use any of them. */
101 return NULL;
102 page = tmp;
103 }
104
105 /* add end of list marker for the returned list */
106 set_page_private(page, 0);
107 /* actual return value, and adjustment of head */
108 page = *head;
109 *head = tmp;
110 return page;
111}
112
113/* may be used outside of locks to find the tail of a (usually short)
114 * "private" page chain, before adding it back to a global chain head
115 * with page_chain_add() under a spinlock. */
116static struct page *page_chain_tail(struct page *page, int *len)
117{
118 struct page *tmp;
119 int i = 1;
120 while ((tmp = page_chain_next(page)))
121 ++i, page = tmp;
122 if (len)
123 *len = i;
124 return page;
125}
126
127static int page_chain_free(struct page *page)
128{
129 struct page *tmp;
130 int i = 0;
131 page_chain_for_each_safe(page, tmp) {
132 put_page(page);
133 ++i;
134 }
135 return i;
136}
137
138static void page_chain_add(struct page **head,
139 struct page *chain_first, struct page *chain_last)
140{
141#if 1
142 struct page *tmp;
143 tmp = page_chain_tail(chain_first, NULL);
144 BUG_ON(tmp != chain_last);
145#endif
146
147 /* add chain to head */
148 set_page_private(chain_last, (unsigned long)*head);
149 *head = chain_first;
150}
151
152static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
b411b363
PR
153{
154 struct page *page = NULL;
45bb912b
LE
155 struct page *tmp = NULL;
156 int i = 0;
b411b363
PR
157
158 /* Yes, testing drbd_pp_vacant outside the lock is racy.
159 * So what. It saves a spin_lock. */
45bb912b 160 if (drbd_pp_vacant >= number) {
b411b363 161 spin_lock(&drbd_pp_lock);
45bb912b
LE
162 page = page_chain_del(&drbd_pp_pool, number);
163 if (page)
164 drbd_pp_vacant -= number;
b411b363 165 spin_unlock(&drbd_pp_lock);
45bb912b
LE
166 if (page)
167 return page;
b411b363 168 }
45bb912b 169
b411b363
PR
170 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
171 * "criss-cross" setup, that might cause write-out on some other DRBD,
172 * which in turn might block on the other node at this very place. */
45bb912b
LE
173 for (i = 0; i < number; i++) {
174 tmp = alloc_page(GFP_TRY);
175 if (!tmp)
176 break;
177 set_page_private(tmp, (unsigned long)page);
178 page = tmp;
179 }
180
181 if (i == number)
182 return page;
183
184 /* Not enough pages immediately available this time.
185 * No need to jump around here, drbd_pp_alloc will retry this
186 * function "soon". */
187 if (page) {
188 tmp = page_chain_tail(page, NULL);
189 spin_lock(&drbd_pp_lock);
190 page_chain_add(&drbd_pp_pool, page, tmp);
191 drbd_pp_vacant += i;
192 spin_unlock(&drbd_pp_lock);
193 }
194 return NULL;
b411b363
PR
195}
196
b411b363
PR
197static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
198{
db830c46 199 struct drbd_peer_request *peer_req;
b411b363
PR
200 struct list_head *le, *tle;
201
202 /* The EEs are always appended to the end of the list. Since
203 they are sent in order over the wire, they have to finish
204 in order. As soon as we see the first not finished we can
205 stop to examine the list... */
206
207 list_for_each_safe(le, tle, &mdev->net_ee) {
db830c46
AG
208 peer_req = list_entry(le, struct drbd_peer_request, w.list);
209 if (drbd_ee_has_active_page(peer_req))
b411b363
PR
210 break;
211 list_move(le, to_be_freed);
212 }
213}
214
215static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
216{
217 LIST_HEAD(reclaimed);
db830c46 218 struct drbd_peer_request *peer_req, *t;
b411b363 219
87eeee41 220 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 221 reclaim_net_ee(mdev, &reclaimed);
87eeee41 222 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 223
db830c46
AG
224 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
225 drbd_free_net_ee(mdev, peer_req);
b411b363
PR
226}
227
228/**
45bb912b 229 * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
b411b363 230 * @mdev: DRBD device.
45bb912b
LE
231 * @number: number of pages requested
232 * @retry: whether to retry, if not enough pages are available right now
233 *
234 * Tries to allocate number pages, first from our own page pool, then from
235 * the kernel, unless this allocation would exceed the max_buffers setting.
236 * Possibly retry until DRBD frees sufficient pages somewhere else.
b411b363 237 *
45bb912b 238 * Returns a page chain linked via page->private.
b411b363 239 */
45bb912b 240static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
b411b363
PR
241{
242 struct page *page = NULL;
243 DEFINE_WAIT(wait);
244
45bb912b
LE
245 /* Yes, we may run up to @number over max_buffers. If we
246 * follow it strictly, the admin will get it wrong anyways. */
89e58e75 247 if (atomic_read(&mdev->pp_in_use) < mdev->tconn->net_conf->max_buffers)
45bb912b 248 page = drbd_pp_first_pages_or_try_alloc(mdev, number);
b411b363 249
45bb912b 250 while (page == NULL) {
b411b363
PR
251 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
252
253 drbd_kick_lo_and_reclaim_net(mdev);
254
89e58e75 255 if (atomic_read(&mdev->pp_in_use) < mdev->tconn->net_conf->max_buffers) {
45bb912b 256 page = drbd_pp_first_pages_or_try_alloc(mdev, number);
b411b363
PR
257 if (page)
258 break;
259 }
260
261 if (!retry)
262 break;
263
264 if (signal_pending(current)) {
265 dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
266 break;
267 }
268
269 schedule();
270 }
271 finish_wait(&drbd_pp_wait, &wait);
272
45bb912b
LE
273 if (page)
274 atomic_add(number, &mdev->pp_in_use);
b411b363
PR
275 return page;
276}
277
278/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
87eeee41 279 * Is also used from inside an other spin_lock_irq(&mdev->tconn->req_lock);
45bb912b
LE
280 * Either links the page chain back to the global pool,
281 * or returns all pages to the system. */
435f0740 282static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
b411b363 283{
435f0740 284 atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
b411b363 285 int i;
435f0740 286
81a5d60e 287 if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
45bb912b
LE
288 i = page_chain_free(page);
289 else {
290 struct page *tmp;
291 tmp = page_chain_tail(page, &i);
292 spin_lock(&drbd_pp_lock);
293 page_chain_add(&drbd_pp_pool, page, tmp);
294 drbd_pp_vacant += i;
295 spin_unlock(&drbd_pp_lock);
b411b363 296 }
435f0740 297 i = atomic_sub_return(i, a);
45bb912b 298 if (i < 0)
435f0740
LE
299 dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
300 is_net ? "pp_in_use_by_net" : "pp_in_use", i);
b411b363
PR
301 wake_up(&drbd_pp_wait);
302}
303
304/*
305You need to hold the req_lock:
306 _drbd_wait_ee_list_empty()
307
308You must not have the req_lock:
309 drbd_free_ee()
310 drbd_alloc_ee()
311 drbd_init_ee()
312 drbd_release_ee()
313 drbd_ee_fix_bhs()
314 drbd_process_done_ee()
315 drbd_clear_done_ee()
316 drbd_wait_ee_list_empty()
317*/
318
f6ffca9f
AG
319struct drbd_peer_request *
320drbd_alloc_ee(struct drbd_conf *mdev, u64 id, sector_t sector,
321 unsigned int data_size, gfp_t gfp_mask) __must_hold(local)
b411b363 322{
db830c46 323 struct drbd_peer_request *peer_req;
b411b363 324 struct page *page;
45bb912b 325 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
b411b363 326
0cf9d27e 327 if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
b411b363
PR
328 return NULL;
329
db830c46
AG
330 peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
331 if (!peer_req) {
b411b363
PR
332 if (!(gfp_mask & __GFP_NOWARN))
333 dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
334 return NULL;
335 }
336
45bb912b
LE
337 page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
338 if (!page)
339 goto fail;
b411b363 340
db830c46
AG
341 drbd_clear_interval(&peer_req->i);
342 peer_req->i.size = data_size;
343 peer_req->i.sector = sector;
344 peer_req->i.local = false;
345 peer_req->i.waiting = false;
346
347 peer_req->epoch = NULL;
a21e9298 348 peer_req->w.mdev = mdev;
db830c46
AG
349 peer_req->pages = page;
350 atomic_set(&peer_req->pending_bios, 0);
351 peer_req->flags = 0;
9a8e7753
AG
352 /*
353 * The block_id is opaque to the receiver. It is not endianness
354 * converted, and sent back to the sender unchanged.
355 */
db830c46 356 peer_req->block_id = id;
b411b363 357
db830c46 358 return peer_req;
b411b363 359
45bb912b 360 fail:
db830c46 361 mempool_free(peer_req, drbd_ee_mempool);
b411b363
PR
362 return NULL;
363}
364
db830c46 365void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_peer_request *peer_req,
f6ffca9f 366 int is_net)
b411b363 367{
db830c46
AG
368 if (peer_req->flags & EE_HAS_DIGEST)
369 kfree(peer_req->digest);
370 drbd_pp_free(mdev, peer_req->pages, is_net);
371 D_ASSERT(atomic_read(&peer_req->pending_bios) == 0);
372 D_ASSERT(drbd_interval_empty(&peer_req->i));
373 mempool_free(peer_req, drbd_ee_mempool);
b411b363
PR
374}
375
376int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
377{
378 LIST_HEAD(work_list);
db830c46 379 struct drbd_peer_request *peer_req, *t;
b411b363 380 int count = 0;
435f0740 381 int is_net = list == &mdev->net_ee;
b411b363 382
87eeee41 383 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 384 list_splice_init(list, &work_list);
87eeee41 385 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 386
db830c46
AG
387 list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
388 drbd_free_some_ee(mdev, peer_req, is_net);
b411b363
PR
389 count++;
390 }
391 return count;
392}
393
394
32862ec7 395/* See also comments in _req_mod(,BARRIER_ACKED)
b411b363
PR
396 * and receive_Barrier.
397 *
398 * Move entries from net_ee to done_ee, if ready.
399 * Grab done_ee, call all callbacks, free the entries.
400 * The callbacks typically send out ACKs.
401 */
402static int drbd_process_done_ee(struct drbd_conf *mdev)
403{
404 LIST_HEAD(work_list);
405 LIST_HEAD(reclaimed);
db830c46 406 struct drbd_peer_request *peer_req, *t;
e2b3032b 407 int err = 0;
b411b363 408
87eeee41 409 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
410 reclaim_net_ee(mdev, &reclaimed);
411 list_splice_init(&mdev->done_ee, &work_list);
87eeee41 412 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 413
db830c46
AG
414 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
415 drbd_free_net_ee(mdev, peer_req);
b411b363
PR
416
417 /* possible callbacks here:
7be8da07 418 * e_end_block, and e_end_resync_block, e_send_discard_write.
b411b363
PR
419 * all ignore the last argument.
420 */
db830c46 421 list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
e2b3032b
AG
422 int err2;
423
b411b363 424 /* list_del not necessary, next/prev members not touched */
e2b3032b
AG
425 err2 = peer_req->w.cb(&peer_req->w, !!err);
426 if (!err)
427 err = err2;
db830c46 428 drbd_free_ee(mdev, peer_req);
b411b363
PR
429 }
430 wake_up(&mdev->ee_wait);
431
e2b3032b 432 return err;
b411b363
PR
433}
434
435void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
436{
437 DEFINE_WAIT(wait);
438
439 /* avoids spin_lock/unlock
440 * and calling prepare_to_wait in the fast path */
441 while (!list_empty(head)) {
442 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
87eeee41 443 spin_unlock_irq(&mdev->tconn->req_lock);
7eaceacc 444 io_schedule();
b411b363 445 finish_wait(&mdev->ee_wait, &wait);
87eeee41 446 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
447 }
448}
449
450void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
451{
87eeee41 452 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 453 _drbd_wait_ee_list_empty(mdev, head);
87eeee41 454 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
455}
456
457/* see also kernel_accept; which is only present since 2.6.18.
458 * also we want to log which part of it failed, exactly */
7653620d 459static int drbd_accept(const char **what, struct socket *sock, struct socket **newsock)
b411b363
PR
460{
461 struct sock *sk = sock->sk;
462 int err = 0;
463
464 *what = "listen";
465 err = sock->ops->listen(sock, 5);
466 if (err < 0)
467 goto out;
468
469 *what = "sock_create_lite";
470 err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
471 newsock);
472 if (err < 0)
473 goto out;
474
475 *what = "accept";
476 err = sock->ops->accept(sock, *newsock, 0);
477 if (err < 0) {
478 sock_release(*newsock);
479 *newsock = NULL;
480 goto out;
481 }
482 (*newsock)->ops = sock->ops;
483
484out:
485 return err;
486}
487
dbd9eea0 488static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
b411b363
PR
489{
490 mm_segment_t oldfs;
491 struct kvec iov = {
492 .iov_base = buf,
493 .iov_len = size,
494 };
495 struct msghdr msg = {
496 .msg_iovlen = 1,
497 .msg_iov = (struct iovec *)&iov,
498 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
499 };
500 int rv;
501
502 oldfs = get_fs();
503 set_fs(KERNEL_DS);
504 rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
505 set_fs(oldfs);
506
507 return rv;
508}
509
de0ff338 510static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size)
b411b363
PR
511{
512 mm_segment_t oldfs;
513 struct kvec iov = {
514 .iov_base = buf,
515 .iov_len = size,
516 };
517 struct msghdr msg = {
518 .msg_iovlen = 1,
519 .msg_iov = (struct iovec *)&iov,
520 .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
521 };
522 int rv;
523
524 oldfs = get_fs();
525 set_fs(KERNEL_DS);
526
527 for (;;) {
de0ff338 528 rv = sock_recvmsg(tconn->data.socket, &msg, size, msg.msg_flags);
b411b363
PR
529 if (rv == size)
530 break;
531
532 /* Note:
533 * ECONNRESET other side closed the connection
534 * ERESTARTSYS (on sock) we got a signal
535 */
536
537 if (rv < 0) {
538 if (rv == -ECONNRESET)
de0ff338 539 conn_info(tconn, "sock was reset by peer\n");
b411b363 540 else if (rv != -ERESTARTSYS)
de0ff338 541 conn_err(tconn, "sock_recvmsg returned %d\n", rv);
b411b363
PR
542 break;
543 } else if (rv == 0) {
de0ff338 544 conn_info(tconn, "sock was shut down by peer\n");
b411b363
PR
545 break;
546 } else {
547 /* signal came in, or peer/link went down,
548 * after we read a partial message
549 */
550 /* D_ASSERT(signal_pending(current)); */
551 break;
552 }
553 };
554
555 set_fs(oldfs);
556
557 if (rv != size)
bbeb641c 558 conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
b411b363
PR
559
560 return rv;
561}
562
c6967746
AG
563static int drbd_recv_all(struct drbd_tconn *tconn, void *buf, size_t size)
564{
565 int err;
566
567 err = drbd_recv(tconn, buf, size);
568 if (err != size) {
569 if (err >= 0)
570 err = -EIO;
571 } else
572 err = 0;
573 return err;
574}
575
a5c31904
AG
576static int drbd_recv_all_warn(struct drbd_tconn *tconn, void *buf, size_t size)
577{
578 int err;
579
580 err = drbd_recv_all(tconn, buf, size);
581 if (err && !signal_pending(current))
582 conn_warn(tconn, "short read (expected size %d)\n", (int)size);
583 return err;
584}
585
5dbf1673
LE
586/* quoting tcp(7):
587 * On individual connections, the socket buffer size must be set prior to the
588 * listen(2) or connect(2) calls in order to have it take effect.
589 * This is our wrapper to do so.
590 */
591static void drbd_setbufsize(struct socket *sock, unsigned int snd,
592 unsigned int rcv)
593{
594 /* open coded SO_SNDBUF, SO_RCVBUF */
595 if (snd) {
596 sock->sk->sk_sndbuf = snd;
597 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
598 }
599 if (rcv) {
600 sock->sk->sk_rcvbuf = rcv;
601 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
602 }
603}
604
eac3e990 605static struct socket *drbd_try_connect(struct drbd_tconn *tconn)
b411b363
PR
606{
607 const char *what;
608 struct socket *sock;
609 struct sockaddr_in6 src_in6;
610 int err;
611 int disconnect_on_error = 1;
612
eac3e990 613 if (!get_net_conf(tconn))
b411b363
PR
614 return NULL;
615
616 what = "sock_create_kern";
eac3e990 617 err = sock_create_kern(((struct sockaddr *)tconn->net_conf->my_addr)->sa_family,
b411b363
PR
618 SOCK_STREAM, IPPROTO_TCP, &sock);
619 if (err < 0) {
620 sock = NULL;
621 goto out;
622 }
623
624 sock->sk->sk_rcvtimeo =
eac3e990
PR
625 sock->sk->sk_sndtimeo = tconn->net_conf->try_connect_int*HZ;
626 drbd_setbufsize(sock, tconn->net_conf->sndbuf_size,
627 tconn->net_conf->rcvbuf_size);
b411b363
PR
628
629 /* explicitly bind to the configured IP as source IP
630 * for the outgoing connections.
631 * This is needed for multihomed hosts and to be
632 * able to use lo: interfaces for drbd.
633 * Make sure to use 0 as port number, so linux selects
634 * a free one dynamically.
635 */
eac3e990
PR
636 memcpy(&src_in6, tconn->net_conf->my_addr,
637 min_t(int, tconn->net_conf->my_addr_len, sizeof(src_in6)));
638 if (((struct sockaddr *)tconn->net_conf->my_addr)->sa_family == AF_INET6)
b411b363
PR
639 src_in6.sin6_port = 0;
640 else
641 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
642
643 what = "bind before connect";
644 err = sock->ops->bind(sock,
645 (struct sockaddr *) &src_in6,
eac3e990 646 tconn->net_conf->my_addr_len);
b411b363
PR
647 if (err < 0)
648 goto out;
649
650 /* connect may fail, peer not yet available.
651 * stay C_WF_CONNECTION, don't go Disconnecting! */
652 disconnect_on_error = 0;
653 what = "connect";
654 err = sock->ops->connect(sock,
eac3e990
PR
655 (struct sockaddr *)tconn->net_conf->peer_addr,
656 tconn->net_conf->peer_addr_len, 0);
b411b363
PR
657
658out:
659 if (err < 0) {
660 if (sock) {
661 sock_release(sock);
662 sock = NULL;
663 }
664 switch (-err) {
665 /* timeout, busy, signal pending */
666 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
667 case EINTR: case ERESTARTSYS:
668 /* peer not (yet) available, network problem */
669 case ECONNREFUSED: case ENETUNREACH:
670 case EHOSTDOWN: case EHOSTUNREACH:
671 disconnect_on_error = 0;
672 break;
673 default:
eac3e990 674 conn_err(tconn, "%s failed, err = %d\n", what, err);
b411b363
PR
675 }
676 if (disconnect_on_error)
bbeb641c 677 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 678 }
eac3e990 679 put_net_conf(tconn);
b411b363
PR
680 return sock;
681}
682
7653620d 683static struct socket *drbd_wait_for_connect(struct drbd_tconn *tconn)
b411b363
PR
684{
685 int timeo, err;
686 struct socket *s_estab = NULL, *s_listen;
687 const char *what;
688
7653620d 689 if (!get_net_conf(tconn))
b411b363
PR
690 return NULL;
691
692 what = "sock_create_kern";
7653620d 693 err = sock_create_kern(((struct sockaddr *)tconn->net_conf->my_addr)->sa_family,
b411b363
PR
694 SOCK_STREAM, IPPROTO_TCP, &s_listen);
695 if (err) {
696 s_listen = NULL;
697 goto out;
698 }
699
7653620d 700 timeo = tconn->net_conf->try_connect_int * HZ;
b411b363
PR
701 timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
702
703 s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
704 s_listen->sk->sk_rcvtimeo = timeo;
705 s_listen->sk->sk_sndtimeo = timeo;
7653620d
PR
706 drbd_setbufsize(s_listen, tconn->net_conf->sndbuf_size,
707 tconn->net_conf->rcvbuf_size);
b411b363
PR
708
709 what = "bind before listen";
710 err = s_listen->ops->bind(s_listen,
7653620d
PR
711 (struct sockaddr *) tconn->net_conf->my_addr,
712 tconn->net_conf->my_addr_len);
b411b363
PR
713 if (err < 0)
714 goto out;
715
7653620d 716 err = drbd_accept(&what, s_listen, &s_estab);
b411b363
PR
717
718out:
719 if (s_listen)
720 sock_release(s_listen);
721 if (err < 0) {
722 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
7653620d 723 conn_err(tconn, "%s failed, err = %d\n", what, err);
bbeb641c 724 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
725 }
726 }
7653620d 727 put_net_conf(tconn);
b411b363
PR
728
729 return s_estab;
730}
731
7c96715a 732static int drbd_send_fp(struct drbd_tconn *tconn, struct drbd_socket *sock, enum drbd_packet cmd)
b411b363 733{
5a87d920 734 struct p_header *h = tconn->data.sbuf;
b411b363 735
ecf2363c 736 return !_conn_send_cmd(tconn, 0, sock, cmd, h, sizeof(*h), 0);
b411b363
PR
737}
738
a25b63f1 739static enum drbd_packet drbd_recv_fp(struct drbd_tconn *tconn, struct socket *sock)
b411b363 740{
7c96715a 741 struct p_header80 h;
b411b363
PR
742 int rr;
743
7c96715a 744 rr = drbd_recv_short(sock, &h, sizeof(h), 0);
b411b363 745
7c96715a
AG
746 if (rr == sizeof(h) && h.magic == cpu_to_be32(DRBD_MAGIC))
747 return be16_to_cpu(h.command);
b411b363
PR
748
749 return 0xffff;
750}
751
752/**
753 * drbd_socket_okay() - Free the socket if its connection is not okay
b411b363
PR
754 * @sock: pointer to the pointer to the socket.
755 */
dbd9eea0 756static int drbd_socket_okay(struct socket **sock)
b411b363
PR
757{
758 int rr;
759 char tb[4];
760
761 if (!*sock)
81e84650 762 return false;
b411b363 763
dbd9eea0 764 rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
b411b363
PR
765
766 if (rr > 0 || rr == -EAGAIN) {
81e84650 767 return true;
b411b363
PR
768 } else {
769 sock_release(*sock);
770 *sock = NULL;
81e84650 771 return false;
b411b363
PR
772 }
773}
2325eb66
PR
774/* Gets called if a connection is established, or if a new minor gets created
775 in a connection */
776int drbd_connected(int vnr, void *p, void *data)
907599e0
PR
777{
778 struct drbd_conf *mdev = (struct drbd_conf *)p;
0829f5ed 779 int err;
907599e0
PR
780
781 atomic_set(&mdev->packet_seq, 0);
782 mdev->peer_seq = 0;
783
8410da8f
PR
784 mdev->state_mutex = mdev->tconn->agreed_pro_version < 100 ?
785 &mdev->tconn->cstate_mutex :
786 &mdev->own_state_mutex;
787
0829f5ed
AG
788 err = drbd_send_sync_param(mdev);
789 if (!err)
790 err = drbd_send_sizes(mdev, 0, 0);
791 if (!err)
792 err = drbd_send_uuids(mdev);
793 if (!err)
794 err = drbd_send_state(mdev);
907599e0
PR
795 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
796 clear_bit(RESIZE_PENDING, &mdev->flags);
8b924f1d 797 mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
0829f5ed 798 return err;
907599e0
PR
799}
800
b411b363
PR
801/*
802 * return values:
803 * 1 yes, we have a valid connection
804 * 0 oops, did not work out, please try again
805 * -1 peer talks different language,
806 * no point in trying again, please go standalone.
807 * -2 We do not have a network config...
808 */
907599e0 809static int drbd_connect(struct drbd_tconn *tconn)
b411b363 810{
2bf89621 811 struct socket *sock, *msock;
b411b363
PR
812 int try, h, ok;
813
bbeb641c 814 if (conn_request_state(tconn, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
b411b363
PR
815 return -2;
816
907599e0 817 clear_bit(DISCARD_CONCURRENT, &tconn->flags);
0916e0e3
AG
818
819 /* Assume that the peer only understands protocol 80 until we know better. */
820 tconn->agreed_pro_version = 80;
b411b363 821
b411b363 822 do {
2bf89621
AG
823 struct socket *s;
824
b411b363
PR
825 for (try = 0;;) {
826 /* 3 tries, this should take less than a second! */
907599e0 827 s = drbd_try_connect(tconn);
b411b363
PR
828 if (s || ++try >= 3)
829 break;
830 /* give the other side time to call bind() & listen() */
20ee6390 831 schedule_timeout_interruptible(HZ / 10);
b411b363
PR
832 }
833
834 if (s) {
2bf89621
AG
835 if (!tconn->data.socket) {
836 tconn->data.socket = s;
e5d6f33a 837 drbd_send_fp(tconn, &tconn->data, P_INITIAL_DATA);
2bf89621
AG
838 } else if (!tconn->meta.socket) {
839 tconn->meta.socket = s;
e5d6f33a 840 drbd_send_fp(tconn, &tconn->meta, P_INITIAL_META);
b411b363 841 } else {
907599e0 842 conn_err(tconn, "Logic error in drbd_connect()\n");
b411b363
PR
843 goto out_release_sockets;
844 }
845 }
846
2bf89621 847 if (tconn->data.socket && tconn->meta.socket) {
907599e0 848 schedule_timeout_interruptible(tconn->net_conf->ping_timeo*HZ/10);
2bf89621
AG
849 ok = drbd_socket_okay(&tconn->data.socket);
850 ok = drbd_socket_okay(&tconn->meta.socket) && ok;
b411b363
PR
851 if (ok)
852 break;
853 }
854
855retry:
907599e0 856 s = drbd_wait_for_connect(tconn);
b411b363 857 if (s) {
907599e0 858 try = drbd_recv_fp(tconn, s);
2bf89621
AG
859 drbd_socket_okay(&tconn->data.socket);
860 drbd_socket_okay(&tconn->meta.socket);
b411b363 861 switch (try) {
e5d6f33a 862 case P_INITIAL_DATA:
2bf89621 863 if (tconn->data.socket) {
907599e0 864 conn_warn(tconn, "initial packet S crossed\n");
2bf89621 865 sock_release(tconn->data.socket);
b411b363 866 }
2bf89621 867 tconn->data.socket = s;
b411b363 868 break;
e5d6f33a 869 case P_INITIAL_META:
2bf89621 870 if (tconn->meta.socket) {
907599e0 871 conn_warn(tconn, "initial packet M crossed\n");
2bf89621 872 sock_release(tconn->meta.socket);
b411b363 873 }
2bf89621 874 tconn->meta.socket = s;
907599e0 875 set_bit(DISCARD_CONCURRENT, &tconn->flags);
b411b363
PR
876 break;
877 default:
907599e0 878 conn_warn(tconn, "Error receiving initial packet\n");
b411b363
PR
879 sock_release(s);
880 if (random32() & 1)
881 goto retry;
882 }
883 }
884
bbeb641c 885 if (tconn->cstate <= C_DISCONNECTING)
b411b363
PR
886 goto out_release_sockets;
887 if (signal_pending(current)) {
888 flush_signals(current);
889 smp_rmb();
907599e0 890 if (get_t_state(&tconn->receiver) == EXITING)
b411b363
PR
891 goto out_release_sockets;
892 }
893
2bf89621
AG
894 if (tconn->data.socket && &tconn->meta.socket) {
895 ok = drbd_socket_okay(&tconn->data.socket);
896 ok = drbd_socket_okay(&tconn->meta.socket) && ok;
b411b363
PR
897 if (ok)
898 break;
899 }
900 } while (1);
901
2bf89621
AG
902 sock = tconn->data.socket;
903 msock = tconn->meta.socket;
904
b411b363
PR
905 msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
906 sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
907
908 sock->sk->sk_allocation = GFP_NOIO;
909 msock->sk->sk_allocation = GFP_NOIO;
910
911 sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
912 msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
913
b411b363 914 /* NOT YET ...
907599e0 915 * sock->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10;
b411b363 916 * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
6038178e 917 * first set it to the P_CONNECTION_FEATURES timeout,
b411b363
PR
918 * which we set to 4x the configured ping_timeout. */
919 sock->sk->sk_sndtimeo =
907599e0 920 sock->sk->sk_rcvtimeo = tconn->net_conf->ping_timeo*4*HZ/10;
b411b363 921
907599e0
PR
922 msock->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10;
923 msock->sk->sk_rcvtimeo = tconn->net_conf->ping_int*HZ;
b411b363
PR
924
925 /* we don't want delays.
25985edc 926 * we use TCP_CORK where appropriate, though */
b411b363
PR
927 drbd_tcp_nodelay(sock);
928 drbd_tcp_nodelay(msock);
929
907599e0 930 tconn->last_received = jiffies;
b411b363 931
6038178e 932 h = drbd_do_features(tconn);
b411b363
PR
933 if (h <= 0)
934 return h;
935
907599e0 936 if (tconn->cram_hmac_tfm) {
b411b363 937 /* drbd_request_state(mdev, NS(conn, WFAuth)); */
907599e0 938 switch (drbd_do_auth(tconn)) {
b10d96cb 939 case -1:
907599e0 940 conn_err(tconn, "Authentication of peer failed\n");
b411b363 941 return -1;
b10d96cb 942 case 0:
907599e0 943 conn_err(tconn, "Authentication of peer failed, trying again.\n");
b10d96cb 944 return 0;
b411b363
PR
945 }
946 }
947
bbeb641c 948 if (conn_request_state(tconn, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE) < SS_SUCCESS)
b411b363
PR
949 return 0;
950
907599e0 951 sock->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10;
b411b363
PR
952 sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
953
907599e0 954 drbd_thread_start(&tconn->asender);
b411b363 955
387eb308 956 if (drbd_send_protocol(tconn) == -EOPNOTSUPP)
7e2455c1 957 return -1;
b411b363 958
907599e0 959 return !idr_for_each(&tconn->volumes, drbd_connected, tconn);
b411b363
PR
960
961out_release_sockets:
2bf89621
AG
962 if (tconn->data.socket) {
963 sock_release(tconn->data.socket);
964 tconn->data.socket = NULL;
965 }
966 if (tconn->meta.socket) {
967 sock_release(tconn->meta.socket);
968 tconn->meta.socket = NULL;
969 }
b411b363
PR
970 return -1;
971}
972
8172f3e9 973static int decode_header(struct drbd_tconn *tconn, struct p_header *h, struct packet_info *pi)
b411b363 974{
fd340c12 975 if (h->h80.magic == cpu_to_be32(DRBD_MAGIC)) {
77351055
PR
976 pi->cmd = be16_to_cpu(h->h80.command);
977 pi->size = be16_to_cpu(h->h80.length);
eefc2f7d 978 pi->vnr = 0;
ca9bc12b 979 } else if (h->h95.magic == cpu_to_be16(DRBD_MAGIC_BIG)) {
77351055
PR
980 pi->cmd = be16_to_cpu(h->h95.command);
981 pi->size = be32_to_cpu(h->h95.length) & 0x00ffffff;
982 pi->vnr = 0;
02918be2 983 } else {
ce243853 984 conn_err(tconn, "magic?? on data m: 0x%08x c: %d l: %d\n",
004352fa
LE
985 be32_to_cpu(h->h80.magic),
986 be16_to_cpu(h->h80.command),
987 be16_to_cpu(h->h80.length));
8172f3e9 988 return -EINVAL;
b411b363 989 }
8172f3e9 990 return 0;
257d0af6
PR
991}
992
9ba7aa00 993static int drbd_recv_header(struct drbd_tconn *tconn, struct packet_info *pi)
257d0af6 994{
e6ef8a5c 995 struct p_header *h = tconn->data.rbuf;
69bc7bc3 996 int err;
257d0af6 997
a5c31904
AG
998 err = drbd_recv_all_warn(tconn, h, sizeof(*h));
999 if (err)
69bc7bc3 1000 return err;
257d0af6 1001
69bc7bc3 1002 err = decode_header(tconn, h, pi);
9ba7aa00 1003 tconn->last_received = jiffies;
b411b363 1004
69bc7bc3 1005 return err;
b411b363
PR
1006}
1007
2451fc3b 1008static void drbd_flush(struct drbd_conf *mdev)
b411b363
PR
1009{
1010 int rv;
1011
1012 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
fbd9b09a 1013 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
dd3932ed 1014 NULL);
b411b363
PR
1015 if (rv) {
1016 dev_err(DEV, "local disk flush failed with status %d\n", rv);
1017 /* would rather check on EOPNOTSUPP, but that is not reliable.
1018 * don't try again for ANY return value != 0
1019 * if (rv == -EOPNOTSUPP) */
1020 drbd_bump_write_ordering(mdev, WO_drain_io);
1021 }
1022 put_ldev(mdev);
1023 }
b411b363
PR
1024}
1025
1026/**
1027 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
1028 * @mdev: DRBD device.
1029 * @epoch: Epoch object.
1030 * @ev: Epoch event.
1031 */
1032static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1033 struct drbd_epoch *epoch,
1034 enum epoch_event ev)
1035{
2451fc3b 1036 int epoch_size;
b411b363 1037 struct drbd_epoch *next_epoch;
b411b363
PR
1038 enum finish_epoch rv = FE_STILL_LIVE;
1039
1040 spin_lock(&mdev->epoch_lock);
1041 do {
1042 next_epoch = NULL;
b411b363
PR
1043
1044 epoch_size = atomic_read(&epoch->epoch_size);
1045
1046 switch (ev & ~EV_CLEANUP) {
1047 case EV_PUT:
1048 atomic_dec(&epoch->active);
1049 break;
1050 case EV_GOT_BARRIER_NR:
1051 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
b411b363
PR
1052 break;
1053 case EV_BECAME_LAST:
1054 /* nothing to do*/
1055 break;
1056 }
1057
b411b363
PR
1058 if (epoch_size != 0 &&
1059 atomic_read(&epoch->active) == 0 &&
2451fc3b 1060 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) {
b411b363
PR
1061 if (!(ev & EV_CLEANUP)) {
1062 spin_unlock(&mdev->epoch_lock);
1063 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
1064 spin_lock(&mdev->epoch_lock);
1065 }
1066 dec_unacked(mdev);
1067
1068 if (mdev->current_epoch != epoch) {
1069 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1070 list_del(&epoch->list);
1071 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1072 mdev->epochs--;
b411b363
PR
1073 kfree(epoch);
1074
1075 if (rv == FE_STILL_LIVE)
1076 rv = FE_DESTROYED;
1077 } else {
1078 epoch->flags = 0;
1079 atomic_set(&epoch->epoch_size, 0);
698f9315 1080 /* atomic_set(&epoch->active, 0); is already zero */
b411b363
PR
1081 if (rv == FE_STILL_LIVE)
1082 rv = FE_RECYCLED;
2451fc3b 1083 wake_up(&mdev->ee_wait);
b411b363
PR
1084 }
1085 }
1086
1087 if (!next_epoch)
1088 break;
1089
1090 epoch = next_epoch;
1091 } while (1);
1092
1093 spin_unlock(&mdev->epoch_lock);
1094
b411b363
PR
1095 return rv;
1096}
1097
1098/**
1099 * drbd_bump_write_ordering() - Fall back to an other write ordering method
1100 * @mdev: DRBD device.
1101 * @wo: Write ordering method to try.
1102 */
1103void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
1104{
1105 enum write_ordering_e pwo;
1106 static char *write_ordering_str[] = {
1107 [WO_none] = "none",
1108 [WO_drain_io] = "drain",
1109 [WO_bdev_flush] = "flush",
b411b363
PR
1110 };
1111
1112 pwo = mdev->write_ordering;
1113 wo = min(pwo, wo);
b411b363
PR
1114 if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
1115 wo = WO_drain_io;
1116 if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
1117 wo = WO_none;
1118 mdev->write_ordering = wo;
2451fc3b 1119 if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
b411b363
PR
1120 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
1121}
1122
45bb912b 1123/**
fbe29dec 1124 * drbd_submit_peer_request()
45bb912b 1125 * @mdev: DRBD device.
db830c46 1126 * @peer_req: peer request
45bb912b 1127 * @rw: flag field, see bio->bi_rw
10f6d992
LE
1128 *
1129 * May spread the pages to multiple bios,
1130 * depending on bio_add_page restrictions.
1131 *
1132 * Returns 0 if all bios have been submitted,
1133 * -ENOMEM if we could not allocate enough bios,
1134 * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1135 * single page to an empty bio (which should never happen and likely indicates
1136 * that the lower level IO stack is in some way broken). This has been observed
1137 * on certain Xen deployments.
45bb912b
LE
1138 */
1139/* TODO allocate from our own bio_set. */
fbe29dec
AG
1140int drbd_submit_peer_request(struct drbd_conf *mdev,
1141 struct drbd_peer_request *peer_req,
1142 const unsigned rw, const int fault_type)
45bb912b
LE
1143{
1144 struct bio *bios = NULL;
1145 struct bio *bio;
db830c46
AG
1146 struct page *page = peer_req->pages;
1147 sector_t sector = peer_req->i.sector;
1148 unsigned ds = peer_req->i.size;
45bb912b
LE
1149 unsigned n_bios = 0;
1150 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
10f6d992 1151 int err = -ENOMEM;
45bb912b
LE
1152
1153 /* In most cases, we will only need one bio. But in case the lower
1154 * level restrictions happen to be different at this offset on this
1155 * side than those of the sending peer, we may need to submit the
da4a75d2
LE
1156 * request in more than one bio.
1157 *
1158 * Plain bio_alloc is good enough here, this is no DRBD internally
1159 * generated bio, but a bio allocated on behalf of the peer.
1160 */
45bb912b
LE
1161next_bio:
1162 bio = bio_alloc(GFP_NOIO, nr_pages);
1163 if (!bio) {
1164 dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
1165 goto fail;
1166 }
db830c46 1167 /* > peer_req->i.sector, unless this is the first bio */
45bb912b
LE
1168 bio->bi_sector = sector;
1169 bio->bi_bdev = mdev->ldev->backing_bdev;
45bb912b 1170 bio->bi_rw = rw;
db830c46 1171 bio->bi_private = peer_req;
fcefa62e 1172 bio->bi_end_io = drbd_peer_request_endio;
45bb912b
LE
1173
1174 bio->bi_next = bios;
1175 bios = bio;
1176 ++n_bios;
1177
1178 page_chain_for_each(page) {
1179 unsigned len = min_t(unsigned, ds, PAGE_SIZE);
1180 if (!bio_add_page(bio, page, len, 0)) {
10f6d992
LE
1181 /* A single page must always be possible!
1182 * But in case it fails anyways,
1183 * we deal with it, and complain (below). */
1184 if (bio->bi_vcnt == 0) {
1185 dev_err(DEV,
1186 "bio_add_page failed for len=%u, "
1187 "bi_vcnt=0 (bi_sector=%llu)\n",
1188 len, (unsigned long long)bio->bi_sector);
1189 err = -ENOSPC;
1190 goto fail;
1191 }
45bb912b
LE
1192 goto next_bio;
1193 }
1194 ds -= len;
1195 sector += len >> 9;
1196 --nr_pages;
1197 }
1198 D_ASSERT(page == NULL);
1199 D_ASSERT(ds == 0);
1200
db830c46 1201 atomic_set(&peer_req->pending_bios, n_bios);
45bb912b
LE
1202 do {
1203 bio = bios;
1204 bios = bios->bi_next;
1205 bio->bi_next = NULL;
1206
45bb912b 1207 drbd_generic_make_request(mdev, fault_type, bio);
45bb912b 1208 } while (bios);
45bb912b
LE
1209 return 0;
1210
1211fail:
1212 while (bios) {
1213 bio = bios;
1214 bios = bios->bi_next;
1215 bio_put(bio);
1216 }
10f6d992 1217 return err;
45bb912b
LE
1218}
1219
53840641 1220static void drbd_remove_epoch_entry_interval(struct drbd_conf *mdev,
db830c46 1221 struct drbd_peer_request *peer_req)
53840641 1222{
db830c46 1223 struct drbd_interval *i = &peer_req->i;
53840641
AG
1224
1225 drbd_remove_interval(&mdev->write_requests, i);
1226 drbd_clear_interval(i);
1227
6c852bec 1228 /* Wake up any processes waiting for this peer request to complete. */
53840641
AG
1229 if (i->waiting)
1230 wake_up(&mdev->misc_wait);
1231}
1232
4a76b161 1233static int receive_Barrier(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 1234{
4a76b161 1235 struct drbd_conf *mdev;
2451fc3b 1236 int rv;
4a76b161 1237 struct p_barrier *p = tconn->data.rbuf;
b411b363
PR
1238 struct drbd_epoch *epoch;
1239
4a76b161
AG
1240 mdev = vnr_to_mdev(tconn, pi->vnr);
1241 if (!mdev)
1242 return -EIO;
1243
b411b363
PR
1244 inc_unacked(mdev);
1245
b411b363
PR
1246 mdev->current_epoch->barrier_nr = p->barrier;
1247 rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
1248
1249 /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1250 * the activity log, which means it would not be resynced in case the
1251 * R_PRIMARY crashes now.
1252 * Therefore we must send the barrier_ack after the barrier request was
1253 * completed. */
1254 switch (mdev->write_ordering) {
b411b363
PR
1255 case WO_none:
1256 if (rv == FE_RECYCLED)
82bc0194 1257 return 0;
2451fc3b
PR
1258
1259 /* receiver context, in the writeout path of the other node.
1260 * avoid potential distributed deadlock */
1261 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1262 if (epoch)
1263 break;
1264 else
1265 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1266 /* Fall through */
b411b363
PR
1267
1268 case WO_bdev_flush:
1269 case WO_drain_io:
b411b363 1270 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
2451fc3b
PR
1271 drbd_flush(mdev);
1272
1273 if (atomic_read(&mdev->current_epoch->epoch_size)) {
1274 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1275 if (epoch)
1276 break;
b411b363
PR
1277 }
1278
2451fc3b
PR
1279 epoch = mdev->current_epoch;
1280 wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
1281
1282 D_ASSERT(atomic_read(&epoch->active) == 0);
1283 D_ASSERT(epoch->flags == 0);
b411b363 1284
82bc0194 1285 return 0;
2451fc3b
PR
1286 default:
1287 dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
82bc0194 1288 return -EIO;
b411b363
PR
1289 }
1290
1291 epoch->flags = 0;
1292 atomic_set(&epoch->epoch_size, 0);
1293 atomic_set(&epoch->active, 0);
1294
1295 spin_lock(&mdev->epoch_lock);
1296 if (atomic_read(&mdev->current_epoch->epoch_size)) {
1297 list_add(&epoch->list, &mdev->current_epoch->list);
1298 mdev->current_epoch = epoch;
1299 mdev->epochs++;
b411b363
PR
1300 } else {
1301 /* The current_epoch got recycled while we allocated this one... */
1302 kfree(epoch);
1303 }
1304 spin_unlock(&mdev->epoch_lock);
1305
82bc0194 1306 return 0;
b411b363
PR
1307}
1308
1309/* used from receive_RSDataReply (recv_resync_read)
1310 * and from receive_Data */
f6ffca9f
AG
1311static struct drbd_peer_request *
1312read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector,
1313 int data_size) __must_hold(local)
b411b363 1314{
6666032a 1315 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
db830c46 1316 struct drbd_peer_request *peer_req;
b411b363 1317 struct page *page;
a5c31904 1318 int dgs, ds, err;
a0638456
PR
1319 void *dig_in = mdev->tconn->int_dig_in;
1320 void *dig_vv = mdev->tconn->int_dig_vv;
6b4388ac 1321 unsigned long *data;
b411b363 1322
a0638456
PR
1323 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_r_tfm) ?
1324 crypto_hash_digestsize(mdev->tconn->integrity_r_tfm) : 0;
b411b363
PR
1325
1326 if (dgs) {
a5c31904
AG
1327 err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
1328 if (err)
b411b363 1329 return NULL;
b411b363
PR
1330 }
1331
1332 data_size -= dgs;
1333
841ce241
AG
1334 if (!expect(data_size != 0))
1335 return NULL;
1336 if (!expect(IS_ALIGNED(data_size, 512)))
1337 return NULL;
1338 if (!expect(data_size <= DRBD_MAX_BIO_SIZE))
1339 return NULL;
b411b363 1340
6666032a
LE
1341 /* even though we trust out peer,
1342 * we sometimes have to double check. */
1343 if (sector + (data_size>>9) > capacity) {
fdda6544
LE
1344 dev_err(DEV, "request from peer beyond end of local disk: "
1345 "capacity: %llus < sector: %llus + size: %u\n",
6666032a
LE
1346 (unsigned long long)capacity,
1347 (unsigned long long)sector, data_size);
1348 return NULL;
1349 }
1350
b411b363
PR
1351 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1352 * "criss-cross" setup, that might cause write-out on some other DRBD,
1353 * which in turn might block on the other node at this very place. */
db830c46
AG
1354 peer_req = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
1355 if (!peer_req)
b411b363 1356 return NULL;
45bb912b 1357
b411b363 1358 ds = data_size;
db830c46 1359 page = peer_req->pages;
45bb912b
LE
1360 page_chain_for_each(page) {
1361 unsigned len = min_t(int, ds, PAGE_SIZE);
6b4388ac 1362 data = kmap(page);
a5c31904 1363 err = drbd_recv_all_warn(mdev->tconn, data, len);
0cf9d27e 1364 if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) {
6b4388ac
PR
1365 dev_err(DEV, "Fault injection: Corrupting data on receive\n");
1366 data[0] = data[0] ^ (unsigned long)-1;
1367 }
b411b363 1368 kunmap(page);
a5c31904 1369 if (err) {
db830c46 1370 drbd_free_ee(mdev, peer_req);
b411b363
PR
1371 return NULL;
1372 }
a5c31904 1373 ds -= len;
b411b363
PR
1374 }
1375
1376 if (dgs) {
db830c46 1377 drbd_csum_ee(mdev, mdev->tconn->integrity_r_tfm, peer_req, dig_vv);
b411b363 1378 if (memcmp(dig_in, dig_vv, dgs)) {
470be44a
LE
1379 dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n",
1380 (unsigned long long)sector, data_size);
db830c46 1381 drbd_free_ee(mdev, peer_req);
b411b363
PR
1382 return NULL;
1383 }
1384 }
1385 mdev->recv_cnt += data_size>>9;
db830c46 1386 return peer_req;
b411b363
PR
1387}
1388
1389/* drbd_drain_block() just takes a data block
1390 * out of the socket input buffer, and discards it.
1391 */
1392static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1393{
1394 struct page *page;
a5c31904 1395 int err = 0;
b411b363
PR
1396 void *data;
1397
c3470cde 1398 if (!data_size)
fc5be839 1399 return 0;
c3470cde 1400
45bb912b 1401 page = drbd_pp_alloc(mdev, 1, 1);
b411b363
PR
1402
1403 data = kmap(page);
1404 while (data_size) {
fc5be839
AG
1405 unsigned int len = min_t(int, data_size, PAGE_SIZE);
1406
a5c31904
AG
1407 err = drbd_recv_all_warn(mdev->tconn, data, len);
1408 if (err)
b411b363 1409 break;
a5c31904 1410 data_size -= len;
b411b363
PR
1411 }
1412 kunmap(page);
435f0740 1413 drbd_pp_free(mdev, page, 0);
fc5be839 1414 return err;
b411b363
PR
1415}
1416
1417static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1418 sector_t sector, int data_size)
1419{
1420 struct bio_vec *bvec;
1421 struct bio *bio;
a5c31904 1422 int dgs, err, i, expect;
a0638456
PR
1423 void *dig_in = mdev->tconn->int_dig_in;
1424 void *dig_vv = mdev->tconn->int_dig_vv;
b411b363 1425
a0638456
PR
1426 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_r_tfm) ?
1427 crypto_hash_digestsize(mdev->tconn->integrity_r_tfm) : 0;
b411b363
PR
1428
1429 if (dgs) {
a5c31904
AG
1430 err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
1431 if (err)
1432 return err;
b411b363
PR
1433 }
1434
1435 data_size -= dgs;
1436
1437 /* optimistically update recv_cnt. if receiving fails below,
1438 * we disconnect anyways, and counters will be reset. */
1439 mdev->recv_cnt += data_size>>9;
1440
1441 bio = req->master_bio;
1442 D_ASSERT(sector == bio->bi_sector);
1443
1444 bio_for_each_segment(bvec, bio, i) {
a5c31904 1445 void *mapped = kmap(bvec->bv_page) + bvec->bv_offset;
b411b363 1446 expect = min_t(int, data_size, bvec->bv_len);
a5c31904 1447 err = drbd_recv_all_warn(mdev->tconn, mapped, expect);
b411b363 1448 kunmap(bvec->bv_page);
a5c31904
AG
1449 if (err)
1450 return err;
1451 data_size -= expect;
b411b363
PR
1452 }
1453
1454 if (dgs) {
a0638456 1455 drbd_csum_bio(mdev, mdev->tconn->integrity_r_tfm, bio, dig_vv);
b411b363
PR
1456 if (memcmp(dig_in, dig_vv, dgs)) {
1457 dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
28284cef 1458 return -EINVAL;
b411b363
PR
1459 }
1460 }
1461
1462 D_ASSERT(data_size == 0);
28284cef 1463 return 0;
b411b363
PR
1464}
1465
1466/* e_end_resync_block() is called via
1467 * drbd_process_done_ee() by asender only */
99920dc5 1468static int e_end_resync_block(struct drbd_work *w, int unused)
b411b363 1469{
8050e6d0
AG
1470 struct drbd_peer_request *peer_req =
1471 container_of(w, struct drbd_peer_request, w);
00d56944 1472 struct drbd_conf *mdev = w->mdev;
db830c46 1473 sector_t sector = peer_req->i.sector;
99920dc5 1474 int err;
b411b363 1475
db830c46 1476 D_ASSERT(drbd_interval_empty(&peer_req->i));
b411b363 1477
db830c46
AG
1478 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1479 drbd_set_in_sync(mdev, sector, peer_req->i.size);
99920dc5 1480 err = drbd_send_ack(mdev, P_RS_WRITE_ACK, peer_req);
b411b363
PR
1481 } else {
1482 /* Record failure to sync */
db830c46 1483 drbd_rs_failed_io(mdev, sector, peer_req->i.size);
b411b363 1484
99920dc5 1485 err = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
b411b363
PR
1486 }
1487 dec_unacked(mdev);
1488
99920dc5 1489 return err;
b411b363
PR
1490}
1491
1492static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
1493{
db830c46 1494 struct drbd_peer_request *peer_req;
b411b363 1495
db830c46
AG
1496 peer_req = read_in_block(mdev, ID_SYNCER, sector, data_size);
1497 if (!peer_req)
45bb912b 1498 goto fail;
b411b363
PR
1499
1500 dec_rs_pending(mdev);
1501
b411b363
PR
1502 inc_unacked(mdev);
1503 /* corresponding dec_unacked() in e_end_resync_block()
1504 * respective _drbd_clear_done_ee */
1505
db830c46 1506 peer_req->w.cb = e_end_resync_block;
45bb912b 1507
87eeee41 1508 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 1509 list_add(&peer_req->w.list, &mdev->sync_ee);
87eeee41 1510 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 1511
0f0601f4 1512 atomic_add(data_size >> 9, &mdev->rs_sect_ev);
fbe29dec 1513 if (drbd_submit_peer_request(mdev, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
e1c1b0fc 1514 return 0;
b411b363 1515
10f6d992
LE
1516 /* don't care for the reason here */
1517 dev_err(DEV, "submit failed, triggering re-connect\n");
87eeee41 1518 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 1519 list_del(&peer_req->w.list);
87eeee41 1520 spin_unlock_irq(&mdev->tconn->req_lock);
22cc37a9 1521
db830c46 1522 drbd_free_ee(mdev, peer_req);
45bb912b
LE
1523fail:
1524 put_ldev(mdev);
e1c1b0fc 1525 return -EIO;
b411b363
PR
1526}
1527
668eebc6 1528static struct drbd_request *
bc9c5c41
AG
1529find_request(struct drbd_conf *mdev, struct rb_root *root, u64 id,
1530 sector_t sector, bool missing_ok, const char *func)
51624585 1531{
51624585
AG
1532 struct drbd_request *req;
1533
bc9c5c41
AG
1534 /* Request object according to our peer */
1535 req = (struct drbd_request *)(unsigned long)id;
5e472264 1536 if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
668eebc6 1537 return req;
c3afd8f5
AG
1538 if (!missing_ok) {
1539 dev_err(DEV, "%s: failed to find request %lu, sector %llus\n", func,
1540 (unsigned long)id, (unsigned long long)sector);
1541 }
51624585
AG
1542 return NULL;
1543}
1544
4a76b161 1545static int receive_DataReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 1546{
4a76b161 1547 struct drbd_conf *mdev;
b411b363
PR
1548 struct drbd_request *req;
1549 sector_t sector;
82bc0194 1550 int err;
4a76b161
AG
1551 struct p_data *p = tconn->data.rbuf;
1552
1553 mdev = vnr_to_mdev(tconn, pi->vnr);
1554 if (!mdev)
1555 return -EIO;
b411b363
PR
1556
1557 sector = be64_to_cpu(p->sector);
1558
87eeee41 1559 spin_lock_irq(&mdev->tconn->req_lock);
bc9c5c41 1560 req = find_request(mdev, &mdev->read_requests, p->block_id, sector, false, __func__);
87eeee41 1561 spin_unlock_irq(&mdev->tconn->req_lock);
c3afd8f5 1562 if (unlikely(!req))
82bc0194 1563 return -EIO;
b411b363 1564
24c4830c 1565 /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
b411b363
PR
1566 * special casing it there for the various failure cases.
1567 * still no race with drbd_fail_pending_reads */
e2857216 1568 err = recv_dless_read(mdev, req, sector, pi->size);
82bc0194 1569 if (!err)
8554df1c 1570 req_mod(req, DATA_RECEIVED);
b411b363
PR
1571 /* else: nothing. handled from drbd_disconnect...
1572 * I don't think we may complete this just yet
1573 * in case we are "on-disconnect: freeze" */
1574
82bc0194 1575 return err;
b411b363
PR
1576}
1577
4a76b161 1578static int receive_RSDataReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 1579{
4a76b161 1580 struct drbd_conf *mdev;
b411b363 1581 sector_t sector;
82bc0194 1582 int err;
4a76b161
AG
1583 struct p_data *p = tconn->data.rbuf;
1584
1585 mdev = vnr_to_mdev(tconn, pi->vnr);
1586 if (!mdev)
1587 return -EIO;
b411b363
PR
1588
1589 sector = be64_to_cpu(p->sector);
1590 D_ASSERT(p->block_id == ID_SYNCER);
1591
1592 if (get_ldev(mdev)) {
1593 /* data is submitted to disk within recv_resync_read.
1594 * corresponding put_ldev done below on error,
fcefa62e 1595 * or in drbd_peer_request_endio. */
e2857216 1596 err = recv_resync_read(mdev, sector, pi->size);
b411b363
PR
1597 } else {
1598 if (__ratelimit(&drbd_ratelimit_state))
1599 dev_err(DEV, "Can not write resync data to local disk.\n");
1600
e2857216 1601 err = drbd_drain_block(mdev, pi->size);
b411b363 1602
e2857216 1603 drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
b411b363
PR
1604 }
1605
e2857216 1606 atomic_add(pi->size >> 9, &mdev->rs_sect_in);
778f271d 1607
82bc0194 1608 return err;
b411b363
PR
1609}
1610
99920dc5 1611static int w_restart_write(struct drbd_work *w, int cancel)
7be8da07
AG
1612{
1613 struct drbd_request *req = container_of(w, struct drbd_request, w);
1614 struct drbd_conf *mdev = w->mdev;
1615 struct bio *bio;
1616 unsigned long start_time;
1617 unsigned long flags;
1618
1619 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
1620 if (!expect(req->rq_state & RQ_POSTPONED)) {
1621 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
99920dc5 1622 return -EIO;
7be8da07
AG
1623 }
1624 bio = req->master_bio;
1625 start_time = req->start_time;
1626 /* Postponed requests will not have their master_bio completed! */
1627 __req_mod(req, DISCARD_WRITE, NULL);
1628 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
1629
1630 while (__drbd_make_request(mdev, bio, start_time))
1631 /* retry */ ;
99920dc5 1632 return 0;
7be8da07
AG
1633}
1634
1635static void restart_conflicting_writes(struct drbd_conf *mdev,
1636 sector_t sector, int size)
1637{
1638 struct drbd_interval *i;
1639 struct drbd_request *req;
1640
1641 drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
1642 if (!i->local)
1643 continue;
1644 req = container_of(i, struct drbd_request, i);
1645 if (req->rq_state & RQ_LOCAL_PENDING ||
1646 !(req->rq_state & RQ_POSTPONED))
1647 continue;
1648 if (expect(list_empty(&req->w.list))) {
1649 req->w.mdev = mdev;
1650 req->w.cb = w_restart_write;
1651 drbd_queue_work(&mdev->tconn->data.work, &req->w);
1652 }
1653 }
1654}
1655
b411b363
PR
1656/* e_end_block() is called via drbd_process_done_ee().
1657 * this means this function only runs in the asender thread
1658 */
99920dc5 1659static int e_end_block(struct drbd_work *w, int cancel)
b411b363 1660{
8050e6d0
AG
1661 struct drbd_peer_request *peer_req =
1662 container_of(w, struct drbd_peer_request, w);
00d56944 1663 struct drbd_conf *mdev = w->mdev;
db830c46 1664 sector_t sector = peer_req->i.sector;
99920dc5 1665 int err = 0, pcmd;
b411b363 1666
89e58e75 1667 if (mdev->tconn->net_conf->wire_protocol == DRBD_PROT_C) {
db830c46 1668 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
b411b363
PR
1669 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
1670 mdev->state.conn <= C_PAUSED_SYNC_T &&
db830c46 1671 peer_req->flags & EE_MAY_SET_IN_SYNC) ?
b411b363 1672 P_RS_WRITE_ACK : P_WRITE_ACK;
99920dc5 1673 err = drbd_send_ack(mdev, pcmd, peer_req);
b411b363 1674 if (pcmd == P_RS_WRITE_ACK)
db830c46 1675 drbd_set_in_sync(mdev, sector, peer_req->i.size);
b411b363 1676 } else {
99920dc5 1677 err = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
b411b363
PR
1678 /* we expect it to be marked out of sync anyways...
1679 * maybe assert this? */
1680 }
1681 dec_unacked(mdev);
1682 }
1683 /* we delete from the conflict detection hash _after_ we sent out the
1684 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
89e58e75 1685 if (mdev->tconn->net_conf->two_primaries) {
87eeee41 1686 spin_lock_irq(&mdev->tconn->req_lock);
db830c46
AG
1687 D_ASSERT(!drbd_interval_empty(&peer_req->i));
1688 drbd_remove_epoch_entry_interval(mdev, peer_req);
7be8da07
AG
1689 if (peer_req->flags & EE_RESTART_REQUESTS)
1690 restart_conflicting_writes(mdev, sector, peer_req->i.size);
87eeee41 1691 spin_unlock_irq(&mdev->tconn->req_lock);
bb3bfe96 1692 } else
db830c46 1693 D_ASSERT(drbd_interval_empty(&peer_req->i));
b411b363 1694
db830c46 1695 drbd_may_finish_epoch(mdev, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
b411b363 1696
99920dc5 1697 return err;
b411b363
PR
1698}
1699
7be8da07 1700static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
b411b363 1701{
7be8da07 1702 struct drbd_conf *mdev = w->mdev;
8050e6d0
AG
1703 struct drbd_peer_request *peer_req =
1704 container_of(w, struct drbd_peer_request, w);
99920dc5 1705 int err;
b411b363 1706
99920dc5 1707 err = drbd_send_ack(mdev, ack, peer_req);
b411b363
PR
1708 dec_unacked(mdev);
1709
99920dc5 1710 return err;
b411b363
PR
1711}
1712
99920dc5 1713static int e_send_discard_write(struct drbd_work *w, int unused)
7be8da07
AG
1714{
1715 return e_send_ack(w, P_DISCARD_WRITE);
1716}
1717
99920dc5 1718static int e_send_retry_write(struct drbd_work *w, int unused)
7be8da07
AG
1719{
1720 struct drbd_tconn *tconn = w->mdev->tconn;
1721
1722 return e_send_ack(w, tconn->agreed_pro_version >= 100 ?
1723 P_RETRY_WRITE : P_DISCARD_WRITE);
1724}
1725
3e394da1
AG
1726static bool seq_greater(u32 a, u32 b)
1727{
1728 /*
1729 * We assume 32-bit wrap-around here.
1730 * For 24-bit wrap-around, we would have to shift:
1731 * a <<= 8; b <<= 8;
1732 */
1733 return (s32)a - (s32)b > 0;
1734}
1735
1736static u32 seq_max(u32 a, u32 b)
1737{
1738 return seq_greater(a, b) ? a : b;
1739}
1740
7be8da07
AG
1741static bool need_peer_seq(struct drbd_conf *mdev)
1742{
1743 struct drbd_tconn *tconn = mdev->tconn;
1744
1745 /*
1746 * We only need to keep track of the last packet_seq number of our peer
1747 * if we are in dual-primary mode and we have the discard flag set; see
1748 * handle_write_conflicts().
1749 */
1750 return tconn->net_conf->two_primaries &&
1751 test_bit(DISCARD_CONCURRENT, &tconn->flags);
1752}
1753
43ae077d 1754static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq)
3e394da1 1755{
3c13b680 1756 unsigned int newest_peer_seq;
3e394da1 1757
7be8da07
AG
1758 if (need_peer_seq(mdev)) {
1759 spin_lock(&mdev->peer_seq_lock);
3c13b680
LE
1760 newest_peer_seq = seq_max(mdev->peer_seq, peer_seq);
1761 mdev->peer_seq = newest_peer_seq;
7be8da07 1762 spin_unlock(&mdev->peer_seq_lock);
3c13b680
LE
1763 /* wake up only if we actually changed mdev->peer_seq */
1764 if (peer_seq == newest_peer_seq)
7be8da07
AG
1765 wake_up(&mdev->seq_wait);
1766 }
3e394da1
AG
1767}
1768
b411b363
PR
1769/* Called from receive_Data.
1770 * Synchronize packets on sock with packets on msock.
1771 *
1772 * This is here so even when a P_DATA packet traveling via sock overtook an Ack
1773 * packet traveling on msock, they are still processed in the order they have
1774 * been sent.
1775 *
1776 * Note: we don't care for Ack packets overtaking P_DATA packets.
1777 *
1778 * In case packet_seq is larger than mdev->peer_seq number, there are
1779 * outstanding packets on the msock. We wait for them to arrive.
1780 * In case we are the logically next packet, we update mdev->peer_seq
1781 * ourselves. Correctly handles 32bit wrap around.
1782 *
1783 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
1784 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
1785 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
1786 * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
1787 *
1788 * returns 0 if we may process the packet,
1789 * -ERESTARTSYS if we were interrupted (by disconnect signal). */
7be8da07 1790static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_seq)
b411b363
PR
1791{
1792 DEFINE_WAIT(wait);
b411b363 1793 long timeout;
7be8da07
AG
1794 int ret;
1795
1796 if (!need_peer_seq(mdev))
1797 return 0;
1798
b411b363
PR
1799 spin_lock(&mdev->peer_seq_lock);
1800 for (;;) {
7be8da07
AG
1801 if (!seq_greater(peer_seq - 1, mdev->peer_seq)) {
1802 mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq);
1803 ret = 0;
b411b363 1804 break;
7be8da07 1805 }
b411b363
PR
1806 if (signal_pending(current)) {
1807 ret = -ERESTARTSYS;
1808 break;
1809 }
7be8da07 1810 prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
b411b363 1811 spin_unlock(&mdev->peer_seq_lock);
71b1c1eb
AG
1812 timeout = mdev->tconn->net_conf->ping_timeo*HZ/10;
1813 timeout = schedule_timeout(timeout);
b411b363 1814 spin_lock(&mdev->peer_seq_lock);
7be8da07 1815 if (!timeout) {
b411b363 1816 ret = -ETIMEDOUT;
71b1c1eb 1817 dev_err(DEV, "Timed out waiting for missing ack packets; disconnecting\n");
b411b363
PR
1818 break;
1819 }
1820 }
b411b363 1821 spin_unlock(&mdev->peer_seq_lock);
7be8da07 1822 finish_wait(&mdev->seq_wait, &wait);
b411b363
PR
1823 return ret;
1824}
1825
688593c5
LE
1826/* see also bio_flags_to_wire()
1827 * DRBD_REQ_*, because we need to semantically map the flags to data packet
1828 * flags and back. We may replicate to other kernel versions. */
1829static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
76d2e7ec 1830{
688593c5
LE
1831 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
1832 (dpf & DP_FUA ? REQ_FUA : 0) |
1833 (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
1834 (dpf & DP_DISCARD ? REQ_DISCARD : 0);
76d2e7ec
PR
1835}
1836
7be8da07
AG
1837static void fail_postponed_requests(struct drbd_conf *mdev, sector_t sector,
1838 unsigned int size)
1839{
1840 struct drbd_interval *i;
1841
1842 repeat:
1843 drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
1844 struct drbd_request *req;
1845 struct bio_and_error m;
1846
1847 if (!i->local)
1848 continue;
1849 req = container_of(i, struct drbd_request, i);
1850 if (!(req->rq_state & RQ_POSTPONED))
1851 continue;
1852 req->rq_state &= ~RQ_POSTPONED;
1853 __req_mod(req, NEG_ACKED, &m);
1854 spin_unlock_irq(&mdev->tconn->req_lock);
1855 if (m.bio)
1856 complete_master_bio(mdev, &m);
1857 spin_lock_irq(&mdev->tconn->req_lock);
1858 goto repeat;
1859 }
1860}
1861
1862static int handle_write_conflicts(struct drbd_conf *mdev,
1863 struct drbd_peer_request *peer_req)
1864{
1865 struct drbd_tconn *tconn = mdev->tconn;
1866 bool resolve_conflicts = test_bit(DISCARD_CONCURRENT, &tconn->flags);
1867 sector_t sector = peer_req->i.sector;
1868 const unsigned int size = peer_req->i.size;
1869 struct drbd_interval *i;
1870 bool equal;
1871 int err;
1872
1873 /*
1874 * Inserting the peer request into the write_requests tree will prevent
1875 * new conflicting local requests from being added.
1876 */
1877 drbd_insert_interval(&mdev->write_requests, &peer_req->i);
1878
1879 repeat:
1880 drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
1881 if (i == &peer_req->i)
1882 continue;
1883
1884 if (!i->local) {
1885 /*
1886 * Our peer has sent a conflicting remote request; this
1887 * should not happen in a two-node setup. Wait for the
1888 * earlier peer request to complete.
1889 */
1890 err = drbd_wait_misc(mdev, i);
1891 if (err)
1892 goto out;
1893 goto repeat;
1894 }
1895
1896 equal = i->sector == sector && i->size == size;
1897 if (resolve_conflicts) {
1898 /*
1899 * If the peer request is fully contained within the
1900 * overlapping request, it can be discarded; otherwise,
1901 * it will be retried once all overlapping requests
1902 * have completed.
1903 */
1904 bool discard = i->sector <= sector && i->sector +
1905 (i->size >> 9) >= sector + (size >> 9);
1906
1907 if (!equal)
1908 dev_alert(DEV, "Concurrent writes detected: "
1909 "local=%llus +%u, remote=%llus +%u, "
1910 "assuming %s came first\n",
1911 (unsigned long long)i->sector, i->size,
1912 (unsigned long long)sector, size,
1913 discard ? "local" : "remote");
1914
1915 inc_unacked(mdev);
1916 peer_req->w.cb = discard ? e_send_discard_write :
1917 e_send_retry_write;
1918 list_add_tail(&peer_req->w.list, &mdev->done_ee);
1919 wake_asender(mdev->tconn);
1920
1921 err = -ENOENT;
1922 goto out;
1923 } else {
1924 struct drbd_request *req =
1925 container_of(i, struct drbd_request, i);
1926
1927 if (!equal)
1928 dev_alert(DEV, "Concurrent writes detected: "
1929 "local=%llus +%u, remote=%llus +%u\n",
1930 (unsigned long long)i->sector, i->size,
1931 (unsigned long long)sector, size);
1932
1933 if (req->rq_state & RQ_LOCAL_PENDING ||
1934 !(req->rq_state & RQ_POSTPONED)) {
1935 /*
1936 * Wait for the node with the discard flag to
1937 * decide if this request will be discarded or
1938 * retried. Requests that are discarded will
1939 * disappear from the write_requests tree.
1940 *
1941 * In addition, wait for the conflicting
1942 * request to finish locally before submitting
1943 * the conflicting peer request.
1944 */
1945 err = drbd_wait_misc(mdev, &req->i);
1946 if (err) {
1947 _conn_request_state(mdev->tconn,
1948 NS(conn, C_TIMEOUT),
1949 CS_HARD);
1950 fail_postponed_requests(mdev, sector, size);
1951 goto out;
1952 }
1953 goto repeat;
1954 }
1955 /*
1956 * Remember to restart the conflicting requests after
1957 * the new peer request has completed.
1958 */
1959 peer_req->flags |= EE_RESTART_REQUESTS;
1960 }
1961 }
1962 err = 0;
1963
1964 out:
1965 if (err)
1966 drbd_remove_epoch_entry_interval(mdev, peer_req);
1967 return err;
1968}
1969
b411b363 1970/* mirrored write */
4a76b161 1971static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 1972{
4a76b161 1973 struct drbd_conf *mdev;
b411b363 1974 sector_t sector;
db830c46 1975 struct drbd_peer_request *peer_req;
4a76b161 1976 struct p_data *p = tconn->data.rbuf;
7be8da07 1977 u32 peer_seq = be32_to_cpu(p->seq_num);
b411b363
PR
1978 int rw = WRITE;
1979 u32 dp_flags;
7be8da07 1980 int err;
b411b363 1981
4a76b161
AG
1982 mdev = vnr_to_mdev(tconn, pi->vnr);
1983 if (!mdev)
1984 return -EIO;
1985
7be8da07 1986 if (!get_ldev(mdev)) {
82bc0194
AG
1987 int err2;
1988
7be8da07 1989 err = wait_for_and_update_peer_seq(mdev, peer_seq);
e2857216 1990 drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
b411b363 1991 atomic_inc(&mdev->current_epoch->epoch_size);
e2857216 1992 err2 = drbd_drain_block(mdev, pi->size);
82bc0194
AG
1993 if (!err)
1994 err = err2;
1995 return err;
b411b363
PR
1996 }
1997
fcefa62e
AG
1998 /*
1999 * Corresponding put_ldev done either below (on various errors), or in
2000 * drbd_peer_request_endio, if we successfully submit the data at the
2001 * end of this function.
2002 */
b411b363
PR
2003
2004 sector = be64_to_cpu(p->sector);
e2857216 2005 peer_req = read_in_block(mdev, p->block_id, sector, pi->size);
db830c46 2006 if (!peer_req) {
b411b363 2007 put_ldev(mdev);
82bc0194 2008 return -EIO;
b411b363
PR
2009 }
2010
db830c46 2011 peer_req->w.cb = e_end_block;
b411b363 2012
688593c5
LE
2013 dp_flags = be32_to_cpu(p->dp_flags);
2014 rw |= wire_flags_to_bio(mdev, dp_flags);
2015
2016 if (dp_flags & DP_MAY_SET_IN_SYNC)
db830c46 2017 peer_req->flags |= EE_MAY_SET_IN_SYNC;
688593c5 2018
b411b363 2019 spin_lock(&mdev->epoch_lock);
db830c46
AG
2020 peer_req->epoch = mdev->current_epoch;
2021 atomic_inc(&peer_req->epoch->epoch_size);
2022 atomic_inc(&peer_req->epoch->active);
b411b363
PR
2023 spin_unlock(&mdev->epoch_lock);
2024
7be8da07
AG
2025 if (mdev->tconn->net_conf->two_primaries) {
2026 err = wait_for_and_update_peer_seq(mdev, peer_seq);
2027 if (err)
b411b363 2028 goto out_interrupted;
87eeee41 2029 spin_lock_irq(&mdev->tconn->req_lock);
7be8da07
AG
2030 err = handle_write_conflicts(mdev, peer_req);
2031 if (err) {
2032 spin_unlock_irq(&mdev->tconn->req_lock);
2033 if (err == -ENOENT) {
b411b363 2034 put_ldev(mdev);
82bc0194 2035 return 0;
b411b363 2036 }
7be8da07 2037 goto out_interrupted;
b411b363 2038 }
7be8da07
AG
2039 } else
2040 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 2041 list_add(&peer_req->w.list, &mdev->active_ee);
87eeee41 2042 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 2043
89e58e75 2044 switch (mdev->tconn->net_conf->wire_protocol) {
b411b363
PR
2045 case DRBD_PROT_C:
2046 inc_unacked(mdev);
2047 /* corresponding dec_unacked() in e_end_block()
2048 * respective _drbd_clear_done_ee */
2049 break;
2050 case DRBD_PROT_B:
2051 /* I really don't like it that the receiver thread
2052 * sends on the msock, but anyways */
db830c46 2053 drbd_send_ack(mdev, P_RECV_ACK, peer_req);
b411b363
PR
2054 break;
2055 case DRBD_PROT_A:
2056 /* nothing to do */
2057 break;
2058 }
2059
6719fb03 2060 if (mdev->state.pdsk < D_INCONSISTENT) {
b411b363 2061 /* In case we have the only disk of the cluster, */
db830c46
AG
2062 drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size);
2063 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
2064 peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
181286ad 2065 drbd_al_begin_io(mdev, &peer_req->i);
b411b363
PR
2066 }
2067
82bc0194
AG
2068 err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR);
2069 if (!err)
2070 return 0;
b411b363 2071
10f6d992
LE
2072 /* don't care for the reason here */
2073 dev_err(DEV, "submit failed, triggering re-connect\n");
87eeee41 2074 spin_lock_irq(&mdev->tconn->req_lock);
db830c46
AG
2075 list_del(&peer_req->w.list);
2076 drbd_remove_epoch_entry_interval(mdev, peer_req);
87eeee41 2077 spin_unlock_irq(&mdev->tconn->req_lock);
db830c46 2078 if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
181286ad 2079 drbd_al_complete_io(mdev, &peer_req->i);
22cc37a9 2080
b411b363 2081out_interrupted:
db830c46 2082 drbd_may_finish_epoch(mdev, peer_req->epoch, EV_PUT + EV_CLEANUP);
b411b363 2083 put_ldev(mdev);
db830c46 2084 drbd_free_ee(mdev, peer_req);
82bc0194 2085 return err;
b411b363
PR
2086}
2087
0f0601f4
LE
2088/* We may throttle resync, if the lower device seems to be busy,
2089 * and current sync rate is above c_min_rate.
2090 *
2091 * To decide whether or not the lower device is busy, we use a scheme similar
2092 * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2093 * (more than 64 sectors) of activity we cannot account for with our own resync
2094 * activity, it obviously is "busy".
2095 *
2096 * The current sync rate used here uses only the most recent two step marks,
2097 * to have a short time average so we can react faster.
2098 */
e3555d85 2099int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
0f0601f4
LE
2100{
2101 struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
2102 unsigned long db, dt, dbdt;
e3555d85 2103 struct lc_element *tmp;
0f0601f4
LE
2104 int curr_events;
2105 int throttle = 0;
2106
2107 /* feature disabled? */
f399002e 2108 if (mdev->ldev->dc.c_min_rate == 0)
0f0601f4
LE
2109 return 0;
2110
e3555d85
PR
2111 spin_lock_irq(&mdev->al_lock);
2112 tmp = lc_find(mdev->resync, BM_SECT_TO_EXT(sector));
2113 if (tmp) {
2114 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
2115 if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
2116 spin_unlock_irq(&mdev->al_lock);
2117 return 0;
2118 }
2119 /* Do not slow down if app IO is already waiting for this extent */
2120 }
2121 spin_unlock_irq(&mdev->al_lock);
2122
0f0601f4
LE
2123 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
2124 (int)part_stat_read(&disk->part0, sectors[1]) -
2125 atomic_read(&mdev->rs_sect_ev);
e3555d85 2126
0f0601f4
LE
2127 if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
2128 unsigned long rs_left;
2129 int i;
2130
2131 mdev->rs_last_events = curr_events;
2132
2133 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2134 * approx. */
2649f080
LE
2135 i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
2136
2137 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
2138 rs_left = mdev->ov_left;
2139 else
2140 rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
0f0601f4
LE
2141
2142 dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
2143 if (!dt)
2144 dt++;
2145 db = mdev->rs_mark_left[i] - rs_left;
2146 dbdt = Bit2KB(db/dt);
2147
f399002e 2148 if (dbdt > mdev->ldev->dc.c_min_rate)
0f0601f4
LE
2149 throttle = 1;
2150 }
2151 return throttle;
2152}
2153
2154
4a76b161 2155static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 2156{
4a76b161 2157 struct drbd_conf *mdev;
b411b363 2158 sector_t sector;
4a76b161 2159 sector_t capacity;
db830c46 2160 struct drbd_peer_request *peer_req;
b411b363 2161 struct digest_info *di = NULL;
b18b37be 2162 int size, verb;
b411b363 2163 unsigned int fault_type;
4a76b161
AG
2164 struct p_block_req *p = tconn->data.rbuf;
2165
2166 mdev = vnr_to_mdev(tconn, pi->vnr);
2167 if (!mdev)
2168 return -EIO;
2169 capacity = drbd_get_capacity(mdev->this_bdev);
b411b363
PR
2170
2171 sector = be64_to_cpu(p->sector);
2172 size = be32_to_cpu(p->blksize);
2173
c670a398 2174 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
b411b363
PR
2175 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2176 (unsigned long long)sector, size);
82bc0194 2177 return -EINVAL;
b411b363
PR
2178 }
2179 if (sector + (size>>9) > capacity) {
2180 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2181 (unsigned long long)sector, size);
82bc0194 2182 return -EINVAL;
b411b363
PR
2183 }
2184
2185 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
b18b37be 2186 verb = 1;
e2857216 2187 switch (pi->cmd) {
b18b37be
PR
2188 case P_DATA_REQUEST:
2189 drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
2190 break;
2191 case P_RS_DATA_REQUEST:
2192 case P_CSUM_RS_REQUEST:
2193 case P_OV_REQUEST:
2194 drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
2195 break;
2196 case P_OV_REPLY:
2197 verb = 0;
2198 dec_rs_pending(mdev);
2199 drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
2200 break;
2201 default:
49ba9b1b 2202 BUG();
b18b37be
PR
2203 }
2204 if (verb && __ratelimit(&drbd_ratelimit_state))
b411b363
PR
2205 dev_err(DEV, "Can not satisfy peer's read request, "
2206 "no local data.\n");
b18b37be 2207
a821cc4a 2208 /* drain possibly payload */
e2857216 2209 return drbd_drain_block(mdev, pi->size);
b411b363
PR
2210 }
2211
2212 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2213 * "criss-cross" setup, that might cause write-out on some other DRBD,
2214 * which in turn might block on the other node at this very place. */
db830c46
AG
2215 peer_req = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
2216 if (!peer_req) {
b411b363 2217 put_ldev(mdev);
82bc0194 2218 return -ENOMEM;
b411b363
PR
2219 }
2220
e2857216 2221 switch (pi->cmd) {
b411b363 2222 case P_DATA_REQUEST:
db830c46 2223 peer_req->w.cb = w_e_end_data_req;
b411b363 2224 fault_type = DRBD_FAULT_DT_RD;
80a40e43
LE
2225 /* application IO, don't drbd_rs_begin_io */
2226 goto submit;
2227
b411b363 2228 case P_RS_DATA_REQUEST:
db830c46 2229 peer_req->w.cb = w_e_end_rsdata_req;
b411b363 2230 fault_type = DRBD_FAULT_RS_RD;
5f9915bb
LE
2231 /* used in the sector offset progress display */
2232 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
b411b363
PR
2233 break;
2234
2235 case P_OV_REPLY:
2236 case P_CSUM_RS_REQUEST:
2237 fault_type = DRBD_FAULT_RS_RD;
e2857216 2238 di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
b411b363
PR
2239 if (!di)
2240 goto out_free_e;
2241
e2857216 2242 di->digest_size = pi->size;
b411b363
PR
2243 di->digest = (((char *)di)+sizeof(struct digest_info));
2244
db830c46
AG
2245 peer_req->digest = di;
2246 peer_req->flags |= EE_HAS_DIGEST;
c36c3ced 2247
e2857216 2248 if (drbd_recv_all(mdev->tconn, di->digest, pi->size))
b411b363
PR
2249 goto out_free_e;
2250
e2857216 2251 if (pi->cmd == P_CSUM_RS_REQUEST) {
31890f4a 2252 D_ASSERT(mdev->tconn->agreed_pro_version >= 89);
db830c46 2253 peer_req->w.cb = w_e_end_csum_rs_req;
5f9915bb
LE
2254 /* used in the sector offset progress display */
2255 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
e2857216 2256 } else if (pi->cmd == P_OV_REPLY) {
2649f080
LE
2257 /* track progress, we may need to throttle */
2258 atomic_add(size >> 9, &mdev->rs_sect_in);
db830c46 2259 peer_req->w.cb = w_e_end_ov_reply;
b411b363 2260 dec_rs_pending(mdev);
0f0601f4
LE
2261 /* drbd_rs_begin_io done when we sent this request,
2262 * but accounting still needs to be done. */
2263 goto submit_for_resync;
b411b363
PR
2264 }
2265 break;
2266
2267 case P_OV_REQUEST:
b411b363 2268 if (mdev->ov_start_sector == ~(sector_t)0 &&
31890f4a 2269 mdev->tconn->agreed_pro_version >= 90) {
de228bba
LE
2270 unsigned long now = jiffies;
2271 int i;
b411b363
PR
2272 mdev->ov_start_sector = sector;
2273 mdev->ov_position = sector;
30b743a2
LE
2274 mdev->ov_left = drbd_bm_bits(mdev) - BM_SECT_TO_BIT(sector);
2275 mdev->rs_total = mdev->ov_left;
de228bba
LE
2276 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2277 mdev->rs_mark_left[i] = mdev->ov_left;
2278 mdev->rs_mark_time[i] = now;
2279 }
b411b363
PR
2280 dev_info(DEV, "Online Verify start sector: %llu\n",
2281 (unsigned long long)sector);
2282 }
db830c46 2283 peer_req->w.cb = w_e_end_ov_req;
b411b363 2284 fault_type = DRBD_FAULT_RS_RD;
b411b363
PR
2285 break;
2286
b411b363 2287 default:
49ba9b1b 2288 BUG();
b411b363
PR
2289 }
2290
0f0601f4
LE
2291 /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2292 * wrt the receiver, but it is not as straightforward as it may seem.
2293 * Various places in the resync start and stop logic assume resync
2294 * requests are processed in order, requeuing this on the worker thread
2295 * introduces a bunch of new code for synchronization between threads.
2296 *
2297 * Unlimited throttling before drbd_rs_begin_io may stall the resync
2298 * "forever", throttling after drbd_rs_begin_io will lock that extent
2299 * for application writes for the same time. For now, just throttle
2300 * here, where the rest of the code expects the receiver to sleep for
2301 * a while, anyways.
2302 */
2303
2304 /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2305 * this defers syncer requests for some time, before letting at least
2306 * on request through. The resync controller on the receiving side
2307 * will adapt to the incoming rate accordingly.
2308 *
2309 * We cannot throttle here if remote is Primary/SyncTarget:
2310 * we would also throttle its application reads.
2311 * In that case, throttling is done on the SyncTarget only.
2312 */
e3555d85
PR
2313 if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev, sector))
2314 schedule_timeout_uninterruptible(HZ/10);
2315 if (drbd_rs_begin_io(mdev, sector))
80a40e43 2316 goto out_free_e;
b411b363 2317
0f0601f4
LE
2318submit_for_resync:
2319 atomic_add(size >> 9, &mdev->rs_sect_ev);
2320
80a40e43 2321submit:
b411b363 2322 inc_unacked(mdev);
87eeee41 2323 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 2324 list_add_tail(&peer_req->w.list, &mdev->read_ee);
87eeee41 2325 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 2326
fbe29dec 2327 if (drbd_submit_peer_request(mdev, peer_req, READ, fault_type) == 0)
82bc0194 2328 return 0;
b411b363 2329
10f6d992
LE
2330 /* don't care for the reason here */
2331 dev_err(DEV, "submit failed, triggering re-connect\n");
87eeee41 2332 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 2333 list_del(&peer_req->w.list);
87eeee41 2334 spin_unlock_irq(&mdev->tconn->req_lock);
22cc37a9
LE
2335 /* no drbd_rs_complete_io(), we are dropping the connection anyways */
2336
b411b363 2337out_free_e:
b411b363 2338 put_ldev(mdev);
db830c46 2339 drbd_free_ee(mdev, peer_req);
82bc0194 2340 return -EIO;
b411b363
PR
2341}
2342
2343static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
2344{
2345 int self, peer, rv = -100;
2346 unsigned long ch_self, ch_peer;
2347
2348 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2349 peer = mdev->p_uuid[UI_BITMAP] & 1;
2350
2351 ch_peer = mdev->p_uuid[UI_SIZE];
2352 ch_self = mdev->comm_bm_set;
2353
89e58e75 2354 switch (mdev->tconn->net_conf->after_sb_0p) {
b411b363
PR
2355 case ASB_CONSENSUS:
2356 case ASB_DISCARD_SECONDARY:
2357 case ASB_CALL_HELPER:
2358 dev_err(DEV, "Configuration error.\n");
2359 break;
2360 case ASB_DISCONNECT:
2361 break;
2362 case ASB_DISCARD_YOUNGER_PRI:
2363 if (self == 0 && peer == 1) {
2364 rv = -1;
2365 break;
2366 }
2367 if (self == 1 && peer == 0) {
2368 rv = 1;
2369 break;
2370 }
2371 /* Else fall through to one of the other strategies... */
2372 case ASB_DISCARD_OLDER_PRI:
2373 if (self == 0 && peer == 1) {
2374 rv = 1;
2375 break;
2376 }
2377 if (self == 1 && peer == 0) {
2378 rv = -1;
2379 break;
2380 }
2381 /* Else fall through to one of the other strategies... */
ad19bf6e 2382 dev_warn(DEV, "Discard younger/older primary did not find a decision\n"
b411b363
PR
2383 "Using discard-least-changes instead\n");
2384 case ASB_DISCARD_ZERO_CHG:
2385 if (ch_peer == 0 && ch_self == 0) {
25703f83 2386 rv = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags)
b411b363
PR
2387 ? -1 : 1;
2388 break;
2389 } else {
2390 if (ch_peer == 0) { rv = 1; break; }
2391 if (ch_self == 0) { rv = -1; break; }
2392 }
89e58e75 2393 if (mdev->tconn->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
b411b363
PR
2394 break;
2395 case ASB_DISCARD_LEAST_CHG:
2396 if (ch_self < ch_peer)
2397 rv = -1;
2398 else if (ch_self > ch_peer)
2399 rv = 1;
2400 else /* ( ch_self == ch_peer ) */
2401 /* Well, then use something else. */
25703f83 2402 rv = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags)
b411b363
PR
2403 ? -1 : 1;
2404 break;
2405 case ASB_DISCARD_LOCAL:
2406 rv = -1;
2407 break;
2408 case ASB_DISCARD_REMOTE:
2409 rv = 1;
2410 }
2411
2412 return rv;
2413}
2414
2415static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2416{
6184ea21 2417 int hg, rv = -100;
b411b363 2418
89e58e75 2419 switch (mdev->tconn->net_conf->after_sb_1p) {
b411b363
PR
2420 case ASB_DISCARD_YOUNGER_PRI:
2421 case ASB_DISCARD_OLDER_PRI:
2422 case ASB_DISCARD_LEAST_CHG:
2423 case ASB_DISCARD_LOCAL:
2424 case ASB_DISCARD_REMOTE:
2425 dev_err(DEV, "Configuration error.\n");
2426 break;
2427 case ASB_DISCONNECT:
2428 break;
2429 case ASB_CONSENSUS:
2430 hg = drbd_asb_recover_0p(mdev);
2431 if (hg == -1 && mdev->state.role == R_SECONDARY)
2432 rv = hg;
2433 if (hg == 1 && mdev->state.role == R_PRIMARY)
2434 rv = hg;
2435 break;
2436 case ASB_VIOLENTLY:
2437 rv = drbd_asb_recover_0p(mdev);
2438 break;
2439 case ASB_DISCARD_SECONDARY:
2440 return mdev->state.role == R_PRIMARY ? 1 : -1;
2441 case ASB_CALL_HELPER:
2442 hg = drbd_asb_recover_0p(mdev);
2443 if (hg == -1 && mdev->state.role == R_PRIMARY) {
bb437946
AG
2444 enum drbd_state_rv rv2;
2445
2446 drbd_set_role(mdev, R_SECONDARY, 0);
b411b363
PR
2447 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2448 * we might be here in C_WF_REPORT_PARAMS which is transient.
2449 * we do not need to wait for the after state change work either. */
bb437946
AG
2450 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2451 if (rv2 != SS_SUCCESS) {
b411b363
PR
2452 drbd_khelper(mdev, "pri-lost-after-sb");
2453 } else {
2454 dev_warn(DEV, "Successfully gave up primary role.\n");
2455 rv = hg;
2456 }
2457 } else
2458 rv = hg;
2459 }
2460
2461 return rv;
2462}
2463
2464static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2465{
6184ea21 2466 int hg, rv = -100;
b411b363 2467
89e58e75 2468 switch (mdev->tconn->net_conf->after_sb_2p) {
b411b363
PR
2469 case ASB_DISCARD_YOUNGER_PRI:
2470 case ASB_DISCARD_OLDER_PRI:
2471 case ASB_DISCARD_LEAST_CHG:
2472 case ASB_DISCARD_LOCAL:
2473 case ASB_DISCARD_REMOTE:
2474 case ASB_CONSENSUS:
2475 case ASB_DISCARD_SECONDARY:
2476 dev_err(DEV, "Configuration error.\n");
2477 break;
2478 case ASB_VIOLENTLY:
2479 rv = drbd_asb_recover_0p(mdev);
2480 break;
2481 case ASB_DISCONNECT:
2482 break;
2483 case ASB_CALL_HELPER:
2484 hg = drbd_asb_recover_0p(mdev);
2485 if (hg == -1) {
bb437946
AG
2486 enum drbd_state_rv rv2;
2487
b411b363
PR
2488 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2489 * we might be here in C_WF_REPORT_PARAMS which is transient.
2490 * we do not need to wait for the after state change work either. */
bb437946
AG
2491 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2492 if (rv2 != SS_SUCCESS) {
b411b363
PR
2493 drbd_khelper(mdev, "pri-lost-after-sb");
2494 } else {
2495 dev_warn(DEV, "Successfully gave up primary role.\n");
2496 rv = hg;
2497 }
2498 } else
2499 rv = hg;
2500 }
2501
2502 return rv;
2503}
2504
2505static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
2506 u64 bits, u64 flags)
2507{
2508 if (!uuid) {
2509 dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
2510 return;
2511 }
2512 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
2513 text,
2514 (unsigned long long)uuid[UI_CURRENT],
2515 (unsigned long long)uuid[UI_BITMAP],
2516 (unsigned long long)uuid[UI_HISTORY_START],
2517 (unsigned long long)uuid[UI_HISTORY_END],
2518 (unsigned long long)bits,
2519 (unsigned long long)flags);
2520}
2521
2522/*
2523 100 after split brain try auto recover
2524 2 C_SYNC_SOURCE set BitMap
2525 1 C_SYNC_SOURCE use BitMap
2526 0 no Sync
2527 -1 C_SYNC_TARGET use BitMap
2528 -2 C_SYNC_TARGET set BitMap
2529 -100 after split brain, disconnect
2530-1000 unrelated data
4a23f264
PR
2531-1091 requires proto 91
2532-1096 requires proto 96
b411b363
PR
2533 */
2534static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
2535{
2536 u64 self, peer;
2537 int i, j;
2538
2539 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2540 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2541
2542 *rule_nr = 10;
2543 if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2544 return 0;
2545
2546 *rule_nr = 20;
2547 if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2548 peer != UUID_JUST_CREATED)
2549 return -2;
2550
2551 *rule_nr = 30;
2552 if (self != UUID_JUST_CREATED &&
2553 (peer == UUID_JUST_CREATED || peer == (u64)0))
2554 return 2;
2555
2556 if (self == peer) {
2557 int rct, dc; /* roles at crash time */
2558
2559 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2560
31890f4a 2561 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2562 return -1091;
b411b363
PR
2563
2564 if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2565 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
2566 dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
2567 drbd_uuid_set_bm(mdev, 0UL);
2568
2569 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2570 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2571 *rule_nr = 34;
2572 } else {
2573 dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
2574 *rule_nr = 36;
2575 }
2576
2577 return 1;
2578 }
2579
2580 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
2581
31890f4a 2582 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2583 return -1091;
b411b363
PR
2584
2585 if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2586 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
2587 dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
2588
2589 mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
2590 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
2591 mdev->p_uuid[UI_BITMAP] = 0UL;
2592
2593 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2594 *rule_nr = 35;
2595 } else {
2596 dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
2597 *rule_nr = 37;
2598 }
2599
2600 return -1;
2601 }
2602
2603 /* Common power [off|failure] */
2604 rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
2605 (mdev->p_uuid[UI_FLAGS] & 2);
2606 /* lowest bit is set when we were primary,
2607 * next bit (weight 2) is set when peer was primary */
2608 *rule_nr = 40;
2609
2610 switch (rct) {
2611 case 0: /* !self_pri && !peer_pri */ return 0;
2612 case 1: /* self_pri && !peer_pri */ return 1;
2613 case 2: /* !self_pri && peer_pri */ return -1;
2614 case 3: /* self_pri && peer_pri */
25703f83 2615 dc = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags);
b411b363
PR
2616 return dc ? -1 : 1;
2617 }
2618 }
2619
2620 *rule_nr = 50;
2621 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2622 if (self == peer)
2623 return -1;
2624
2625 *rule_nr = 51;
2626 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2627 if (self == peer) {
31890f4a 2628 if (mdev->tconn->agreed_pro_version < 96 ?
4a23f264
PR
2629 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
2630 (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
2631 peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) {
b411b363
PR
2632 /* The last P_SYNC_UUID did not get though. Undo the last start of
2633 resync as sync source modifications of the peer's UUIDs. */
2634
31890f4a 2635 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2636 return -1091;
b411b363
PR
2637
2638 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2639 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
4a23f264
PR
2640
2641 dev_info(DEV, "Did not got last syncUUID packet, corrected:\n");
2642 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2643
b411b363
PR
2644 return -1;
2645 }
2646 }
2647
2648 *rule_nr = 60;
2649 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2650 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2651 peer = mdev->p_uuid[i] & ~((u64)1);
2652 if (self == peer)
2653 return -2;
2654 }
2655
2656 *rule_nr = 70;
2657 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2658 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2659 if (self == peer)
2660 return 1;
2661
2662 *rule_nr = 71;
2663 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2664 if (self == peer) {
31890f4a 2665 if (mdev->tconn->agreed_pro_version < 96 ?
4a23f264
PR
2666 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
2667 (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
2668 self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
b411b363
PR
2669 /* The last P_SYNC_UUID did not get though. Undo the last start of
2670 resync as sync source modifications of our UUIDs. */
2671
31890f4a 2672 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2673 return -1091;
b411b363
PR
2674
2675 _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
2676 _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
2677
4a23f264 2678 dev_info(DEV, "Last syncUUID did not get through, corrected:\n");
b411b363
PR
2679 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2680 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2681
2682 return 1;
2683 }
2684 }
2685
2686
2687 *rule_nr = 80;
d8c2a36b 2688 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
b411b363
PR
2689 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2690 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2691 if (self == peer)
2692 return 2;
2693 }
2694
2695 *rule_nr = 90;
2696 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2697 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2698 if (self == peer && self != ((u64)0))
2699 return 100;
2700
2701 *rule_nr = 100;
2702 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2703 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2704 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
2705 peer = mdev->p_uuid[j] & ~((u64)1);
2706 if (self == peer)
2707 return -100;
2708 }
2709 }
2710
2711 return -1000;
2712}
2713
2714/* drbd_sync_handshake() returns the new conn state on success, or
2715 CONN_MASK (-1) on failure.
2716 */
2717static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
2718 enum drbd_disk_state peer_disk) __must_hold(local)
2719{
2720 int hg, rule_nr;
2721 enum drbd_conns rv = C_MASK;
2722 enum drbd_disk_state mydisk;
2723
2724 mydisk = mdev->state.disk;
2725 if (mydisk == D_NEGOTIATING)
2726 mydisk = mdev->new_state_tmp.disk;
2727
2728 dev_info(DEV, "drbd_sync_handshake:\n");
2729 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
2730 drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
2731 mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2732
2733 hg = drbd_uuid_compare(mdev, &rule_nr);
2734
2735 dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
2736
2737 if (hg == -1000) {
2738 dev_alert(DEV, "Unrelated data, aborting!\n");
2739 return C_MASK;
2740 }
4a23f264
PR
2741 if (hg < -1000) {
2742 dev_alert(DEV, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
b411b363
PR
2743 return C_MASK;
2744 }
2745
2746 if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
2747 (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
2748 int f = (hg == -100) || abs(hg) == 2;
2749 hg = mydisk > D_INCONSISTENT ? 1 : -1;
2750 if (f)
2751 hg = hg*2;
2752 dev_info(DEV, "Becoming sync %s due to disk states.\n",
2753 hg > 0 ? "source" : "target");
2754 }
2755
3a11a487
AG
2756 if (abs(hg) == 100)
2757 drbd_khelper(mdev, "initial-split-brain");
2758
89e58e75 2759 if (hg == 100 || (hg == -100 && mdev->tconn->net_conf->always_asbp)) {
b411b363
PR
2760 int pcount = (mdev->state.role == R_PRIMARY)
2761 + (peer_role == R_PRIMARY);
2762 int forced = (hg == -100);
2763
2764 switch (pcount) {
2765 case 0:
2766 hg = drbd_asb_recover_0p(mdev);
2767 break;
2768 case 1:
2769 hg = drbd_asb_recover_1p(mdev);
2770 break;
2771 case 2:
2772 hg = drbd_asb_recover_2p(mdev);
2773 break;
2774 }
2775 if (abs(hg) < 100) {
2776 dev_warn(DEV, "Split-Brain detected, %d primaries, "
2777 "automatically solved. Sync from %s node\n",
2778 pcount, (hg < 0) ? "peer" : "this");
2779 if (forced) {
2780 dev_warn(DEV, "Doing a full sync, since"
2781 " UUIDs where ambiguous.\n");
2782 hg = hg*2;
2783 }
2784 }
2785 }
2786
2787 if (hg == -100) {
89e58e75 2788 if (mdev->tconn->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
b411b363 2789 hg = -1;
89e58e75 2790 if (!mdev->tconn->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
b411b363
PR
2791 hg = 1;
2792
2793 if (abs(hg) < 100)
2794 dev_warn(DEV, "Split-Brain detected, manually solved. "
2795 "Sync from %s node\n",
2796 (hg < 0) ? "peer" : "this");
2797 }
2798
2799 if (hg == -100) {
580b9767
LE
2800 /* FIXME this log message is not correct if we end up here
2801 * after an attempted attach on a diskless node.
2802 * We just refuse to attach -- well, we drop the "connection"
2803 * to that disk, in a way... */
3a11a487 2804 dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
b411b363
PR
2805 drbd_khelper(mdev, "split-brain");
2806 return C_MASK;
2807 }
2808
2809 if (hg > 0 && mydisk <= D_INCONSISTENT) {
2810 dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
2811 return C_MASK;
2812 }
2813
2814 if (hg < 0 && /* by intention we do not use mydisk here. */
2815 mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
89e58e75 2816 switch (mdev->tconn->net_conf->rr_conflict) {
b411b363
PR
2817 case ASB_CALL_HELPER:
2818 drbd_khelper(mdev, "pri-lost");
2819 /* fall through */
2820 case ASB_DISCONNECT:
2821 dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
2822 return C_MASK;
2823 case ASB_VIOLENTLY:
2824 dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
2825 "assumption\n");
2826 }
2827 }
2828
8169e41b 2829 if (mdev->tconn->net_conf->dry_run || test_bit(CONN_DRY_RUN, &mdev->tconn->flags)) {
cf14c2e9
PR
2830 if (hg == 0)
2831 dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
2832 else
2833 dev_info(DEV, "dry-run connect: Would become %s, doing a %s resync.",
2834 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
2835 abs(hg) >= 2 ? "full" : "bit-map based");
2836 return C_MASK;
2837 }
2838
b411b363
PR
2839 if (abs(hg) >= 2) {
2840 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
20ceb2b2
LE
2841 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
2842 BM_LOCKED_SET_ALLOWED))
b411b363
PR
2843 return C_MASK;
2844 }
2845
2846 if (hg > 0) { /* become sync source. */
2847 rv = C_WF_BITMAP_S;
2848 } else if (hg < 0) { /* become sync target */
2849 rv = C_WF_BITMAP_T;
2850 } else {
2851 rv = C_CONNECTED;
2852 if (drbd_bm_total_weight(mdev)) {
2853 dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
2854 drbd_bm_total_weight(mdev));
2855 }
2856 }
2857
2858 return rv;
2859}
2860
2861/* returns 1 if invalid */
2862static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
2863{
2864 /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
2865 if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
2866 (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
2867 return 0;
2868
2869 /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
2870 if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
2871 self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
2872 return 1;
2873
2874 /* everything else is valid if they are equal on both sides. */
2875 if (peer == self)
2876 return 0;
2877
2878 /* everything es is invalid. */
2879 return 1;
2880}
2881
e2857216 2882static int receive_protocol(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 2883{
e6ef8a5c 2884 struct p_protocol *p = tconn->data.rbuf;
b411b363 2885 int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
cf14c2e9 2886 int p_want_lose, p_two_primaries, cf;
b411b363
PR
2887 char p_integrity_alg[SHARED_SECRET_MAX] = "";
2888
b411b363
PR
2889 p_proto = be32_to_cpu(p->protocol);
2890 p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
2891 p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
2892 p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
b411b363 2893 p_two_primaries = be32_to_cpu(p->two_primaries);
cf14c2e9
PR
2894 cf = be32_to_cpu(p->conn_flags);
2895 p_want_lose = cf & CF_WANT_LOSE;
2896
7204624c 2897 clear_bit(CONN_DRY_RUN, &tconn->flags);
cf14c2e9
PR
2898
2899 if (cf & CF_DRY_RUN)
7204624c 2900 set_bit(CONN_DRY_RUN, &tconn->flags);
b411b363 2901
7204624c
PR
2902 if (p_proto != tconn->net_conf->wire_protocol) {
2903 conn_err(tconn, "incompatible communication protocols\n");
b411b363
PR
2904 goto disconnect;
2905 }
2906
7204624c
PR
2907 if (cmp_after_sb(p_after_sb_0p, tconn->net_conf->after_sb_0p)) {
2908 conn_err(tconn, "incompatible after-sb-0pri settings\n");
b411b363
PR
2909 goto disconnect;
2910 }
2911
7204624c
PR
2912 if (cmp_after_sb(p_after_sb_1p, tconn->net_conf->after_sb_1p)) {
2913 conn_err(tconn, "incompatible after-sb-1pri settings\n");
b411b363
PR
2914 goto disconnect;
2915 }
2916
7204624c
PR
2917 if (cmp_after_sb(p_after_sb_2p, tconn->net_conf->after_sb_2p)) {
2918 conn_err(tconn, "incompatible after-sb-2pri settings\n");
b411b363
PR
2919 goto disconnect;
2920 }
2921
7204624c
PR
2922 if (p_want_lose && tconn->net_conf->want_lose) {
2923 conn_err(tconn, "both sides have the 'want_lose' flag set\n");
b411b363
PR
2924 goto disconnect;
2925 }
2926
7204624c
PR
2927 if (p_two_primaries != tconn->net_conf->two_primaries) {
2928 conn_err(tconn, "incompatible setting of the two-primaries options\n");
b411b363
PR
2929 goto disconnect;
2930 }
2931
7204624c
PR
2932 if (tconn->agreed_pro_version >= 87) {
2933 unsigned char *my_alg = tconn->net_conf->integrity_alg;
82bc0194 2934 int err;
b411b363 2935
e2857216 2936 err = drbd_recv_all(tconn, p_integrity_alg, pi->size);
82bc0194
AG
2937 if (err)
2938 return err;
b411b363
PR
2939
2940 p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
2941 if (strcmp(p_integrity_alg, my_alg)) {
7204624c 2942 conn_err(tconn, "incompatible setting of the data-integrity-alg\n");
b411b363
PR
2943 goto disconnect;
2944 }
7204624c 2945 conn_info(tconn, "data-integrity-alg: %s\n",
b411b363
PR
2946 my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
2947 }
2948
82bc0194 2949 return 0;
b411b363
PR
2950
2951disconnect:
7204624c 2952 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 2953 return -EIO;
b411b363
PR
2954}
2955
2956/* helper function
2957 * input: alg name, feature name
2958 * return: NULL (alg name was "")
2959 * ERR_PTR(error) if something goes wrong
2960 * or the crypto hash ptr, if it worked out ok. */
2961struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
2962 const char *alg, const char *name)
2963{
2964 struct crypto_hash *tfm;
2965
2966 if (!alg[0])
2967 return NULL;
2968
2969 tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
2970 if (IS_ERR(tfm)) {
2971 dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
2972 alg, name, PTR_ERR(tfm));
2973 return tfm;
2974 }
2975 if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
2976 crypto_free_hash(tfm);
2977 dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
2978 return ERR_PTR(-EINVAL);
2979 }
2980 return tfm;
2981}
2982
4a76b161
AG
2983static int ignore_remaining_packet(struct drbd_tconn *tconn, struct packet_info *pi)
2984{
2985 void *buffer = tconn->data.rbuf;
2986 int size = pi->size;
2987
2988 while (size) {
2989 int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
2990 s = drbd_recv(tconn, buffer, s);
2991 if (s <= 0) {
2992 if (s < 0)
2993 return s;
2994 break;
2995 }
2996 size -= s;
2997 }
2998 if (size)
2999 return -EIO;
3000 return 0;
3001}
3002
3003/*
3004 * config_unknown_volume - device configuration command for unknown volume
3005 *
3006 * When a device is added to an existing connection, the node on which the
3007 * device is added first will send configuration commands to its peer but the
3008 * peer will not know about the device yet. It will warn and ignore these
3009 * commands. Once the device is added on the second node, the second node will
3010 * send the same device configuration commands, but in the other direction.
3011 *
3012 * (We can also end up here if drbd is misconfigured.)
3013 */
3014static int config_unknown_volume(struct drbd_tconn *tconn, struct packet_info *pi)
3015{
3016 conn_warn(tconn, "Volume %u unknown; ignoring %s packet\n",
3017 pi->vnr, cmdname(pi->cmd));
3018 return ignore_remaining_packet(tconn, pi);
3019}
3020
3021static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3022{
4a76b161
AG
3023 struct drbd_conf *mdev;
3024 struct p_rs_param_95 *p = tconn->data.rbuf;
b411b363
PR
3025 unsigned int header_size, data_size, exp_max_sz;
3026 struct crypto_hash *verify_tfm = NULL;
3027 struct crypto_hash *csums_tfm = NULL;
4a76b161 3028 const int apv = tconn->agreed_pro_version;
778f271d
PR
3029 int *rs_plan_s = NULL;
3030 int fifo_size = 0;
82bc0194 3031 int err;
b411b363 3032
4a76b161
AG
3033 mdev = vnr_to_mdev(tconn, pi->vnr);
3034 if (!mdev)
3035 return config_unknown_volume(tconn, pi);
3036
b411b363
PR
3037 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
3038 : apv == 88 ? sizeof(struct p_rs_param)
3039 + SHARED_SECRET_MAX
8e26f9cc
PR
3040 : apv <= 94 ? sizeof(struct p_rs_param_89)
3041 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363 3042
e2857216 3043 if (pi->size > exp_max_sz) {
b411b363 3044 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
e2857216 3045 pi->size, exp_max_sz);
82bc0194 3046 return -EIO;
b411b363
PR
3047 }
3048
3049 if (apv <= 88) {
257d0af6 3050 header_size = sizeof(struct p_rs_param) - sizeof(struct p_header);
e2857216 3051 data_size = pi->size - header_size;
8e26f9cc 3052 } else if (apv <= 94) {
257d0af6 3053 header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header);
e2857216 3054 data_size = pi->size - header_size;
b411b363 3055 D_ASSERT(data_size == 0);
8e26f9cc 3056 } else {
257d0af6 3057 header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header);
e2857216 3058 data_size = pi->size - header_size;
b411b363
PR
3059 D_ASSERT(data_size == 0);
3060 }
3061
3062 /* initialize verify_alg and csums_alg */
3063 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
3064
82bc0194
AG
3065 err = drbd_recv_all(mdev->tconn, &p->head.payload, header_size);
3066 if (err)
3067 return err;
b411b363 3068
f399002e
LE
3069 if (get_ldev(mdev)) {
3070 mdev->ldev->dc.resync_rate = be32_to_cpu(p->rate);
3071 put_ldev(mdev);
3072 }
b411b363
PR
3073
3074 if (apv >= 88) {
3075 if (apv == 88) {
3076 if (data_size > SHARED_SECRET_MAX) {
3077 dev_err(DEV, "verify-alg too long, "
3078 "peer wants %u, accepting only %u byte\n",
3079 data_size, SHARED_SECRET_MAX);
82bc0194 3080 return -EIO;
b411b363
PR
3081 }
3082
82bc0194
AG
3083 err = drbd_recv_all(mdev->tconn, p->verify_alg, data_size);
3084 if (err)
3085 return err;
b411b363
PR
3086
3087 /* we expect NUL terminated string */
3088 /* but just in case someone tries to be evil */
3089 D_ASSERT(p->verify_alg[data_size-1] == 0);
3090 p->verify_alg[data_size-1] = 0;
3091
3092 } else /* apv >= 89 */ {
3093 /* we still expect NUL terminated strings */
3094 /* but just in case someone tries to be evil */
3095 D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
3096 D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
3097 p->verify_alg[SHARED_SECRET_MAX-1] = 0;
3098 p->csums_alg[SHARED_SECRET_MAX-1] = 0;
3099 }
3100
f399002e 3101 if (strcmp(mdev->tconn->net_conf->verify_alg, p->verify_alg)) {
b411b363
PR
3102 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
3103 dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
f399002e 3104 mdev->tconn->net_conf->verify_alg, p->verify_alg);
b411b363
PR
3105 goto disconnect;
3106 }
3107 verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
3108 p->verify_alg, "verify-alg");
3109 if (IS_ERR(verify_tfm)) {
3110 verify_tfm = NULL;
3111 goto disconnect;
3112 }
3113 }
3114
f399002e 3115 if (apv >= 89 && strcmp(mdev->tconn->net_conf->csums_alg, p->csums_alg)) {
b411b363
PR
3116 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
3117 dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
f399002e 3118 mdev->tconn->net_conf->csums_alg, p->csums_alg);
b411b363
PR
3119 goto disconnect;
3120 }
3121 csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
3122 p->csums_alg, "csums-alg");
3123 if (IS_ERR(csums_tfm)) {
3124 csums_tfm = NULL;
3125 goto disconnect;
3126 }
3127 }
3128
f399002e
LE
3129 if (apv > 94 && get_ldev(mdev)) {
3130 mdev->ldev->dc.resync_rate = be32_to_cpu(p->rate);
3131 mdev->ldev->dc.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
3132 mdev->ldev->dc.c_delay_target = be32_to_cpu(p->c_delay_target);
3133 mdev->ldev->dc.c_fill_target = be32_to_cpu(p->c_fill_target);
3134 mdev->ldev->dc.c_max_rate = be32_to_cpu(p->c_max_rate);
778f271d 3135
f399002e 3136 fifo_size = (mdev->ldev->dc.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
778f271d
PR
3137 if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
3138 rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
3139 if (!rs_plan_s) {
3140 dev_err(DEV, "kmalloc of fifo_buffer failed");
f399002e 3141 put_ldev(mdev);
778f271d
PR
3142 goto disconnect;
3143 }
3144 }
f399002e 3145 put_ldev(mdev);
8e26f9cc 3146 }
b411b363
PR
3147
3148 spin_lock(&mdev->peer_seq_lock);
3149 /* lock against drbd_nl_syncer_conf() */
3150 if (verify_tfm) {
f399002e
LE
3151 strcpy(mdev->tconn->net_conf->verify_alg, p->verify_alg);
3152 mdev->tconn->net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
3153 crypto_free_hash(mdev->tconn->verify_tfm);
3154 mdev->tconn->verify_tfm = verify_tfm;
b411b363
PR
3155 dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
3156 }
3157 if (csums_tfm) {
f399002e
LE
3158 strcpy(mdev->tconn->net_conf->csums_alg, p->csums_alg);
3159 mdev->tconn->net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
3160 crypto_free_hash(mdev->tconn->csums_tfm);
3161 mdev->tconn->csums_tfm = csums_tfm;
b411b363
PR
3162 dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
3163 }
778f271d
PR
3164 if (fifo_size != mdev->rs_plan_s.size) {
3165 kfree(mdev->rs_plan_s.values);
3166 mdev->rs_plan_s.values = rs_plan_s;
3167 mdev->rs_plan_s.size = fifo_size;
3168 mdev->rs_planed = 0;
3169 }
b411b363
PR
3170 spin_unlock(&mdev->peer_seq_lock);
3171 }
82bc0194 3172 return 0;
b411b363 3173
b411b363
PR
3174disconnect:
3175 /* just for completeness: actually not needed,
3176 * as this is not reached if csums_tfm was ok. */
3177 crypto_free_hash(csums_tfm);
3178 /* but free the verify_tfm again, if csums_tfm did not work out */
3179 crypto_free_hash(verify_tfm);
38fa9988 3180 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3181 return -EIO;
b411b363
PR
3182}
3183
b411b363
PR
3184/* warn if the arguments differ by more than 12.5% */
3185static void warn_if_differ_considerably(struct drbd_conf *mdev,
3186 const char *s, sector_t a, sector_t b)
3187{
3188 sector_t d;
3189 if (a == 0 || b == 0)
3190 return;
3191 d = (a > b) ? (a - b) : (b - a);
3192 if (d > (a>>3) || d > (b>>3))
3193 dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
3194 (unsigned long long)a, (unsigned long long)b);
3195}
3196
4a76b161 3197static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3198{
4a76b161
AG
3199 struct drbd_conf *mdev;
3200 struct p_sizes *p = tconn->data.rbuf;
b411b363 3201 enum determine_dev_size dd = unchanged;
b411b363
PR
3202 sector_t p_size, p_usize, my_usize;
3203 int ldsc = 0; /* local disk size changed */
e89b591c 3204 enum dds_flags ddsf;
b411b363 3205
4a76b161
AG
3206 mdev = vnr_to_mdev(tconn, pi->vnr);
3207 if (!mdev)
3208 return config_unknown_volume(tconn, pi);
3209
b411b363
PR
3210 p_size = be64_to_cpu(p->d_size);
3211 p_usize = be64_to_cpu(p->u_size);
3212
b411b363
PR
3213 /* just store the peer's disk size for now.
3214 * we still need to figure out whether we accept that. */
3215 mdev->p_size = p_size;
3216
b411b363
PR
3217 if (get_ldev(mdev)) {
3218 warn_if_differ_considerably(mdev, "lower level device sizes",
3219 p_size, drbd_get_max_capacity(mdev->ldev));
3220 warn_if_differ_considerably(mdev, "user requested size",
3221 p_usize, mdev->ldev->dc.disk_size);
3222
3223 /* if this is the first connect, or an otherwise expected
3224 * param exchange, choose the minimum */
3225 if (mdev->state.conn == C_WF_REPORT_PARAMS)
3226 p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
3227 p_usize);
3228
3229 my_usize = mdev->ldev->dc.disk_size;
3230
3231 if (mdev->ldev->dc.disk_size != p_usize) {
3232 mdev->ldev->dc.disk_size = p_usize;
3233 dev_info(DEV, "Peer sets u_size to %lu sectors\n",
3234 (unsigned long)mdev->ldev->dc.disk_size);
3235 }
3236
3237 /* Never shrink a device with usable data during connect.
3238 But allow online shrinking if we are connected. */
a393db6f 3239 if (drbd_new_dev_size(mdev, mdev->ldev, 0) <
b411b363
PR
3240 drbd_get_capacity(mdev->this_bdev) &&
3241 mdev->state.disk >= D_OUTDATED &&
3242 mdev->state.conn < C_CONNECTED) {
3243 dev_err(DEV, "The peer's disk size is too small!\n");
38fa9988 3244 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
3245 mdev->ldev->dc.disk_size = my_usize;
3246 put_ldev(mdev);
82bc0194 3247 return -EIO;
b411b363
PR
3248 }
3249 put_ldev(mdev);
3250 }
b411b363 3251
e89b591c 3252 ddsf = be16_to_cpu(p->dds_flags);
b411b363 3253 if (get_ldev(mdev)) {
24c4830c 3254 dd = drbd_determine_dev_size(mdev, ddsf);
b411b363
PR
3255 put_ldev(mdev);
3256 if (dd == dev_size_error)
82bc0194 3257 return -EIO;
b411b363
PR
3258 drbd_md_sync(mdev);
3259 } else {
3260 /* I am diskless, need to accept the peer's size. */
3261 drbd_set_my_capacity(mdev, p_size);
3262 }
3263
99432fcc
PR
3264 mdev->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
3265 drbd_reconsider_max_bio_size(mdev);
3266
b411b363
PR
3267 if (get_ldev(mdev)) {
3268 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
3269 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
3270 ldsc = 1;
3271 }
3272
b411b363
PR
3273 put_ldev(mdev);
3274 }
3275
3276 if (mdev->state.conn > C_WF_REPORT_PARAMS) {
3277 if (be64_to_cpu(p->c_size) !=
3278 drbd_get_capacity(mdev->this_bdev) || ldsc) {
3279 /* we have different sizes, probably peer
3280 * needs to know my new size... */
e89b591c 3281 drbd_send_sizes(mdev, 0, ddsf);
b411b363
PR
3282 }
3283 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
3284 (dd == grew && mdev->state.conn == C_CONNECTED)) {
3285 if (mdev->state.pdsk >= D_INCONSISTENT &&
e89b591c
PR
3286 mdev->state.disk >= D_INCONSISTENT) {
3287 if (ddsf & DDSF_NO_RESYNC)
3288 dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
3289 else
3290 resync_after_online_grow(mdev);
3291 } else
b411b363
PR
3292 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
3293 }
3294 }
3295
82bc0194 3296 return 0;
b411b363
PR
3297}
3298
4a76b161 3299static int receive_uuids(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3300{
4a76b161
AG
3301 struct drbd_conf *mdev;
3302 struct p_uuids *p = tconn->data.rbuf;
b411b363 3303 u64 *p_uuid;
62b0da3a 3304 int i, updated_uuids = 0;
b411b363 3305
4a76b161
AG
3306 mdev = vnr_to_mdev(tconn, pi->vnr);
3307 if (!mdev)
3308 return config_unknown_volume(tconn, pi);
3309
b411b363
PR
3310 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
3311
3312 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
3313 p_uuid[i] = be64_to_cpu(p->uuid[i]);
3314
3315 kfree(mdev->p_uuid);
3316 mdev->p_uuid = p_uuid;
3317
3318 if (mdev->state.conn < C_CONNECTED &&
3319 mdev->state.disk < D_INCONSISTENT &&
3320 mdev->state.role == R_PRIMARY &&
3321 (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
3322 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
3323 (unsigned long long)mdev->ed_uuid);
38fa9988 3324 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3325 return -EIO;
b411b363
PR
3326 }
3327
3328 if (get_ldev(mdev)) {
3329 int skip_initial_sync =
3330 mdev->state.conn == C_CONNECTED &&
31890f4a 3331 mdev->tconn->agreed_pro_version >= 90 &&
b411b363
PR
3332 mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
3333 (p_uuid[UI_FLAGS] & 8);
3334 if (skip_initial_sync) {
3335 dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
3336 drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
20ceb2b2
LE
3337 "clear_n_write from receive_uuids",
3338 BM_LOCKED_TEST_ALLOWED);
b411b363
PR
3339 _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
3340 _drbd_uuid_set(mdev, UI_BITMAP, 0);
3341 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3342 CS_VERBOSE, NULL);
3343 drbd_md_sync(mdev);
62b0da3a 3344 updated_uuids = 1;
b411b363
PR
3345 }
3346 put_ldev(mdev);
18a50fa2
PR
3347 } else if (mdev->state.disk < D_INCONSISTENT &&
3348 mdev->state.role == R_PRIMARY) {
3349 /* I am a diskless primary, the peer just created a new current UUID
3350 for me. */
62b0da3a 3351 updated_uuids = drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
b411b363
PR
3352 }
3353
3354 /* Before we test for the disk state, we should wait until an eventually
3355 ongoing cluster wide state change is finished. That is important if
3356 we are primary and are detaching from our disk. We need to see the
3357 new disk state... */
8410da8f
PR
3358 mutex_lock(mdev->state_mutex);
3359 mutex_unlock(mdev->state_mutex);
b411b363 3360 if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
62b0da3a
LE
3361 updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3362
3363 if (updated_uuids)
3364 drbd_print_uuids(mdev, "receiver updated UUIDs to");
b411b363 3365
82bc0194 3366 return 0;
b411b363
PR
3367}
3368
3369/**
3370 * convert_state() - Converts the peer's view of the cluster state to our point of view
3371 * @ps: The state as seen by the peer.
3372 */
3373static union drbd_state convert_state(union drbd_state ps)
3374{
3375 union drbd_state ms;
3376
3377 static enum drbd_conns c_tab[] = {
3378 [C_CONNECTED] = C_CONNECTED,
3379
3380 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3381 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3382 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3383 [C_VERIFY_S] = C_VERIFY_T,
3384 [C_MASK] = C_MASK,
3385 };
3386
3387 ms.i = ps.i;
3388
3389 ms.conn = c_tab[ps.conn];
3390 ms.peer = ps.role;
3391 ms.role = ps.peer;
3392 ms.pdsk = ps.disk;
3393 ms.disk = ps.pdsk;
3394 ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3395
3396 return ms;
3397}
3398
4a76b161 3399static int receive_req_state(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3400{
4a76b161
AG
3401 struct drbd_conf *mdev;
3402 struct p_req_state *p = tconn->data.rbuf;
b411b363 3403 union drbd_state mask, val;
bf885f8a 3404 enum drbd_state_rv rv;
b411b363 3405
4a76b161
AG
3406 mdev = vnr_to_mdev(tconn, pi->vnr);
3407 if (!mdev)
3408 return -EIO;
3409
b411b363
PR
3410 mask.i = be32_to_cpu(p->mask);
3411 val.i = be32_to_cpu(p->val);
3412
25703f83 3413 if (test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags) &&
8410da8f 3414 mutex_is_locked(mdev->state_mutex)) {
b411b363 3415 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
82bc0194 3416 return 0;
b411b363
PR
3417 }
3418
3419 mask = convert_state(mask);
3420 val = convert_state(val);
3421
dfafcc8a
PR
3422 rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
3423 drbd_send_sr_reply(mdev, rv);
b411b363 3424
b411b363
PR
3425 drbd_md_sync(mdev);
3426
82bc0194 3427 return 0;
b411b363
PR
3428}
3429
e2857216 3430static int receive_req_conn_state(struct drbd_tconn *tconn, struct packet_info *pi)
dfafcc8a 3431{
e6ef8a5c 3432 struct p_req_state *p = tconn->data.rbuf;
dfafcc8a
PR
3433 union drbd_state mask, val;
3434 enum drbd_state_rv rv;
3435
3436 mask.i = be32_to_cpu(p->mask);
3437 val.i = be32_to_cpu(p->val);
3438
3439 if (test_bit(DISCARD_CONCURRENT, &tconn->flags) &&
3440 mutex_is_locked(&tconn->cstate_mutex)) {
3441 conn_send_sr_reply(tconn, SS_CONCURRENT_ST_CHG);
82bc0194 3442 return 0;
dfafcc8a
PR
3443 }
3444
3445 mask = convert_state(mask);
3446 val = convert_state(val);
3447
778bcf2e 3448 rv = conn_request_state(tconn, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
dfafcc8a
PR
3449 conn_send_sr_reply(tconn, rv);
3450
82bc0194 3451 return 0;
dfafcc8a
PR
3452}
3453
4a76b161 3454static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3455{
4a76b161
AG
3456 struct drbd_conf *mdev;
3457 struct p_state *p = tconn->data.rbuf;
4ac4aada 3458 union drbd_state os, ns, peer_state;
b411b363 3459 enum drbd_disk_state real_peer_disk;
65d922c3 3460 enum chg_state_flags cs_flags;
b411b363
PR
3461 int rv;
3462
4a76b161
AG
3463 mdev = vnr_to_mdev(tconn, pi->vnr);
3464 if (!mdev)
3465 return config_unknown_volume(tconn, pi);
3466
b411b363
PR
3467 peer_state.i = be32_to_cpu(p->state);
3468
3469 real_peer_disk = peer_state.disk;
3470 if (peer_state.disk == D_NEGOTIATING) {
3471 real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
3472 dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
3473 }
3474
87eeee41 3475 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 3476 retry:
78bae59b 3477 os = ns = drbd_read_state(mdev);
87eeee41 3478 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 3479
e9ef7bb6
LE
3480 /* peer says his disk is uptodate, while we think it is inconsistent,
3481 * and this happens while we think we have a sync going on. */
3482 if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
3483 os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
3484 /* If we are (becoming) SyncSource, but peer is still in sync
3485 * preparation, ignore its uptodate-ness to avoid flapping, it
3486 * will change to inconsistent once the peer reaches active
3487 * syncing states.
3488 * It may have changed syncer-paused flags, however, so we
3489 * cannot ignore this completely. */
3490 if (peer_state.conn > C_CONNECTED &&
3491 peer_state.conn < C_SYNC_SOURCE)
3492 real_peer_disk = D_INCONSISTENT;
3493
3494 /* if peer_state changes to connected at the same time,
3495 * it explicitly notifies us that it finished resync.
3496 * Maybe we should finish it up, too? */
3497 else if (os.conn >= C_SYNC_SOURCE &&
3498 peer_state.conn == C_CONNECTED) {
3499 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
3500 drbd_resync_finished(mdev);
82bc0194 3501 return 0;
e9ef7bb6
LE
3502 }
3503 }
3504
3505 /* peer says his disk is inconsistent, while we think it is uptodate,
3506 * and this happens while the peer still thinks we have a sync going on,
3507 * but we think we are already done with the sync.
3508 * We ignore this to avoid flapping pdsk.
3509 * This should not happen, if the peer is a recent version of drbd. */
3510 if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
3511 os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
3512 real_peer_disk = D_UP_TO_DATE;
3513
4ac4aada
LE
3514 if (ns.conn == C_WF_REPORT_PARAMS)
3515 ns.conn = C_CONNECTED;
b411b363 3516
67531718
PR
3517 if (peer_state.conn == C_AHEAD)
3518 ns.conn = C_BEHIND;
3519
b411b363
PR
3520 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3521 get_ldev_if_state(mdev, D_NEGOTIATING)) {
3522 int cr; /* consider resync */
3523
3524 /* if we established a new connection */
4ac4aada 3525 cr = (os.conn < C_CONNECTED);
b411b363
PR
3526 /* if we had an established connection
3527 * and one of the nodes newly attaches a disk */
4ac4aada 3528 cr |= (os.conn == C_CONNECTED &&
b411b363 3529 (peer_state.disk == D_NEGOTIATING ||
4ac4aada 3530 os.disk == D_NEGOTIATING));
b411b363
PR
3531 /* if we have both been inconsistent, and the peer has been
3532 * forced to be UpToDate with --overwrite-data */
3533 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
3534 /* if we had been plain connected, and the admin requested to
3535 * start a sync by "invalidate" or "invalidate-remote" */
4ac4aada 3536 cr |= (os.conn == C_CONNECTED &&
b411b363
PR
3537 (peer_state.conn >= C_STARTING_SYNC_S &&
3538 peer_state.conn <= C_WF_BITMAP_T));
3539
3540 if (cr)
4ac4aada 3541 ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
b411b363
PR
3542
3543 put_ldev(mdev);
4ac4aada
LE
3544 if (ns.conn == C_MASK) {
3545 ns.conn = C_CONNECTED;
b411b363 3546 if (mdev->state.disk == D_NEGOTIATING) {
82f59cc6 3547 drbd_force_state(mdev, NS(disk, D_FAILED));
b411b363
PR
3548 } else if (peer_state.disk == D_NEGOTIATING) {
3549 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3550 peer_state.disk = D_DISKLESS;
580b9767 3551 real_peer_disk = D_DISKLESS;
b411b363 3552 } else {
8169e41b 3553 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->tconn->flags))
82bc0194 3554 return -EIO;
4ac4aada 3555 D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
38fa9988 3556 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3557 return -EIO;
b411b363
PR
3558 }
3559 }
3560 }
3561
87eeee41 3562 spin_lock_irq(&mdev->tconn->req_lock);
78bae59b 3563 if (os.i != drbd_read_state(mdev).i)
b411b363
PR
3564 goto retry;
3565 clear_bit(CONSIDER_RESYNC, &mdev->flags);
b411b363
PR
3566 ns.peer = peer_state.role;
3567 ns.pdsk = real_peer_disk;
3568 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
4ac4aada 3569 if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
b411b363 3570 ns.disk = mdev->new_state_tmp.disk;
4ac4aada 3571 cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
2aebfabb 3572 if (ns.pdsk == D_CONSISTENT && drbd_suspended(mdev) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
481c6f50 3573 test_bit(NEW_CUR_UUID, &mdev->flags)) {
8554df1c 3574 /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
481c6f50 3575 for temporal network outages! */
87eeee41 3576 spin_unlock_irq(&mdev->tconn->req_lock);
481c6f50 3577 dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
2f5cdd0b 3578 tl_clear(mdev->tconn);
481c6f50
PR
3579 drbd_uuid_new_current(mdev);
3580 clear_bit(NEW_CUR_UUID, &mdev->flags);
38fa9988 3581 conn_request_state(mdev->tconn, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
82bc0194 3582 return -EIO;
481c6f50 3583 }
65d922c3 3584 rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
78bae59b 3585 ns = drbd_read_state(mdev);
87eeee41 3586 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
3587
3588 if (rv < SS_SUCCESS) {
38fa9988 3589 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3590 return -EIO;
b411b363
PR
3591 }
3592
4ac4aada
LE
3593 if (os.conn > C_WF_REPORT_PARAMS) {
3594 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
b411b363
PR
3595 peer_state.disk != D_NEGOTIATING ) {
3596 /* we want resync, peer has not yet decided to sync... */
3597 /* Nowadays only used when forcing a node into primary role and
3598 setting its disk to UpToDate with that */
3599 drbd_send_uuids(mdev);
3600 drbd_send_state(mdev);
3601 }
3602 }
3603
89e58e75 3604 mdev->tconn->net_conf->want_lose = 0;
b411b363
PR
3605
3606 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
3607
82bc0194 3608 return 0;
b411b363
PR
3609}
3610
4a76b161 3611static int receive_sync_uuid(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3612{
4a76b161
AG
3613 struct drbd_conf *mdev;
3614 struct p_rs_uuid *p = tconn->data.rbuf;
3615
3616 mdev = vnr_to_mdev(tconn, pi->vnr);
3617 if (!mdev)
3618 return -EIO;
b411b363
PR
3619
3620 wait_event(mdev->misc_wait,
3621 mdev->state.conn == C_WF_SYNC_UUID ||
c4752ef1 3622 mdev->state.conn == C_BEHIND ||
b411b363
PR
3623 mdev->state.conn < C_CONNECTED ||
3624 mdev->state.disk < D_NEGOTIATING);
3625
3626 /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
3627
b411b363
PR
3628 /* Here the _drbd_uuid_ functions are right, current should
3629 _not_ be rotated into the history */
3630 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
3631 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
3632 _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
3633
62b0da3a 3634 drbd_print_uuids(mdev, "updated sync uuid");
b411b363
PR
3635 drbd_start_resync(mdev, C_SYNC_TARGET);
3636
3637 put_ldev(mdev);
3638 } else
3639 dev_err(DEV, "Ignoring SyncUUID packet!\n");
3640
82bc0194 3641 return 0;
b411b363
PR
3642}
3643
2c46407d
AG
3644/**
3645 * receive_bitmap_plain
3646 *
3647 * Return 0 when done, 1 when another iteration is needed, and a negative error
3648 * code upon failure.
3649 */
3650static int
02918be2 3651receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
fc56815c 3652 struct p_header *h, struct bm_xfer_ctx *c)
b411b363 3653{
fc56815c 3654 unsigned long *buffer = (unsigned long *)h->payload;
b411b363
PR
3655 unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
3656 unsigned want = num_words * sizeof(long);
2c46407d 3657 int err;
b411b363 3658
02918be2
PR
3659 if (want != data_size) {
3660 dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
2c46407d 3661 return -EIO;
b411b363
PR
3662 }
3663 if (want == 0)
2c46407d 3664 return 0;
82bc0194
AG
3665 err = drbd_recv_all(mdev->tconn, buffer, want);
3666 if (err)
2c46407d 3667 return err;
b411b363
PR
3668
3669 drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
3670
3671 c->word_offset += num_words;
3672 c->bit_offset = c->word_offset * BITS_PER_LONG;
3673 if (c->bit_offset > c->bm_bits)
3674 c->bit_offset = c->bm_bits;
3675
2c46407d 3676 return 1;
b411b363
PR
3677}
3678
a02d1240
AG
3679static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
3680{
3681 return (enum drbd_bitmap_code)(p->encoding & 0x0f);
3682}
3683
3684static int dcbp_get_start(struct p_compressed_bm *p)
3685{
3686 return (p->encoding & 0x80) != 0;
3687}
3688
3689static int dcbp_get_pad_bits(struct p_compressed_bm *p)
3690{
3691 return (p->encoding >> 4) & 0x7;
3692}
3693
2c46407d
AG
3694/**
3695 * recv_bm_rle_bits
3696 *
3697 * Return 0 when done, 1 when another iteration is needed, and a negative error
3698 * code upon failure.
3699 */
3700static int
b411b363
PR
3701recv_bm_rle_bits(struct drbd_conf *mdev,
3702 struct p_compressed_bm *p,
c6d25cfe
PR
3703 struct bm_xfer_ctx *c,
3704 unsigned int len)
b411b363
PR
3705{
3706 struct bitstream bs;
3707 u64 look_ahead;
3708 u64 rl;
3709 u64 tmp;
3710 unsigned long s = c->bit_offset;
3711 unsigned long e;
a02d1240 3712 int toggle = dcbp_get_start(p);
b411b363
PR
3713 int have;
3714 int bits;
3715
a02d1240 3716 bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
b411b363
PR
3717
3718 bits = bitstream_get_bits(&bs, &look_ahead, 64);
3719 if (bits < 0)
2c46407d 3720 return -EIO;
b411b363
PR
3721
3722 for (have = bits; have > 0; s += rl, toggle = !toggle) {
3723 bits = vli_decode_bits(&rl, look_ahead);
3724 if (bits <= 0)
2c46407d 3725 return -EIO;
b411b363
PR
3726
3727 if (toggle) {
3728 e = s + rl -1;
3729 if (e >= c->bm_bits) {
3730 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
2c46407d 3731 return -EIO;
b411b363
PR
3732 }
3733 _drbd_bm_set_bits(mdev, s, e);
3734 }
3735
3736 if (have < bits) {
3737 dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
3738 have, bits, look_ahead,
3739 (unsigned int)(bs.cur.b - p->code),
3740 (unsigned int)bs.buf_len);
2c46407d 3741 return -EIO;
b411b363
PR
3742 }
3743 look_ahead >>= bits;
3744 have -= bits;
3745
3746 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
3747 if (bits < 0)
2c46407d 3748 return -EIO;
b411b363
PR
3749 look_ahead |= tmp << have;
3750 have += bits;
3751 }
3752
3753 c->bit_offset = s;
3754 bm_xfer_ctx_bit_to_word_offset(c);
3755
2c46407d 3756 return (s != c->bm_bits);
b411b363
PR
3757}
3758
2c46407d
AG
3759/**
3760 * decode_bitmap_c
3761 *
3762 * Return 0 when done, 1 when another iteration is needed, and a negative error
3763 * code upon failure.
3764 */
3765static int
b411b363
PR
3766decode_bitmap_c(struct drbd_conf *mdev,
3767 struct p_compressed_bm *p,
c6d25cfe
PR
3768 struct bm_xfer_ctx *c,
3769 unsigned int len)
b411b363 3770{
a02d1240 3771 if (dcbp_get_code(p) == RLE_VLI_Bits)
c6d25cfe 3772 return recv_bm_rle_bits(mdev, p, c, len);
b411b363
PR
3773
3774 /* other variants had been implemented for evaluation,
3775 * but have been dropped as this one turned out to be "best"
3776 * during all our tests. */
3777
3778 dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
38fa9988 3779 conn_request_state(mdev->tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
2c46407d 3780 return -EIO;
b411b363
PR
3781}
3782
3783void INFO_bm_xfer_stats(struct drbd_conf *mdev,
3784 const char *direction, struct bm_xfer_ctx *c)
3785{
3786 /* what would it take to transfer it "plaintext" */
c012949a 3787 unsigned plain = sizeof(struct p_header) *
b411b363
PR
3788 ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
3789 + c->bm_words * sizeof(long);
3790 unsigned total = c->bytes[0] + c->bytes[1];
3791 unsigned r;
3792
3793 /* total can not be zero. but just in case: */
3794 if (total == 0)
3795 return;
3796
3797 /* don't report if not compressed */
3798 if (total >= plain)
3799 return;
3800
3801 /* total < plain. check for overflow, still */
3802 r = (total > UINT_MAX/1000) ? (total / (plain/1000))
3803 : (1000 * total / plain);
3804
3805 if (r > 1000)
3806 r = 1000;
3807
3808 r = 1000 - r;
3809 dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
3810 "total %u; compression: %u.%u%%\n",
3811 direction,
3812 c->bytes[1], c->packets[1],
3813 c->bytes[0], c->packets[0],
3814 total, r/10, r % 10);
3815}
3816
3817/* Since we are processing the bitfield from lower addresses to higher,
3818 it does not matter if the process it in 32 bit chunks or 64 bit
3819 chunks as long as it is little endian. (Understand it as byte stream,
3820 beginning with the lowest byte...) If we would use big endian
3821 we would need to process it from the highest address to the lowest,
3822 in order to be agnostic to the 32 vs 64 bits issue.
3823
3824 returns 0 on failure, 1 if we successfully received it. */
4a76b161 3825static int receive_bitmap(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3826{
4a76b161 3827 struct drbd_conf *mdev;
b411b363 3828 struct bm_xfer_ctx c;
2c46407d 3829 int err;
4a76b161
AG
3830 struct p_header *h = tconn->data.rbuf;
3831
3832 mdev = vnr_to_mdev(tconn, pi->vnr);
3833 if (!mdev)
3834 return -EIO;
b411b363 3835
20ceb2b2
LE
3836 drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED);
3837 /* you are supposed to send additional out-of-sync information
3838 * if you actually set bits during this phase */
b411b363 3839
b411b363
PR
3840 c = (struct bm_xfer_ctx) {
3841 .bm_bits = drbd_bm_bits(mdev),
3842 .bm_words = drbd_bm_words(mdev),
3843 };
3844
2c46407d 3845 for(;;) {
e2857216
AG
3846 if (pi->cmd == P_BITMAP) {
3847 err = receive_bitmap_plain(mdev, pi->size, h, &c);
3848 } else if (pi->cmd == P_COMPRESSED_BITMAP) {
b411b363
PR
3849 /* MAYBE: sanity check that we speak proto >= 90,
3850 * and the feature is enabled! */
3851 struct p_compressed_bm *p;
3852
e2857216 3853 if (pi->size > BM_PACKET_PAYLOAD_BYTES) {
b411b363 3854 dev_err(DEV, "ReportCBitmap packet too large\n");
82bc0194 3855 err = -EIO;
b411b363
PR
3856 goto out;
3857 }
fc56815c
AG
3858
3859 p = mdev->tconn->data.rbuf;
e2857216 3860 err = drbd_recv_all(mdev->tconn, p->head.payload, pi->size);
82bc0194
AG
3861 if (err)
3862 goto out;
e2857216
AG
3863 if (pi->size <= (sizeof(*p) - sizeof(p->head))) {
3864 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", pi->size);
82bc0194 3865 err = -EIO;
78fcbdae 3866 goto out;
b411b363 3867 }
e2857216 3868 err = decode_bitmap_c(mdev, p, &c, pi->size);
b411b363 3869 } else {
e2857216 3870 dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
82bc0194 3871 err = -EIO;
b411b363
PR
3872 goto out;
3873 }
3874
e2857216
AG
3875 c.packets[pi->cmd == P_BITMAP]++;
3876 c.bytes[pi->cmd == P_BITMAP] += sizeof(struct p_header) + pi->size;
b411b363 3877
2c46407d
AG
3878 if (err <= 0) {
3879 if (err < 0)
3880 goto out;
b411b363 3881 break;
2c46407d 3882 }
e2857216 3883 err = drbd_recv_header(mdev->tconn, pi);
82bc0194 3884 if (err)
b411b363 3885 goto out;
2c46407d 3886 }
b411b363
PR
3887
3888 INFO_bm_xfer_stats(mdev, "receive", &c);
3889
3890 if (mdev->state.conn == C_WF_BITMAP_T) {
de1f8e4a
AG
3891 enum drbd_state_rv rv;
3892
82bc0194
AG
3893 err = drbd_send_bitmap(mdev);
3894 if (err)
b411b363
PR
3895 goto out;
3896 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
de1f8e4a
AG
3897 rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
3898 D_ASSERT(rv == SS_SUCCESS);
b411b363
PR
3899 } else if (mdev->state.conn != C_WF_BITMAP_S) {
3900 /* admin may have requested C_DISCONNECTING,
3901 * other threads may have noticed network errors */
3902 dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
3903 drbd_conn_str(mdev->state.conn));
3904 }
82bc0194 3905 err = 0;
b411b363 3906
b411b363 3907 out:
20ceb2b2 3908 drbd_bm_unlock(mdev);
82bc0194 3909 if (!err && mdev->state.conn == C_WF_BITMAP_S)
b411b363 3910 drbd_start_resync(mdev, C_SYNC_SOURCE);
82bc0194 3911 return err;
b411b363
PR
3912}
3913
4a76b161 3914static int receive_skip(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3915{
4a76b161 3916 conn_warn(tconn, "skipping unknown optional packet type %d, l: %d!\n",
e2857216 3917 pi->cmd, pi->size);
2de876ef 3918
4a76b161 3919 return ignore_remaining_packet(tconn, pi);
2de876ef
PR
3920}
3921
4a76b161 3922static int receive_UnplugRemote(struct drbd_tconn *tconn, struct packet_info *pi)
0ced55a3 3923{
e7f52dfb
LE
3924 /* Make sure we've acked all the TCP data associated
3925 * with the data requests being unplugged */
4a76b161 3926 drbd_tcp_quickack(tconn->data.socket);
0ced55a3 3927
82bc0194 3928 return 0;
0ced55a3
PR
3929}
3930
4a76b161 3931static int receive_out_of_sync(struct drbd_tconn *tconn, struct packet_info *pi)
73a01a18 3932{
4a76b161
AG
3933 struct drbd_conf *mdev;
3934 struct p_block_desc *p = tconn->data.rbuf;
3935
3936 mdev = vnr_to_mdev(tconn, pi->vnr);
3937 if (!mdev)
3938 return -EIO;
73a01a18 3939
f735e363
LE
3940 switch (mdev->state.conn) {
3941 case C_WF_SYNC_UUID:
3942 case C_WF_BITMAP_T:
3943 case C_BEHIND:
3944 break;
3945 default:
3946 dev_err(DEV, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
3947 drbd_conn_str(mdev->state.conn));
3948 }
3949
73a01a18
PR
3950 drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
3951
82bc0194 3952 return 0;
73a01a18
PR
3953}
3954
02918be2
PR
3955struct data_cmd {
3956 int expect_payload;
3957 size_t pkt_size;
4a76b161 3958 int (*fn)(struct drbd_tconn *, struct packet_info *);
02918be2
PR
3959};
3960
3961static struct data_cmd drbd_cmd_handler[] = {
4a76b161
AG
3962 [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
3963 [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
3964 [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
3965 [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
3966 [P_BITMAP] = { 1, sizeof(struct p_header), receive_bitmap } ,
3967 [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header), receive_bitmap } ,
3968 [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header), receive_UnplugRemote },
3969 [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3970 [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3971 [P_SYNC_PARAM] = { 1, sizeof(struct p_header), receive_SyncParam },
3972 [P_SYNC_PARAM89] = { 1, sizeof(struct p_header), receive_SyncParam },
3973 [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
3974 [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
3975 [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
3976 [P_STATE] = { 0, sizeof(struct p_state), receive_state },
3977 [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
3978 [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
3979 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3980 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
3981 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
3982 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
3983 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
3984 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
b411b363
PR
3985};
3986
eefc2f7d 3987static void drbdd(struct drbd_tconn *tconn)
b411b363 3988{
e6ef8a5c 3989 struct p_header *header = tconn->data.rbuf;
77351055 3990 struct packet_info pi;
02918be2 3991 size_t shs; /* sub header size */
82bc0194 3992 int err;
b411b363 3993
eefc2f7d 3994 while (get_t_state(&tconn->receiver) == RUNNING) {
deebe195
AG
3995 struct data_cmd *cmd;
3996
eefc2f7d 3997 drbd_thread_current_set_cpu(&tconn->receiver);
69bc7bc3 3998 if (drbd_recv_header(tconn, &pi))
02918be2 3999 goto err_out;
b411b363 4000
deebe195 4001 cmd = &drbd_cmd_handler[pi.cmd];
4a76b161 4002 if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
eefc2f7d 4003 conn_err(tconn, "unknown packet type %d, l: %d!\n", pi.cmd, pi.size);
02918be2 4004 goto err_out;
0b33a916 4005 }
b411b363 4006
deebe195
AG
4007 shs = cmd->pkt_size - sizeof(struct p_header);
4008 if (pi.size - shs > 0 && !cmd->expect_payload) {
eefc2f7d 4009 conn_err(tconn, "No payload expected %s l:%d\n", cmdname(pi.cmd), pi.size);
02918be2 4010 goto err_out;
b411b363 4011 }
b411b363 4012
c13f7e1a 4013 if (shs) {
a5c31904
AG
4014 err = drbd_recv_all_warn(tconn, &header->payload, shs);
4015 if (err)
c13f7e1a 4016 goto err_out;
e2857216 4017 pi.size -= shs;
c13f7e1a
LE
4018 }
4019
4a76b161
AG
4020 err = cmd->fn(tconn, &pi);
4021 if (err) {
eefc2f7d 4022 conn_err(tconn, "error receiving %s, l: %d!\n",
77351055 4023 cmdname(pi.cmd), pi.size);
02918be2 4024 goto err_out;
b411b363
PR
4025 }
4026 }
82bc0194 4027 return;
b411b363 4028
82bc0194
AG
4029 err_out:
4030 conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
b411b363
PR
4031}
4032
0e29d163 4033void conn_flush_workqueue(struct drbd_tconn *tconn)
b411b363
PR
4034{
4035 struct drbd_wq_barrier barr;
4036
4037 barr.w.cb = w_prev_work_done;
0e29d163 4038 barr.w.tconn = tconn;
b411b363 4039 init_completion(&barr.done);
0e29d163 4040 drbd_queue_work(&tconn->data.work, &barr.w);
b411b363
PR
4041 wait_for_completion(&barr.done);
4042}
4043
360cc740 4044static void drbd_disconnect(struct drbd_tconn *tconn)
b411b363 4045{
bbeb641c 4046 enum drbd_conns oc;
b411b363 4047 int rv = SS_UNKNOWN_ERROR;
b411b363 4048
bbeb641c 4049 if (tconn->cstate == C_STANDALONE)
b411b363 4050 return;
b411b363
PR
4051
4052 /* asender does not clean up anything. it must not interfere, either */
360cc740
PR
4053 drbd_thread_stop(&tconn->asender);
4054 drbd_free_sock(tconn);
4055
4056 idr_for_each(&tconn->volumes, drbd_disconnected, tconn);
360cc740
PR
4057 conn_info(tconn, "Connection closed\n");
4058
cb703454
PR
4059 if (conn_highest_role(tconn) == R_PRIMARY && conn_highest_pdsk(tconn) >= D_UNKNOWN)
4060 conn_try_outdate_peer_async(tconn);
4061
360cc740 4062 spin_lock_irq(&tconn->req_lock);
bbeb641c
PR
4063 oc = tconn->cstate;
4064 if (oc >= C_UNCONNECTED)
4065 rv = _conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE);
4066
360cc740
PR
4067 spin_unlock_irq(&tconn->req_lock);
4068
bbeb641c 4069 if (oc == C_DISCONNECTING) {
360cc740
PR
4070 wait_event(tconn->net_cnt_wait, atomic_read(&tconn->net_cnt) == 0);
4071
4072 crypto_free_hash(tconn->cram_hmac_tfm);
4073 tconn->cram_hmac_tfm = NULL;
4074
4075 kfree(tconn->net_conf);
4076 tconn->net_conf = NULL;
bbeb641c 4077 conn_request_state(tconn, NS(conn, C_STANDALONE), CS_VERBOSE);
360cc740
PR
4078 }
4079}
4080
4081static int drbd_disconnected(int vnr, void *p, void *data)
4082{
4083 struct drbd_conf *mdev = (struct drbd_conf *)p;
4084 enum drbd_fencing_p fp;
4085 unsigned int i;
b411b363 4086
85719573 4087 /* wait for current activity to cease. */
87eeee41 4088 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
4089 _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
4090 _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
4091 _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
87eeee41 4092 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
4093
4094 /* We do not have data structures that would allow us to
4095 * get the rs_pending_cnt down to 0 again.
4096 * * On C_SYNC_TARGET we do not have any data structures describing
4097 * the pending RSDataRequest's we have sent.
4098 * * On C_SYNC_SOURCE there is no data structure that tracks
4099 * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
4100 * And no, it is not the sum of the reference counts in the
4101 * resync_LRU. The resync_LRU tracks the whole operation including
4102 * the disk-IO, while the rs_pending_cnt only tracks the blocks
4103 * on the fly. */
4104 drbd_rs_cancel_all(mdev);
4105 mdev->rs_total = 0;
4106 mdev->rs_failed = 0;
4107 atomic_set(&mdev->rs_pending_cnt, 0);
4108 wake_up(&mdev->misc_wait);
4109
7fde2be9
PR
4110 del_timer(&mdev->request_timer);
4111
b411b363 4112 del_timer_sync(&mdev->resync_timer);
b411b363
PR
4113 resync_timer_fn((unsigned long)mdev);
4114
b411b363
PR
4115 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
4116 * w_make_resync_request etc. which may still be on the worker queue
4117 * to be "canceled" */
a21e9298 4118 drbd_flush_workqueue(mdev);
b411b363
PR
4119
4120 /* This also does reclaim_net_ee(). If we do this too early, we might
4121 * miss some resync ee and pages.*/
4122 drbd_process_done_ee(mdev);
4123
4124 kfree(mdev->p_uuid);
4125 mdev->p_uuid = NULL;
4126
2aebfabb 4127 if (!drbd_suspended(mdev))
2f5cdd0b 4128 tl_clear(mdev->tconn);
b411b363 4129
b411b363
PR
4130 drbd_md_sync(mdev);
4131
4132 fp = FP_DONT_CARE;
4133 if (get_ldev(mdev)) {
4134 fp = mdev->ldev->dc.fencing;
4135 put_ldev(mdev);
4136 }
4137
20ceb2b2
LE
4138 /* serialize with bitmap writeout triggered by the state change,
4139 * if any. */
4140 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
4141
b411b363
PR
4142 /* tcp_close and release of sendpage pages can be deferred. I don't
4143 * want to use SO_LINGER, because apparently it can be deferred for
4144 * more than 20 seconds (longest time I checked).
4145 *
4146 * Actually we don't care for exactly when the network stack does its
4147 * put_page(), but release our reference on these pages right here.
4148 */
4149 i = drbd_release_ee(mdev, &mdev->net_ee);
4150 if (i)
4151 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
435f0740
LE
4152 i = atomic_read(&mdev->pp_in_use_by_net);
4153 if (i)
4154 dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
b411b363
PR
4155 i = atomic_read(&mdev->pp_in_use);
4156 if (i)
45bb912b 4157 dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
b411b363
PR
4158
4159 D_ASSERT(list_empty(&mdev->read_ee));
4160 D_ASSERT(list_empty(&mdev->active_ee));
4161 D_ASSERT(list_empty(&mdev->sync_ee));
4162 D_ASSERT(list_empty(&mdev->done_ee));
4163
4164 /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
4165 atomic_set(&mdev->current_epoch->epoch_size, 0);
4166 D_ASSERT(list_empty(&mdev->current_epoch->list));
360cc740
PR
4167
4168 return 0;
b411b363
PR
4169}
4170
4171/*
4172 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
4173 * we can agree on is stored in agreed_pro_version.
4174 *
4175 * feature flags and the reserved array should be enough room for future
4176 * enhancements of the handshake protocol, and possible plugins...
4177 *
4178 * for now, they are expected to be zero, but ignored.
4179 */
6038178e 4180static int drbd_send_features(struct drbd_tconn *tconn)
b411b363 4181{
e6b3ea83 4182 /* ASSERT current == mdev->tconn->receiver ... */
6038178e 4183 struct p_connection_features *p = tconn->data.sbuf;
e8d17b01 4184 int err;
b411b363 4185
8a22cccc
PR
4186 if (mutex_lock_interruptible(&tconn->data.mutex)) {
4187 conn_err(tconn, "interrupted during initial handshake\n");
e8d17b01 4188 return -EINTR;
b411b363
PR
4189 }
4190
8a22cccc
PR
4191 if (tconn->data.socket == NULL) {
4192 mutex_unlock(&tconn->data.mutex);
e8d17b01 4193 return -EIO;
b411b363
PR
4194 }
4195
4196 memset(p, 0, sizeof(*p));
4197 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
4198 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
6038178e 4199 err = _conn_send_cmd(tconn, 0, &tconn->data, P_CONNECTION_FEATURES,
ecf2363c 4200 &p->head, sizeof(*p), 0);
8a22cccc 4201 mutex_unlock(&tconn->data.mutex);
e8d17b01 4202 return err;
b411b363
PR
4203}
4204
4205/*
4206 * return values:
4207 * 1 yes, we have a valid connection
4208 * 0 oops, did not work out, please try again
4209 * -1 peer talks different language,
4210 * no point in trying again, please go standalone.
4211 */
6038178e 4212static int drbd_do_features(struct drbd_tconn *tconn)
b411b363 4213{
65d11ed6 4214 /* ASSERT current == tconn->receiver ... */
6038178e
AG
4215 struct p_connection_features *p = tconn->data.rbuf;
4216 const int expect = sizeof(struct p_connection_features) - sizeof(struct p_header80);
77351055 4217 struct packet_info pi;
a5c31904 4218 int err;
b411b363 4219
6038178e 4220 err = drbd_send_features(tconn);
e8d17b01 4221 if (err)
b411b363
PR
4222 return 0;
4223
69bc7bc3
AG
4224 err = drbd_recv_header(tconn, &pi);
4225 if (err)
b411b363
PR
4226 return 0;
4227
6038178e
AG
4228 if (pi.cmd != P_CONNECTION_FEATURES) {
4229 conn_err(tconn, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
77351055 4230 cmdname(pi.cmd), pi.cmd);
b411b363
PR
4231 return -1;
4232 }
4233
77351055 4234 if (pi.size != expect) {
6038178e 4235 conn_err(tconn, "expected ConnectionFeatures length: %u, received: %u\n",
77351055 4236 expect, pi.size);
b411b363
PR
4237 return -1;
4238 }
4239
a5c31904
AG
4240 err = drbd_recv_all_warn(tconn, &p->head.payload, expect);
4241 if (err)
b411b363 4242 return 0;
b411b363 4243
b411b363
PR
4244 p->protocol_min = be32_to_cpu(p->protocol_min);
4245 p->protocol_max = be32_to_cpu(p->protocol_max);
4246 if (p->protocol_max == 0)
4247 p->protocol_max = p->protocol_min;
4248
4249 if (PRO_VERSION_MAX < p->protocol_min ||
4250 PRO_VERSION_MIN > p->protocol_max)
4251 goto incompat;
4252
65d11ed6 4253 tconn->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
b411b363 4254
65d11ed6
PR
4255 conn_info(tconn, "Handshake successful: "
4256 "Agreed network protocol version %d\n", tconn->agreed_pro_version);
b411b363
PR
4257
4258 return 1;
4259
4260 incompat:
65d11ed6 4261 conn_err(tconn, "incompatible DRBD dialects: "
b411b363
PR
4262 "I support %d-%d, peer supports %d-%d\n",
4263 PRO_VERSION_MIN, PRO_VERSION_MAX,
4264 p->protocol_min, p->protocol_max);
4265 return -1;
4266}
4267
4268#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
13e6037d 4269static int drbd_do_auth(struct drbd_tconn *tconn)
b411b363
PR
4270{
4271 dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
4272 dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
b10d96cb 4273 return -1;
b411b363
PR
4274}
4275#else
4276#define CHALLENGE_LEN 64
b10d96cb
JT
4277
4278/* Return value:
4279 1 - auth succeeded,
4280 0 - failed, try again (network error),
4281 -1 - auth failed, don't try again.
4282*/
4283
13e6037d 4284static int drbd_do_auth(struct drbd_tconn *tconn)
b411b363
PR
4285{
4286 char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
4287 struct scatterlist sg;
4288 char *response = NULL;
4289 char *right_response = NULL;
4290 char *peers_ch = NULL;
13e6037d 4291 unsigned int key_len = strlen(tconn->net_conf->shared_secret);
b411b363
PR
4292 unsigned int resp_size;
4293 struct hash_desc desc;
77351055 4294 struct packet_info pi;
69bc7bc3 4295 int err, rv;
b411b363 4296
13e6037d 4297 desc.tfm = tconn->cram_hmac_tfm;
b411b363
PR
4298 desc.flags = 0;
4299
13e6037d
PR
4300 rv = crypto_hash_setkey(tconn->cram_hmac_tfm,
4301 (u8 *)tconn->net_conf->shared_secret, key_len);
b411b363 4302 if (rv) {
13e6037d 4303 conn_err(tconn, "crypto_hash_setkey() failed with %d\n", rv);
b10d96cb 4304 rv = -1;
b411b363
PR
4305 goto fail;
4306 }
4307
4308 get_random_bytes(my_challenge, CHALLENGE_LEN);
4309
ce9879cb 4310 rv = !conn_send_cmd2(tconn, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
b411b363
PR
4311 if (!rv)
4312 goto fail;
4313
69bc7bc3
AG
4314 err = drbd_recv_header(tconn, &pi);
4315 if (err) {
4316 rv = 0;
b411b363 4317 goto fail;
69bc7bc3 4318 }
b411b363 4319
77351055 4320 if (pi.cmd != P_AUTH_CHALLENGE) {
13e6037d 4321 conn_err(tconn, "expected AuthChallenge packet, received: %s (0x%04x)\n",
77351055 4322 cmdname(pi.cmd), pi.cmd);
b411b363
PR
4323 rv = 0;
4324 goto fail;
4325 }
4326
77351055 4327 if (pi.size > CHALLENGE_LEN * 2) {
13e6037d 4328 conn_err(tconn, "expected AuthChallenge payload too big.\n");
b10d96cb 4329 rv = -1;
b411b363
PR
4330 goto fail;
4331 }
4332
77351055 4333 peers_ch = kmalloc(pi.size, GFP_NOIO);
b411b363 4334 if (peers_ch == NULL) {
13e6037d 4335 conn_err(tconn, "kmalloc of peers_ch failed\n");
b10d96cb 4336 rv = -1;
b411b363
PR
4337 goto fail;
4338 }
4339
a5c31904
AG
4340 err = drbd_recv_all_warn(tconn, peers_ch, pi.size);
4341 if (err) {
b411b363
PR
4342 rv = 0;
4343 goto fail;
4344 }
4345
13e6037d 4346 resp_size = crypto_hash_digestsize(tconn->cram_hmac_tfm);
b411b363
PR
4347 response = kmalloc(resp_size, GFP_NOIO);
4348 if (response == NULL) {
13e6037d 4349 conn_err(tconn, "kmalloc of response failed\n");
b10d96cb 4350 rv = -1;
b411b363
PR
4351 goto fail;
4352 }
4353
4354 sg_init_table(&sg, 1);
77351055 4355 sg_set_buf(&sg, peers_ch, pi.size);
b411b363
PR
4356
4357 rv = crypto_hash_digest(&desc, &sg, sg.length, response);
4358 if (rv) {
13e6037d 4359 conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv);
b10d96cb 4360 rv = -1;
b411b363
PR
4361 goto fail;
4362 }
4363
ce9879cb 4364 rv = !conn_send_cmd2(tconn, P_AUTH_RESPONSE, response, resp_size);
b411b363
PR
4365 if (!rv)
4366 goto fail;
4367
69bc7bc3
AG
4368 err = drbd_recv_header(tconn, &pi);
4369 if (err) {
4370 rv = 0;
b411b363 4371 goto fail;
69bc7bc3 4372 }
b411b363 4373
77351055 4374 if (pi.cmd != P_AUTH_RESPONSE) {
13e6037d 4375 conn_err(tconn, "expected AuthResponse packet, received: %s (0x%04x)\n",
77351055 4376 cmdname(pi.cmd), pi.cmd);
b411b363
PR
4377 rv = 0;
4378 goto fail;
4379 }
4380
77351055 4381 if (pi.size != resp_size) {
13e6037d 4382 conn_err(tconn, "expected AuthResponse payload of wrong size\n");
b411b363
PR
4383 rv = 0;
4384 goto fail;
4385 }
4386
a5c31904
AG
4387 err = drbd_recv_all_warn(tconn, response , resp_size);
4388 if (err) {
b411b363
PR
4389 rv = 0;
4390 goto fail;
4391 }
4392
4393 right_response = kmalloc(resp_size, GFP_NOIO);
2d1ee87d 4394 if (right_response == NULL) {
13e6037d 4395 conn_err(tconn, "kmalloc of right_response failed\n");
b10d96cb 4396 rv = -1;
b411b363
PR
4397 goto fail;
4398 }
4399
4400 sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
4401
4402 rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
4403 if (rv) {
13e6037d 4404 conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv);
b10d96cb 4405 rv = -1;
b411b363
PR
4406 goto fail;
4407 }
4408
4409 rv = !memcmp(response, right_response, resp_size);
4410
4411 if (rv)
13e6037d
PR
4412 conn_info(tconn, "Peer authenticated using %d bytes of '%s' HMAC\n",
4413 resp_size, tconn->net_conf->cram_hmac_alg);
b10d96cb
JT
4414 else
4415 rv = -1;
b411b363
PR
4416
4417 fail:
4418 kfree(peers_ch);
4419 kfree(response);
4420 kfree(right_response);
4421
4422 return rv;
4423}
4424#endif
4425
4426int drbdd_init(struct drbd_thread *thi)
4427{
392c8801 4428 struct drbd_tconn *tconn = thi->tconn;
b411b363
PR
4429 int h;
4430
4d641dd7 4431 conn_info(tconn, "receiver (re)started\n");
b411b363
PR
4432
4433 do {
4d641dd7 4434 h = drbd_connect(tconn);
b411b363 4435 if (h == 0) {
4d641dd7 4436 drbd_disconnect(tconn);
20ee6390 4437 schedule_timeout_interruptible(HZ);
b411b363
PR
4438 }
4439 if (h == -1) {
4d641dd7 4440 conn_warn(tconn, "Discarding network configuration.\n");
bbeb641c 4441 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
4442 }
4443 } while (h == 0);
4444
4445 if (h > 0) {
4d641dd7
PR
4446 if (get_net_conf(tconn)) {
4447 drbdd(tconn);
4448 put_net_conf(tconn);
b411b363
PR
4449 }
4450 }
4451
4d641dd7 4452 drbd_disconnect(tconn);
b411b363 4453
4d641dd7 4454 conn_info(tconn, "receiver terminated\n");
b411b363
PR
4455 return 0;
4456}
4457
4458/* ********* acknowledge sender ******** */
4459
e05e1e59 4460static int got_conn_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi)
e4f78ede 4461{
e6ef8a5c 4462 struct p_req_state_reply *p = tconn->meta.rbuf;
e4f78ede
PR
4463 int retcode = be32_to_cpu(p->retcode);
4464
4465 if (retcode >= SS_SUCCESS) {
4466 set_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags);
4467 } else {
4468 set_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags);
4469 conn_err(tconn, "Requested state change failed by peer: %s (%d)\n",
4470 drbd_set_st_err_str(retcode), retcode);
4471 }
4472 wake_up(&tconn->ping_wait);
4473
4474 return true;
4475}
4476
1952e916 4477static int got_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4478{
1952e916
AG
4479 struct drbd_conf *mdev;
4480 struct p_req_state_reply *p = tconn->meta.rbuf;
b411b363
PR
4481 int retcode = be32_to_cpu(p->retcode);
4482
1952e916
AG
4483 mdev = vnr_to_mdev(tconn, pi->vnr);
4484 if (!mdev)
4485 return false;
4486
e4f78ede
PR
4487 if (retcode >= SS_SUCCESS) {
4488 set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
4489 } else {
4490 set_bit(CL_ST_CHG_FAIL, &mdev->flags);
4491 dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
4492 drbd_set_st_err_str(retcode), retcode);
b411b363 4493 }
e4f78ede
PR
4494 wake_up(&mdev->state_wait);
4495
81e84650 4496 return true;
b411b363
PR
4497}
4498
e05e1e59 4499static int got_Ping(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4500{
a17647aa 4501 return !drbd_send_ping_ack(tconn);
b411b363
PR
4502
4503}
4504
e05e1e59 4505static int got_PingAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363
PR
4506{
4507 /* restore idle timeout */
2a67d8b9
PR
4508 tconn->meta.socket->sk->sk_rcvtimeo = tconn->net_conf->ping_int*HZ;
4509 if (!test_and_set_bit(GOT_PING_ACK, &tconn->flags))
4510 wake_up(&tconn->ping_wait);
b411b363 4511
81e84650 4512 return true;
b411b363
PR
4513}
4514
1952e916 4515static int got_IsInSync(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4516{
1952e916
AG
4517 struct drbd_conf *mdev;
4518 struct p_block_ack *p = tconn->meta.rbuf;
b411b363
PR
4519 sector_t sector = be64_to_cpu(p->sector);
4520 int blksize = be32_to_cpu(p->blksize);
4521
1952e916
AG
4522 mdev = vnr_to_mdev(tconn, pi->vnr);
4523 if (!mdev)
4524 return false;
4525
31890f4a 4526 D_ASSERT(mdev->tconn->agreed_pro_version >= 89);
b411b363
PR
4527
4528 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4529
1d53f09e
LE
4530 if (get_ldev(mdev)) {
4531 drbd_rs_complete_io(mdev, sector);
4532 drbd_set_in_sync(mdev, sector, blksize);
4533 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4534 mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4535 put_ldev(mdev);
4536 }
b411b363 4537 dec_rs_pending(mdev);
778f271d 4538 atomic_add(blksize >> 9, &mdev->rs_sect_in);
b411b363 4539
81e84650 4540 return true;
b411b363
PR
4541}
4542
bc9c5c41
AG
4543static int
4544validate_req_change_req_state(struct drbd_conf *mdev, u64 id, sector_t sector,
4545 struct rb_root *root, const char *func,
4546 enum drbd_req_event what, bool missing_ok)
b411b363
PR
4547{
4548 struct drbd_request *req;
4549 struct bio_and_error m;
4550
87eeee41 4551 spin_lock_irq(&mdev->tconn->req_lock);
bc9c5c41 4552 req = find_request(mdev, root, id, sector, missing_ok, func);
b411b363 4553 if (unlikely(!req)) {
87eeee41 4554 spin_unlock_irq(&mdev->tconn->req_lock);
81e84650 4555 return false;
b411b363
PR
4556 }
4557 __req_mod(req, what, &m);
87eeee41 4558 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
4559
4560 if (m.bio)
4561 complete_master_bio(mdev, &m);
81e84650 4562 return true;
b411b363
PR
4563}
4564
1952e916 4565static int got_BlockAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4566{
1952e916
AG
4567 struct drbd_conf *mdev;
4568 struct p_block_ack *p = tconn->meta.rbuf;
b411b363
PR
4569 sector_t sector = be64_to_cpu(p->sector);
4570 int blksize = be32_to_cpu(p->blksize);
4571 enum drbd_req_event what;
4572
1952e916
AG
4573 mdev = vnr_to_mdev(tconn, pi->vnr);
4574 if (!mdev)
4575 return false;
4576
b411b363
PR
4577 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4578
579b57ed 4579 if (p->block_id == ID_SYNCER) {
b411b363
PR
4580 drbd_set_in_sync(mdev, sector, blksize);
4581 dec_rs_pending(mdev);
81e84650 4582 return true;
b411b363 4583 }
e05e1e59 4584 switch (pi->cmd) {
b411b363 4585 case P_RS_WRITE_ACK:
89e58e75 4586 D_ASSERT(mdev->tconn->net_conf->wire_protocol == DRBD_PROT_C);
8554df1c 4587 what = WRITE_ACKED_BY_PEER_AND_SIS;
b411b363
PR
4588 break;
4589 case P_WRITE_ACK:
89e58e75 4590 D_ASSERT(mdev->tconn->net_conf->wire_protocol == DRBD_PROT_C);
8554df1c 4591 what = WRITE_ACKED_BY_PEER;
b411b363
PR
4592 break;
4593 case P_RECV_ACK:
89e58e75 4594 D_ASSERT(mdev->tconn->net_conf->wire_protocol == DRBD_PROT_B);
8554df1c 4595 what = RECV_ACKED_BY_PEER;
b411b363 4596 break;
7be8da07 4597 case P_DISCARD_WRITE:
89e58e75 4598 D_ASSERT(mdev->tconn->net_conf->wire_protocol == DRBD_PROT_C);
7be8da07
AG
4599 what = DISCARD_WRITE;
4600 break;
4601 case P_RETRY_WRITE:
4602 D_ASSERT(mdev->tconn->net_conf->wire_protocol == DRBD_PROT_C);
4603 what = POSTPONE_WRITE;
b411b363
PR
4604 break;
4605 default:
4606 D_ASSERT(0);
81e84650 4607 return false;
b411b363
PR
4608 }
4609
4610 return validate_req_change_req_state(mdev, p->block_id, sector,
bc9c5c41
AG
4611 &mdev->write_requests, __func__,
4612 what, false);
b411b363
PR
4613}
4614
1952e916 4615static int got_NegAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4616{
1952e916
AG
4617 struct drbd_conf *mdev;
4618 struct p_block_ack *p = tconn->meta.rbuf;
b411b363 4619 sector_t sector = be64_to_cpu(p->sector);
2deb8336 4620 int size = be32_to_cpu(p->blksize);
1952e916
AG
4621 bool missing_ok = tconn->net_conf->wire_protocol == DRBD_PROT_A ||
4622 tconn->net_conf->wire_protocol == DRBD_PROT_B;
c3afd8f5 4623 bool found;
b411b363 4624
1952e916
AG
4625 mdev = vnr_to_mdev(tconn, pi->vnr);
4626 if (!mdev)
4627 return false;
4628
b411b363
PR
4629 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4630
579b57ed 4631 if (p->block_id == ID_SYNCER) {
b411b363
PR
4632 dec_rs_pending(mdev);
4633 drbd_rs_failed_io(mdev, sector, size);
81e84650 4634 return true;
b411b363 4635 }
2deb8336 4636
c3afd8f5 4637 found = validate_req_change_req_state(mdev, p->block_id, sector,
bc9c5c41 4638 &mdev->write_requests, __func__,
8554df1c 4639 NEG_ACKED, missing_ok);
c3afd8f5
AG
4640 if (!found) {
4641 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
4642 The master bio might already be completed, therefore the
4643 request is no longer in the collision hash. */
4644 /* In Protocol B we might already have got a P_RECV_ACK
4645 but then get a P_NEG_ACK afterwards. */
4646 if (!missing_ok)
2deb8336 4647 return false;
c3afd8f5 4648 drbd_set_out_of_sync(mdev, sector, size);
2deb8336 4649 }
2deb8336 4650 return true;
b411b363
PR
4651}
4652
1952e916 4653static int got_NegDReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4654{
1952e916
AG
4655 struct drbd_conf *mdev;
4656 struct p_block_ack *p = tconn->meta.rbuf;
b411b363
PR
4657 sector_t sector = be64_to_cpu(p->sector);
4658
1952e916
AG
4659 mdev = vnr_to_mdev(tconn, pi->vnr);
4660 if (!mdev)
4661 return false;
4662
b411b363 4663 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
7be8da07 4664
b411b363
PR
4665 dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
4666 (unsigned long long)sector, be32_to_cpu(p->blksize));
4667
4668 return validate_req_change_req_state(mdev, p->block_id, sector,
bc9c5c41 4669 &mdev->read_requests, __func__,
8554df1c 4670 NEG_ACKED, false);
b411b363
PR
4671}
4672
1952e916 4673static int got_NegRSDReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4674{
1952e916 4675 struct drbd_conf *mdev;
b411b363
PR
4676 sector_t sector;
4677 int size;
1952e916
AG
4678 struct p_block_ack *p = tconn->meta.rbuf;
4679
4680 mdev = vnr_to_mdev(tconn, pi->vnr);
4681 if (!mdev)
4682 return false;
b411b363
PR
4683
4684 sector = be64_to_cpu(p->sector);
4685 size = be32_to_cpu(p->blksize);
b411b363
PR
4686
4687 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4688
4689 dec_rs_pending(mdev);
4690
4691 if (get_ldev_if_state(mdev, D_FAILED)) {
4692 drbd_rs_complete_io(mdev, sector);
e05e1e59 4693 switch (pi->cmd) {
d612d309
PR
4694 case P_NEG_RS_DREPLY:
4695 drbd_rs_failed_io(mdev, sector, size);
4696 case P_RS_CANCEL:
4697 break;
4698 default:
4699 D_ASSERT(0);
4700 put_ldev(mdev);
4701 return false;
4702 }
b411b363
PR
4703 put_ldev(mdev);
4704 }
4705
81e84650 4706 return true;
b411b363
PR
4707}
4708
1952e916 4709static int got_BarrierAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4710{
1952e916
AG
4711 struct drbd_conf *mdev;
4712 struct p_barrier_ack *p = tconn->meta.rbuf;
4713
4714 mdev = vnr_to_mdev(tconn, pi->vnr);
4715 if (!mdev)
4716 return false;
b411b363 4717
2f5cdd0b 4718 tl_release(mdev->tconn, p->barrier, be32_to_cpu(p->set_size));
b411b363 4719
c4752ef1
PR
4720 if (mdev->state.conn == C_AHEAD &&
4721 atomic_read(&mdev->ap_in_flight) == 0 &&
370a43e7
PR
4722 !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) {
4723 mdev->start_resync_timer.expires = jiffies + HZ;
4724 add_timer(&mdev->start_resync_timer);
c4752ef1
PR
4725 }
4726
81e84650 4727 return true;
b411b363
PR
4728}
4729
1952e916 4730static int got_OVResult(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4731{
1952e916
AG
4732 struct drbd_conf *mdev;
4733 struct p_block_ack *p = tconn->meta.rbuf;
b411b363
PR
4734 struct drbd_work *w;
4735 sector_t sector;
4736 int size;
4737
1952e916
AG
4738 mdev = vnr_to_mdev(tconn, pi->vnr);
4739 if (!mdev)
4740 return false;
4741
b411b363
PR
4742 sector = be64_to_cpu(p->sector);
4743 size = be32_to_cpu(p->blksize);
4744
4745 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4746
4747 if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
8f7bed77 4748 drbd_ov_out_of_sync_found(mdev, sector, size);
b411b363 4749 else
8f7bed77 4750 ov_out_of_sync_print(mdev);
b411b363 4751
1d53f09e 4752 if (!get_ldev(mdev))
81e84650 4753 return true;
1d53f09e 4754
b411b363
PR
4755 drbd_rs_complete_io(mdev, sector);
4756 dec_rs_pending(mdev);
4757
ea5442af
LE
4758 --mdev->ov_left;
4759
4760 /* let's advance progress step marks only for every other megabyte */
4761 if ((mdev->ov_left & 0x200) == 0x200)
4762 drbd_advance_rs_marks(mdev, mdev->ov_left);
4763
4764 if (mdev->ov_left == 0) {
b411b363
PR
4765 w = kmalloc(sizeof(*w), GFP_NOIO);
4766 if (w) {
4767 w->cb = w_ov_finished;
a21e9298 4768 w->mdev = mdev;
e42325a5 4769 drbd_queue_work_front(&mdev->tconn->data.work, w);
b411b363
PR
4770 } else {
4771 dev_err(DEV, "kmalloc(w) failed.");
8f7bed77 4772 ov_out_of_sync_print(mdev);
b411b363
PR
4773 drbd_resync_finished(mdev);
4774 }
4775 }
1d53f09e 4776 put_ldev(mdev);
81e84650 4777 return true;
b411b363
PR
4778}
4779
1952e916 4780static int got_skip(struct drbd_tconn *tconn, struct packet_info *pi)
0ced55a3 4781{
81e84650 4782 return true;
0ced55a3
PR
4783}
4784
32862ec7
PR
4785static int tconn_process_done_ee(struct drbd_tconn *tconn)
4786{
082a3439
PR
4787 struct drbd_conf *mdev;
4788 int i, not_empty = 0;
32862ec7
PR
4789
4790 do {
4791 clear_bit(SIGNAL_ASENDER, &tconn->flags);
4792 flush_signals(current);
082a3439 4793 idr_for_each_entry(&tconn->volumes, mdev, i) {
e2b3032b 4794 if (drbd_process_done_ee(mdev))
082a3439
PR
4795 return 1; /* error */
4796 }
32862ec7 4797 set_bit(SIGNAL_ASENDER, &tconn->flags);
082a3439
PR
4798
4799 spin_lock_irq(&tconn->req_lock);
4800 idr_for_each_entry(&tconn->volumes, mdev, i) {
4801 not_empty = !list_empty(&mdev->done_ee);
4802 if (not_empty)
4803 break;
4804 }
4805 spin_unlock_irq(&tconn->req_lock);
32862ec7
PR
4806 } while (not_empty);
4807
4808 return 0;
4809}
4810
7201b972
AG
4811struct asender_cmd {
4812 size_t pkt_size;
1952e916 4813 int (*fn)(struct drbd_tconn *tconn, struct packet_info *);
7201b972
AG
4814};
4815
4816static struct asender_cmd asender_tbl[] = {
1952e916
AG
4817 [P_PING] = { sizeof(struct p_header), got_Ping },
4818 [P_PING_ACK] = { sizeof(struct p_header), got_PingAck },
4819 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4820 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4821 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4822 [P_DISCARD_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
4823 [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
4824 [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
4825 [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply },
4826 [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
4827 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
4828 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
4829 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
4830 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
4831 [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply },
4832 [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
4833 [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
7201b972
AG
4834};
4835
b411b363
PR
4836int drbd_asender(struct drbd_thread *thi)
4837{
392c8801 4838 struct drbd_tconn *tconn = thi->tconn;
e6ef8a5c 4839 struct p_header *h = tconn->meta.rbuf;
b411b363 4840 struct asender_cmd *cmd = NULL;
77351055 4841 struct packet_info pi;
257d0af6 4842 int rv;
b411b363
PR
4843 void *buf = h;
4844 int received = 0;
257d0af6 4845 int expect = sizeof(struct p_header);
f36af18c 4846 int ping_timeout_active = 0;
b411b363 4847
b411b363
PR
4848 current->policy = SCHED_RR; /* Make this a realtime task! */
4849 current->rt_priority = 2; /* more important than all other tasks */
4850
e77a0a5c 4851 while (get_t_state(thi) == RUNNING) {
80822284 4852 drbd_thread_current_set_cpu(thi);
32862ec7 4853 if (test_and_clear_bit(SEND_PING, &tconn->flags)) {
a17647aa 4854 if (drbd_send_ping(tconn)) {
32862ec7 4855 conn_err(tconn, "drbd_send_ping has failed\n");
841ce241
AG
4856 goto reconnect;
4857 }
32862ec7
PR
4858 tconn->meta.socket->sk->sk_rcvtimeo =
4859 tconn->net_conf->ping_timeo*HZ/10;
f36af18c 4860 ping_timeout_active = 1;
b411b363
PR
4861 }
4862
32862ec7
PR
4863 /* TODO: conditionally cork; it may hurt latency if we cork without
4864 much to send */
4865 if (!tconn->net_conf->no_cork)
4866 drbd_tcp_cork(tconn->meta.socket);
082a3439
PR
4867 if (tconn_process_done_ee(tconn)) {
4868 conn_err(tconn, "tconn_process_done_ee() failed\n");
32862ec7 4869 goto reconnect;
082a3439 4870 }
b411b363 4871 /* but unconditionally uncork unless disabled */
32862ec7
PR
4872 if (!tconn->net_conf->no_cork)
4873 drbd_tcp_uncork(tconn->meta.socket);
b411b363
PR
4874
4875 /* short circuit, recv_msg would return EINTR anyways. */
4876 if (signal_pending(current))
4877 continue;
4878
32862ec7
PR
4879 rv = drbd_recv_short(tconn->meta.socket, buf, expect-received, 0);
4880 clear_bit(SIGNAL_ASENDER, &tconn->flags);
b411b363
PR
4881
4882 flush_signals(current);
4883
4884 /* Note:
4885 * -EINTR (on meta) we got a signal
4886 * -EAGAIN (on meta) rcvtimeo expired
4887 * -ECONNRESET other side closed the connection
4888 * -ERESTARTSYS (on data) we got a signal
4889 * rv < 0 other than above: unexpected error!
4890 * rv == expected: full header or command
4891 * rv < expected: "woken" by signal during receive
4892 * rv == 0 : "connection shut down by peer"
4893 */
4894 if (likely(rv > 0)) {
4895 received += rv;
4896 buf += rv;
4897 } else if (rv == 0) {
32862ec7 4898 conn_err(tconn, "meta connection shut down by peer.\n");
b411b363
PR
4899 goto reconnect;
4900 } else if (rv == -EAGAIN) {
cb6518cb
LE
4901 /* If the data socket received something meanwhile,
4902 * that is good enough: peer is still alive. */
32862ec7
PR
4903 if (time_after(tconn->last_received,
4904 jiffies - tconn->meta.socket->sk->sk_rcvtimeo))
cb6518cb 4905 continue;
f36af18c 4906 if (ping_timeout_active) {
32862ec7 4907 conn_err(tconn, "PingAck did not arrive in time.\n");
b411b363
PR
4908 goto reconnect;
4909 }
32862ec7 4910 set_bit(SEND_PING, &tconn->flags);
b411b363
PR
4911 continue;
4912 } else if (rv == -EINTR) {
4913 continue;
4914 } else {
32862ec7 4915 conn_err(tconn, "sock_recvmsg returned %d\n", rv);
b411b363
PR
4916 goto reconnect;
4917 }
4918
4919 if (received == expect && cmd == NULL) {
8172f3e9 4920 if (decode_header(tconn, h, &pi))
b411b363 4921 goto reconnect;
7201b972 4922 cmd = &asender_tbl[pi.cmd];
1952e916 4923 if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
32862ec7 4924 conn_err(tconn, "unknown command %d on meta (l: %d)\n",
77351055 4925 pi.cmd, pi.size);
b411b363
PR
4926 goto disconnect;
4927 }
4928 expect = cmd->pkt_size;
77351055 4929 if (pi.size != expect - sizeof(struct p_header)) {
32862ec7 4930 conn_err(tconn, "Wrong packet size on meta (c: %d, l: %d)\n",
77351055 4931 pi.cmd, pi.size);
b411b363 4932 goto reconnect;
257d0af6 4933 }
b411b363
PR
4934 }
4935 if (received == expect) {
a4fbda8e
PR
4936 bool rv;
4937
1952e916
AG
4938 rv = cmd->fn(tconn, &pi);
4939 if (!rv) {
4940 conn_err(tconn, "%pf failed\n", cmd->fn);
b411b363 4941 goto reconnect;
1952e916 4942 }
b411b363 4943
a4fbda8e
PR
4944 tconn->last_received = jiffies;
4945
f36af18c
LE
4946 /* the idle_timeout (ping-int)
4947 * has been restored in got_PingAck() */
7201b972 4948 if (cmd == &asender_tbl[P_PING_ACK])
f36af18c
LE
4949 ping_timeout_active = 0;
4950
b411b363
PR
4951 buf = h;
4952 received = 0;
257d0af6 4953 expect = sizeof(struct p_header);
b411b363
PR
4954 cmd = NULL;
4955 }
4956 }
4957
4958 if (0) {
4959reconnect:
bbeb641c 4960 conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
b411b363
PR
4961 }
4962 if (0) {
4963disconnect:
bbeb641c 4964 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 4965 }
32862ec7 4966 clear_bit(SIGNAL_ASENDER, &tconn->flags);
b411b363 4967
32862ec7 4968 conn_info(tconn, "asender terminated\n");
b411b363
PR
4969
4970 return 0;
4971}
This page took 0.446434 seconds and 5 git commands to generate.