drbd: introduce stop-sector to online verify
[deliverable/linux.git] / drivers / block / drbd / drbd_receiver.c
CommitLineData
b411b363
PR
1/*
2 drbd_receiver.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25
b411b363
PR
26#include <linux/module.h>
27
28#include <asm/uaccess.h>
29#include <net/sock.h>
30
b411b363
PR
31#include <linux/drbd.h>
32#include <linux/fs.h>
33#include <linux/file.h>
34#include <linux/in.h>
35#include <linux/mm.h>
36#include <linux/memcontrol.h>
37#include <linux/mm_inline.h>
38#include <linux/slab.h>
b411b363
PR
39#include <linux/pkt_sched.h>
40#define __KERNEL_SYSCALLS__
41#include <linux/unistd.h>
42#include <linux/vmalloc.h>
43#include <linux/random.h>
b411b363
PR
44#include <linux/string.h>
45#include <linux/scatterlist.h>
46#include "drbd_int.h"
b411b363
PR
47#include "drbd_req.h"
48
49#include "drbd_vli.h"
50
77351055
PR
51struct packet_info {
52 enum drbd_packet cmd;
e2857216
AG
53 unsigned int size;
54 unsigned int vnr;
e658983a 55 void *data;
77351055
PR
56};
57
b411b363
PR
58enum finish_epoch {
59 FE_STILL_LIVE,
60 FE_DESTROYED,
61 FE_RECYCLED,
62};
63
6038178e 64static int drbd_do_features(struct drbd_tconn *tconn);
13e6037d 65static int drbd_do_auth(struct drbd_tconn *tconn);
c141ebda 66static int drbd_disconnected(struct drbd_conf *mdev);
b411b363 67
1e9dd291 68static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *, struct drbd_epoch *, enum epoch_event);
99920dc5 69static int e_end_block(struct drbd_work *, int);
b411b363 70
b411b363
PR
71
72#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
73
45bb912b
LE
74/*
75 * some helper functions to deal with single linked page lists,
76 * page->private being our "next" pointer.
77 */
78
79/* If at least n pages are linked at head, get n pages off.
80 * Otherwise, don't modify head, and return NULL.
81 * Locking is the responsibility of the caller.
82 */
83static struct page *page_chain_del(struct page **head, int n)
84{
85 struct page *page;
86 struct page *tmp;
87
88 BUG_ON(!n);
89 BUG_ON(!head);
90
91 page = *head;
23ce4227
PR
92
93 if (!page)
94 return NULL;
95
45bb912b
LE
96 while (page) {
97 tmp = page_chain_next(page);
98 if (--n == 0)
99 break; /* found sufficient pages */
100 if (tmp == NULL)
101 /* insufficient pages, don't use any of them. */
102 return NULL;
103 page = tmp;
104 }
105
106 /* add end of list marker for the returned list */
107 set_page_private(page, 0);
108 /* actual return value, and adjustment of head */
109 page = *head;
110 *head = tmp;
111 return page;
112}
113
114/* may be used outside of locks to find the tail of a (usually short)
115 * "private" page chain, before adding it back to a global chain head
116 * with page_chain_add() under a spinlock. */
117static struct page *page_chain_tail(struct page *page, int *len)
118{
119 struct page *tmp;
120 int i = 1;
121 while ((tmp = page_chain_next(page)))
122 ++i, page = tmp;
123 if (len)
124 *len = i;
125 return page;
126}
127
128static int page_chain_free(struct page *page)
129{
130 struct page *tmp;
131 int i = 0;
132 page_chain_for_each_safe(page, tmp) {
133 put_page(page);
134 ++i;
135 }
136 return i;
137}
138
139static void page_chain_add(struct page **head,
140 struct page *chain_first, struct page *chain_last)
141{
142#if 1
143 struct page *tmp;
144 tmp = page_chain_tail(chain_first, NULL);
145 BUG_ON(tmp != chain_last);
146#endif
147
148 /* add chain to head */
149 set_page_private(chain_last, (unsigned long)*head);
150 *head = chain_first;
151}
152
18c2d522
AG
153static struct page *__drbd_alloc_pages(struct drbd_conf *mdev,
154 unsigned int number)
b411b363
PR
155{
156 struct page *page = NULL;
45bb912b 157 struct page *tmp = NULL;
18c2d522 158 unsigned int i = 0;
b411b363
PR
159
160 /* Yes, testing drbd_pp_vacant outside the lock is racy.
161 * So what. It saves a spin_lock. */
45bb912b 162 if (drbd_pp_vacant >= number) {
b411b363 163 spin_lock(&drbd_pp_lock);
45bb912b
LE
164 page = page_chain_del(&drbd_pp_pool, number);
165 if (page)
166 drbd_pp_vacant -= number;
b411b363 167 spin_unlock(&drbd_pp_lock);
45bb912b
LE
168 if (page)
169 return page;
b411b363 170 }
45bb912b 171
b411b363
PR
172 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
173 * "criss-cross" setup, that might cause write-out on some other DRBD,
174 * which in turn might block on the other node at this very place. */
45bb912b
LE
175 for (i = 0; i < number; i++) {
176 tmp = alloc_page(GFP_TRY);
177 if (!tmp)
178 break;
179 set_page_private(tmp, (unsigned long)page);
180 page = tmp;
181 }
182
183 if (i == number)
184 return page;
185
186 /* Not enough pages immediately available this time.
c37c8ecf 187 * No need to jump around here, drbd_alloc_pages will retry this
45bb912b
LE
188 * function "soon". */
189 if (page) {
190 tmp = page_chain_tail(page, NULL);
191 spin_lock(&drbd_pp_lock);
192 page_chain_add(&drbd_pp_pool, page, tmp);
193 drbd_pp_vacant += i;
194 spin_unlock(&drbd_pp_lock);
195 }
196 return NULL;
b411b363
PR
197}
198
a990be46
AG
199static void reclaim_finished_net_peer_reqs(struct drbd_conf *mdev,
200 struct list_head *to_be_freed)
b411b363 201{
db830c46 202 struct drbd_peer_request *peer_req;
b411b363
PR
203 struct list_head *le, *tle;
204
205 /* The EEs are always appended to the end of the list. Since
206 they are sent in order over the wire, they have to finish
207 in order. As soon as we see the first not finished we can
208 stop to examine the list... */
209
210 list_for_each_safe(le, tle, &mdev->net_ee) {
db830c46 211 peer_req = list_entry(le, struct drbd_peer_request, w.list);
045417f7 212 if (drbd_peer_req_has_active_page(peer_req))
b411b363
PR
213 break;
214 list_move(le, to_be_freed);
215 }
216}
217
218static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
219{
220 LIST_HEAD(reclaimed);
db830c46 221 struct drbd_peer_request *peer_req, *t;
b411b363 222
87eeee41 223 spin_lock_irq(&mdev->tconn->req_lock);
a990be46 224 reclaim_finished_net_peer_reqs(mdev, &reclaimed);
87eeee41 225 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 226
db830c46 227 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
3967deb1 228 drbd_free_net_peer_req(mdev, peer_req);
b411b363
PR
229}
230
231/**
c37c8ecf 232 * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
b411b363 233 * @mdev: DRBD device.
45bb912b
LE
234 * @number: number of pages requested
235 * @retry: whether to retry, if not enough pages are available right now
236 *
237 * Tries to allocate number pages, first from our own page pool, then from
238 * the kernel, unless this allocation would exceed the max_buffers setting.
239 * Possibly retry until DRBD frees sufficient pages somewhere else.
b411b363 240 *
45bb912b 241 * Returns a page chain linked via page->private.
b411b363 242 */
c37c8ecf
AG
243struct page *drbd_alloc_pages(struct drbd_conf *mdev, unsigned int number,
244 bool retry)
b411b363
PR
245{
246 struct page *page = NULL;
44ed167d 247 struct net_conf *nc;
b411b363 248 DEFINE_WAIT(wait);
44ed167d 249 int mxb;
b411b363 250
45bb912b
LE
251 /* Yes, we may run up to @number over max_buffers. If we
252 * follow it strictly, the admin will get it wrong anyways. */
44ed167d
PR
253 rcu_read_lock();
254 nc = rcu_dereference(mdev->tconn->net_conf);
255 mxb = nc ? nc->max_buffers : 1000000;
256 rcu_read_unlock();
257
258 if (atomic_read(&mdev->pp_in_use) < mxb)
18c2d522 259 page = __drbd_alloc_pages(mdev, number);
b411b363 260
45bb912b 261 while (page == NULL) {
b411b363
PR
262 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
263
264 drbd_kick_lo_and_reclaim_net(mdev);
265
44ed167d 266 if (atomic_read(&mdev->pp_in_use) < mxb) {
18c2d522 267 page = __drbd_alloc_pages(mdev, number);
b411b363
PR
268 if (page)
269 break;
270 }
271
272 if (!retry)
273 break;
274
275 if (signal_pending(current)) {
c37c8ecf 276 dev_warn(DEV, "drbd_alloc_pages interrupted!\n");
b411b363
PR
277 break;
278 }
279
280 schedule();
281 }
282 finish_wait(&drbd_pp_wait, &wait);
283
45bb912b
LE
284 if (page)
285 atomic_add(number, &mdev->pp_in_use);
b411b363
PR
286 return page;
287}
288
c37c8ecf 289/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
87eeee41 290 * Is also used from inside an other spin_lock_irq(&mdev->tconn->req_lock);
45bb912b
LE
291 * Either links the page chain back to the global pool,
292 * or returns all pages to the system. */
5cc287e0 293static void drbd_free_pages(struct drbd_conf *mdev, struct page *page, int is_net)
b411b363 294{
435f0740 295 atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
b411b363 296 int i;
435f0740 297
81a5d60e 298 if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
45bb912b
LE
299 i = page_chain_free(page);
300 else {
301 struct page *tmp;
302 tmp = page_chain_tail(page, &i);
303 spin_lock(&drbd_pp_lock);
304 page_chain_add(&drbd_pp_pool, page, tmp);
305 drbd_pp_vacant += i;
306 spin_unlock(&drbd_pp_lock);
b411b363 307 }
435f0740 308 i = atomic_sub_return(i, a);
45bb912b 309 if (i < 0)
435f0740
LE
310 dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
311 is_net ? "pp_in_use_by_net" : "pp_in_use", i);
b411b363
PR
312 wake_up(&drbd_pp_wait);
313}
314
315/*
316You need to hold the req_lock:
317 _drbd_wait_ee_list_empty()
318
319You must not have the req_lock:
3967deb1 320 drbd_free_peer_req()
0db55363 321 drbd_alloc_peer_req()
7721f567 322 drbd_free_peer_reqs()
b411b363 323 drbd_ee_fix_bhs()
a990be46 324 drbd_finish_peer_reqs()
b411b363
PR
325 drbd_clear_done_ee()
326 drbd_wait_ee_list_empty()
327*/
328
f6ffca9f 329struct drbd_peer_request *
0db55363
AG
330drbd_alloc_peer_req(struct drbd_conf *mdev, u64 id, sector_t sector,
331 unsigned int data_size, gfp_t gfp_mask) __must_hold(local)
b411b363 332{
db830c46 333 struct drbd_peer_request *peer_req;
b411b363 334 struct page *page;
45bb912b 335 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
b411b363 336
0cf9d27e 337 if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
b411b363
PR
338 return NULL;
339
db830c46
AG
340 peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
341 if (!peer_req) {
b411b363 342 if (!(gfp_mask & __GFP_NOWARN))
0db55363 343 dev_err(DEV, "%s: allocation failed\n", __func__);
b411b363
PR
344 return NULL;
345 }
346
c37c8ecf 347 page = drbd_alloc_pages(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
45bb912b
LE
348 if (!page)
349 goto fail;
b411b363 350
db830c46
AG
351 drbd_clear_interval(&peer_req->i);
352 peer_req->i.size = data_size;
353 peer_req->i.sector = sector;
354 peer_req->i.local = false;
355 peer_req->i.waiting = false;
356
357 peer_req->epoch = NULL;
a21e9298 358 peer_req->w.mdev = mdev;
db830c46
AG
359 peer_req->pages = page;
360 atomic_set(&peer_req->pending_bios, 0);
361 peer_req->flags = 0;
9a8e7753
AG
362 /*
363 * The block_id is opaque to the receiver. It is not endianness
364 * converted, and sent back to the sender unchanged.
365 */
db830c46 366 peer_req->block_id = id;
b411b363 367
db830c46 368 return peer_req;
b411b363 369
45bb912b 370 fail:
db830c46 371 mempool_free(peer_req, drbd_ee_mempool);
b411b363
PR
372 return NULL;
373}
374
3967deb1 375void __drbd_free_peer_req(struct drbd_conf *mdev, struct drbd_peer_request *peer_req,
f6ffca9f 376 int is_net)
b411b363 377{
db830c46
AG
378 if (peer_req->flags & EE_HAS_DIGEST)
379 kfree(peer_req->digest);
5cc287e0 380 drbd_free_pages(mdev, peer_req->pages, is_net);
db830c46
AG
381 D_ASSERT(atomic_read(&peer_req->pending_bios) == 0);
382 D_ASSERT(drbd_interval_empty(&peer_req->i));
383 mempool_free(peer_req, drbd_ee_mempool);
b411b363
PR
384}
385
7721f567 386int drbd_free_peer_reqs(struct drbd_conf *mdev, struct list_head *list)
b411b363
PR
387{
388 LIST_HEAD(work_list);
db830c46 389 struct drbd_peer_request *peer_req, *t;
b411b363 390 int count = 0;
435f0740 391 int is_net = list == &mdev->net_ee;
b411b363 392
87eeee41 393 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 394 list_splice_init(list, &work_list);
87eeee41 395 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 396
db830c46 397 list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
3967deb1 398 __drbd_free_peer_req(mdev, peer_req, is_net);
b411b363
PR
399 count++;
400 }
401 return count;
402}
403
a990be46
AG
404/*
405 * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
b411b363 406 */
a990be46 407static int drbd_finish_peer_reqs(struct drbd_conf *mdev)
b411b363
PR
408{
409 LIST_HEAD(work_list);
410 LIST_HEAD(reclaimed);
db830c46 411 struct drbd_peer_request *peer_req, *t;
e2b3032b 412 int err = 0;
b411b363 413
87eeee41 414 spin_lock_irq(&mdev->tconn->req_lock);
a990be46 415 reclaim_finished_net_peer_reqs(mdev, &reclaimed);
b411b363 416 list_splice_init(&mdev->done_ee, &work_list);
87eeee41 417 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 418
db830c46 419 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
3967deb1 420 drbd_free_net_peer_req(mdev, peer_req);
b411b363
PR
421
422 /* possible callbacks here:
7be8da07 423 * e_end_block, and e_end_resync_block, e_send_discard_write.
b411b363
PR
424 * all ignore the last argument.
425 */
db830c46 426 list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
e2b3032b
AG
427 int err2;
428
b411b363 429 /* list_del not necessary, next/prev members not touched */
e2b3032b
AG
430 err2 = peer_req->w.cb(&peer_req->w, !!err);
431 if (!err)
432 err = err2;
3967deb1 433 drbd_free_peer_req(mdev, peer_req);
b411b363
PR
434 }
435 wake_up(&mdev->ee_wait);
436
e2b3032b 437 return err;
b411b363
PR
438}
439
d4da1537
AG
440static void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
441 struct list_head *head)
b411b363
PR
442{
443 DEFINE_WAIT(wait);
444
445 /* avoids spin_lock/unlock
446 * and calling prepare_to_wait in the fast path */
447 while (!list_empty(head)) {
448 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
87eeee41 449 spin_unlock_irq(&mdev->tconn->req_lock);
7eaceacc 450 io_schedule();
b411b363 451 finish_wait(&mdev->ee_wait, &wait);
87eeee41 452 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
453 }
454}
455
d4da1537
AG
456static void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
457 struct list_head *head)
b411b363 458{
87eeee41 459 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 460 _drbd_wait_ee_list_empty(mdev, head);
87eeee41 461 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
462}
463
dbd9eea0 464static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
b411b363
PR
465{
466 mm_segment_t oldfs;
467 struct kvec iov = {
468 .iov_base = buf,
469 .iov_len = size,
470 };
471 struct msghdr msg = {
472 .msg_iovlen = 1,
473 .msg_iov = (struct iovec *)&iov,
474 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
475 };
476 int rv;
477
478 oldfs = get_fs();
479 set_fs(KERNEL_DS);
480 rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
481 set_fs(oldfs);
482
483 return rv;
484}
485
de0ff338 486static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size)
b411b363
PR
487{
488 mm_segment_t oldfs;
489 struct kvec iov = {
490 .iov_base = buf,
491 .iov_len = size,
492 };
493 struct msghdr msg = {
494 .msg_iovlen = 1,
495 .msg_iov = (struct iovec *)&iov,
496 .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
497 };
498 int rv;
499
500 oldfs = get_fs();
501 set_fs(KERNEL_DS);
502
503 for (;;) {
de0ff338 504 rv = sock_recvmsg(tconn->data.socket, &msg, size, msg.msg_flags);
b411b363
PR
505 if (rv == size)
506 break;
507
508 /* Note:
509 * ECONNRESET other side closed the connection
510 * ERESTARTSYS (on sock) we got a signal
511 */
512
513 if (rv < 0) {
514 if (rv == -ECONNRESET)
de0ff338 515 conn_info(tconn, "sock was reset by peer\n");
b411b363 516 else if (rv != -ERESTARTSYS)
de0ff338 517 conn_err(tconn, "sock_recvmsg returned %d\n", rv);
b411b363
PR
518 break;
519 } else if (rv == 0) {
de0ff338 520 conn_info(tconn, "sock was shut down by peer\n");
b411b363
PR
521 break;
522 } else {
523 /* signal came in, or peer/link went down,
524 * after we read a partial message
525 */
526 /* D_ASSERT(signal_pending(current)); */
527 break;
528 }
529 };
530
531 set_fs(oldfs);
532
533 if (rv != size)
bbeb641c 534 conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
b411b363
PR
535
536 return rv;
537}
538
c6967746
AG
539static int drbd_recv_all(struct drbd_tconn *tconn, void *buf, size_t size)
540{
541 int err;
542
543 err = drbd_recv(tconn, buf, size);
544 if (err != size) {
545 if (err >= 0)
546 err = -EIO;
547 } else
548 err = 0;
549 return err;
550}
551
a5c31904
AG
552static int drbd_recv_all_warn(struct drbd_tconn *tconn, void *buf, size_t size)
553{
554 int err;
555
556 err = drbd_recv_all(tconn, buf, size);
557 if (err && !signal_pending(current))
558 conn_warn(tconn, "short read (expected size %d)\n", (int)size);
559 return err;
560}
561
5dbf1673
LE
562/* quoting tcp(7):
563 * On individual connections, the socket buffer size must be set prior to the
564 * listen(2) or connect(2) calls in order to have it take effect.
565 * This is our wrapper to do so.
566 */
567static void drbd_setbufsize(struct socket *sock, unsigned int snd,
568 unsigned int rcv)
569{
570 /* open coded SO_SNDBUF, SO_RCVBUF */
571 if (snd) {
572 sock->sk->sk_sndbuf = snd;
573 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
574 }
575 if (rcv) {
576 sock->sk->sk_rcvbuf = rcv;
577 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
578 }
579}
580
eac3e990 581static struct socket *drbd_try_connect(struct drbd_tconn *tconn)
b411b363
PR
582{
583 const char *what;
584 struct socket *sock;
585 struct sockaddr_in6 src_in6;
44ed167d
PR
586 struct sockaddr_in6 peer_in6;
587 struct net_conf *nc;
588 int err, peer_addr_len, my_addr_len;
69ef82de 589 int sndbuf_size, rcvbuf_size, connect_int;
b411b363
PR
590 int disconnect_on_error = 1;
591
44ed167d
PR
592 rcu_read_lock();
593 nc = rcu_dereference(tconn->net_conf);
594 if (!nc) {
595 rcu_read_unlock();
b411b363 596 return NULL;
44ed167d 597 }
44ed167d
PR
598 sndbuf_size = nc->sndbuf_size;
599 rcvbuf_size = nc->rcvbuf_size;
69ef82de 600 connect_int = nc->connect_int;
089c075d 601 rcu_read_unlock();
44ed167d 602
089c075d
AG
603 my_addr_len = min_t(int, tconn->my_addr_len, sizeof(src_in6));
604 memcpy(&src_in6, &tconn->my_addr, my_addr_len);
44ed167d 605
089c075d 606 if (((struct sockaddr *)&tconn->my_addr)->sa_family == AF_INET6)
44ed167d
PR
607 src_in6.sin6_port = 0;
608 else
609 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
610
089c075d
AG
611 peer_addr_len = min_t(int, tconn->peer_addr_len, sizeof(src_in6));
612 memcpy(&peer_in6, &tconn->peer_addr, peer_addr_len);
b411b363
PR
613
614 what = "sock_create_kern";
44ed167d
PR
615 err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family,
616 SOCK_STREAM, IPPROTO_TCP, &sock);
b411b363
PR
617 if (err < 0) {
618 sock = NULL;
619 goto out;
620 }
621
622 sock->sk->sk_rcvtimeo =
69ef82de 623 sock->sk->sk_sndtimeo = connect_int * HZ;
44ed167d 624 drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
b411b363
PR
625
626 /* explicitly bind to the configured IP as source IP
627 * for the outgoing connections.
628 * This is needed for multihomed hosts and to be
629 * able to use lo: interfaces for drbd.
630 * Make sure to use 0 as port number, so linux selects
631 * a free one dynamically.
632 */
b411b363 633 what = "bind before connect";
44ed167d 634 err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
b411b363
PR
635 if (err < 0)
636 goto out;
637
638 /* connect may fail, peer not yet available.
639 * stay C_WF_CONNECTION, don't go Disconnecting! */
640 disconnect_on_error = 0;
641 what = "connect";
44ed167d 642 err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
b411b363
PR
643
644out:
645 if (err < 0) {
646 if (sock) {
647 sock_release(sock);
648 sock = NULL;
649 }
650 switch (-err) {
651 /* timeout, busy, signal pending */
652 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
653 case EINTR: case ERESTARTSYS:
654 /* peer not (yet) available, network problem */
655 case ECONNREFUSED: case ENETUNREACH:
656 case EHOSTDOWN: case EHOSTUNREACH:
657 disconnect_on_error = 0;
658 break;
659 default:
eac3e990 660 conn_err(tconn, "%s failed, err = %d\n", what, err);
b411b363
PR
661 }
662 if (disconnect_on_error)
bbeb641c 663 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 664 }
44ed167d 665
b411b363
PR
666 return sock;
667}
668
7a426fd8
PR
669struct accept_wait_data {
670 struct drbd_tconn *tconn;
671 struct socket *s_listen;
672 struct completion door_bell;
673 void (*original_sk_state_change)(struct sock *sk);
674
675};
676
677static void incomming_connection(struct sock *sk)
678{
679 struct accept_wait_data *ad = sk->sk_user_data;
680 struct drbd_tconn *tconn = ad->tconn;
681
682 if (sk->sk_state != TCP_ESTABLISHED)
683 conn_warn(tconn, "unexpected tcp state change. sk_state = %d\n", sk->sk_state);
684
685 write_lock_bh(&sk->sk_callback_lock);
686 sk->sk_state_change = ad->original_sk_state_change;
687 sk->sk_user_data = NULL;
688 write_unlock_bh(&sk->sk_callback_lock);
689
690 sk->sk_state_change(sk);
691 complete(&ad->door_bell);
692}
693
694static int prepare_listen_socket(struct drbd_tconn *tconn, struct accept_wait_data *ad)
b411b363 695{
1f3e509b 696 int err, sndbuf_size, rcvbuf_size, my_addr_len;
44ed167d 697 struct sockaddr_in6 my_addr;
1f3e509b 698 struct socket *s_listen;
44ed167d 699 struct net_conf *nc;
b411b363
PR
700 const char *what;
701
44ed167d
PR
702 rcu_read_lock();
703 nc = rcu_dereference(tconn->net_conf);
704 if (!nc) {
705 rcu_read_unlock();
7a426fd8 706 return -EIO;
44ed167d 707 }
44ed167d
PR
708 sndbuf_size = nc->sndbuf_size;
709 rcvbuf_size = nc->rcvbuf_size;
44ed167d 710 rcu_read_unlock();
b411b363 711
089c075d
AG
712 my_addr_len = min_t(int, tconn->my_addr_len, sizeof(struct sockaddr_in6));
713 memcpy(&my_addr, &tconn->my_addr, my_addr_len);
714
b411b363 715 what = "sock_create_kern";
44ed167d 716 err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family,
1f3e509b 717 SOCK_STREAM, IPPROTO_TCP, &s_listen);
b411b363
PR
718 if (err) {
719 s_listen = NULL;
720 goto out;
721 }
722
1f3e509b 723 s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
44ed167d 724 drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
b411b363
PR
725
726 what = "bind before listen";
44ed167d 727 err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
b411b363
PR
728 if (err < 0)
729 goto out;
730
7a426fd8
PR
731 ad->s_listen = s_listen;
732 write_lock_bh(&s_listen->sk->sk_callback_lock);
733 ad->original_sk_state_change = s_listen->sk->sk_state_change;
734 s_listen->sk->sk_state_change = incomming_connection;
735 s_listen->sk->sk_user_data = ad;
736 write_unlock_bh(&s_listen->sk->sk_callback_lock);
737
2820fd39
PR
738 what = "listen";
739 err = s_listen->ops->listen(s_listen, 5);
740 if (err < 0)
741 goto out;
742
7a426fd8 743 return 0;
1f3e509b
PR
744out:
745 if (s_listen)
746 sock_release(s_listen);
747 if (err < 0) {
748 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
749 conn_err(tconn, "%s failed, err = %d\n", what, err);
750 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
751 }
752 }
753
7a426fd8 754 return -EIO;
1f3e509b
PR
755}
756
7a426fd8 757static struct socket *drbd_wait_for_connect(struct drbd_tconn *tconn, struct accept_wait_data *ad)
1f3e509b
PR
758{
759 int timeo, connect_int, err = 0;
760 struct socket *s_estab = NULL;
1f3e509b
PR
761 struct net_conf *nc;
762
763 rcu_read_lock();
764 nc = rcu_dereference(tconn->net_conf);
765 if (!nc) {
766 rcu_read_unlock();
767 return NULL;
768 }
769 connect_int = nc->connect_int;
770 rcu_read_unlock();
771
772 timeo = connect_int * HZ;
773 timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
774
7a426fd8
PR
775 err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
776 if (err <= 0)
777 return NULL;
b411b363 778
7a426fd8 779 err = kernel_accept(ad->s_listen, &s_estab, 0);
b411b363
PR
780 if (err < 0) {
781 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
1f3e509b 782 conn_err(tconn, "accept failed, err = %d\n", err);
bbeb641c 783 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
784 }
785 }
b411b363
PR
786
787 return s_estab;
788}
789
e658983a 790static int decode_header(struct drbd_tconn *, void *, struct packet_info *);
b411b363 791
9f5bdc33
AG
792static int send_first_packet(struct drbd_tconn *tconn, struct drbd_socket *sock,
793 enum drbd_packet cmd)
794{
795 if (!conn_prepare_command(tconn, sock))
796 return -EIO;
e658983a 797 return conn_send_command(tconn, sock, cmd, 0, NULL, 0);
b411b363
PR
798}
799
9f5bdc33 800static int receive_first_packet(struct drbd_tconn *tconn, struct socket *sock)
b411b363 801{
9f5bdc33
AG
802 unsigned int header_size = drbd_header_size(tconn);
803 struct packet_info pi;
804 int err;
b411b363 805
9f5bdc33
AG
806 err = drbd_recv_short(sock, tconn->data.rbuf, header_size, 0);
807 if (err != header_size) {
808 if (err >= 0)
809 err = -EIO;
810 return err;
811 }
812 err = decode_header(tconn, tconn->data.rbuf, &pi);
813 if (err)
814 return err;
815 return pi.cmd;
b411b363
PR
816}
817
818/**
819 * drbd_socket_okay() - Free the socket if its connection is not okay
b411b363
PR
820 * @sock: pointer to the pointer to the socket.
821 */
dbd9eea0 822static int drbd_socket_okay(struct socket **sock)
b411b363
PR
823{
824 int rr;
825 char tb[4];
826
827 if (!*sock)
81e84650 828 return false;
b411b363 829
dbd9eea0 830 rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
b411b363
PR
831
832 if (rr > 0 || rr == -EAGAIN) {
81e84650 833 return true;
b411b363
PR
834 } else {
835 sock_release(*sock);
836 *sock = NULL;
81e84650 837 return false;
b411b363
PR
838 }
839}
2325eb66
PR
840/* Gets called if a connection is established, or if a new minor gets created
841 in a connection */
c141ebda 842int drbd_connected(struct drbd_conf *mdev)
907599e0 843{
0829f5ed 844 int err;
907599e0
PR
845
846 atomic_set(&mdev->packet_seq, 0);
847 mdev->peer_seq = 0;
848
8410da8f
PR
849 mdev->state_mutex = mdev->tconn->agreed_pro_version < 100 ?
850 &mdev->tconn->cstate_mutex :
851 &mdev->own_state_mutex;
852
0829f5ed
AG
853 err = drbd_send_sync_param(mdev);
854 if (!err)
855 err = drbd_send_sizes(mdev, 0, 0);
856 if (!err)
857 err = drbd_send_uuids(mdev);
858 if (!err)
43de7c85 859 err = drbd_send_current_state(mdev);
907599e0
PR
860 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
861 clear_bit(RESIZE_PENDING, &mdev->flags);
8b924f1d 862 mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
0829f5ed 863 return err;
907599e0
PR
864}
865
b411b363
PR
866/*
867 * return values:
868 * 1 yes, we have a valid connection
869 * 0 oops, did not work out, please try again
870 * -1 peer talks different language,
871 * no point in trying again, please go standalone.
872 * -2 We do not have a network config...
873 */
81fa2e67 874static int conn_connect(struct drbd_tconn *tconn)
b411b363 875{
7da35862 876 struct drbd_socket sock, msock;
c141ebda 877 struct drbd_conf *mdev;
44ed167d 878 struct net_conf *nc;
c141ebda 879 int vnr, timeout, try, h, ok;
08b165ba 880 bool discard_my_data;
a1096a6e 881 enum drbd_state_rv rv;
7a426fd8
PR
882 struct accept_wait_data ad = {
883 .tconn = tconn,
884 .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
885 };
b411b363 886
bbeb641c 887 if (conn_request_state(tconn, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
b411b363
PR
888 return -2;
889
7da35862
PR
890 mutex_init(&sock.mutex);
891 sock.sbuf = tconn->data.sbuf;
892 sock.rbuf = tconn->data.rbuf;
893 sock.socket = NULL;
894 mutex_init(&msock.mutex);
895 msock.sbuf = tconn->meta.sbuf;
896 msock.rbuf = tconn->meta.rbuf;
897 msock.socket = NULL;
898
907599e0 899 clear_bit(DISCARD_CONCURRENT, &tconn->flags);
0916e0e3
AG
900
901 /* Assume that the peer only understands protocol 80 until we know better. */
902 tconn->agreed_pro_version = 80;
b411b363 903
7a426fd8
PR
904 if (prepare_listen_socket(tconn, &ad))
905 return 0;
906
b411b363 907 do {
2bf89621
AG
908 struct socket *s;
909
b411b363
PR
910 for (try = 0;;) {
911 /* 3 tries, this should take less than a second! */
907599e0 912 s = drbd_try_connect(tconn);
b411b363
PR
913 if (s || ++try >= 3)
914 break;
915 /* give the other side time to call bind() & listen() */
20ee6390 916 schedule_timeout_interruptible(HZ / 10);
b411b363
PR
917 }
918
919 if (s) {
7da35862
PR
920 if (!sock.socket) {
921 sock.socket = s;
922 send_first_packet(tconn, &sock, P_INITIAL_DATA);
923 } else if (!msock.socket) {
924 msock.socket = s;
925 send_first_packet(tconn, &msock, P_INITIAL_META);
b411b363 926 } else {
81fa2e67 927 conn_err(tconn, "Logic error in conn_connect()\n");
b411b363
PR
928 goto out_release_sockets;
929 }
930 }
931
7da35862
PR
932 if (sock.socket && msock.socket) {
933 rcu_read_lock();
934 nc = rcu_dereference(tconn->net_conf);
935 timeout = nc->ping_timeo * HZ / 10;
936 rcu_read_unlock();
937 schedule_timeout_interruptible(timeout);
938 ok = drbd_socket_okay(&sock.socket);
939 ok = drbd_socket_okay(&msock.socket) && ok;
b411b363
PR
940 if (ok)
941 break;
942 }
943
944retry:
7a426fd8 945 s = drbd_wait_for_connect(tconn, &ad);
b411b363 946 if (s) {
9f5bdc33 947 try = receive_first_packet(tconn, s);
7da35862
PR
948 drbd_socket_okay(&sock.socket);
949 drbd_socket_okay(&msock.socket);
b411b363 950 switch (try) {
e5d6f33a 951 case P_INITIAL_DATA:
7da35862 952 if (sock.socket) {
907599e0 953 conn_warn(tconn, "initial packet S crossed\n");
7da35862 954 sock_release(sock.socket);
b411b363 955 }
7da35862 956 sock.socket = s;
b411b363 957 break;
e5d6f33a 958 case P_INITIAL_META:
7da35862 959 if (msock.socket) {
907599e0 960 conn_warn(tconn, "initial packet M crossed\n");
7da35862 961 sock_release(msock.socket);
b411b363 962 }
7da35862 963 msock.socket = s;
907599e0 964 set_bit(DISCARD_CONCURRENT, &tconn->flags);
b411b363
PR
965 break;
966 default:
907599e0 967 conn_warn(tconn, "Error receiving initial packet\n");
b411b363
PR
968 sock_release(s);
969 if (random32() & 1)
970 goto retry;
971 }
972 }
973
bbeb641c 974 if (tconn->cstate <= C_DISCONNECTING)
b411b363
PR
975 goto out_release_sockets;
976 if (signal_pending(current)) {
977 flush_signals(current);
978 smp_rmb();
907599e0 979 if (get_t_state(&tconn->receiver) == EXITING)
b411b363
PR
980 goto out_release_sockets;
981 }
982
7da35862
PR
983 if (sock.socket && &msock.socket) {
984 ok = drbd_socket_okay(&sock.socket);
985 ok = drbd_socket_okay(&msock.socket) && ok;
b411b363
PR
986 if (ok)
987 break;
988 }
989 } while (1);
990
7a426fd8
PR
991 if (ad.s_listen)
992 sock_release(ad.s_listen);
993
7da35862
PR
994 sock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */
995 msock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */
b411b363 996
7da35862
PR
997 sock.socket->sk->sk_allocation = GFP_NOIO;
998 msock.socket->sk->sk_allocation = GFP_NOIO;
b411b363 999
7da35862
PR
1000 sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
1001 msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
b411b363 1002
b411b363 1003 /* NOT YET ...
7da35862
PR
1004 * sock.socket->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10;
1005 * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
6038178e 1006 * first set it to the P_CONNECTION_FEATURES timeout,
b411b363 1007 * which we set to 4x the configured ping_timeout. */
44ed167d
PR
1008 rcu_read_lock();
1009 nc = rcu_dereference(tconn->net_conf);
1010
7da35862
PR
1011 sock.socket->sk->sk_sndtimeo =
1012 sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
44ed167d 1013
7da35862 1014 msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
44ed167d 1015 timeout = nc->timeout * HZ / 10;
08b165ba 1016 discard_my_data = nc->discard_my_data;
44ed167d 1017 rcu_read_unlock();
b411b363 1018
7da35862 1019 msock.socket->sk->sk_sndtimeo = timeout;
b411b363
PR
1020
1021 /* we don't want delays.
25985edc 1022 * we use TCP_CORK where appropriate, though */
7da35862
PR
1023 drbd_tcp_nodelay(sock.socket);
1024 drbd_tcp_nodelay(msock.socket);
b411b363 1025
7da35862
PR
1026 tconn->data.socket = sock.socket;
1027 tconn->meta.socket = msock.socket;
907599e0 1028 tconn->last_received = jiffies;
b411b363 1029
6038178e 1030 h = drbd_do_features(tconn);
b411b363
PR
1031 if (h <= 0)
1032 return h;
1033
907599e0 1034 if (tconn->cram_hmac_tfm) {
b411b363 1035 /* drbd_request_state(mdev, NS(conn, WFAuth)); */
907599e0 1036 switch (drbd_do_auth(tconn)) {
b10d96cb 1037 case -1:
907599e0 1038 conn_err(tconn, "Authentication of peer failed\n");
b411b363 1039 return -1;
b10d96cb 1040 case 0:
907599e0 1041 conn_err(tconn, "Authentication of peer failed, trying again.\n");
b10d96cb 1042 return 0;
b411b363
PR
1043 }
1044 }
1045
7da35862
PR
1046 tconn->data.socket->sk->sk_sndtimeo = timeout;
1047 tconn->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
b411b363 1048
387eb308 1049 if (drbd_send_protocol(tconn) == -EOPNOTSUPP)
7e2455c1 1050 return -1;
b411b363 1051
a1096a6e
PR
1052 set_bit(STATE_SENT, &tconn->flags);
1053
c141ebda
PR
1054 rcu_read_lock();
1055 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
1056 kref_get(&mdev->kref);
1057 rcu_read_unlock();
08b165ba
PR
1058
1059 if (discard_my_data)
1060 set_bit(DISCARD_MY_DATA, &mdev->flags);
1061 else
1062 clear_bit(DISCARD_MY_DATA, &mdev->flags);
1063
c141ebda
PR
1064 drbd_connected(mdev);
1065 kref_put(&mdev->kref, &drbd_minor_destroy);
1066 rcu_read_lock();
1067 }
1068 rcu_read_unlock();
1069
a1096a6e
PR
1070 rv = conn_request_state(tconn, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
1071 if (rv < SS_SUCCESS) {
1072 clear_bit(STATE_SENT, &tconn->flags);
823bd832 1073 return 0;
a1096a6e 1074 }
823bd832
PR
1075
1076 drbd_thread_start(&tconn->asender);
1077
08b165ba
PR
1078 mutex_lock(&tconn->conf_update);
1079 /* The discard_my_data flag is a single-shot modifier to the next
1080 * connection attempt, the handshake of which is now well underway.
1081 * No need for rcu style copying of the whole struct
1082 * just to clear a single value. */
1083 tconn->net_conf->discard_my_data = 0;
1084 mutex_unlock(&tconn->conf_update);
1085
d3fcb490 1086 return h;
b411b363
PR
1087
1088out_release_sockets:
7a426fd8
PR
1089 if (ad.s_listen)
1090 sock_release(ad.s_listen);
7da35862
PR
1091 if (sock.socket)
1092 sock_release(sock.socket);
1093 if (msock.socket)
1094 sock_release(msock.socket);
b411b363
PR
1095 return -1;
1096}
1097
e658983a 1098static int decode_header(struct drbd_tconn *tconn, void *header, struct packet_info *pi)
b411b363 1099{
e658983a
AG
1100 unsigned int header_size = drbd_header_size(tconn);
1101
0c8e36d9
AG
1102 if (header_size == sizeof(struct p_header100) &&
1103 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
1104 struct p_header100 *h = header;
1105 if (h->pad != 0) {
1106 conn_err(tconn, "Header padding is not zero\n");
1107 return -EINVAL;
1108 }
1109 pi->vnr = be16_to_cpu(h->volume);
1110 pi->cmd = be16_to_cpu(h->command);
1111 pi->size = be32_to_cpu(h->length);
1112 } else if (header_size == sizeof(struct p_header95) &&
1113 *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
e658983a 1114 struct p_header95 *h = header;
e658983a 1115 pi->cmd = be16_to_cpu(h->command);
b55d84ba
AG
1116 pi->size = be32_to_cpu(h->length);
1117 pi->vnr = 0;
e658983a
AG
1118 } else if (header_size == sizeof(struct p_header80) &&
1119 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
1120 struct p_header80 *h = header;
1121 pi->cmd = be16_to_cpu(h->command);
1122 pi->size = be16_to_cpu(h->length);
77351055 1123 pi->vnr = 0;
02918be2 1124 } else {
e658983a
AG
1125 conn_err(tconn, "Wrong magic value 0x%08x in protocol version %d\n",
1126 be32_to_cpu(*(__be32 *)header),
1127 tconn->agreed_pro_version);
8172f3e9 1128 return -EINVAL;
b411b363 1129 }
e658983a 1130 pi->data = header + header_size;
8172f3e9 1131 return 0;
257d0af6
PR
1132}
1133
9ba7aa00 1134static int drbd_recv_header(struct drbd_tconn *tconn, struct packet_info *pi)
257d0af6 1135{
e658983a 1136 void *buffer = tconn->data.rbuf;
69bc7bc3 1137 int err;
257d0af6 1138
e658983a 1139 err = drbd_recv_all_warn(tconn, buffer, drbd_header_size(tconn));
a5c31904 1140 if (err)
69bc7bc3 1141 return err;
257d0af6 1142
e658983a 1143 err = decode_header(tconn, buffer, pi);
9ba7aa00 1144 tconn->last_received = jiffies;
b411b363 1145
69bc7bc3 1146 return err;
b411b363
PR
1147}
1148
4b0007c0 1149static void drbd_flush(struct drbd_tconn *tconn)
b411b363
PR
1150{
1151 int rv;
4b0007c0
PR
1152 struct drbd_conf *mdev;
1153 int vnr;
1154
1155 if (tconn->write_ordering >= WO_bdev_flush) {
615e087f 1156 rcu_read_lock();
4b0007c0 1157 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
615e087f
LE
1158 if (!get_ldev(mdev))
1159 continue;
1160 kref_get(&mdev->kref);
1161 rcu_read_unlock();
1162
1163 rv = blkdev_issue_flush(mdev->ldev->backing_bdev,
1164 GFP_NOIO, NULL);
1165 if (rv) {
1166 dev_info(DEV, "local disk flush failed with status %d\n", rv);
1167 /* would rather check on EOPNOTSUPP, but that is not reliable.
1168 * don't try again for ANY return value != 0
1169 * if (rv == -EOPNOTSUPP) */
1170 drbd_bump_write_ordering(tconn, WO_drain_io);
4b0007c0 1171 }
615e087f
LE
1172 put_ldev(mdev);
1173 kref_put(&mdev->kref, &drbd_minor_destroy);
1174
1175 rcu_read_lock();
1176 if (rv)
1177 break;
b411b363 1178 }
615e087f 1179 rcu_read_unlock();
b411b363 1180 }
b411b363
PR
1181}
1182
1183/**
1184 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
1185 * @mdev: DRBD device.
1186 * @epoch: Epoch object.
1187 * @ev: Epoch event.
1188 */
1e9dd291 1189static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn,
b411b363
PR
1190 struct drbd_epoch *epoch,
1191 enum epoch_event ev)
1192{
2451fc3b 1193 int epoch_size;
b411b363 1194 struct drbd_epoch *next_epoch;
b411b363
PR
1195 enum finish_epoch rv = FE_STILL_LIVE;
1196
12038a3a 1197 spin_lock(&tconn->epoch_lock);
b411b363
PR
1198 do {
1199 next_epoch = NULL;
b411b363
PR
1200
1201 epoch_size = atomic_read(&epoch->epoch_size);
1202
1203 switch (ev & ~EV_CLEANUP) {
1204 case EV_PUT:
1205 atomic_dec(&epoch->active);
1206 break;
1207 case EV_GOT_BARRIER_NR:
1208 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
b411b363
PR
1209 break;
1210 case EV_BECAME_LAST:
1211 /* nothing to do*/
1212 break;
1213 }
1214
b411b363
PR
1215 if (epoch_size != 0 &&
1216 atomic_read(&epoch->active) == 0 &&
85d73513 1217 (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
b411b363 1218 if (!(ev & EV_CLEANUP)) {
12038a3a 1219 spin_unlock(&tconn->epoch_lock);
9ed57dcb 1220 drbd_send_b_ack(epoch->tconn, epoch->barrier_nr, epoch_size);
12038a3a 1221 spin_lock(&tconn->epoch_lock);
b411b363 1222 }
9ed57dcb
LE
1223#if 0
1224 /* FIXME: dec unacked on connection, once we have
1225 * something to count pending connection packets in. */
85d73513 1226 if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
9ed57dcb
LE
1227 dec_unacked(epoch->tconn);
1228#endif
b411b363 1229
12038a3a 1230 if (tconn->current_epoch != epoch) {
b411b363
PR
1231 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1232 list_del(&epoch->list);
1233 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
12038a3a 1234 tconn->epochs--;
b411b363
PR
1235 kfree(epoch);
1236
1237 if (rv == FE_STILL_LIVE)
1238 rv = FE_DESTROYED;
1239 } else {
1240 epoch->flags = 0;
1241 atomic_set(&epoch->epoch_size, 0);
698f9315 1242 /* atomic_set(&epoch->active, 0); is already zero */
b411b363
PR
1243 if (rv == FE_STILL_LIVE)
1244 rv = FE_RECYCLED;
1245 }
1246 }
1247
1248 if (!next_epoch)
1249 break;
1250
1251 epoch = next_epoch;
1252 } while (1);
1253
12038a3a 1254 spin_unlock(&tconn->epoch_lock);
b411b363 1255
b411b363
PR
1256 return rv;
1257}
1258
1259/**
1260 * drbd_bump_write_ordering() - Fall back to an other write ordering method
4b0007c0 1261 * @tconn: DRBD connection.
b411b363
PR
1262 * @wo: Write ordering method to try.
1263 */
4b0007c0 1264void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo)
b411b363 1265{
daeda1cc 1266 struct disk_conf *dc;
4b0007c0 1267 struct drbd_conf *mdev;
b411b363 1268 enum write_ordering_e pwo;
4b0007c0 1269 int vnr;
b411b363
PR
1270 static char *write_ordering_str[] = {
1271 [WO_none] = "none",
1272 [WO_drain_io] = "drain",
1273 [WO_bdev_flush] = "flush",
b411b363
PR
1274 };
1275
4b0007c0 1276 pwo = tconn->write_ordering;
b411b363 1277 wo = min(pwo, wo);
daeda1cc 1278 rcu_read_lock();
4b0007c0 1279 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
27eb13e9 1280 if (!get_ldev_if_state(mdev, D_ATTACHING))
4b0007c0
PR
1281 continue;
1282 dc = rcu_dereference(mdev->ldev->disk_conf);
1283
1284 if (wo == WO_bdev_flush && !dc->disk_flushes)
1285 wo = WO_drain_io;
1286 if (wo == WO_drain_io && !dc->disk_drain)
1287 wo = WO_none;
1288 put_ldev(mdev);
1289 }
daeda1cc 1290 rcu_read_unlock();
4b0007c0
PR
1291 tconn->write_ordering = wo;
1292 if (pwo != tconn->write_ordering || wo == WO_bdev_flush)
1293 conn_info(tconn, "Method to ensure write ordering: %s\n", write_ordering_str[tconn->write_ordering]);
b411b363
PR
1294}
1295
45bb912b 1296/**
fbe29dec 1297 * drbd_submit_peer_request()
45bb912b 1298 * @mdev: DRBD device.
db830c46 1299 * @peer_req: peer request
45bb912b 1300 * @rw: flag field, see bio->bi_rw
10f6d992
LE
1301 *
1302 * May spread the pages to multiple bios,
1303 * depending on bio_add_page restrictions.
1304 *
1305 * Returns 0 if all bios have been submitted,
1306 * -ENOMEM if we could not allocate enough bios,
1307 * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1308 * single page to an empty bio (which should never happen and likely indicates
1309 * that the lower level IO stack is in some way broken). This has been observed
1310 * on certain Xen deployments.
45bb912b
LE
1311 */
1312/* TODO allocate from our own bio_set. */
fbe29dec
AG
1313int drbd_submit_peer_request(struct drbd_conf *mdev,
1314 struct drbd_peer_request *peer_req,
1315 const unsigned rw, const int fault_type)
45bb912b
LE
1316{
1317 struct bio *bios = NULL;
1318 struct bio *bio;
db830c46
AG
1319 struct page *page = peer_req->pages;
1320 sector_t sector = peer_req->i.sector;
1321 unsigned ds = peer_req->i.size;
45bb912b
LE
1322 unsigned n_bios = 0;
1323 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
10f6d992 1324 int err = -ENOMEM;
45bb912b
LE
1325
1326 /* In most cases, we will only need one bio. But in case the lower
1327 * level restrictions happen to be different at this offset on this
1328 * side than those of the sending peer, we may need to submit the
da4a75d2
LE
1329 * request in more than one bio.
1330 *
1331 * Plain bio_alloc is good enough here, this is no DRBD internally
1332 * generated bio, but a bio allocated on behalf of the peer.
1333 */
45bb912b
LE
1334next_bio:
1335 bio = bio_alloc(GFP_NOIO, nr_pages);
1336 if (!bio) {
1337 dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
1338 goto fail;
1339 }
db830c46 1340 /* > peer_req->i.sector, unless this is the first bio */
45bb912b
LE
1341 bio->bi_sector = sector;
1342 bio->bi_bdev = mdev->ldev->backing_bdev;
45bb912b 1343 bio->bi_rw = rw;
db830c46 1344 bio->bi_private = peer_req;
fcefa62e 1345 bio->bi_end_io = drbd_peer_request_endio;
45bb912b
LE
1346
1347 bio->bi_next = bios;
1348 bios = bio;
1349 ++n_bios;
1350
1351 page_chain_for_each(page) {
1352 unsigned len = min_t(unsigned, ds, PAGE_SIZE);
1353 if (!bio_add_page(bio, page, len, 0)) {
10f6d992
LE
1354 /* A single page must always be possible!
1355 * But in case it fails anyways,
1356 * we deal with it, and complain (below). */
1357 if (bio->bi_vcnt == 0) {
1358 dev_err(DEV,
1359 "bio_add_page failed for len=%u, "
1360 "bi_vcnt=0 (bi_sector=%llu)\n",
1361 len, (unsigned long long)bio->bi_sector);
1362 err = -ENOSPC;
1363 goto fail;
1364 }
45bb912b
LE
1365 goto next_bio;
1366 }
1367 ds -= len;
1368 sector += len >> 9;
1369 --nr_pages;
1370 }
1371 D_ASSERT(page == NULL);
1372 D_ASSERT(ds == 0);
1373
db830c46 1374 atomic_set(&peer_req->pending_bios, n_bios);
45bb912b
LE
1375 do {
1376 bio = bios;
1377 bios = bios->bi_next;
1378 bio->bi_next = NULL;
1379
45bb912b 1380 drbd_generic_make_request(mdev, fault_type, bio);
45bb912b 1381 } while (bios);
45bb912b
LE
1382 return 0;
1383
1384fail:
1385 while (bios) {
1386 bio = bios;
1387 bios = bios->bi_next;
1388 bio_put(bio);
1389 }
10f6d992 1390 return err;
45bb912b
LE
1391}
1392
53840641 1393static void drbd_remove_epoch_entry_interval(struct drbd_conf *mdev,
db830c46 1394 struct drbd_peer_request *peer_req)
53840641 1395{
db830c46 1396 struct drbd_interval *i = &peer_req->i;
53840641
AG
1397
1398 drbd_remove_interval(&mdev->write_requests, i);
1399 drbd_clear_interval(i);
1400
6c852bec 1401 /* Wake up any processes waiting for this peer request to complete. */
53840641
AG
1402 if (i->waiting)
1403 wake_up(&mdev->misc_wait);
1404}
1405
77fede51
PR
1406void conn_wait_active_ee_empty(struct drbd_tconn *tconn)
1407{
1408 struct drbd_conf *mdev;
1409 int vnr;
1410
1411 rcu_read_lock();
1412 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
1413 kref_get(&mdev->kref);
1414 rcu_read_unlock();
1415 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1416 kref_put(&mdev->kref, &drbd_minor_destroy);
1417 rcu_read_lock();
1418 }
1419 rcu_read_unlock();
1420}
1421
4a76b161 1422static int receive_Barrier(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 1423{
2451fc3b 1424 int rv;
e658983a 1425 struct p_barrier *p = pi->data;
b411b363
PR
1426 struct drbd_epoch *epoch;
1427
9ed57dcb
LE
1428 /* FIXME these are unacked on connection,
1429 * not a specific (peer)device.
1430 */
12038a3a 1431 tconn->current_epoch->barrier_nr = p->barrier;
9ed57dcb 1432 tconn->current_epoch->tconn = tconn;
1e9dd291 1433 rv = drbd_may_finish_epoch(tconn, tconn->current_epoch, EV_GOT_BARRIER_NR);
b411b363
PR
1434
1435 /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1436 * the activity log, which means it would not be resynced in case the
1437 * R_PRIMARY crashes now.
1438 * Therefore we must send the barrier_ack after the barrier request was
1439 * completed. */
4b0007c0 1440 switch (tconn->write_ordering) {
b411b363
PR
1441 case WO_none:
1442 if (rv == FE_RECYCLED)
82bc0194 1443 return 0;
2451fc3b
PR
1444
1445 /* receiver context, in the writeout path of the other node.
1446 * avoid potential distributed deadlock */
1447 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1448 if (epoch)
1449 break;
1450 else
9ed57dcb 1451 conn_warn(tconn, "Allocation of an epoch failed, slowing down\n");
2451fc3b 1452 /* Fall through */
b411b363
PR
1453
1454 case WO_bdev_flush:
1455 case WO_drain_io:
77fede51 1456 conn_wait_active_ee_empty(tconn);
4b0007c0 1457 drbd_flush(tconn);
2451fc3b 1458
12038a3a 1459 if (atomic_read(&tconn->current_epoch->epoch_size)) {
2451fc3b
PR
1460 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1461 if (epoch)
1462 break;
b411b363
PR
1463 }
1464
82bc0194 1465 return 0;
2451fc3b 1466 default:
9ed57dcb 1467 conn_err(tconn, "Strangeness in tconn->write_ordering %d\n", tconn->write_ordering);
82bc0194 1468 return -EIO;
b411b363
PR
1469 }
1470
1471 epoch->flags = 0;
1472 atomic_set(&epoch->epoch_size, 0);
1473 atomic_set(&epoch->active, 0);
1474
12038a3a
PR
1475 spin_lock(&tconn->epoch_lock);
1476 if (atomic_read(&tconn->current_epoch->epoch_size)) {
1477 list_add(&epoch->list, &tconn->current_epoch->list);
1478 tconn->current_epoch = epoch;
1479 tconn->epochs++;
b411b363
PR
1480 } else {
1481 /* The current_epoch got recycled while we allocated this one... */
1482 kfree(epoch);
1483 }
12038a3a 1484 spin_unlock(&tconn->epoch_lock);
b411b363 1485
82bc0194 1486 return 0;
b411b363
PR
1487}
1488
1489/* used from receive_RSDataReply (recv_resync_read)
1490 * and from receive_Data */
f6ffca9f
AG
1491static struct drbd_peer_request *
1492read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector,
1493 int data_size) __must_hold(local)
b411b363 1494{
6666032a 1495 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
db830c46 1496 struct drbd_peer_request *peer_req;
b411b363 1497 struct page *page;
a5c31904 1498 int dgs, ds, err;
a0638456
PR
1499 void *dig_in = mdev->tconn->int_dig_in;
1500 void *dig_vv = mdev->tconn->int_dig_vv;
6b4388ac 1501 unsigned long *data;
b411b363 1502
88104ca4
AG
1503 dgs = 0;
1504 if (mdev->tconn->peer_integrity_tfm) {
1505 dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
9f5bdc33
AG
1506 /*
1507 * FIXME: Receive the incoming digest into the receive buffer
1508 * here, together with its struct p_data?
1509 */
a5c31904
AG
1510 err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
1511 if (err)
b411b363 1512 return NULL;
88104ca4 1513 data_size -= dgs;
b411b363
PR
1514 }
1515
841ce241
AG
1516 if (!expect(data_size != 0))
1517 return NULL;
1518 if (!expect(IS_ALIGNED(data_size, 512)))
1519 return NULL;
1520 if (!expect(data_size <= DRBD_MAX_BIO_SIZE))
1521 return NULL;
b411b363 1522
6666032a
LE
1523 /* even though we trust out peer,
1524 * we sometimes have to double check. */
1525 if (sector + (data_size>>9) > capacity) {
fdda6544
LE
1526 dev_err(DEV, "request from peer beyond end of local disk: "
1527 "capacity: %llus < sector: %llus + size: %u\n",
6666032a
LE
1528 (unsigned long long)capacity,
1529 (unsigned long long)sector, data_size);
1530 return NULL;
1531 }
1532
b411b363
PR
1533 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1534 * "criss-cross" setup, that might cause write-out on some other DRBD,
1535 * which in turn might block on the other node at this very place. */
0db55363 1536 peer_req = drbd_alloc_peer_req(mdev, id, sector, data_size, GFP_NOIO);
db830c46 1537 if (!peer_req)
b411b363 1538 return NULL;
45bb912b 1539
b411b363 1540 ds = data_size;
db830c46 1541 page = peer_req->pages;
45bb912b
LE
1542 page_chain_for_each(page) {
1543 unsigned len = min_t(int, ds, PAGE_SIZE);
6b4388ac 1544 data = kmap(page);
a5c31904 1545 err = drbd_recv_all_warn(mdev->tconn, data, len);
0cf9d27e 1546 if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) {
6b4388ac
PR
1547 dev_err(DEV, "Fault injection: Corrupting data on receive\n");
1548 data[0] = data[0] ^ (unsigned long)-1;
1549 }
b411b363 1550 kunmap(page);
a5c31904 1551 if (err) {
3967deb1 1552 drbd_free_peer_req(mdev, peer_req);
b411b363
PR
1553 return NULL;
1554 }
a5c31904 1555 ds -= len;
b411b363
PR
1556 }
1557
1558 if (dgs) {
5b614abe 1559 drbd_csum_ee(mdev, mdev->tconn->peer_integrity_tfm, peer_req, dig_vv);
b411b363 1560 if (memcmp(dig_in, dig_vv, dgs)) {
470be44a
LE
1561 dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n",
1562 (unsigned long long)sector, data_size);
3967deb1 1563 drbd_free_peer_req(mdev, peer_req);
b411b363
PR
1564 return NULL;
1565 }
1566 }
1567 mdev->recv_cnt += data_size>>9;
db830c46 1568 return peer_req;
b411b363
PR
1569}
1570
1571/* drbd_drain_block() just takes a data block
1572 * out of the socket input buffer, and discards it.
1573 */
1574static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1575{
1576 struct page *page;
a5c31904 1577 int err = 0;
b411b363
PR
1578 void *data;
1579
c3470cde 1580 if (!data_size)
fc5be839 1581 return 0;
c3470cde 1582
c37c8ecf 1583 page = drbd_alloc_pages(mdev, 1, 1);
b411b363
PR
1584
1585 data = kmap(page);
1586 while (data_size) {
fc5be839
AG
1587 unsigned int len = min_t(int, data_size, PAGE_SIZE);
1588
a5c31904
AG
1589 err = drbd_recv_all_warn(mdev->tconn, data, len);
1590 if (err)
b411b363 1591 break;
a5c31904 1592 data_size -= len;
b411b363
PR
1593 }
1594 kunmap(page);
5cc287e0 1595 drbd_free_pages(mdev, page, 0);
fc5be839 1596 return err;
b411b363
PR
1597}
1598
1599static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1600 sector_t sector, int data_size)
1601{
1602 struct bio_vec *bvec;
1603 struct bio *bio;
a5c31904 1604 int dgs, err, i, expect;
a0638456
PR
1605 void *dig_in = mdev->tconn->int_dig_in;
1606 void *dig_vv = mdev->tconn->int_dig_vv;
b411b363 1607
88104ca4
AG
1608 dgs = 0;
1609 if (mdev->tconn->peer_integrity_tfm) {
1610 dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
a5c31904
AG
1611 err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
1612 if (err)
1613 return err;
88104ca4 1614 data_size -= dgs;
b411b363
PR
1615 }
1616
b411b363
PR
1617 /* optimistically update recv_cnt. if receiving fails below,
1618 * we disconnect anyways, and counters will be reset. */
1619 mdev->recv_cnt += data_size>>9;
1620
1621 bio = req->master_bio;
1622 D_ASSERT(sector == bio->bi_sector);
1623
1624 bio_for_each_segment(bvec, bio, i) {
a5c31904 1625 void *mapped = kmap(bvec->bv_page) + bvec->bv_offset;
b411b363 1626 expect = min_t(int, data_size, bvec->bv_len);
a5c31904 1627 err = drbd_recv_all_warn(mdev->tconn, mapped, expect);
b411b363 1628 kunmap(bvec->bv_page);
a5c31904
AG
1629 if (err)
1630 return err;
1631 data_size -= expect;
b411b363
PR
1632 }
1633
1634 if (dgs) {
5b614abe 1635 drbd_csum_bio(mdev, mdev->tconn->peer_integrity_tfm, bio, dig_vv);
b411b363
PR
1636 if (memcmp(dig_in, dig_vv, dgs)) {
1637 dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
28284cef 1638 return -EINVAL;
b411b363
PR
1639 }
1640 }
1641
1642 D_ASSERT(data_size == 0);
28284cef 1643 return 0;
b411b363
PR
1644}
1645
a990be46
AG
1646/*
1647 * e_end_resync_block() is called in asender context via
1648 * drbd_finish_peer_reqs().
1649 */
99920dc5 1650static int e_end_resync_block(struct drbd_work *w, int unused)
b411b363 1651{
8050e6d0
AG
1652 struct drbd_peer_request *peer_req =
1653 container_of(w, struct drbd_peer_request, w);
00d56944 1654 struct drbd_conf *mdev = w->mdev;
db830c46 1655 sector_t sector = peer_req->i.sector;
99920dc5 1656 int err;
b411b363 1657
db830c46 1658 D_ASSERT(drbd_interval_empty(&peer_req->i));
b411b363 1659
db830c46
AG
1660 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1661 drbd_set_in_sync(mdev, sector, peer_req->i.size);
99920dc5 1662 err = drbd_send_ack(mdev, P_RS_WRITE_ACK, peer_req);
b411b363
PR
1663 } else {
1664 /* Record failure to sync */
db830c46 1665 drbd_rs_failed_io(mdev, sector, peer_req->i.size);
b411b363 1666
99920dc5 1667 err = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
b411b363
PR
1668 }
1669 dec_unacked(mdev);
1670
99920dc5 1671 return err;
b411b363
PR
1672}
1673
1674static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
1675{
db830c46 1676 struct drbd_peer_request *peer_req;
b411b363 1677
db830c46
AG
1678 peer_req = read_in_block(mdev, ID_SYNCER, sector, data_size);
1679 if (!peer_req)
45bb912b 1680 goto fail;
b411b363
PR
1681
1682 dec_rs_pending(mdev);
1683
b411b363
PR
1684 inc_unacked(mdev);
1685 /* corresponding dec_unacked() in e_end_resync_block()
1686 * respective _drbd_clear_done_ee */
1687
db830c46 1688 peer_req->w.cb = e_end_resync_block;
45bb912b 1689
87eeee41 1690 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 1691 list_add(&peer_req->w.list, &mdev->sync_ee);
87eeee41 1692 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 1693
0f0601f4 1694 atomic_add(data_size >> 9, &mdev->rs_sect_ev);
fbe29dec 1695 if (drbd_submit_peer_request(mdev, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
e1c1b0fc 1696 return 0;
b411b363 1697
10f6d992
LE
1698 /* don't care for the reason here */
1699 dev_err(DEV, "submit failed, triggering re-connect\n");
87eeee41 1700 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 1701 list_del(&peer_req->w.list);
87eeee41 1702 spin_unlock_irq(&mdev->tconn->req_lock);
22cc37a9 1703
3967deb1 1704 drbd_free_peer_req(mdev, peer_req);
45bb912b
LE
1705fail:
1706 put_ldev(mdev);
e1c1b0fc 1707 return -EIO;
b411b363
PR
1708}
1709
668eebc6 1710static struct drbd_request *
bc9c5c41
AG
1711find_request(struct drbd_conf *mdev, struct rb_root *root, u64 id,
1712 sector_t sector, bool missing_ok, const char *func)
51624585 1713{
51624585
AG
1714 struct drbd_request *req;
1715
bc9c5c41
AG
1716 /* Request object according to our peer */
1717 req = (struct drbd_request *)(unsigned long)id;
5e472264 1718 if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
668eebc6 1719 return req;
c3afd8f5 1720 if (!missing_ok) {
5af172ed 1721 dev_err(DEV, "%s: failed to find request 0x%lx, sector %llus\n", func,
c3afd8f5
AG
1722 (unsigned long)id, (unsigned long long)sector);
1723 }
51624585
AG
1724 return NULL;
1725}
1726
4a76b161 1727static int receive_DataReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 1728{
4a76b161 1729 struct drbd_conf *mdev;
b411b363
PR
1730 struct drbd_request *req;
1731 sector_t sector;
82bc0194 1732 int err;
e658983a 1733 struct p_data *p = pi->data;
4a76b161
AG
1734
1735 mdev = vnr_to_mdev(tconn, pi->vnr);
1736 if (!mdev)
1737 return -EIO;
b411b363
PR
1738
1739 sector = be64_to_cpu(p->sector);
1740
87eeee41 1741 spin_lock_irq(&mdev->tconn->req_lock);
bc9c5c41 1742 req = find_request(mdev, &mdev->read_requests, p->block_id, sector, false, __func__);
87eeee41 1743 spin_unlock_irq(&mdev->tconn->req_lock);
c3afd8f5 1744 if (unlikely(!req))
82bc0194 1745 return -EIO;
b411b363 1746
24c4830c 1747 /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
b411b363
PR
1748 * special casing it there for the various failure cases.
1749 * still no race with drbd_fail_pending_reads */
e2857216 1750 err = recv_dless_read(mdev, req, sector, pi->size);
82bc0194 1751 if (!err)
8554df1c 1752 req_mod(req, DATA_RECEIVED);
b411b363
PR
1753 /* else: nothing. handled from drbd_disconnect...
1754 * I don't think we may complete this just yet
1755 * in case we are "on-disconnect: freeze" */
1756
82bc0194 1757 return err;
b411b363
PR
1758}
1759
4a76b161 1760static int receive_RSDataReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 1761{
4a76b161 1762 struct drbd_conf *mdev;
b411b363 1763 sector_t sector;
82bc0194 1764 int err;
e658983a 1765 struct p_data *p = pi->data;
4a76b161
AG
1766
1767 mdev = vnr_to_mdev(tconn, pi->vnr);
1768 if (!mdev)
1769 return -EIO;
b411b363
PR
1770
1771 sector = be64_to_cpu(p->sector);
1772 D_ASSERT(p->block_id == ID_SYNCER);
1773
1774 if (get_ldev(mdev)) {
1775 /* data is submitted to disk within recv_resync_read.
1776 * corresponding put_ldev done below on error,
fcefa62e 1777 * or in drbd_peer_request_endio. */
e2857216 1778 err = recv_resync_read(mdev, sector, pi->size);
b411b363
PR
1779 } else {
1780 if (__ratelimit(&drbd_ratelimit_state))
1781 dev_err(DEV, "Can not write resync data to local disk.\n");
1782
e2857216 1783 err = drbd_drain_block(mdev, pi->size);
b411b363 1784
e2857216 1785 drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
b411b363
PR
1786 }
1787
e2857216 1788 atomic_add(pi->size >> 9, &mdev->rs_sect_in);
778f271d 1789
82bc0194 1790 return err;
b411b363
PR
1791}
1792
7be8da07
AG
1793static void restart_conflicting_writes(struct drbd_conf *mdev,
1794 sector_t sector, int size)
1795{
1796 struct drbd_interval *i;
1797 struct drbd_request *req;
1798
1799 drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
1800 if (!i->local)
1801 continue;
1802 req = container_of(i, struct drbd_request, i);
1803 if (req->rq_state & RQ_LOCAL_PENDING ||
1804 !(req->rq_state & RQ_POSTPONED))
1805 continue;
2312f0b3
LE
1806 /* as it is RQ_POSTPONED, this will cause it to
1807 * be queued on the retry workqueue. */
1808 __req_mod(req, DISCARD_WRITE, NULL);
7be8da07
AG
1809 }
1810}
1811
a990be46
AG
1812/*
1813 * e_end_block() is called in asender context via drbd_finish_peer_reqs().
b411b363 1814 */
99920dc5 1815static int e_end_block(struct drbd_work *w, int cancel)
b411b363 1816{
8050e6d0
AG
1817 struct drbd_peer_request *peer_req =
1818 container_of(w, struct drbd_peer_request, w);
00d56944 1819 struct drbd_conf *mdev = w->mdev;
db830c46 1820 sector_t sector = peer_req->i.sector;
99920dc5 1821 int err = 0, pcmd;
b411b363 1822
303d1448 1823 if (peer_req->flags & EE_SEND_WRITE_ACK) {
db830c46 1824 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
b411b363
PR
1825 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
1826 mdev->state.conn <= C_PAUSED_SYNC_T &&
db830c46 1827 peer_req->flags & EE_MAY_SET_IN_SYNC) ?
b411b363 1828 P_RS_WRITE_ACK : P_WRITE_ACK;
99920dc5 1829 err = drbd_send_ack(mdev, pcmd, peer_req);
b411b363 1830 if (pcmd == P_RS_WRITE_ACK)
db830c46 1831 drbd_set_in_sync(mdev, sector, peer_req->i.size);
b411b363 1832 } else {
99920dc5 1833 err = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
b411b363
PR
1834 /* we expect it to be marked out of sync anyways...
1835 * maybe assert this? */
1836 }
1837 dec_unacked(mdev);
1838 }
1839 /* we delete from the conflict detection hash _after_ we sent out the
1840 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
302bdeae 1841 if (peer_req->flags & EE_IN_INTERVAL_TREE) {
87eeee41 1842 spin_lock_irq(&mdev->tconn->req_lock);
db830c46
AG
1843 D_ASSERT(!drbd_interval_empty(&peer_req->i));
1844 drbd_remove_epoch_entry_interval(mdev, peer_req);
7be8da07
AG
1845 if (peer_req->flags & EE_RESTART_REQUESTS)
1846 restart_conflicting_writes(mdev, sector, peer_req->i.size);
87eeee41 1847 spin_unlock_irq(&mdev->tconn->req_lock);
bb3bfe96 1848 } else
db830c46 1849 D_ASSERT(drbd_interval_empty(&peer_req->i));
b411b363 1850
1e9dd291 1851 drbd_may_finish_epoch(mdev->tconn, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
b411b363 1852
99920dc5 1853 return err;
b411b363
PR
1854}
1855
7be8da07 1856static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
b411b363 1857{
7be8da07 1858 struct drbd_conf *mdev = w->mdev;
8050e6d0
AG
1859 struct drbd_peer_request *peer_req =
1860 container_of(w, struct drbd_peer_request, w);
99920dc5 1861 int err;
b411b363 1862
99920dc5 1863 err = drbd_send_ack(mdev, ack, peer_req);
b411b363
PR
1864 dec_unacked(mdev);
1865
99920dc5 1866 return err;
b411b363
PR
1867}
1868
99920dc5 1869static int e_send_discard_write(struct drbd_work *w, int unused)
7be8da07
AG
1870{
1871 return e_send_ack(w, P_DISCARD_WRITE);
1872}
1873
99920dc5 1874static int e_send_retry_write(struct drbd_work *w, int unused)
7be8da07
AG
1875{
1876 struct drbd_tconn *tconn = w->mdev->tconn;
1877
1878 return e_send_ack(w, tconn->agreed_pro_version >= 100 ?
1879 P_RETRY_WRITE : P_DISCARD_WRITE);
1880}
1881
3e394da1
AG
1882static bool seq_greater(u32 a, u32 b)
1883{
1884 /*
1885 * We assume 32-bit wrap-around here.
1886 * For 24-bit wrap-around, we would have to shift:
1887 * a <<= 8; b <<= 8;
1888 */
1889 return (s32)a - (s32)b > 0;
1890}
1891
1892static u32 seq_max(u32 a, u32 b)
1893{
1894 return seq_greater(a, b) ? a : b;
1895}
1896
7be8da07
AG
1897static bool need_peer_seq(struct drbd_conf *mdev)
1898{
1899 struct drbd_tconn *tconn = mdev->tconn;
302bdeae 1900 int tp;
7be8da07
AG
1901
1902 /*
1903 * We only need to keep track of the last packet_seq number of our peer
1904 * if we are in dual-primary mode and we have the discard flag set; see
1905 * handle_write_conflicts().
1906 */
302bdeae
PR
1907
1908 rcu_read_lock();
1909 tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
1910 rcu_read_unlock();
1911
1912 return tp && test_bit(DISCARD_CONCURRENT, &tconn->flags);
7be8da07
AG
1913}
1914
43ae077d 1915static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq)
3e394da1 1916{
3c13b680 1917 unsigned int newest_peer_seq;
3e394da1 1918
7be8da07
AG
1919 if (need_peer_seq(mdev)) {
1920 spin_lock(&mdev->peer_seq_lock);
3c13b680
LE
1921 newest_peer_seq = seq_max(mdev->peer_seq, peer_seq);
1922 mdev->peer_seq = newest_peer_seq;
7be8da07 1923 spin_unlock(&mdev->peer_seq_lock);
3c13b680
LE
1924 /* wake up only if we actually changed mdev->peer_seq */
1925 if (peer_seq == newest_peer_seq)
7be8da07
AG
1926 wake_up(&mdev->seq_wait);
1927 }
3e394da1
AG
1928}
1929
d93f6302
LE
1930static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
1931{
1932 return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
1933}
1934
1935/* maybe change sync_ee into interval trees as well? */
3ea35df8 1936static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_peer_request *peer_req)
d93f6302
LE
1937{
1938 struct drbd_peer_request *rs_req;
1939 bool rv = 0;
1940
1941 spin_lock_irq(&mdev->tconn->req_lock);
1942 list_for_each_entry(rs_req, &mdev->sync_ee, w.list) {
1943 if (overlaps(peer_req->i.sector, peer_req->i.size,
1944 rs_req->i.sector, rs_req->i.size)) {
1945 rv = 1;
1946 break;
1947 }
1948 }
1949 spin_unlock_irq(&mdev->tconn->req_lock);
1950
d93f6302
LE
1951 return rv;
1952}
1953
b411b363
PR
1954/* Called from receive_Data.
1955 * Synchronize packets on sock with packets on msock.
1956 *
1957 * This is here so even when a P_DATA packet traveling via sock overtook an Ack
1958 * packet traveling on msock, they are still processed in the order they have
1959 * been sent.
1960 *
1961 * Note: we don't care for Ack packets overtaking P_DATA packets.
1962 *
1963 * In case packet_seq is larger than mdev->peer_seq number, there are
1964 * outstanding packets on the msock. We wait for them to arrive.
1965 * In case we are the logically next packet, we update mdev->peer_seq
1966 * ourselves. Correctly handles 32bit wrap around.
1967 *
1968 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
1969 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
1970 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
1971 * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
1972 *
1973 * returns 0 if we may process the packet,
1974 * -ERESTARTSYS if we were interrupted (by disconnect signal). */
7be8da07 1975static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_seq)
b411b363
PR
1976{
1977 DEFINE_WAIT(wait);
b411b363 1978 long timeout;
7be8da07
AG
1979 int ret;
1980
1981 if (!need_peer_seq(mdev))
1982 return 0;
1983
b411b363
PR
1984 spin_lock(&mdev->peer_seq_lock);
1985 for (;;) {
7be8da07
AG
1986 if (!seq_greater(peer_seq - 1, mdev->peer_seq)) {
1987 mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq);
1988 ret = 0;
b411b363 1989 break;
7be8da07 1990 }
b411b363
PR
1991 if (signal_pending(current)) {
1992 ret = -ERESTARTSYS;
1993 break;
1994 }
7be8da07 1995 prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
b411b363 1996 spin_unlock(&mdev->peer_seq_lock);
44ed167d
PR
1997 rcu_read_lock();
1998 timeout = rcu_dereference(mdev->tconn->net_conf)->ping_timeo*HZ/10;
1999 rcu_read_unlock();
71b1c1eb 2000 timeout = schedule_timeout(timeout);
b411b363 2001 spin_lock(&mdev->peer_seq_lock);
7be8da07 2002 if (!timeout) {
b411b363 2003 ret = -ETIMEDOUT;
71b1c1eb 2004 dev_err(DEV, "Timed out waiting for missing ack packets; disconnecting\n");
b411b363
PR
2005 break;
2006 }
2007 }
b411b363 2008 spin_unlock(&mdev->peer_seq_lock);
7be8da07 2009 finish_wait(&mdev->seq_wait, &wait);
b411b363
PR
2010 return ret;
2011}
2012
688593c5
LE
2013/* see also bio_flags_to_wire()
2014 * DRBD_REQ_*, because we need to semantically map the flags to data packet
2015 * flags and back. We may replicate to other kernel versions. */
2016static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
76d2e7ec 2017{
688593c5
LE
2018 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
2019 (dpf & DP_FUA ? REQ_FUA : 0) |
2020 (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
2021 (dpf & DP_DISCARD ? REQ_DISCARD : 0);
76d2e7ec
PR
2022}
2023
7be8da07
AG
2024static void fail_postponed_requests(struct drbd_conf *mdev, sector_t sector,
2025 unsigned int size)
2026{
2027 struct drbd_interval *i;
2028
2029 repeat:
2030 drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
2031 struct drbd_request *req;
2032 struct bio_and_error m;
2033
2034 if (!i->local)
2035 continue;
2036 req = container_of(i, struct drbd_request, i);
2037 if (!(req->rq_state & RQ_POSTPONED))
2038 continue;
2039 req->rq_state &= ~RQ_POSTPONED;
2040 __req_mod(req, NEG_ACKED, &m);
2041 spin_unlock_irq(&mdev->tconn->req_lock);
2042 if (m.bio)
2043 complete_master_bio(mdev, &m);
2044 spin_lock_irq(&mdev->tconn->req_lock);
2045 goto repeat;
2046 }
2047}
2048
2049static int handle_write_conflicts(struct drbd_conf *mdev,
2050 struct drbd_peer_request *peer_req)
2051{
2052 struct drbd_tconn *tconn = mdev->tconn;
2053 bool resolve_conflicts = test_bit(DISCARD_CONCURRENT, &tconn->flags);
2054 sector_t sector = peer_req->i.sector;
2055 const unsigned int size = peer_req->i.size;
2056 struct drbd_interval *i;
2057 bool equal;
2058 int err;
2059
2060 /*
2061 * Inserting the peer request into the write_requests tree will prevent
2062 * new conflicting local requests from being added.
2063 */
2064 drbd_insert_interval(&mdev->write_requests, &peer_req->i);
2065
2066 repeat:
2067 drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
2068 if (i == &peer_req->i)
2069 continue;
2070
2071 if (!i->local) {
2072 /*
2073 * Our peer has sent a conflicting remote request; this
2074 * should not happen in a two-node setup. Wait for the
2075 * earlier peer request to complete.
2076 */
2077 err = drbd_wait_misc(mdev, i);
2078 if (err)
2079 goto out;
2080 goto repeat;
2081 }
2082
2083 equal = i->sector == sector && i->size == size;
2084 if (resolve_conflicts) {
2085 /*
2086 * If the peer request is fully contained within the
2087 * overlapping request, it can be discarded; otherwise,
2088 * it will be retried once all overlapping requests
2089 * have completed.
2090 */
2091 bool discard = i->sector <= sector && i->sector +
2092 (i->size >> 9) >= sector + (size >> 9);
2093
2094 if (!equal)
2095 dev_alert(DEV, "Concurrent writes detected: "
2096 "local=%llus +%u, remote=%llus +%u, "
2097 "assuming %s came first\n",
2098 (unsigned long long)i->sector, i->size,
2099 (unsigned long long)sector, size,
2100 discard ? "local" : "remote");
2101
2102 inc_unacked(mdev);
2103 peer_req->w.cb = discard ? e_send_discard_write :
2104 e_send_retry_write;
2105 list_add_tail(&peer_req->w.list, &mdev->done_ee);
2106 wake_asender(mdev->tconn);
2107
2108 err = -ENOENT;
2109 goto out;
2110 } else {
2111 struct drbd_request *req =
2112 container_of(i, struct drbd_request, i);
2113
2114 if (!equal)
2115 dev_alert(DEV, "Concurrent writes detected: "
2116 "local=%llus +%u, remote=%llus +%u\n",
2117 (unsigned long long)i->sector, i->size,
2118 (unsigned long long)sector, size);
2119
2120 if (req->rq_state & RQ_LOCAL_PENDING ||
2121 !(req->rq_state & RQ_POSTPONED)) {
2122 /*
2123 * Wait for the node with the discard flag to
2124 * decide if this request will be discarded or
2125 * retried. Requests that are discarded will
2126 * disappear from the write_requests tree.
2127 *
2128 * In addition, wait for the conflicting
2129 * request to finish locally before submitting
2130 * the conflicting peer request.
2131 */
2132 err = drbd_wait_misc(mdev, &req->i);
2133 if (err) {
2134 _conn_request_state(mdev->tconn,
2135 NS(conn, C_TIMEOUT),
2136 CS_HARD);
2137 fail_postponed_requests(mdev, sector, size);
2138 goto out;
2139 }
2140 goto repeat;
2141 }
2142 /*
2143 * Remember to restart the conflicting requests after
2144 * the new peer request has completed.
2145 */
2146 peer_req->flags |= EE_RESTART_REQUESTS;
2147 }
2148 }
2149 err = 0;
2150
2151 out:
2152 if (err)
2153 drbd_remove_epoch_entry_interval(mdev, peer_req);
2154 return err;
2155}
2156
b411b363 2157/* mirrored write */
4a76b161 2158static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 2159{
4a76b161 2160 struct drbd_conf *mdev;
b411b363 2161 sector_t sector;
db830c46 2162 struct drbd_peer_request *peer_req;
e658983a 2163 struct p_data *p = pi->data;
7be8da07 2164 u32 peer_seq = be32_to_cpu(p->seq_num);
b411b363
PR
2165 int rw = WRITE;
2166 u32 dp_flags;
302bdeae 2167 int err, tp;
b411b363 2168
4a76b161
AG
2169 mdev = vnr_to_mdev(tconn, pi->vnr);
2170 if (!mdev)
2171 return -EIO;
2172
7be8da07 2173 if (!get_ldev(mdev)) {
82bc0194
AG
2174 int err2;
2175
7be8da07 2176 err = wait_for_and_update_peer_seq(mdev, peer_seq);
e2857216 2177 drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
12038a3a 2178 atomic_inc(&tconn->current_epoch->epoch_size);
e2857216 2179 err2 = drbd_drain_block(mdev, pi->size);
82bc0194
AG
2180 if (!err)
2181 err = err2;
2182 return err;
b411b363
PR
2183 }
2184
fcefa62e
AG
2185 /*
2186 * Corresponding put_ldev done either below (on various errors), or in
2187 * drbd_peer_request_endio, if we successfully submit the data at the
2188 * end of this function.
2189 */
b411b363
PR
2190
2191 sector = be64_to_cpu(p->sector);
e2857216 2192 peer_req = read_in_block(mdev, p->block_id, sector, pi->size);
db830c46 2193 if (!peer_req) {
b411b363 2194 put_ldev(mdev);
82bc0194 2195 return -EIO;
b411b363
PR
2196 }
2197
db830c46 2198 peer_req->w.cb = e_end_block;
b411b363 2199
688593c5
LE
2200 dp_flags = be32_to_cpu(p->dp_flags);
2201 rw |= wire_flags_to_bio(mdev, dp_flags);
2202
2203 if (dp_flags & DP_MAY_SET_IN_SYNC)
db830c46 2204 peer_req->flags |= EE_MAY_SET_IN_SYNC;
688593c5 2205
12038a3a
PR
2206 spin_lock(&tconn->epoch_lock);
2207 peer_req->epoch = tconn->current_epoch;
db830c46
AG
2208 atomic_inc(&peer_req->epoch->epoch_size);
2209 atomic_inc(&peer_req->epoch->active);
12038a3a 2210 spin_unlock(&tconn->epoch_lock);
b411b363 2211
302bdeae
PR
2212 rcu_read_lock();
2213 tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
2214 rcu_read_unlock();
2215 if (tp) {
2216 peer_req->flags |= EE_IN_INTERVAL_TREE;
7be8da07
AG
2217 err = wait_for_and_update_peer_seq(mdev, peer_seq);
2218 if (err)
b411b363 2219 goto out_interrupted;
87eeee41 2220 spin_lock_irq(&mdev->tconn->req_lock);
7be8da07
AG
2221 err = handle_write_conflicts(mdev, peer_req);
2222 if (err) {
2223 spin_unlock_irq(&mdev->tconn->req_lock);
2224 if (err == -ENOENT) {
b411b363 2225 put_ldev(mdev);
82bc0194 2226 return 0;
b411b363 2227 }
7be8da07 2228 goto out_interrupted;
b411b363 2229 }
7be8da07
AG
2230 } else
2231 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 2232 list_add(&peer_req->w.list, &mdev->active_ee);
87eeee41 2233 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 2234
d93f6302 2235 if (mdev->state.conn == C_SYNC_TARGET)
3ea35df8 2236 wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, peer_req));
d93f6302 2237
303d1448 2238 if (mdev->tconn->agreed_pro_version < 100) {
44ed167d
PR
2239 rcu_read_lock();
2240 switch (rcu_dereference(mdev->tconn->net_conf)->wire_protocol) {
303d1448
PR
2241 case DRBD_PROT_C:
2242 dp_flags |= DP_SEND_WRITE_ACK;
2243 break;
2244 case DRBD_PROT_B:
2245 dp_flags |= DP_SEND_RECEIVE_ACK;
2246 break;
2247 }
44ed167d 2248 rcu_read_unlock();
303d1448
PR
2249 }
2250
2251 if (dp_flags & DP_SEND_WRITE_ACK) {
2252 peer_req->flags |= EE_SEND_WRITE_ACK;
b411b363
PR
2253 inc_unacked(mdev);
2254 /* corresponding dec_unacked() in e_end_block()
2255 * respective _drbd_clear_done_ee */
303d1448
PR
2256 }
2257
2258 if (dp_flags & DP_SEND_RECEIVE_ACK) {
b411b363
PR
2259 /* I really don't like it that the receiver thread
2260 * sends on the msock, but anyways */
db830c46 2261 drbd_send_ack(mdev, P_RECV_ACK, peer_req);
b411b363
PR
2262 }
2263
6719fb03 2264 if (mdev->state.pdsk < D_INCONSISTENT) {
b411b363 2265 /* In case we have the only disk of the cluster, */
db830c46
AG
2266 drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size);
2267 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
2268 peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
181286ad 2269 drbd_al_begin_io(mdev, &peer_req->i);
b411b363
PR
2270 }
2271
82bc0194
AG
2272 err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR);
2273 if (!err)
2274 return 0;
b411b363 2275
10f6d992
LE
2276 /* don't care for the reason here */
2277 dev_err(DEV, "submit failed, triggering re-connect\n");
87eeee41 2278 spin_lock_irq(&mdev->tconn->req_lock);
db830c46
AG
2279 list_del(&peer_req->w.list);
2280 drbd_remove_epoch_entry_interval(mdev, peer_req);
87eeee41 2281 spin_unlock_irq(&mdev->tconn->req_lock);
db830c46 2282 if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
181286ad 2283 drbd_al_complete_io(mdev, &peer_req->i);
22cc37a9 2284
b411b363 2285out_interrupted:
1e9dd291 2286 drbd_may_finish_epoch(tconn, peer_req->epoch, EV_PUT + EV_CLEANUP);
b411b363 2287 put_ldev(mdev);
3967deb1 2288 drbd_free_peer_req(mdev, peer_req);
82bc0194 2289 return err;
b411b363
PR
2290}
2291
0f0601f4
LE
2292/* We may throttle resync, if the lower device seems to be busy,
2293 * and current sync rate is above c_min_rate.
2294 *
2295 * To decide whether or not the lower device is busy, we use a scheme similar
2296 * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2297 * (more than 64 sectors) of activity we cannot account for with our own resync
2298 * activity, it obviously is "busy".
2299 *
2300 * The current sync rate used here uses only the most recent two step marks,
2301 * to have a short time average so we can react faster.
2302 */
e3555d85 2303int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
0f0601f4
LE
2304{
2305 struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
2306 unsigned long db, dt, dbdt;
e3555d85 2307 struct lc_element *tmp;
0f0601f4
LE
2308 int curr_events;
2309 int throttle = 0;
daeda1cc
PR
2310 unsigned int c_min_rate;
2311
2312 rcu_read_lock();
2313 c_min_rate = rcu_dereference(mdev->ldev->disk_conf)->c_min_rate;
2314 rcu_read_unlock();
0f0601f4
LE
2315
2316 /* feature disabled? */
daeda1cc 2317 if (c_min_rate == 0)
0f0601f4
LE
2318 return 0;
2319
e3555d85
PR
2320 spin_lock_irq(&mdev->al_lock);
2321 tmp = lc_find(mdev->resync, BM_SECT_TO_EXT(sector));
2322 if (tmp) {
2323 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
2324 if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
2325 spin_unlock_irq(&mdev->al_lock);
2326 return 0;
2327 }
2328 /* Do not slow down if app IO is already waiting for this extent */
2329 }
2330 spin_unlock_irq(&mdev->al_lock);
2331
0f0601f4
LE
2332 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
2333 (int)part_stat_read(&disk->part0, sectors[1]) -
2334 atomic_read(&mdev->rs_sect_ev);
e3555d85 2335
0f0601f4
LE
2336 if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
2337 unsigned long rs_left;
2338 int i;
2339
2340 mdev->rs_last_events = curr_events;
2341
2342 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2343 * approx. */
2649f080
LE
2344 i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
2345
2346 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
2347 rs_left = mdev->ov_left;
2348 else
2349 rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
0f0601f4
LE
2350
2351 dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
2352 if (!dt)
2353 dt++;
2354 db = mdev->rs_mark_left[i] - rs_left;
2355 dbdt = Bit2KB(db/dt);
2356
daeda1cc 2357 if (dbdt > c_min_rate)
0f0601f4
LE
2358 throttle = 1;
2359 }
2360 return throttle;
2361}
2362
2363
4a76b161 2364static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 2365{
4a76b161 2366 struct drbd_conf *mdev;
b411b363 2367 sector_t sector;
4a76b161 2368 sector_t capacity;
db830c46 2369 struct drbd_peer_request *peer_req;
b411b363 2370 struct digest_info *di = NULL;
b18b37be 2371 int size, verb;
b411b363 2372 unsigned int fault_type;
e658983a 2373 struct p_block_req *p = pi->data;
4a76b161
AG
2374
2375 mdev = vnr_to_mdev(tconn, pi->vnr);
2376 if (!mdev)
2377 return -EIO;
2378 capacity = drbd_get_capacity(mdev->this_bdev);
b411b363
PR
2379
2380 sector = be64_to_cpu(p->sector);
2381 size = be32_to_cpu(p->blksize);
2382
c670a398 2383 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
b411b363
PR
2384 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2385 (unsigned long long)sector, size);
82bc0194 2386 return -EINVAL;
b411b363
PR
2387 }
2388 if (sector + (size>>9) > capacity) {
2389 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2390 (unsigned long long)sector, size);
82bc0194 2391 return -EINVAL;
b411b363
PR
2392 }
2393
2394 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
b18b37be 2395 verb = 1;
e2857216 2396 switch (pi->cmd) {
b18b37be
PR
2397 case P_DATA_REQUEST:
2398 drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
2399 break;
2400 case P_RS_DATA_REQUEST:
2401 case P_CSUM_RS_REQUEST:
2402 case P_OV_REQUEST:
2403 drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
2404 break;
2405 case P_OV_REPLY:
2406 verb = 0;
2407 dec_rs_pending(mdev);
2408 drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
2409 break;
2410 default:
49ba9b1b 2411 BUG();
b18b37be
PR
2412 }
2413 if (verb && __ratelimit(&drbd_ratelimit_state))
b411b363
PR
2414 dev_err(DEV, "Can not satisfy peer's read request, "
2415 "no local data.\n");
b18b37be 2416
a821cc4a 2417 /* drain possibly payload */
e2857216 2418 return drbd_drain_block(mdev, pi->size);
b411b363
PR
2419 }
2420
2421 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2422 * "criss-cross" setup, that might cause write-out on some other DRBD,
2423 * which in turn might block on the other node at this very place. */
0db55363 2424 peer_req = drbd_alloc_peer_req(mdev, p->block_id, sector, size, GFP_NOIO);
db830c46 2425 if (!peer_req) {
b411b363 2426 put_ldev(mdev);
82bc0194 2427 return -ENOMEM;
b411b363
PR
2428 }
2429
e2857216 2430 switch (pi->cmd) {
b411b363 2431 case P_DATA_REQUEST:
db830c46 2432 peer_req->w.cb = w_e_end_data_req;
b411b363 2433 fault_type = DRBD_FAULT_DT_RD;
80a40e43
LE
2434 /* application IO, don't drbd_rs_begin_io */
2435 goto submit;
2436
b411b363 2437 case P_RS_DATA_REQUEST:
db830c46 2438 peer_req->w.cb = w_e_end_rsdata_req;
b411b363 2439 fault_type = DRBD_FAULT_RS_RD;
5f9915bb
LE
2440 /* used in the sector offset progress display */
2441 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
b411b363
PR
2442 break;
2443
2444 case P_OV_REPLY:
2445 case P_CSUM_RS_REQUEST:
2446 fault_type = DRBD_FAULT_RS_RD;
e2857216 2447 di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
b411b363
PR
2448 if (!di)
2449 goto out_free_e;
2450
e2857216 2451 di->digest_size = pi->size;
b411b363
PR
2452 di->digest = (((char *)di)+sizeof(struct digest_info));
2453
db830c46
AG
2454 peer_req->digest = di;
2455 peer_req->flags |= EE_HAS_DIGEST;
c36c3ced 2456
e2857216 2457 if (drbd_recv_all(mdev->tconn, di->digest, pi->size))
b411b363
PR
2458 goto out_free_e;
2459
e2857216 2460 if (pi->cmd == P_CSUM_RS_REQUEST) {
31890f4a 2461 D_ASSERT(mdev->tconn->agreed_pro_version >= 89);
db830c46 2462 peer_req->w.cb = w_e_end_csum_rs_req;
5f9915bb
LE
2463 /* used in the sector offset progress display */
2464 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
e2857216 2465 } else if (pi->cmd == P_OV_REPLY) {
2649f080
LE
2466 /* track progress, we may need to throttle */
2467 atomic_add(size >> 9, &mdev->rs_sect_in);
db830c46 2468 peer_req->w.cb = w_e_end_ov_reply;
b411b363 2469 dec_rs_pending(mdev);
0f0601f4
LE
2470 /* drbd_rs_begin_io done when we sent this request,
2471 * but accounting still needs to be done. */
2472 goto submit_for_resync;
b411b363
PR
2473 }
2474 break;
2475
2476 case P_OV_REQUEST:
b411b363 2477 if (mdev->ov_start_sector == ~(sector_t)0 &&
31890f4a 2478 mdev->tconn->agreed_pro_version >= 90) {
de228bba
LE
2479 unsigned long now = jiffies;
2480 int i;
b411b363
PR
2481 mdev->ov_start_sector = sector;
2482 mdev->ov_position = sector;
30b743a2
LE
2483 mdev->ov_left = drbd_bm_bits(mdev) - BM_SECT_TO_BIT(sector);
2484 mdev->rs_total = mdev->ov_left;
de228bba
LE
2485 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2486 mdev->rs_mark_left[i] = mdev->ov_left;
2487 mdev->rs_mark_time[i] = now;
2488 }
b411b363
PR
2489 dev_info(DEV, "Online Verify start sector: %llu\n",
2490 (unsigned long long)sector);
2491 }
db830c46 2492 peer_req->w.cb = w_e_end_ov_req;
b411b363 2493 fault_type = DRBD_FAULT_RS_RD;
b411b363
PR
2494 break;
2495
b411b363 2496 default:
49ba9b1b 2497 BUG();
b411b363
PR
2498 }
2499
0f0601f4
LE
2500 /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2501 * wrt the receiver, but it is not as straightforward as it may seem.
2502 * Various places in the resync start and stop logic assume resync
2503 * requests are processed in order, requeuing this on the worker thread
2504 * introduces a bunch of new code for synchronization between threads.
2505 *
2506 * Unlimited throttling before drbd_rs_begin_io may stall the resync
2507 * "forever", throttling after drbd_rs_begin_io will lock that extent
2508 * for application writes for the same time. For now, just throttle
2509 * here, where the rest of the code expects the receiver to sleep for
2510 * a while, anyways.
2511 */
2512
2513 /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2514 * this defers syncer requests for some time, before letting at least
2515 * on request through. The resync controller on the receiving side
2516 * will adapt to the incoming rate accordingly.
2517 *
2518 * We cannot throttle here if remote is Primary/SyncTarget:
2519 * we would also throttle its application reads.
2520 * In that case, throttling is done on the SyncTarget only.
2521 */
e3555d85
PR
2522 if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev, sector))
2523 schedule_timeout_uninterruptible(HZ/10);
2524 if (drbd_rs_begin_io(mdev, sector))
80a40e43 2525 goto out_free_e;
b411b363 2526
0f0601f4
LE
2527submit_for_resync:
2528 atomic_add(size >> 9, &mdev->rs_sect_ev);
2529
80a40e43 2530submit:
b411b363 2531 inc_unacked(mdev);
87eeee41 2532 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 2533 list_add_tail(&peer_req->w.list, &mdev->read_ee);
87eeee41 2534 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 2535
fbe29dec 2536 if (drbd_submit_peer_request(mdev, peer_req, READ, fault_type) == 0)
82bc0194 2537 return 0;
b411b363 2538
10f6d992
LE
2539 /* don't care for the reason here */
2540 dev_err(DEV, "submit failed, triggering re-connect\n");
87eeee41 2541 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 2542 list_del(&peer_req->w.list);
87eeee41 2543 spin_unlock_irq(&mdev->tconn->req_lock);
22cc37a9
LE
2544 /* no drbd_rs_complete_io(), we are dropping the connection anyways */
2545
b411b363 2546out_free_e:
b411b363 2547 put_ldev(mdev);
3967deb1 2548 drbd_free_peer_req(mdev, peer_req);
82bc0194 2549 return -EIO;
b411b363
PR
2550}
2551
2552static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
2553{
2554 int self, peer, rv = -100;
2555 unsigned long ch_self, ch_peer;
44ed167d 2556 enum drbd_after_sb_p after_sb_0p;
b411b363
PR
2557
2558 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2559 peer = mdev->p_uuid[UI_BITMAP] & 1;
2560
2561 ch_peer = mdev->p_uuid[UI_SIZE];
2562 ch_self = mdev->comm_bm_set;
2563
44ed167d
PR
2564 rcu_read_lock();
2565 after_sb_0p = rcu_dereference(mdev->tconn->net_conf)->after_sb_0p;
2566 rcu_read_unlock();
2567 switch (after_sb_0p) {
b411b363
PR
2568 case ASB_CONSENSUS:
2569 case ASB_DISCARD_SECONDARY:
2570 case ASB_CALL_HELPER:
44ed167d 2571 case ASB_VIOLENTLY:
b411b363
PR
2572 dev_err(DEV, "Configuration error.\n");
2573 break;
2574 case ASB_DISCONNECT:
2575 break;
2576 case ASB_DISCARD_YOUNGER_PRI:
2577 if (self == 0 && peer == 1) {
2578 rv = -1;
2579 break;
2580 }
2581 if (self == 1 && peer == 0) {
2582 rv = 1;
2583 break;
2584 }
2585 /* Else fall through to one of the other strategies... */
2586 case ASB_DISCARD_OLDER_PRI:
2587 if (self == 0 && peer == 1) {
2588 rv = 1;
2589 break;
2590 }
2591 if (self == 1 && peer == 0) {
2592 rv = -1;
2593 break;
2594 }
2595 /* Else fall through to one of the other strategies... */
ad19bf6e 2596 dev_warn(DEV, "Discard younger/older primary did not find a decision\n"
b411b363
PR
2597 "Using discard-least-changes instead\n");
2598 case ASB_DISCARD_ZERO_CHG:
2599 if (ch_peer == 0 && ch_self == 0) {
25703f83 2600 rv = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags)
b411b363
PR
2601 ? -1 : 1;
2602 break;
2603 } else {
2604 if (ch_peer == 0) { rv = 1; break; }
2605 if (ch_self == 0) { rv = -1; break; }
2606 }
44ed167d 2607 if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
b411b363
PR
2608 break;
2609 case ASB_DISCARD_LEAST_CHG:
2610 if (ch_self < ch_peer)
2611 rv = -1;
2612 else if (ch_self > ch_peer)
2613 rv = 1;
2614 else /* ( ch_self == ch_peer ) */
2615 /* Well, then use something else. */
25703f83 2616 rv = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags)
b411b363
PR
2617 ? -1 : 1;
2618 break;
2619 case ASB_DISCARD_LOCAL:
2620 rv = -1;
2621 break;
2622 case ASB_DISCARD_REMOTE:
2623 rv = 1;
2624 }
2625
2626 return rv;
2627}
2628
2629static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2630{
6184ea21 2631 int hg, rv = -100;
44ed167d 2632 enum drbd_after_sb_p after_sb_1p;
b411b363 2633
44ed167d
PR
2634 rcu_read_lock();
2635 after_sb_1p = rcu_dereference(mdev->tconn->net_conf)->after_sb_1p;
2636 rcu_read_unlock();
2637 switch (after_sb_1p) {
b411b363
PR
2638 case ASB_DISCARD_YOUNGER_PRI:
2639 case ASB_DISCARD_OLDER_PRI:
2640 case ASB_DISCARD_LEAST_CHG:
2641 case ASB_DISCARD_LOCAL:
2642 case ASB_DISCARD_REMOTE:
44ed167d 2643 case ASB_DISCARD_ZERO_CHG:
b411b363
PR
2644 dev_err(DEV, "Configuration error.\n");
2645 break;
2646 case ASB_DISCONNECT:
2647 break;
2648 case ASB_CONSENSUS:
2649 hg = drbd_asb_recover_0p(mdev);
2650 if (hg == -1 && mdev->state.role == R_SECONDARY)
2651 rv = hg;
2652 if (hg == 1 && mdev->state.role == R_PRIMARY)
2653 rv = hg;
2654 break;
2655 case ASB_VIOLENTLY:
2656 rv = drbd_asb_recover_0p(mdev);
2657 break;
2658 case ASB_DISCARD_SECONDARY:
2659 return mdev->state.role == R_PRIMARY ? 1 : -1;
2660 case ASB_CALL_HELPER:
2661 hg = drbd_asb_recover_0p(mdev);
2662 if (hg == -1 && mdev->state.role == R_PRIMARY) {
bb437946
AG
2663 enum drbd_state_rv rv2;
2664
2665 drbd_set_role(mdev, R_SECONDARY, 0);
b411b363
PR
2666 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2667 * we might be here in C_WF_REPORT_PARAMS which is transient.
2668 * we do not need to wait for the after state change work either. */
bb437946
AG
2669 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2670 if (rv2 != SS_SUCCESS) {
b411b363
PR
2671 drbd_khelper(mdev, "pri-lost-after-sb");
2672 } else {
2673 dev_warn(DEV, "Successfully gave up primary role.\n");
2674 rv = hg;
2675 }
2676 } else
2677 rv = hg;
2678 }
2679
2680 return rv;
2681}
2682
2683static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2684{
6184ea21 2685 int hg, rv = -100;
44ed167d 2686 enum drbd_after_sb_p after_sb_2p;
b411b363 2687
44ed167d
PR
2688 rcu_read_lock();
2689 after_sb_2p = rcu_dereference(mdev->tconn->net_conf)->after_sb_2p;
2690 rcu_read_unlock();
2691 switch (after_sb_2p) {
b411b363
PR
2692 case ASB_DISCARD_YOUNGER_PRI:
2693 case ASB_DISCARD_OLDER_PRI:
2694 case ASB_DISCARD_LEAST_CHG:
2695 case ASB_DISCARD_LOCAL:
2696 case ASB_DISCARD_REMOTE:
2697 case ASB_CONSENSUS:
2698 case ASB_DISCARD_SECONDARY:
44ed167d 2699 case ASB_DISCARD_ZERO_CHG:
b411b363
PR
2700 dev_err(DEV, "Configuration error.\n");
2701 break;
2702 case ASB_VIOLENTLY:
2703 rv = drbd_asb_recover_0p(mdev);
2704 break;
2705 case ASB_DISCONNECT:
2706 break;
2707 case ASB_CALL_HELPER:
2708 hg = drbd_asb_recover_0p(mdev);
2709 if (hg == -1) {
bb437946
AG
2710 enum drbd_state_rv rv2;
2711
b411b363
PR
2712 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2713 * we might be here in C_WF_REPORT_PARAMS which is transient.
2714 * we do not need to wait for the after state change work either. */
bb437946
AG
2715 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2716 if (rv2 != SS_SUCCESS) {
b411b363
PR
2717 drbd_khelper(mdev, "pri-lost-after-sb");
2718 } else {
2719 dev_warn(DEV, "Successfully gave up primary role.\n");
2720 rv = hg;
2721 }
2722 } else
2723 rv = hg;
2724 }
2725
2726 return rv;
2727}
2728
2729static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
2730 u64 bits, u64 flags)
2731{
2732 if (!uuid) {
2733 dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
2734 return;
2735 }
2736 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
2737 text,
2738 (unsigned long long)uuid[UI_CURRENT],
2739 (unsigned long long)uuid[UI_BITMAP],
2740 (unsigned long long)uuid[UI_HISTORY_START],
2741 (unsigned long long)uuid[UI_HISTORY_END],
2742 (unsigned long long)bits,
2743 (unsigned long long)flags);
2744}
2745
2746/*
2747 100 after split brain try auto recover
2748 2 C_SYNC_SOURCE set BitMap
2749 1 C_SYNC_SOURCE use BitMap
2750 0 no Sync
2751 -1 C_SYNC_TARGET use BitMap
2752 -2 C_SYNC_TARGET set BitMap
2753 -100 after split brain, disconnect
2754-1000 unrelated data
4a23f264
PR
2755-1091 requires proto 91
2756-1096 requires proto 96
b411b363
PR
2757 */
2758static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
2759{
2760 u64 self, peer;
2761 int i, j;
2762
2763 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2764 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2765
2766 *rule_nr = 10;
2767 if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2768 return 0;
2769
2770 *rule_nr = 20;
2771 if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2772 peer != UUID_JUST_CREATED)
2773 return -2;
2774
2775 *rule_nr = 30;
2776 if (self != UUID_JUST_CREATED &&
2777 (peer == UUID_JUST_CREATED || peer == (u64)0))
2778 return 2;
2779
2780 if (self == peer) {
2781 int rct, dc; /* roles at crash time */
2782
2783 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2784
31890f4a 2785 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2786 return -1091;
b411b363
PR
2787
2788 if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2789 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
2790 dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
2791 drbd_uuid_set_bm(mdev, 0UL);
2792
2793 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2794 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2795 *rule_nr = 34;
2796 } else {
2797 dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
2798 *rule_nr = 36;
2799 }
2800
2801 return 1;
2802 }
2803
2804 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
2805
31890f4a 2806 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2807 return -1091;
b411b363
PR
2808
2809 if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2810 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
2811 dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
2812
2813 mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
2814 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
2815 mdev->p_uuid[UI_BITMAP] = 0UL;
2816
2817 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2818 *rule_nr = 35;
2819 } else {
2820 dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
2821 *rule_nr = 37;
2822 }
2823
2824 return -1;
2825 }
2826
2827 /* Common power [off|failure] */
2828 rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
2829 (mdev->p_uuid[UI_FLAGS] & 2);
2830 /* lowest bit is set when we were primary,
2831 * next bit (weight 2) is set when peer was primary */
2832 *rule_nr = 40;
2833
2834 switch (rct) {
2835 case 0: /* !self_pri && !peer_pri */ return 0;
2836 case 1: /* self_pri && !peer_pri */ return 1;
2837 case 2: /* !self_pri && peer_pri */ return -1;
2838 case 3: /* self_pri && peer_pri */
25703f83 2839 dc = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags);
b411b363
PR
2840 return dc ? -1 : 1;
2841 }
2842 }
2843
2844 *rule_nr = 50;
2845 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2846 if (self == peer)
2847 return -1;
2848
2849 *rule_nr = 51;
2850 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2851 if (self == peer) {
31890f4a 2852 if (mdev->tconn->agreed_pro_version < 96 ?
4a23f264
PR
2853 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
2854 (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
2855 peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) {
b411b363
PR
2856 /* The last P_SYNC_UUID did not get though. Undo the last start of
2857 resync as sync source modifications of the peer's UUIDs. */
2858
31890f4a 2859 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2860 return -1091;
b411b363
PR
2861
2862 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2863 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
4a23f264 2864
1882e22d 2865 dev_info(DEV, "Lost last syncUUID packet, corrected:\n");
4a23f264
PR
2866 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2867
b411b363
PR
2868 return -1;
2869 }
2870 }
2871
2872 *rule_nr = 60;
2873 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2874 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2875 peer = mdev->p_uuid[i] & ~((u64)1);
2876 if (self == peer)
2877 return -2;
2878 }
2879
2880 *rule_nr = 70;
2881 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2882 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2883 if (self == peer)
2884 return 1;
2885
2886 *rule_nr = 71;
2887 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2888 if (self == peer) {
31890f4a 2889 if (mdev->tconn->agreed_pro_version < 96 ?
4a23f264
PR
2890 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
2891 (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
2892 self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
b411b363
PR
2893 /* The last P_SYNC_UUID did not get though. Undo the last start of
2894 resync as sync source modifications of our UUIDs. */
2895
31890f4a 2896 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2897 return -1091;
b411b363
PR
2898
2899 _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
2900 _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
2901
4a23f264 2902 dev_info(DEV, "Last syncUUID did not get through, corrected:\n");
b411b363
PR
2903 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2904 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2905
2906 return 1;
2907 }
2908 }
2909
2910
2911 *rule_nr = 80;
d8c2a36b 2912 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
b411b363
PR
2913 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2914 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2915 if (self == peer)
2916 return 2;
2917 }
2918
2919 *rule_nr = 90;
2920 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2921 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2922 if (self == peer && self != ((u64)0))
2923 return 100;
2924
2925 *rule_nr = 100;
2926 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2927 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2928 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
2929 peer = mdev->p_uuid[j] & ~((u64)1);
2930 if (self == peer)
2931 return -100;
2932 }
2933 }
2934
2935 return -1000;
2936}
2937
2938/* drbd_sync_handshake() returns the new conn state on success, or
2939 CONN_MASK (-1) on failure.
2940 */
2941static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
2942 enum drbd_disk_state peer_disk) __must_hold(local)
2943{
b411b363
PR
2944 enum drbd_conns rv = C_MASK;
2945 enum drbd_disk_state mydisk;
44ed167d 2946 struct net_conf *nc;
6dff2902 2947 int hg, rule_nr, rr_conflict, tentative;
b411b363
PR
2948
2949 mydisk = mdev->state.disk;
2950 if (mydisk == D_NEGOTIATING)
2951 mydisk = mdev->new_state_tmp.disk;
2952
2953 dev_info(DEV, "drbd_sync_handshake:\n");
2954 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
2955 drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
2956 mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2957
2958 hg = drbd_uuid_compare(mdev, &rule_nr);
2959
2960 dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
2961
2962 if (hg == -1000) {
2963 dev_alert(DEV, "Unrelated data, aborting!\n");
2964 return C_MASK;
2965 }
4a23f264
PR
2966 if (hg < -1000) {
2967 dev_alert(DEV, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
b411b363
PR
2968 return C_MASK;
2969 }
2970
2971 if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
2972 (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
2973 int f = (hg == -100) || abs(hg) == 2;
2974 hg = mydisk > D_INCONSISTENT ? 1 : -1;
2975 if (f)
2976 hg = hg*2;
2977 dev_info(DEV, "Becoming sync %s due to disk states.\n",
2978 hg > 0 ? "source" : "target");
2979 }
2980
3a11a487
AG
2981 if (abs(hg) == 100)
2982 drbd_khelper(mdev, "initial-split-brain");
2983
44ed167d
PR
2984 rcu_read_lock();
2985 nc = rcu_dereference(mdev->tconn->net_conf);
2986
2987 if (hg == 100 || (hg == -100 && nc->always_asbp)) {
b411b363
PR
2988 int pcount = (mdev->state.role == R_PRIMARY)
2989 + (peer_role == R_PRIMARY);
2990 int forced = (hg == -100);
2991
2992 switch (pcount) {
2993 case 0:
2994 hg = drbd_asb_recover_0p(mdev);
2995 break;
2996 case 1:
2997 hg = drbd_asb_recover_1p(mdev);
2998 break;
2999 case 2:
3000 hg = drbd_asb_recover_2p(mdev);
3001 break;
3002 }
3003 if (abs(hg) < 100) {
3004 dev_warn(DEV, "Split-Brain detected, %d primaries, "
3005 "automatically solved. Sync from %s node\n",
3006 pcount, (hg < 0) ? "peer" : "this");
3007 if (forced) {
3008 dev_warn(DEV, "Doing a full sync, since"
3009 " UUIDs where ambiguous.\n");
3010 hg = hg*2;
3011 }
3012 }
3013 }
3014
3015 if (hg == -100) {
08b165ba 3016 if (test_bit(DISCARD_MY_DATA, &mdev->flags) && !(mdev->p_uuid[UI_FLAGS]&1))
b411b363 3017 hg = -1;
08b165ba 3018 if (!test_bit(DISCARD_MY_DATA, &mdev->flags) && (mdev->p_uuid[UI_FLAGS]&1))
b411b363
PR
3019 hg = 1;
3020
3021 if (abs(hg) < 100)
3022 dev_warn(DEV, "Split-Brain detected, manually solved. "
3023 "Sync from %s node\n",
3024 (hg < 0) ? "peer" : "this");
3025 }
44ed167d 3026 rr_conflict = nc->rr_conflict;
6dff2902 3027 tentative = nc->tentative;
44ed167d 3028 rcu_read_unlock();
b411b363
PR
3029
3030 if (hg == -100) {
580b9767
LE
3031 /* FIXME this log message is not correct if we end up here
3032 * after an attempted attach on a diskless node.
3033 * We just refuse to attach -- well, we drop the "connection"
3034 * to that disk, in a way... */
3a11a487 3035 dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
b411b363
PR
3036 drbd_khelper(mdev, "split-brain");
3037 return C_MASK;
3038 }
3039
3040 if (hg > 0 && mydisk <= D_INCONSISTENT) {
3041 dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
3042 return C_MASK;
3043 }
3044
3045 if (hg < 0 && /* by intention we do not use mydisk here. */
3046 mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
44ed167d 3047 switch (rr_conflict) {
b411b363
PR
3048 case ASB_CALL_HELPER:
3049 drbd_khelper(mdev, "pri-lost");
3050 /* fall through */
3051 case ASB_DISCONNECT:
3052 dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
3053 return C_MASK;
3054 case ASB_VIOLENTLY:
3055 dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
3056 "assumption\n");
3057 }
3058 }
3059
6dff2902 3060 if (tentative || test_bit(CONN_DRY_RUN, &mdev->tconn->flags)) {
cf14c2e9
PR
3061 if (hg == 0)
3062 dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
3063 else
3064 dev_info(DEV, "dry-run connect: Would become %s, doing a %s resync.",
3065 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
3066 abs(hg) >= 2 ? "full" : "bit-map based");
3067 return C_MASK;
3068 }
3069
b411b363
PR
3070 if (abs(hg) >= 2) {
3071 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
20ceb2b2
LE
3072 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
3073 BM_LOCKED_SET_ALLOWED))
b411b363
PR
3074 return C_MASK;
3075 }
3076
3077 if (hg > 0) { /* become sync source. */
3078 rv = C_WF_BITMAP_S;
3079 } else if (hg < 0) { /* become sync target */
3080 rv = C_WF_BITMAP_T;
3081 } else {
3082 rv = C_CONNECTED;
3083 if (drbd_bm_total_weight(mdev)) {
3084 dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
3085 drbd_bm_total_weight(mdev));
3086 }
3087 }
3088
3089 return rv;
3090}
3091
f179d76d 3092static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
b411b363
PR
3093{
3094 /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
f179d76d
PR
3095 if (peer == ASB_DISCARD_REMOTE)
3096 return ASB_DISCARD_LOCAL;
b411b363
PR
3097
3098 /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
f179d76d
PR
3099 if (peer == ASB_DISCARD_LOCAL)
3100 return ASB_DISCARD_REMOTE;
b411b363
PR
3101
3102 /* everything else is valid if they are equal on both sides. */
f179d76d 3103 return peer;
b411b363
PR
3104}
3105
e2857216 3106static int receive_protocol(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3107{
e658983a 3108 struct p_protocol *p = pi->data;
036b17ea
PR
3109 enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
3110 int p_proto, p_discard_my_data, p_two_primaries, cf;
3111 struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
3112 char integrity_alg[SHARED_SECRET_MAX] = "";
accdbcc5 3113 struct crypto_hash *peer_integrity_tfm = NULL;
7aca6c75 3114 void *int_dig_in = NULL, *int_dig_vv = NULL;
b411b363 3115
b411b363
PR
3116 p_proto = be32_to_cpu(p->protocol);
3117 p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
3118 p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
3119 p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
b411b363 3120 p_two_primaries = be32_to_cpu(p->two_primaries);
cf14c2e9 3121 cf = be32_to_cpu(p->conn_flags);
6139f60d 3122 p_discard_my_data = cf & CF_DISCARD_MY_DATA;
cf14c2e9 3123
86db0618
AG
3124 if (tconn->agreed_pro_version >= 87) {
3125 int err;
3126
88104ca4 3127 if (pi->size > sizeof(integrity_alg))
86db0618 3128 return -EIO;
88104ca4 3129 err = drbd_recv_all(tconn, integrity_alg, pi->size);
86db0618
AG
3130 if (err)
3131 return err;
036b17ea
PR
3132 integrity_alg[SHARED_SECRET_MAX - 1] = 0;
3133 }
88104ca4 3134
7d4c782c 3135 if (pi->cmd != P_PROTOCOL_UPDATE) {
fbc12f45 3136 clear_bit(CONN_DRY_RUN, &tconn->flags);
036b17ea 3137
fbc12f45
AG
3138 if (cf & CF_DRY_RUN)
3139 set_bit(CONN_DRY_RUN, &tconn->flags);
cf14c2e9 3140
fbc12f45
AG
3141 rcu_read_lock();
3142 nc = rcu_dereference(tconn->net_conf);
b411b363 3143
fbc12f45 3144 if (p_proto != nc->wire_protocol) {
d505d9be 3145 conn_err(tconn, "incompatible %s settings\n", "protocol");
fbc12f45
AG
3146 goto disconnect_rcu_unlock;
3147 }
44ed167d 3148
fbc12f45 3149 if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
d505d9be 3150 conn_err(tconn, "incompatible %s settings\n", "after-sb-0pri");
fbc12f45
AG
3151 goto disconnect_rcu_unlock;
3152 }
b411b363 3153
fbc12f45 3154 if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
d505d9be 3155 conn_err(tconn, "incompatible %s settings\n", "after-sb-1pri");
fbc12f45
AG
3156 goto disconnect_rcu_unlock;
3157 }
b411b363 3158
fbc12f45 3159 if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
d505d9be 3160 conn_err(tconn, "incompatible %s settings\n", "after-sb-2pri");
fbc12f45
AG
3161 goto disconnect_rcu_unlock;
3162 }
b411b363 3163
fbc12f45 3164 if (p_discard_my_data && nc->discard_my_data) {
d505d9be 3165 conn_err(tconn, "incompatible %s settings\n", "discard-my-data");
fbc12f45
AG
3166 goto disconnect_rcu_unlock;
3167 }
b411b363 3168
fbc12f45 3169 if (p_two_primaries != nc->two_primaries) {
d505d9be 3170 conn_err(tconn, "incompatible %s settings\n", "allow-two-primaries");
fbc12f45
AG
3171 goto disconnect_rcu_unlock;
3172 }
b411b363 3173
fbc12f45 3174 if (strcmp(integrity_alg, nc->integrity_alg)) {
d505d9be 3175 conn_err(tconn, "incompatible %s settings\n", "data-integrity-alg");
fbc12f45
AG
3176 goto disconnect_rcu_unlock;
3177 }
b411b363 3178
fbc12f45 3179 rcu_read_unlock();
036b17ea 3180 }
7d4c782c
AG
3181
3182 if (integrity_alg[0]) {
3183 int hash_size;
3184
3185 /*
3186 * We can only change the peer data integrity algorithm
3187 * here. Changing our own data integrity algorithm
3188 * requires that we send a P_PROTOCOL_UPDATE packet at
3189 * the same time; otherwise, the peer has no way to
3190 * tell between which packets the algorithm should
3191 * change.
3192 */
3193
3194 peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
3195 if (!peer_integrity_tfm) {
3196 conn_err(tconn, "peer data-integrity-alg %s not supported\n",
3197 integrity_alg);
3198 goto disconnect;
3199 }
3200
3201 hash_size = crypto_hash_digestsize(peer_integrity_tfm);
3202 int_dig_in = kmalloc(hash_size, GFP_KERNEL);
3203 int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
3204 if (!(int_dig_in && int_dig_vv)) {
3205 conn_err(tconn, "Allocation of buffers for data integrity checking failed\n");
3206 goto disconnect;
3207 }
3208 }
3209
3210 new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
3211 if (!new_net_conf) {
3212 conn_err(tconn, "Allocation of new net_conf failed\n");
3213 goto disconnect;
3214 }
3215
3216 mutex_lock(&tconn->data.mutex);
3217 mutex_lock(&tconn->conf_update);
3218 old_net_conf = tconn->net_conf;
3219 *new_net_conf = *old_net_conf;
3220
3221 new_net_conf->wire_protocol = p_proto;
3222 new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
3223 new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
3224 new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
3225 new_net_conf->two_primaries = p_two_primaries;
3226
3227 rcu_assign_pointer(tconn->net_conf, new_net_conf);
3228 mutex_unlock(&tconn->conf_update);
3229 mutex_unlock(&tconn->data.mutex);
3230
3231 crypto_free_hash(tconn->peer_integrity_tfm);
3232 kfree(tconn->int_dig_in);
3233 kfree(tconn->int_dig_vv);
3234 tconn->peer_integrity_tfm = peer_integrity_tfm;
3235 tconn->int_dig_in = int_dig_in;
3236 tconn->int_dig_vv = int_dig_vv;
3237
3238 if (strcmp(old_net_conf->integrity_alg, integrity_alg))
3239 conn_info(tconn, "peer data-integrity-alg: %s\n",
3240 integrity_alg[0] ? integrity_alg : "(none)");
3241
3242 synchronize_rcu();
3243 kfree(old_net_conf);
82bc0194 3244 return 0;
b411b363 3245
44ed167d
PR
3246disconnect_rcu_unlock:
3247 rcu_read_unlock();
b411b363 3248disconnect:
b792c35c 3249 crypto_free_hash(peer_integrity_tfm);
036b17ea
PR
3250 kfree(int_dig_in);
3251 kfree(int_dig_vv);
7204624c 3252 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3253 return -EIO;
b411b363
PR
3254}
3255
3256/* helper function
3257 * input: alg name, feature name
3258 * return: NULL (alg name was "")
3259 * ERR_PTR(error) if something goes wrong
3260 * or the crypto hash ptr, if it worked out ok. */
3261struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
3262 const char *alg, const char *name)
3263{
3264 struct crypto_hash *tfm;
3265
3266 if (!alg[0])
3267 return NULL;
3268
3269 tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
3270 if (IS_ERR(tfm)) {
3271 dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
3272 alg, name, PTR_ERR(tfm));
3273 return tfm;
3274 }
b411b363
PR
3275 return tfm;
3276}
3277
4a76b161
AG
3278static int ignore_remaining_packet(struct drbd_tconn *tconn, struct packet_info *pi)
3279{
3280 void *buffer = tconn->data.rbuf;
3281 int size = pi->size;
3282
3283 while (size) {
3284 int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
3285 s = drbd_recv(tconn, buffer, s);
3286 if (s <= 0) {
3287 if (s < 0)
3288 return s;
3289 break;
3290 }
3291 size -= s;
3292 }
3293 if (size)
3294 return -EIO;
3295 return 0;
3296}
3297
3298/*
3299 * config_unknown_volume - device configuration command for unknown volume
3300 *
3301 * When a device is added to an existing connection, the node on which the
3302 * device is added first will send configuration commands to its peer but the
3303 * peer will not know about the device yet. It will warn and ignore these
3304 * commands. Once the device is added on the second node, the second node will
3305 * send the same device configuration commands, but in the other direction.
3306 *
3307 * (We can also end up here if drbd is misconfigured.)
3308 */
3309static int config_unknown_volume(struct drbd_tconn *tconn, struct packet_info *pi)
3310{
2fcb8f30
AG
3311 conn_warn(tconn, "%s packet received for volume %u, which is not configured locally\n",
3312 cmdname(pi->cmd), pi->vnr);
4a76b161
AG
3313 return ignore_remaining_packet(tconn, pi);
3314}
3315
3316static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3317{
4a76b161 3318 struct drbd_conf *mdev;
e658983a 3319 struct p_rs_param_95 *p;
b411b363
PR
3320 unsigned int header_size, data_size, exp_max_sz;
3321 struct crypto_hash *verify_tfm = NULL;
3322 struct crypto_hash *csums_tfm = NULL;
2ec91e0e 3323 struct net_conf *old_net_conf, *new_net_conf = NULL;
813472ce 3324 struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
4a76b161 3325 const int apv = tconn->agreed_pro_version;
813472ce 3326 struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
778f271d 3327 int fifo_size = 0;
82bc0194 3328 int err;
b411b363 3329
4a76b161
AG
3330 mdev = vnr_to_mdev(tconn, pi->vnr);
3331 if (!mdev)
3332 return config_unknown_volume(tconn, pi);
3333
b411b363
PR
3334 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
3335 : apv == 88 ? sizeof(struct p_rs_param)
3336 + SHARED_SECRET_MAX
8e26f9cc
PR
3337 : apv <= 94 ? sizeof(struct p_rs_param_89)
3338 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363 3339
e2857216 3340 if (pi->size > exp_max_sz) {
b411b363 3341 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
e2857216 3342 pi->size, exp_max_sz);
82bc0194 3343 return -EIO;
b411b363
PR
3344 }
3345
3346 if (apv <= 88) {
e658983a 3347 header_size = sizeof(struct p_rs_param);
e2857216 3348 data_size = pi->size - header_size;
8e26f9cc 3349 } else if (apv <= 94) {
e658983a 3350 header_size = sizeof(struct p_rs_param_89);
e2857216 3351 data_size = pi->size - header_size;
b411b363 3352 D_ASSERT(data_size == 0);
8e26f9cc 3353 } else {
e658983a 3354 header_size = sizeof(struct p_rs_param_95);
e2857216 3355 data_size = pi->size - header_size;
b411b363
PR
3356 D_ASSERT(data_size == 0);
3357 }
3358
3359 /* initialize verify_alg and csums_alg */
e658983a 3360 p = pi->data;
b411b363
PR
3361 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
3362
e658983a 3363 err = drbd_recv_all(mdev->tconn, p, header_size);
82bc0194
AG
3364 if (err)
3365 return err;
b411b363 3366
daeda1cc
PR
3367 mutex_lock(&mdev->tconn->conf_update);
3368 old_net_conf = mdev->tconn->net_conf;
813472ce
PR
3369 if (get_ldev(mdev)) {
3370 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3371 if (!new_disk_conf) {
3372 put_ldev(mdev);
3373 mutex_unlock(&mdev->tconn->conf_update);
3374 dev_err(DEV, "Allocation of new disk_conf failed\n");
3375 return -ENOMEM;
3376 }
daeda1cc 3377
813472ce
PR
3378 old_disk_conf = mdev->ldev->disk_conf;
3379 *new_disk_conf = *old_disk_conf;
3380
6394b935 3381 new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
813472ce 3382 }
daeda1cc 3383
b411b363
PR
3384 if (apv >= 88) {
3385 if (apv == 88) {
e4bad1bc
PR
3386 if (data_size > SHARED_SECRET_MAX || data_size == 0) {
3387 dev_err(DEV, "verify-alg of wrong size, "
3388 "peer wants %u, accepting only up to %u byte\n",
3389 data_size, SHARED_SECRET_MAX);
813472ce
PR
3390 err = -EIO;
3391 goto reconnect;
b411b363
PR
3392 }
3393
82bc0194 3394 err = drbd_recv_all(mdev->tconn, p->verify_alg, data_size);
813472ce
PR
3395 if (err)
3396 goto reconnect;
b411b363
PR
3397 /* we expect NUL terminated string */
3398 /* but just in case someone tries to be evil */
3399 D_ASSERT(p->verify_alg[data_size-1] == 0);
3400 p->verify_alg[data_size-1] = 0;
3401
3402 } else /* apv >= 89 */ {
3403 /* we still expect NUL terminated strings */
3404 /* but just in case someone tries to be evil */
3405 D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
3406 D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
3407 p->verify_alg[SHARED_SECRET_MAX-1] = 0;
3408 p->csums_alg[SHARED_SECRET_MAX-1] = 0;
3409 }
3410
2ec91e0e 3411 if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
b411b363
PR
3412 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
3413 dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
2ec91e0e 3414 old_net_conf->verify_alg, p->verify_alg);
b411b363
PR
3415 goto disconnect;
3416 }
3417 verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
3418 p->verify_alg, "verify-alg");
3419 if (IS_ERR(verify_tfm)) {
3420 verify_tfm = NULL;
3421 goto disconnect;
3422 }
3423 }
3424
2ec91e0e 3425 if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
b411b363
PR
3426 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
3427 dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
2ec91e0e 3428 old_net_conf->csums_alg, p->csums_alg);
b411b363
PR
3429 goto disconnect;
3430 }
3431 csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
3432 p->csums_alg, "csums-alg");
3433 if (IS_ERR(csums_tfm)) {
3434 csums_tfm = NULL;
3435 goto disconnect;
3436 }
3437 }
3438
813472ce 3439 if (apv > 94 && new_disk_conf) {
daeda1cc
PR
3440 new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
3441 new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
3442 new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
3443 new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
778f271d 3444
daeda1cc 3445 fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
9958c857 3446 if (fifo_size != mdev->rs_plan_s->size) {
813472ce
PR
3447 new_plan = fifo_alloc(fifo_size);
3448 if (!new_plan) {
778f271d 3449 dev_err(DEV, "kmalloc of fifo_buffer failed");
f399002e 3450 put_ldev(mdev);
778f271d
PR
3451 goto disconnect;
3452 }
3453 }
8e26f9cc 3454 }
b411b363 3455
91fd4dad 3456 if (verify_tfm || csums_tfm) {
2ec91e0e
PR
3457 new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
3458 if (!new_net_conf) {
91fd4dad
PR
3459 dev_err(DEV, "Allocation of new net_conf failed\n");
3460 goto disconnect;
3461 }
3462
2ec91e0e 3463 *new_net_conf = *old_net_conf;
91fd4dad
PR
3464
3465 if (verify_tfm) {
2ec91e0e
PR
3466 strcpy(new_net_conf->verify_alg, p->verify_alg);
3467 new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
91fd4dad
PR
3468 crypto_free_hash(mdev->tconn->verify_tfm);
3469 mdev->tconn->verify_tfm = verify_tfm;
3470 dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
3471 }
3472 if (csums_tfm) {
2ec91e0e
PR
3473 strcpy(new_net_conf->csums_alg, p->csums_alg);
3474 new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
91fd4dad
PR
3475 crypto_free_hash(mdev->tconn->csums_tfm);
3476 mdev->tconn->csums_tfm = csums_tfm;
3477 dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
3478 }
2ec91e0e 3479 rcu_assign_pointer(tconn->net_conf, new_net_conf);
b411b363 3480 }
daeda1cc 3481 }
91fd4dad 3482
813472ce
PR
3483 if (new_disk_conf) {
3484 rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
3485 put_ldev(mdev);
3486 }
3487
3488 if (new_plan) {
3489 old_plan = mdev->rs_plan_s;
3490 rcu_assign_pointer(mdev->rs_plan_s, new_plan);
b411b363 3491 }
daeda1cc
PR
3492
3493 mutex_unlock(&mdev->tconn->conf_update);
3494 synchronize_rcu();
3495 if (new_net_conf)
3496 kfree(old_net_conf);
3497 kfree(old_disk_conf);
813472ce 3498 kfree(old_plan);
daeda1cc 3499
82bc0194 3500 return 0;
b411b363 3501
813472ce
PR
3502reconnect:
3503 if (new_disk_conf) {
3504 put_ldev(mdev);
3505 kfree(new_disk_conf);
3506 }
3507 mutex_unlock(&mdev->tconn->conf_update);
3508 return -EIO;
3509
b411b363 3510disconnect:
813472ce
PR
3511 kfree(new_plan);
3512 if (new_disk_conf) {
3513 put_ldev(mdev);
3514 kfree(new_disk_conf);
3515 }
a0095508 3516 mutex_unlock(&mdev->tconn->conf_update);
b411b363
PR
3517 /* just for completeness: actually not needed,
3518 * as this is not reached if csums_tfm was ok. */
3519 crypto_free_hash(csums_tfm);
3520 /* but free the verify_tfm again, if csums_tfm did not work out */
3521 crypto_free_hash(verify_tfm);
38fa9988 3522 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3523 return -EIO;
b411b363
PR
3524}
3525
b411b363
PR
3526/* warn if the arguments differ by more than 12.5% */
3527static void warn_if_differ_considerably(struct drbd_conf *mdev,
3528 const char *s, sector_t a, sector_t b)
3529{
3530 sector_t d;
3531 if (a == 0 || b == 0)
3532 return;
3533 d = (a > b) ? (a - b) : (b - a);
3534 if (d > (a>>3) || d > (b>>3))
3535 dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
3536 (unsigned long long)a, (unsigned long long)b);
3537}
3538
4a76b161 3539static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3540{
4a76b161 3541 struct drbd_conf *mdev;
e658983a 3542 struct p_sizes *p = pi->data;
b411b363 3543 enum determine_dev_size dd = unchanged;
b411b363
PR
3544 sector_t p_size, p_usize, my_usize;
3545 int ldsc = 0; /* local disk size changed */
e89b591c 3546 enum dds_flags ddsf;
b411b363 3547
4a76b161
AG
3548 mdev = vnr_to_mdev(tconn, pi->vnr);
3549 if (!mdev)
3550 return config_unknown_volume(tconn, pi);
3551
b411b363
PR
3552 p_size = be64_to_cpu(p->d_size);
3553 p_usize = be64_to_cpu(p->u_size);
3554
b411b363
PR
3555 /* just store the peer's disk size for now.
3556 * we still need to figure out whether we accept that. */
3557 mdev->p_size = p_size;
3558
b411b363 3559 if (get_ldev(mdev)) {
daeda1cc
PR
3560 rcu_read_lock();
3561 my_usize = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
3562 rcu_read_unlock();
3563
b411b363
PR
3564 warn_if_differ_considerably(mdev, "lower level device sizes",
3565 p_size, drbd_get_max_capacity(mdev->ldev));
3566 warn_if_differ_considerably(mdev, "user requested size",
daeda1cc 3567 p_usize, my_usize);
b411b363
PR
3568
3569 /* if this is the first connect, or an otherwise expected
3570 * param exchange, choose the minimum */
3571 if (mdev->state.conn == C_WF_REPORT_PARAMS)
daeda1cc 3572 p_usize = min_not_zero(my_usize, p_usize);
b411b363
PR
3573
3574 /* Never shrink a device with usable data during connect.
3575 But allow online shrinking if we are connected. */
ef5e44a6 3576 if (drbd_new_dev_size(mdev, mdev->ldev, p_usize, 0) <
daeda1cc
PR
3577 drbd_get_capacity(mdev->this_bdev) &&
3578 mdev->state.disk >= D_OUTDATED &&
3579 mdev->state.conn < C_CONNECTED) {
b411b363 3580 dev_err(DEV, "The peer's disk size is too small!\n");
38fa9988 3581 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 3582 put_ldev(mdev);
82bc0194 3583 return -EIO;
b411b363 3584 }
daeda1cc
PR
3585
3586 if (my_usize != p_usize) {
3587 struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
3588
3589 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3590 if (!new_disk_conf) {
3591 dev_err(DEV, "Allocation of new disk_conf failed\n");
3592 put_ldev(mdev);
3593 return -ENOMEM;
3594 }
3595
3596 mutex_lock(&mdev->tconn->conf_update);
3597 old_disk_conf = mdev->ldev->disk_conf;
3598 *new_disk_conf = *old_disk_conf;
3599 new_disk_conf->disk_size = p_usize;
3600
3601 rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
3602 mutex_unlock(&mdev->tconn->conf_update);
3603 synchronize_rcu();
3604 kfree(old_disk_conf);
3605
3606 dev_info(DEV, "Peer sets u_size to %lu sectors\n",
3607 (unsigned long)my_usize);
3608 }
3609
b411b363
PR
3610 put_ldev(mdev);
3611 }
b411b363 3612
e89b591c 3613 ddsf = be16_to_cpu(p->dds_flags);
b411b363 3614 if (get_ldev(mdev)) {
24c4830c 3615 dd = drbd_determine_dev_size(mdev, ddsf);
b411b363
PR
3616 put_ldev(mdev);
3617 if (dd == dev_size_error)
82bc0194 3618 return -EIO;
b411b363
PR
3619 drbd_md_sync(mdev);
3620 } else {
3621 /* I am diskless, need to accept the peer's size. */
3622 drbd_set_my_capacity(mdev, p_size);
3623 }
3624
99432fcc
PR
3625 mdev->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
3626 drbd_reconsider_max_bio_size(mdev);
3627
b411b363
PR
3628 if (get_ldev(mdev)) {
3629 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
3630 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
3631 ldsc = 1;
3632 }
3633
b411b363
PR
3634 put_ldev(mdev);
3635 }
3636
3637 if (mdev->state.conn > C_WF_REPORT_PARAMS) {
3638 if (be64_to_cpu(p->c_size) !=
3639 drbd_get_capacity(mdev->this_bdev) || ldsc) {
3640 /* we have different sizes, probably peer
3641 * needs to know my new size... */
e89b591c 3642 drbd_send_sizes(mdev, 0, ddsf);
b411b363
PR
3643 }
3644 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
3645 (dd == grew && mdev->state.conn == C_CONNECTED)) {
3646 if (mdev->state.pdsk >= D_INCONSISTENT &&
e89b591c
PR
3647 mdev->state.disk >= D_INCONSISTENT) {
3648 if (ddsf & DDSF_NO_RESYNC)
3649 dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
3650 else
3651 resync_after_online_grow(mdev);
3652 } else
b411b363
PR
3653 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
3654 }
3655 }
3656
82bc0194 3657 return 0;
b411b363
PR
3658}
3659
4a76b161 3660static int receive_uuids(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3661{
4a76b161 3662 struct drbd_conf *mdev;
e658983a 3663 struct p_uuids *p = pi->data;
b411b363 3664 u64 *p_uuid;
62b0da3a 3665 int i, updated_uuids = 0;
b411b363 3666
4a76b161
AG
3667 mdev = vnr_to_mdev(tconn, pi->vnr);
3668 if (!mdev)
3669 return config_unknown_volume(tconn, pi);
3670
b411b363
PR
3671 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
3672
3673 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
3674 p_uuid[i] = be64_to_cpu(p->uuid[i]);
3675
3676 kfree(mdev->p_uuid);
3677 mdev->p_uuid = p_uuid;
3678
3679 if (mdev->state.conn < C_CONNECTED &&
3680 mdev->state.disk < D_INCONSISTENT &&
3681 mdev->state.role == R_PRIMARY &&
3682 (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
3683 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
3684 (unsigned long long)mdev->ed_uuid);
38fa9988 3685 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3686 return -EIO;
b411b363
PR
3687 }
3688
3689 if (get_ldev(mdev)) {
3690 int skip_initial_sync =
3691 mdev->state.conn == C_CONNECTED &&
31890f4a 3692 mdev->tconn->agreed_pro_version >= 90 &&
b411b363
PR
3693 mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
3694 (p_uuid[UI_FLAGS] & 8);
3695 if (skip_initial_sync) {
3696 dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
3697 drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
20ceb2b2
LE
3698 "clear_n_write from receive_uuids",
3699 BM_LOCKED_TEST_ALLOWED);
b411b363
PR
3700 _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
3701 _drbd_uuid_set(mdev, UI_BITMAP, 0);
3702 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3703 CS_VERBOSE, NULL);
3704 drbd_md_sync(mdev);
62b0da3a 3705 updated_uuids = 1;
b411b363
PR
3706 }
3707 put_ldev(mdev);
18a50fa2
PR
3708 } else if (mdev->state.disk < D_INCONSISTENT &&
3709 mdev->state.role == R_PRIMARY) {
3710 /* I am a diskless primary, the peer just created a new current UUID
3711 for me. */
62b0da3a 3712 updated_uuids = drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
b411b363
PR
3713 }
3714
3715 /* Before we test for the disk state, we should wait until an eventually
3716 ongoing cluster wide state change is finished. That is important if
3717 we are primary and are detaching from our disk. We need to see the
3718 new disk state... */
8410da8f
PR
3719 mutex_lock(mdev->state_mutex);
3720 mutex_unlock(mdev->state_mutex);
b411b363 3721 if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
62b0da3a
LE
3722 updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3723
3724 if (updated_uuids)
3725 drbd_print_uuids(mdev, "receiver updated UUIDs to");
b411b363 3726
82bc0194 3727 return 0;
b411b363
PR
3728}
3729
3730/**
3731 * convert_state() - Converts the peer's view of the cluster state to our point of view
3732 * @ps: The state as seen by the peer.
3733 */
3734static union drbd_state convert_state(union drbd_state ps)
3735{
3736 union drbd_state ms;
3737
3738 static enum drbd_conns c_tab[] = {
369bea63 3739 [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
b411b363
PR
3740 [C_CONNECTED] = C_CONNECTED,
3741
3742 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3743 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3744 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3745 [C_VERIFY_S] = C_VERIFY_T,
3746 [C_MASK] = C_MASK,
3747 };
3748
3749 ms.i = ps.i;
3750
3751 ms.conn = c_tab[ps.conn];
3752 ms.peer = ps.role;
3753 ms.role = ps.peer;
3754 ms.pdsk = ps.disk;
3755 ms.disk = ps.pdsk;
3756 ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3757
3758 return ms;
3759}
3760
4a76b161 3761static int receive_req_state(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3762{
4a76b161 3763 struct drbd_conf *mdev;
e658983a 3764 struct p_req_state *p = pi->data;
b411b363 3765 union drbd_state mask, val;
bf885f8a 3766 enum drbd_state_rv rv;
b411b363 3767
4a76b161
AG
3768 mdev = vnr_to_mdev(tconn, pi->vnr);
3769 if (!mdev)
3770 return -EIO;
3771
b411b363
PR
3772 mask.i = be32_to_cpu(p->mask);
3773 val.i = be32_to_cpu(p->val);
3774
25703f83 3775 if (test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags) &&
8410da8f 3776 mutex_is_locked(mdev->state_mutex)) {
b411b363 3777 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
82bc0194 3778 return 0;
b411b363
PR
3779 }
3780
3781 mask = convert_state(mask);
3782 val = convert_state(val);
3783
dfafcc8a
PR
3784 rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
3785 drbd_send_sr_reply(mdev, rv);
b411b363 3786
b411b363
PR
3787 drbd_md_sync(mdev);
3788
82bc0194 3789 return 0;
b411b363
PR
3790}
3791
e2857216 3792static int receive_req_conn_state(struct drbd_tconn *tconn, struct packet_info *pi)
dfafcc8a 3793{
e658983a 3794 struct p_req_state *p = pi->data;
dfafcc8a
PR
3795 union drbd_state mask, val;
3796 enum drbd_state_rv rv;
3797
3798 mask.i = be32_to_cpu(p->mask);
3799 val.i = be32_to_cpu(p->val);
3800
3801 if (test_bit(DISCARD_CONCURRENT, &tconn->flags) &&
3802 mutex_is_locked(&tconn->cstate_mutex)) {
3803 conn_send_sr_reply(tconn, SS_CONCURRENT_ST_CHG);
82bc0194 3804 return 0;
dfafcc8a
PR
3805 }
3806
3807 mask = convert_state(mask);
3808 val = convert_state(val);
3809
778bcf2e 3810 rv = conn_request_state(tconn, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
dfafcc8a
PR
3811 conn_send_sr_reply(tconn, rv);
3812
82bc0194 3813 return 0;
dfafcc8a
PR
3814}
3815
4a76b161 3816static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3817{
4a76b161 3818 struct drbd_conf *mdev;
e658983a 3819 struct p_state *p = pi->data;
4ac4aada 3820 union drbd_state os, ns, peer_state;
b411b363 3821 enum drbd_disk_state real_peer_disk;
65d922c3 3822 enum chg_state_flags cs_flags;
b411b363
PR
3823 int rv;
3824
4a76b161
AG
3825 mdev = vnr_to_mdev(tconn, pi->vnr);
3826 if (!mdev)
3827 return config_unknown_volume(tconn, pi);
3828
b411b363
PR
3829 peer_state.i = be32_to_cpu(p->state);
3830
3831 real_peer_disk = peer_state.disk;
3832 if (peer_state.disk == D_NEGOTIATING) {
3833 real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
3834 dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
3835 }
3836
87eeee41 3837 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 3838 retry:
78bae59b 3839 os = ns = drbd_read_state(mdev);
87eeee41 3840 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 3841
b8853dbd
PR
3842 /* If some other part of the code (asender thread, timeout)
3843 * already decided to close the connection again,
3844 * we must not "re-establish" it here. */
3845 if (os.conn <= C_TEAR_DOWN)
58ffa580 3846 return -ECONNRESET;
b8853dbd 3847
9bcd2521
PR
3848 /* If this is the "end of sync" confirmation, usually the peer disk
3849 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
3850 * set) resync started in PausedSyncT, or if the timing of pause-/
3851 * unpause-sync events has been "just right", the peer disk may
3852 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
3853 */
3854 if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
3855 real_peer_disk == D_UP_TO_DATE &&
e9ef7bb6
LE
3856 os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
3857 /* If we are (becoming) SyncSource, but peer is still in sync
3858 * preparation, ignore its uptodate-ness to avoid flapping, it
3859 * will change to inconsistent once the peer reaches active
3860 * syncing states.
3861 * It may have changed syncer-paused flags, however, so we
3862 * cannot ignore this completely. */
3863 if (peer_state.conn > C_CONNECTED &&
3864 peer_state.conn < C_SYNC_SOURCE)
3865 real_peer_disk = D_INCONSISTENT;
3866
3867 /* if peer_state changes to connected at the same time,
3868 * it explicitly notifies us that it finished resync.
3869 * Maybe we should finish it up, too? */
3870 else if (os.conn >= C_SYNC_SOURCE &&
3871 peer_state.conn == C_CONNECTED) {
3872 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
3873 drbd_resync_finished(mdev);
82bc0194 3874 return 0;
e9ef7bb6
LE
3875 }
3876 }
3877
58ffa580
LE
3878 /* explicit verify finished notification, stop sector reached. */
3879 if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
3880 peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
3881 ov_out_of_sync_print(mdev);
3882 drbd_resync_finished(mdev);
3883 return 0;
3884 }
3885
e9ef7bb6
LE
3886 /* peer says his disk is inconsistent, while we think it is uptodate,
3887 * and this happens while the peer still thinks we have a sync going on,
3888 * but we think we are already done with the sync.
3889 * We ignore this to avoid flapping pdsk.
3890 * This should not happen, if the peer is a recent version of drbd. */
3891 if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
3892 os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
3893 real_peer_disk = D_UP_TO_DATE;
3894
4ac4aada
LE
3895 if (ns.conn == C_WF_REPORT_PARAMS)
3896 ns.conn = C_CONNECTED;
b411b363 3897
67531718
PR
3898 if (peer_state.conn == C_AHEAD)
3899 ns.conn = C_BEHIND;
3900
b411b363
PR
3901 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3902 get_ldev_if_state(mdev, D_NEGOTIATING)) {
3903 int cr; /* consider resync */
3904
3905 /* if we established a new connection */
4ac4aada 3906 cr = (os.conn < C_CONNECTED);
b411b363
PR
3907 /* if we had an established connection
3908 * and one of the nodes newly attaches a disk */
4ac4aada 3909 cr |= (os.conn == C_CONNECTED &&
b411b363 3910 (peer_state.disk == D_NEGOTIATING ||
4ac4aada 3911 os.disk == D_NEGOTIATING));
b411b363
PR
3912 /* if we have both been inconsistent, and the peer has been
3913 * forced to be UpToDate with --overwrite-data */
3914 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
3915 /* if we had been plain connected, and the admin requested to
3916 * start a sync by "invalidate" or "invalidate-remote" */
4ac4aada 3917 cr |= (os.conn == C_CONNECTED &&
b411b363
PR
3918 (peer_state.conn >= C_STARTING_SYNC_S &&
3919 peer_state.conn <= C_WF_BITMAP_T));
3920
3921 if (cr)
4ac4aada 3922 ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
b411b363
PR
3923
3924 put_ldev(mdev);
4ac4aada
LE
3925 if (ns.conn == C_MASK) {
3926 ns.conn = C_CONNECTED;
b411b363 3927 if (mdev->state.disk == D_NEGOTIATING) {
82f59cc6 3928 drbd_force_state(mdev, NS(disk, D_FAILED));
b411b363
PR
3929 } else if (peer_state.disk == D_NEGOTIATING) {
3930 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3931 peer_state.disk = D_DISKLESS;
580b9767 3932 real_peer_disk = D_DISKLESS;
b411b363 3933 } else {
8169e41b 3934 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->tconn->flags))
82bc0194 3935 return -EIO;
4ac4aada 3936 D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
38fa9988 3937 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3938 return -EIO;
b411b363
PR
3939 }
3940 }
3941 }
3942
87eeee41 3943 spin_lock_irq(&mdev->tconn->req_lock);
78bae59b 3944 if (os.i != drbd_read_state(mdev).i)
b411b363
PR
3945 goto retry;
3946 clear_bit(CONSIDER_RESYNC, &mdev->flags);
b411b363
PR
3947 ns.peer = peer_state.role;
3948 ns.pdsk = real_peer_disk;
3949 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
4ac4aada 3950 if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
b411b363 3951 ns.disk = mdev->new_state_tmp.disk;
4ac4aada 3952 cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
2aebfabb 3953 if (ns.pdsk == D_CONSISTENT && drbd_suspended(mdev) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
481c6f50 3954 test_bit(NEW_CUR_UUID, &mdev->flags)) {
8554df1c 3955 /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
481c6f50 3956 for temporal network outages! */
87eeee41 3957 spin_unlock_irq(&mdev->tconn->req_lock);
481c6f50 3958 dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
2f5cdd0b 3959 tl_clear(mdev->tconn);
481c6f50
PR
3960 drbd_uuid_new_current(mdev);
3961 clear_bit(NEW_CUR_UUID, &mdev->flags);
38fa9988 3962 conn_request_state(mdev->tconn, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
82bc0194 3963 return -EIO;
481c6f50 3964 }
65d922c3 3965 rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
78bae59b 3966 ns = drbd_read_state(mdev);
87eeee41 3967 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
3968
3969 if (rv < SS_SUCCESS) {
38fa9988 3970 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3971 return -EIO;
b411b363
PR
3972 }
3973
4ac4aada
LE
3974 if (os.conn > C_WF_REPORT_PARAMS) {
3975 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
b411b363
PR
3976 peer_state.disk != D_NEGOTIATING ) {
3977 /* we want resync, peer has not yet decided to sync... */
3978 /* Nowadays only used when forcing a node into primary role and
3979 setting its disk to UpToDate with that */
3980 drbd_send_uuids(mdev);
43de7c85 3981 drbd_send_current_state(mdev);
b411b363
PR
3982 }
3983 }
3984
08b165ba 3985 clear_bit(DISCARD_MY_DATA, &mdev->flags);
b411b363
PR
3986
3987 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
3988
82bc0194 3989 return 0;
b411b363
PR
3990}
3991
4a76b161 3992static int receive_sync_uuid(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3993{
4a76b161 3994 struct drbd_conf *mdev;
e658983a 3995 struct p_rs_uuid *p = pi->data;
4a76b161
AG
3996
3997 mdev = vnr_to_mdev(tconn, pi->vnr);
3998 if (!mdev)
3999 return -EIO;
b411b363
PR
4000
4001 wait_event(mdev->misc_wait,
4002 mdev->state.conn == C_WF_SYNC_UUID ||
c4752ef1 4003 mdev->state.conn == C_BEHIND ||
b411b363
PR
4004 mdev->state.conn < C_CONNECTED ||
4005 mdev->state.disk < D_NEGOTIATING);
4006
4007 /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
4008
b411b363
PR
4009 /* Here the _drbd_uuid_ functions are right, current should
4010 _not_ be rotated into the history */
4011 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
4012 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
4013 _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
4014
62b0da3a 4015 drbd_print_uuids(mdev, "updated sync uuid");
b411b363
PR
4016 drbd_start_resync(mdev, C_SYNC_TARGET);
4017
4018 put_ldev(mdev);
4019 } else
4020 dev_err(DEV, "Ignoring SyncUUID packet!\n");
4021
82bc0194 4022 return 0;
b411b363
PR
4023}
4024
2c46407d
AG
4025/**
4026 * receive_bitmap_plain
4027 *
4028 * Return 0 when done, 1 when another iteration is needed, and a negative error
4029 * code upon failure.
4030 */
4031static int
50d0b1ad 4032receive_bitmap_plain(struct drbd_conf *mdev, unsigned int size,
e658983a 4033 unsigned long *p, struct bm_xfer_ctx *c)
b411b363 4034{
50d0b1ad
AG
4035 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
4036 drbd_header_size(mdev->tconn);
e658983a 4037 unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
50d0b1ad 4038 c->bm_words - c->word_offset);
e658983a 4039 unsigned int want = num_words * sizeof(*p);
2c46407d 4040 int err;
b411b363 4041
50d0b1ad
AG
4042 if (want != size) {
4043 dev_err(DEV, "%s:want (%u) != size (%u)\n", __func__, want, size);
2c46407d 4044 return -EIO;
b411b363
PR
4045 }
4046 if (want == 0)
2c46407d 4047 return 0;
e658983a 4048 err = drbd_recv_all(mdev->tconn, p, want);
82bc0194 4049 if (err)
2c46407d 4050 return err;
b411b363 4051
e658983a 4052 drbd_bm_merge_lel(mdev, c->word_offset, num_words, p);
b411b363
PR
4053
4054 c->word_offset += num_words;
4055 c->bit_offset = c->word_offset * BITS_PER_LONG;
4056 if (c->bit_offset > c->bm_bits)
4057 c->bit_offset = c->bm_bits;
4058
2c46407d 4059 return 1;
b411b363
PR
4060}
4061
a02d1240
AG
4062static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
4063{
4064 return (enum drbd_bitmap_code)(p->encoding & 0x0f);
4065}
4066
4067static int dcbp_get_start(struct p_compressed_bm *p)
4068{
4069 return (p->encoding & 0x80) != 0;
4070}
4071
4072static int dcbp_get_pad_bits(struct p_compressed_bm *p)
4073{
4074 return (p->encoding >> 4) & 0x7;
4075}
4076
2c46407d
AG
4077/**
4078 * recv_bm_rle_bits
4079 *
4080 * Return 0 when done, 1 when another iteration is needed, and a negative error
4081 * code upon failure.
4082 */
4083static int
b411b363
PR
4084recv_bm_rle_bits(struct drbd_conf *mdev,
4085 struct p_compressed_bm *p,
c6d25cfe
PR
4086 struct bm_xfer_ctx *c,
4087 unsigned int len)
b411b363
PR
4088{
4089 struct bitstream bs;
4090 u64 look_ahead;
4091 u64 rl;
4092 u64 tmp;
4093 unsigned long s = c->bit_offset;
4094 unsigned long e;
a02d1240 4095 int toggle = dcbp_get_start(p);
b411b363
PR
4096 int have;
4097 int bits;
4098
a02d1240 4099 bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
b411b363
PR
4100
4101 bits = bitstream_get_bits(&bs, &look_ahead, 64);
4102 if (bits < 0)
2c46407d 4103 return -EIO;
b411b363
PR
4104
4105 for (have = bits; have > 0; s += rl, toggle = !toggle) {
4106 bits = vli_decode_bits(&rl, look_ahead);
4107 if (bits <= 0)
2c46407d 4108 return -EIO;
b411b363
PR
4109
4110 if (toggle) {
4111 e = s + rl -1;
4112 if (e >= c->bm_bits) {
4113 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
2c46407d 4114 return -EIO;
b411b363
PR
4115 }
4116 _drbd_bm_set_bits(mdev, s, e);
4117 }
4118
4119 if (have < bits) {
4120 dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
4121 have, bits, look_ahead,
4122 (unsigned int)(bs.cur.b - p->code),
4123 (unsigned int)bs.buf_len);
2c46407d 4124 return -EIO;
b411b363
PR
4125 }
4126 look_ahead >>= bits;
4127 have -= bits;
4128
4129 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
4130 if (bits < 0)
2c46407d 4131 return -EIO;
b411b363
PR
4132 look_ahead |= tmp << have;
4133 have += bits;
4134 }
4135
4136 c->bit_offset = s;
4137 bm_xfer_ctx_bit_to_word_offset(c);
4138
2c46407d 4139 return (s != c->bm_bits);
b411b363
PR
4140}
4141
2c46407d
AG
4142/**
4143 * decode_bitmap_c
4144 *
4145 * Return 0 when done, 1 when another iteration is needed, and a negative error
4146 * code upon failure.
4147 */
4148static int
b411b363
PR
4149decode_bitmap_c(struct drbd_conf *mdev,
4150 struct p_compressed_bm *p,
c6d25cfe
PR
4151 struct bm_xfer_ctx *c,
4152 unsigned int len)
b411b363 4153{
a02d1240 4154 if (dcbp_get_code(p) == RLE_VLI_Bits)
e658983a 4155 return recv_bm_rle_bits(mdev, p, c, len - sizeof(*p));
b411b363
PR
4156
4157 /* other variants had been implemented for evaluation,
4158 * but have been dropped as this one turned out to be "best"
4159 * during all our tests. */
4160
4161 dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
38fa9988 4162 conn_request_state(mdev->tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
2c46407d 4163 return -EIO;
b411b363
PR
4164}
4165
4166void INFO_bm_xfer_stats(struct drbd_conf *mdev,
4167 const char *direction, struct bm_xfer_ctx *c)
4168{
4169 /* what would it take to transfer it "plaintext" */
50d0b1ad
AG
4170 unsigned int header_size = drbd_header_size(mdev->tconn);
4171 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
4172 unsigned int plain =
4173 header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
4174 c->bm_words * sizeof(unsigned long);
4175 unsigned int total = c->bytes[0] + c->bytes[1];
4176 unsigned int r;
b411b363
PR
4177
4178 /* total can not be zero. but just in case: */
4179 if (total == 0)
4180 return;
4181
4182 /* don't report if not compressed */
4183 if (total >= plain)
4184 return;
4185
4186 /* total < plain. check for overflow, still */
4187 r = (total > UINT_MAX/1000) ? (total / (plain/1000))
4188 : (1000 * total / plain);
4189
4190 if (r > 1000)
4191 r = 1000;
4192
4193 r = 1000 - r;
4194 dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
4195 "total %u; compression: %u.%u%%\n",
4196 direction,
4197 c->bytes[1], c->packets[1],
4198 c->bytes[0], c->packets[0],
4199 total, r/10, r % 10);
4200}
4201
4202/* Since we are processing the bitfield from lower addresses to higher,
4203 it does not matter if the process it in 32 bit chunks or 64 bit
4204 chunks as long as it is little endian. (Understand it as byte stream,
4205 beginning with the lowest byte...) If we would use big endian
4206 we would need to process it from the highest address to the lowest,
4207 in order to be agnostic to the 32 vs 64 bits issue.
4208
4209 returns 0 on failure, 1 if we successfully received it. */
4a76b161 4210static int receive_bitmap(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4211{
4a76b161 4212 struct drbd_conf *mdev;
b411b363 4213 struct bm_xfer_ctx c;
2c46407d 4214 int err;
4a76b161
AG
4215
4216 mdev = vnr_to_mdev(tconn, pi->vnr);
4217 if (!mdev)
4218 return -EIO;
b411b363 4219
20ceb2b2
LE
4220 drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED);
4221 /* you are supposed to send additional out-of-sync information
4222 * if you actually set bits during this phase */
b411b363 4223
b411b363
PR
4224 c = (struct bm_xfer_ctx) {
4225 .bm_bits = drbd_bm_bits(mdev),
4226 .bm_words = drbd_bm_words(mdev),
4227 };
4228
2c46407d 4229 for(;;) {
e658983a
AG
4230 if (pi->cmd == P_BITMAP)
4231 err = receive_bitmap_plain(mdev, pi->size, pi->data, &c);
4232 else if (pi->cmd == P_COMPRESSED_BITMAP) {
b411b363
PR
4233 /* MAYBE: sanity check that we speak proto >= 90,
4234 * and the feature is enabled! */
e658983a 4235 struct p_compressed_bm *p = pi->data;
b411b363 4236
50d0b1ad 4237 if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(tconn)) {
b411b363 4238 dev_err(DEV, "ReportCBitmap packet too large\n");
82bc0194 4239 err = -EIO;
b411b363
PR
4240 goto out;
4241 }
e658983a 4242 if (pi->size <= sizeof(*p)) {
e2857216 4243 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", pi->size);
82bc0194 4244 err = -EIO;
78fcbdae 4245 goto out;
b411b363 4246 }
e658983a
AG
4247 err = drbd_recv_all(mdev->tconn, p, pi->size);
4248 if (err)
4249 goto out;
e2857216 4250 err = decode_bitmap_c(mdev, p, &c, pi->size);
b411b363 4251 } else {
e2857216 4252 dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
82bc0194 4253 err = -EIO;
b411b363
PR
4254 goto out;
4255 }
4256
e2857216 4257 c.packets[pi->cmd == P_BITMAP]++;
50d0b1ad 4258 c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(tconn) + pi->size;
b411b363 4259
2c46407d
AG
4260 if (err <= 0) {
4261 if (err < 0)
4262 goto out;
b411b363 4263 break;
2c46407d 4264 }
e2857216 4265 err = drbd_recv_header(mdev->tconn, pi);
82bc0194 4266 if (err)
b411b363 4267 goto out;
2c46407d 4268 }
b411b363
PR
4269
4270 INFO_bm_xfer_stats(mdev, "receive", &c);
4271
4272 if (mdev->state.conn == C_WF_BITMAP_T) {
de1f8e4a
AG
4273 enum drbd_state_rv rv;
4274
82bc0194
AG
4275 err = drbd_send_bitmap(mdev);
4276 if (err)
b411b363
PR
4277 goto out;
4278 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
de1f8e4a
AG
4279 rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
4280 D_ASSERT(rv == SS_SUCCESS);
b411b363
PR
4281 } else if (mdev->state.conn != C_WF_BITMAP_S) {
4282 /* admin may have requested C_DISCONNECTING,
4283 * other threads may have noticed network errors */
4284 dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
4285 drbd_conn_str(mdev->state.conn));
4286 }
82bc0194 4287 err = 0;
b411b363 4288
b411b363 4289 out:
20ceb2b2 4290 drbd_bm_unlock(mdev);
82bc0194 4291 if (!err && mdev->state.conn == C_WF_BITMAP_S)
b411b363 4292 drbd_start_resync(mdev, C_SYNC_SOURCE);
82bc0194 4293 return err;
b411b363
PR
4294}
4295
4a76b161 4296static int receive_skip(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4297{
4a76b161 4298 conn_warn(tconn, "skipping unknown optional packet type %d, l: %d!\n",
e2857216 4299 pi->cmd, pi->size);
2de876ef 4300
4a76b161 4301 return ignore_remaining_packet(tconn, pi);
2de876ef
PR
4302}
4303
4a76b161 4304static int receive_UnplugRemote(struct drbd_tconn *tconn, struct packet_info *pi)
0ced55a3 4305{
e7f52dfb
LE
4306 /* Make sure we've acked all the TCP data associated
4307 * with the data requests being unplugged */
4a76b161 4308 drbd_tcp_quickack(tconn->data.socket);
0ced55a3 4309
82bc0194 4310 return 0;
0ced55a3
PR
4311}
4312
4a76b161 4313static int receive_out_of_sync(struct drbd_tconn *tconn, struct packet_info *pi)
73a01a18 4314{
4a76b161 4315 struct drbd_conf *mdev;
e658983a 4316 struct p_block_desc *p = pi->data;
4a76b161
AG
4317
4318 mdev = vnr_to_mdev(tconn, pi->vnr);
4319 if (!mdev)
4320 return -EIO;
73a01a18 4321
f735e363
LE
4322 switch (mdev->state.conn) {
4323 case C_WF_SYNC_UUID:
4324 case C_WF_BITMAP_T:
4325 case C_BEHIND:
4326 break;
4327 default:
4328 dev_err(DEV, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
4329 drbd_conn_str(mdev->state.conn));
4330 }
4331
73a01a18
PR
4332 drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
4333
82bc0194 4334 return 0;
73a01a18
PR
4335}
4336
02918be2
PR
4337struct data_cmd {
4338 int expect_payload;
4339 size_t pkt_size;
4a76b161 4340 int (*fn)(struct drbd_tconn *, struct packet_info *);
02918be2
PR
4341};
4342
4343static struct data_cmd drbd_cmd_handler[] = {
4a76b161
AG
4344 [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
4345 [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
4346 [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
4347 [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
e658983a
AG
4348 [P_BITMAP] = { 1, 0, receive_bitmap } ,
4349 [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
4350 [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote },
4a76b161
AG
4351 [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4352 [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
e658983a
AG
4353 [P_SYNC_PARAM] = { 1, 0, receive_SyncParam },
4354 [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam },
4a76b161
AG
4355 [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
4356 [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
4357 [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
4358 [P_STATE] = { 0, sizeof(struct p_state), receive_state },
4359 [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
4360 [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
4361 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4362 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4363 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4364 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
4365 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
4366 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
036b17ea 4367 [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
b411b363
PR
4368};
4369
eefc2f7d 4370static void drbdd(struct drbd_tconn *tconn)
b411b363 4371{
77351055 4372 struct packet_info pi;
02918be2 4373 size_t shs; /* sub header size */
82bc0194 4374 int err;
b411b363 4375
eefc2f7d 4376 while (get_t_state(&tconn->receiver) == RUNNING) {
deebe195
AG
4377 struct data_cmd *cmd;
4378
eefc2f7d 4379 drbd_thread_current_set_cpu(&tconn->receiver);
69bc7bc3 4380 if (drbd_recv_header(tconn, &pi))
02918be2 4381 goto err_out;
b411b363 4382
deebe195 4383 cmd = &drbd_cmd_handler[pi.cmd];
4a76b161 4384 if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
2fcb8f30
AG
4385 conn_err(tconn, "Unexpected data packet %s (0x%04x)",
4386 cmdname(pi.cmd), pi.cmd);
02918be2 4387 goto err_out;
0b33a916 4388 }
b411b363 4389
e658983a
AG
4390 shs = cmd->pkt_size;
4391 if (pi.size > shs && !cmd->expect_payload) {
2fcb8f30
AG
4392 conn_err(tconn, "No payload expected %s l:%d\n",
4393 cmdname(pi.cmd), pi.size);
02918be2 4394 goto err_out;
b411b363 4395 }
b411b363 4396
c13f7e1a 4397 if (shs) {
e658983a 4398 err = drbd_recv_all_warn(tconn, pi.data, shs);
a5c31904 4399 if (err)
c13f7e1a 4400 goto err_out;
e2857216 4401 pi.size -= shs;
c13f7e1a
LE
4402 }
4403
4a76b161
AG
4404 err = cmd->fn(tconn, &pi);
4405 if (err) {
9f5bdc33
AG
4406 conn_err(tconn, "error receiving %s, e: %d l: %d!\n",
4407 cmdname(pi.cmd), err, pi.size);
02918be2 4408 goto err_out;
b411b363
PR
4409 }
4410 }
82bc0194 4411 return;
b411b363 4412
82bc0194
AG
4413 err_out:
4414 conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
b411b363
PR
4415}
4416
0e29d163 4417void conn_flush_workqueue(struct drbd_tconn *tconn)
b411b363
PR
4418{
4419 struct drbd_wq_barrier barr;
4420
4421 barr.w.cb = w_prev_work_done;
0e29d163 4422 barr.w.tconn = tconn;
b411b363 4423 init_completion(&barr.done);
d5b27b01 4424 drbd_queue_work(&tconn->sender_work, &barr.w);
b411b363
PR
4425 wait_for_completion(&barr.done);
4426}
4427
81fa2e67 4428static void conn_disconnect(struct drbd_tconn *tconn)
b411b363 4429{
c141ebda 4430 struct drbd_conf *mdev;
bbeb641c 4431 enum drbd_conns oc;
376694a0 4432 int vnr;
b411b363 4433
bbeb641c 4434 if (tconn->cstate == C_STANDALONE)
b411b363 4435 return;
b411b363 4436
b8853dbd
PR
4437 /* We are about to start the cleanup after connection loss.
4438 * Make sure drbd_make_request knows about that.
4439 * Usually we should be in some network failure state already,
4440 * but just in case we are not, we fix it up here.
4441 */
4442 conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
4443
b411b363 4444 /* asender does not clean up anything. it must not interfere, either */
360cc740
PR
4445 drbd_thread_stop(&tconn->asender);
4446 drbd_free_sock(tconn);
4447
c141ebda
PR
4448 rcu_read_lock();
4449 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
4450 kref_get(&mdev->kref);
4451 rcu_read_unlock();
4452 drbd_disconnected(mdev);
4453 kref_put(&mdev->kref, &drbd_minor_destroy);
4454 rcu_read_lock();
4455 }
4456 rcu_read_unlock();
4457
12038a3a
PR
4458 if (!list_empty(&tconn->current_epoch->list))
4459 conn_err(tconn, "ASSERTION FAILED: tconn->current_epoch->list not empty\n");
4460 /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
4461 atomic_set(&tconn->current_epoch->epoch_size, 0);
b6dd1a89 4462 tconn->send.seen_any_write_yet = false;
12038a3a 4463
360cc740
PR
4464 conn_info(tconn, "Connection closed\n");
4465
cb703454
PR
4466 if (conn_highest_role(tconn) == R_PRIMARY && conn_highest_pdsk(tconn) >= D_UNKNOWN)
4467 conn_try_outdate_peer_async(tconn);
4468
360cc740 4469 spin_lock_irq(&tconn->req_lock);
bbeb641c
PR
4470 oc = tconn->cstate;
4471 if (oc >= C_UNCONNECTED)
376694a0 4472 _conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE);
bbeb641c 4473
360cc740
PR
4474 spin_unlock_irq(&tconn->req_lock);
4475
f3dfa40a 4476 if (oc == C_DISCONNECTING)
d9cc6e23 4477 conn_request_state(tconn, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
360cc740
PR
4478}
4479
c141ebda 4480static int drbd_disconnected(struct drbd_conf *mdev)
360cc740 4481{
360cc740 4482 unsigned int i;
b411b363 4483
85719573 4484 /* wait for current activity to cease. */
87eeee41 4485 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
4486 _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
4487 _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
4488 _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
87eeee41 4489 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
4490
4491 /* We do not have data structures that would allow us to
4492 * get the rs_pending_cnt down to 0 again.
4493 * * On C_SYNC_TARGET we do not have any data structures describing
4494 * the pending RSDataRequest's we have sent.
4495 * * On C_SYNC_SOURCE there is no data structure that tracks
4496 * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
4497 * And no, it is not the sum of the reference counts in the
4498 * resync_LRU. The resync_LRU tracks the whole operation including
4499 * the disk-IO, while the rs_pending_cnt only tracks the blocks
4500 * on the fly. */
4501 drbd_rs_cancel_all(mdev);
4502 mdev->rs_total = 0;
4503 mdev->rs_failed = 0;
4504 atomic_set(&mdev->rs_pending_cnt, 0);
4505 wake_up(&mdev->misc_wait);
4506
b411b363 4507 del_timer_sync(&mdev->resync_timer);
b411b363
PR
4508 resync_timer_fn((unsigned long)mdev);
4509
b411b363
PR
4510 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
4511 * w_make_resync_request etc. which may still be on the worker queue
4512 * to be "canceled" */
a21e9298 4513 drbd_flush_workqueue(mdev);
b411b363 4514
a990be46 4515 drbd_finish_peer_reqs(mdev);
b411b363 4516
d10b4ea3
PR
4517 /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
4518 might have issued a work again. The one before drbd_finish_peer_reqs() is
4519 necessary to reclain net_ee in drbd_finish_peer_reqs(). */
4520 drbd_flush_workqueue(mdev);
4521
b411b363
PR
4522 kfree(mdev->p_uuid);
4523 mdev->p_uuid = NULL;
4524
2aebfabb 4525 if (!drbd_suspended(mdev))
2f5cdd0b 4526 tl_clear(mdev->tconn);
b411b363 4527
b411b363
PR
4528 drbd_md_sync(mdev);
4529
20ceb2b2
LE
4530 /* serialize with bitmap writeout triggered by the state change,
4531 * if any. */
4532 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
4533
b411b363
PR
4534 /* tcp_close and release of sendpage pages can be deferred. I don't
4535 * want to use SO_LINGER, because apparently it can be deferred for
4536 * more than 20 seconds (longest time I checked).
4537 *
4538 * Actually we don't care for exactly when the network stack does its
4539 * put_page(), but release our reference on these pages right here.
4540 */
7721f567 4541 i = drbd_free_peer_reqs(mdev, &mdev->net_ee);
b411b363
PR
4542 if (i)
4543 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
435f0740
LE
4544 i = atomic_read(&mdev->pp_in_use_by_net);
4545 if (i)
4546 dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
b411b363
PR
4547 i = atomic_read(&mdev->pp_in_use);
4548 if (i)
45bb912b 4549 dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
b411b363
PR
4550
4551 D_ASSERT(list_empty(&mdev->read_ee));
4552 D_ASSERT(list_empty(&mdev->active_ee));
4553 D_ASSERT(list_empty(&mdev->sync_ee));
4554 D_ASSERT(list_empty(&mdev->done_ee));
4555
360cc740 4556 return 0;
b411b363
PR
4557}
4558
4559/*
4560 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
4561 * we can agree on is stored in agreed_pro_version.
4562 *
4563 * feature flags and the reserved array should be enough room for future
4564 * enhancements of the handshake protocol, and possible plugins...
4565 *
4566 * for now, they are expected to be zero, but ignored.
4567 */
6038178e 4568static int drbd_send_features(struct drbd_tconn *tconn)
b411b363 4569{
9f5bdc33
AG
4570 struct drbd_socket *sock;
4571 struct p_connection_features *p;
b411b363 4572
9f5bdc33
AG
4573 sock = &tconn->data;
4574 p = conn_prepare_command(tconn, sock);
4575 if (!p)
e8d17b01 4576 return -EIO;
b411b363
PR
4577 memset(p, 0, sizeof(*p));
4578 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
4579 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
9f5bdc33 4580 return conn_send_command(tconn, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
b411b363
PR
4581}
4582
4583/*
4584 * return values:
4585 * 1 yes, we have a valid connection
4586 * 0 oops, did not work out, please try again
4587 * -1 peer talks different language,
4588 * no point in trying again, please go standalone.
4589 */
6038178e 4590static int drbd_do_features(struct drbd_tconn *tconn)
b411b363 4591{
65d11ed6 4592 /* ASSERT current == tconn->receiver ... */
e658983a
AG
4593 struct p_connection_features *p;
4594 const int expect = sizeof(struct p_connection_features);
77351055 4595 struct packet_info pi;
a5c31904 4596 int err;
b411b363 4597
6038178e 4598 err = drbd_send_features(tconn);
e8d17b01 4599 if (err)
b411b363
PR
4600 return 0;
4601
69bc7bc3
AG
4602 err = drbd_recv_header(tconn, &pi);
4603 if (err)
b411b363
PR
4604 return 0;
4605
6038178e
AG
4606 if (pi.cmd != P_CONNECTION_FEATURES) {
4607 conn_err(tconn, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
2fcb8f30 4608 cmdname(pi.cmd), pi.cmd);
b411b363
PR
4609 return -1;
4610 }
4611
77351055 4612 if (pi.size != expect) {
6038178e 4613 conn_err(tconn, "expected ConnectionFeatures length: %u, received: %u\n",
77351055 4614 expect, pi.size);
b411b363
PR
4615 return -1;
4616 }
4617
e658983a
AG
4618 p = pi.data;
4619 err = drbd_recv_all_warn(tconn, p, expect);
a5c31904 4620 if (err)
b411b363 4621 return 0;
b411b363 4622
b411b363
PR
4623 p->protocol_min = be32_to_cpu(p->protocol_min);
4624 p->protocol_max = be32_to_cpu(p->protocol_max);
4625 if (p->protocol_max == 0)
4626 p->protocol_max = p->protocol_min;
4627
4628 if (PRO_VERSION_MAX < p->protocol_min ||
4629 PRO_VERSION_MIN > p->protocol_max)
4630 goto incompat;
4631
65d11ed6 4632 tconn->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
b411b363 4633
65d11ed6
PR
4634 conn_info(tconn, "Handshake successful: "
4635 "Agreed network protocol version %d\n", tconn->agreed_pro_version);
b411b363
PR
4636
4637 return 1;
4638
4639 incompat:
65d11ed6 4640 conn_err(tconn, "incompatible DRBD dialects: "
b411b363
PR
4641 "I support %d-%d, peer supports %d-%d\n",
4642 PRO_VERSION_MIN, PRO_VERSION_MAX,
4643 p->protocol_min, p->protocol_max);
4644 return -1;
4645}
4646
4647#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
13e6037d 4648static int drbd_do_auth(struct drbd_tconn *tconn)
b411b363
PR
4649{
4650 dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
4651 dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
b10d96cb 4652 return -1;
b411b363
PR
4653}
4654#else
4655#define CHALLENGE_LEN 64
b10d96cb
JT
4656
4657/* Return value:
4658 1 - auth succeeded,
4659 0 - failed, try again (network error),
4660 -1 - auth failed, don't try again.
4661*/
4662
13e6037d 4663static int drbd_do_auth(struct drbd_tconn *tconn)
b411b363 4664{
9f5bdc33 4665 struct drbd_socket *sock;
b411b363
PR
4666 char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
4667 struct scatterlist sg;
4668 char *response = NULL;
4669 char *right_response = NULL;
4670 char *peers_ch = NULL;
44ed167d
PR
4671 unsigned int key_len;
4672 char secret[SHARED_SECRET_MAX]; /* 64 byte */
b411b363
PR
4673 unsigned int resp_size;
4674 struct hash_desc desc;
77351055 4675 struct packet_info pi;
44ed167d 4676 struct net_conf *nc;
69bc7bc3 4677 int err, rv;
b411b363 4678
9f5bdc33
AG
4679 /* FIXME: Put the challenge/response into the preallocated socket buffer. */
4680
44ed167d
PR
4681 rcu_read_lock();
4682 nc = rcu_dereference(tconn->net_conf);
4683 key_len = strlen(nc->shared_secret);
4684 memcpy(secret, nc->shared_secret, key_len);
4685 rcu_read_unlock();
4686
13e6037d 4687 desc.tfm = tconn->cram_hmac_tfm;
b411b363
PR
4688 desc.flags = 0;
4689
44ed167d 4690 rv = crypto_hash_setkey(tconn->cram_hmac_tfm, (u8 *)secret, key_len);
b411b363 4691 if (rv) {
13e6037d 4692 conn_err(tconn, "crypto_hash_setkey() failed with %d\n", rv);
b10d96cb 4693 rv = -1;
b411b363
PR
4694 goto fail;
4695 }
4696
4697 get_random_bytes(my_challenge, CHALLENGE_LEN);
4698
9f5bdc33
AG
4699 sock = &tconn->data;
4700 if (!conn_prepare_command(tconn, sock)) {
4701 rv = 0;
4702 goto fail;
4703 }
e658983a 4704 rv = !conn_send_command(tconn, sock, P_AUTH_CHALLENGE, 0,
9f5bdc33 4705 my_challenge, CHALLENGE_LEN);
b411b363
PR
4706 if (!rv)
4707 goto fail;
4708
69bc7bc3
AG
4709 err = drbd_recv_header(tconn, &pi);
4710 if (err) {
4711 rv = 0;
b411b363 4712 goto fail;
69bc7bc3 4713 }
b411b363 4714
77351055 4715 if (pi.cmd != P_AUTH_CHALLENGE) {
13e6037d 4716 conn_err(tconn, "expected AuthChallenge packet, received: %s (0x%04x)\n",
2fcb8f30 4717 cmdname(pi.cmd), pi.cmd);
b411b363
PR
4718 rv = 0;
4719 goto fail;
4720 }
4721
77351055 4722 if (pi.size > CHALLENGE_LEN * 2) {
13e6037d 4723 conn_err(tconn, "expected AuthChallenge payload too big.\n");
b10d96cb 4724 rv = -1;
b411b363
PR
4725 goto fail;
4726 }
4727
77351055 4728 peers_ch = kmalloc(pi.size, GFP_NOIO);
b411b363 4729 if (peers_ch == NULL) {
13e6037d 4730 conn_err(tconn, "kmalloc of peers_ch failed\n");
b10d96cb 4731 rv = -1;
b411b363
PR
4732 goto fail;
4733 }
4734
a5c31904
AG
4735 err = drbd_recv_all_warn(tconn, peers_ch, pi.size);
4736 if (err) {
b411b363
PR
4737 rv = 0;
4738 goto fail;
4739 }
4740
13e6037d 4741 resp_size = crypto_hash_digestsize(tconn->cram_hmac_tfm);
b411b363
PR
4742 response = kmalloc(resp_size, GFP_NOIO);
4743 if (response == NULL) {
13e6037d 4744 conn_err(tconn, "kmalloc of response failed\n");
b10d96cb 4745 rv = -1;
b411b363
PR
4746 goto fail;
4747 }
4748
4749 sg_init_table(&sg, 1);
77351055 4750 sg_set_buf(&sg, peers_ch, pi.size);
b411b363
PR
4751
4752 rv = crypto_hash_digest(&desc, &sg, sg.length, response);
4753 if (rv) {
13e6037d 4754 conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv);
b10d96cb 4755 rv = -1;
b411b363
PR
4756 goto fail;
4757 }
4758
9f5bdc33
AG
4759 if (!conn_prepare_command(tconn, sock)) {
4760 rv = 0;
4761 goto fail;
4762 }
e658983a 4763 rv = !conn_send_command(tconn, sock, P_AUTH_RESPONSE, 0,
9f5bdc33 4764 response, resp_size);
b411b363
PR
4765 if (!rv)
4766 goto fail;
4767
69bc7bc3
AG
4768 err = drbd_recv_header(tconn, &pi);
4769 if (err) {
4770 rv = 0;
b411b363 4771 goto fail;
69bc7bc3 4772 }
b411b363 4773
77351055 4774 if (pi.cmd != P_AUTH_RESPONSE) {
13e6037d 4775 conn_err(tconn, "expected AuthResponse packet, received: %s (0x%04x)\n",
2fcb8f30 4776 cmdname(pi.cmd), pi.cmd);
b411b363
PR
4777 rv = 0;
4778 goto fail;
4779 }
4780
77351055 4781 if (pi.size != resp_size) {
13e6037d 4782 conn_err(tconn, "expected AuthResponse payload of wrong size\n");
b411b363
PR
4783 rv = 0;
4784 goto fail;
4785 }
4786
a5c31904
AG
4787 err = drbd_recv_all_warn(tconn, response , resp_size);
4788 if (err) {
b411b363
PR
4789 rv = 0;
4790 goto fail;
4791 }
4792
4793 right_response = kmalloc(resp_size, GFP_NOIO);
2d1ee87d 4794 if (right_response == NULL) {
13e6037d 4795 conn_err(tconn, "kmalloc of right_response failed\n");
b10d96cb 4796 rv = -1;
b411b363
PR
4797 goto fail;
4798 }
4799
4800 sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
4801
4802 rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
4803 if (rv) {
13e6037d 4804 conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv);
b10d96cb 4805 rv = -1;
b411b363
PR
4806 goto fail;
4807 }
4808
4809 rv = !memcmp(response, right_response, resp_size);
4810
4811 if (rv)
44ed167d
PR
4812 conn_info(tconn, "Peer authenticated using %d bytes HMAC\n",
4813 resp_size);
b10d96cb
JT
4814 else
4815 rv = -1;
b411b363
PR
4816
4817 fail:
4818 kfree(peers_ch);
4819 kfree(response);
4820 kfree(right_response);
4821
4822 return rv;
4823}
4824#endif
4825
4826int drbdd_init(struct drbd_thread *thi)
4827{
392c8801 4828 struct drbd_tconn *tconn = thi->tconn;
b411b363
PR
4829 int h;
4830
4d641dd7 4831 conn_info(tconn, "receiver (re)started\n");
b411b363
PR
4832
4833 do {
81fa2e67 4834 h = conn_connect(tconn);
b411b363 4835 if (h == 0) {
81fa2e67 4836 conn_disconnect(tconn);
20ee6390 4837 schedule_timeout_interruptible(HZ);
b411b363
PR
4838 }
4839 if (h == -1) {
4d641dd7 4840 conn_warn(tconn, "Discarding network configuration.\n");
bbeb641c 4841 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
4842 }
4843 } while (h == 0);
4844
91fd4dad
PR
4845 if (h > 0)
4846 drbdd(tconn);
b411b363 4847
81fa2e67 4848 conn_disconnect(tconn);
b411b363 4849
4d641dd7 4850 conn_info(tconn, "receiver terminated\n");
b411b363
PR
4851 return 0;
4852}
4853
4854/* ********* acknowledge sender ******** */
4855
e05e1e59 4856static int got_conn_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi)
e4f78ede 4857{
e658983a 4858 struct p_req_state_reply *p = pi->data;
e4f78ede
PR
4859 int retcode = be32_to_cpu(p->retcode);
4860
4861 if (retcode >= SS_SUCCESS) {
4862 set_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags);
4863 } else {
4864 set_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags);
4865 conn_err(tconn, "Requested state change failed by peer: %s (%d)\n",
4866 drbd_set_st_err_str(retcode), retcode);
4867 }
4868 wake_up(&tconn->ping_wait);
4869
2735a594 4870 return 0;
e4f78ede
PR
4871}
4872
1952e916 4873static int got_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4874{
1952e916 4875 struct drbd_conf *mdev;
e658983a 4876 struct p_req_state_reply *p = pi->data;
b411b363
PR
4877 int retcode = be32_to_cpu(p->retcode);
4878
1952e916
AG
4879 mdev = vnr_to_mdev(tconn, pi->vnr);
4880 if (!mdev)
2735a594 4881 return -EIO;
1952e916 4882
4d0fc3fd
PR
4883 if (test_bit(CONN_WD_ST_CHG_REQ, &tconn->flags)) {
4884 D_ASSERT(tconn->agreed_pro_version < 100);
4885 return got_conn_RqSReply(tconn, pi);
4886 }
4887
e4f78ede
PR
4888 if (retcode >= SS_SUCCESS) {
4889 set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
4890 } else {
4891 set_bit(CL_ST_CHG_FAIL, &mdev->flags);
4892 dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
4893 drbd_set_st_err_str(retcode), retcode);
b411b363 4894 }
e4f78ede
PR
4895 wake_up(&mdev->state_wait);
4896
2735a594 4897 return 0;
b411b363
PR
4898}
4899
e05e1e59 4900static int got_Ping(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4901{
2735a594 4902 return drbd_send_ping_ack(tconn);
b411b363
PR
4903
4904}
4905
e05e1e59 4906static int got_PingAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363
PR
4907{
4908 /* restore idle timeout */
2a67d8b9
PR
4909 tconn->meta.socket->sk->sk_rcvtimeo = tconn->net_conf->ping_int*HZ;
4910 if (!test_and_set_bit(GOT_PING_ACK, &tconn->flags))
4911 wake_up(&tconn->ping_wait);
b411b363 4912
2735a594 4913 return 0;
b411b363
PR
4914}
4915
1952e916 4916static int got_IsInSync(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4917{
1952e916 4918 struct drbd_conf *mdev;
e658983a 4919 struct p_block_ack *p = pi->data;
b411b363
PR
4920 sector_t sector = be64_to_cpu(p->sector);
4921 int blksize = be32_to_cpu(p->blksize);
4922
1952e916
AG
4923 mdev = vnr_to_mdev(tconn, pi->vnr);
4924 if (!mdev)
2735a594 4925 return -EIO;
1952e916 4926
31890f4a 4927 D_ASSERT(mdev->tconn->agreed_pro_version >= 89);
b411b363
PR
4928
4929 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4930
1d53f09e
LE
4931 if (get_ldev(mdev)) {
4932 drbd_rs_complete_io(mdev, sector);
4933 drbd_set_in_sync(mdev, sector, blksize);
4934 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4935 mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4936 put_ldev(mdev);
4937 }
b411b363 4938 dec_rs_pending(mdev);
778f271d 4939 atomic_add(blksize >> 9, &mdev->rs_sect_in);
b411b363 4940
2735a594 4941 return 0;
b411b363
PR
4942}
4943
bc9c5c41
AG
4944static int
4945validate_req_change_req_state(struct drbd_conf *mdev, u64 id, sector_t sector,
4946 struct rb_root *root, const char *func,
4947 enum drbd_req_event what, bool missing_ok)
b411b363
PR
4948{
4949 struct drbd_request *req;
4950 struct bio_and_error m;
4951
87eeee41 4952 spin_lock_irq(&mdev->tconn->req_lock);
bc9c5c41 4953 req = find_request(mdev, root, id, sector, missing_ok, func);
b411b363 4954 if (unlikely(!req)) {
87eeee41 4955 spin_unlock_irq(&mdev->tconn->req_lock);
85997675 4956 return -EIO;
b411b363
PR
4957 }
4958 __req_mod(req, what, &m);
87eeee41 4959 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
4960
4961 if (m.bio)
4962 complete_master_bio(mdev, &m);
85997675 4963 return 0;
b411b363
PR
4964}
4965
1952e916 4966static int got_BlockAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4967{
1952e916 4968 struct drbd_conf *mdev;
e658983a 4969 struct p_block_ack *p = pi->data;
b411b363
PR
4970 sector_t sector = be64_to_cpu(p->sector);
4971 int blksize = be32_to_cpu(p->blksize);
4972 enum drbd_req_event what;
4973
1952e916
AG
4974 mdev = vnr_to_mdev(tconn, pi->vnr);
4975 if (!mdev)
2735a594 4976 return -EIO;
1952e916 4977
b411b363
PR
4978 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4979
579b57ed 4980 if (p->block_id == ID_SYNCER) {
b411b363
PR
4981 drbd_set_in_sync(mdev, sector, blksize);
4982 dec_rs_pending(mdev);
2735a594 4983 return 0;
b411b363 4984 }
e05e1e59 4985 switch (pi->cmd) {
b411b363 4986 case P_RS_WRITE_ACK:
8554df1c 4987 what = WRITE_ACKED_BY_PEER_AND_SIS;
b411b363
PR
4988 break;
4989 case P_WRITE_ACK:
8554df1c 4990 what = WRITE_ACKED_BY_PEER;
b411b363
PR
4991 break;
4992 case P_RECV_ACK:
8554df1c 4993 what = RECV_ACKED_BY_PEER;
b411b363 4994 break;
7be8da07 4995 case P_DISCARD_WRITE:
7be8da07
AG
4996 what = DISCARD_WRITE;
4997 break;
4998 case P_RETRY_WRITE:
7be8da07 4999 what = POSTPONE_WRITE;
b411b363
PR
5000 break;
5001 default:
2735a594 5002 BUG();
b411b363
PR
5003 }
5004
2735a594
AG
5005 return validate_req_change_req_state(mdev, p->block_id, sector,
5006 &mdev->write_requests, __func__,
5007 what, false);
b411b363
PR
5008}
5009
1952e916 5010static int got_NegAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 5011{
1952e916 5012 struct drbd_conf *mdev;
e658983a 5013 struct p_block_ack *p = pi->data;
b411b363 5014 sector_t sector = be64_to_cpu(p->sector);
2deb8336 5015 int size = be32_to_cpu(p->blksize);
85997675 5016 int err;
b411b363 5017
1952e916
AG
5018 mdev = vnr_to_mdev(tconn, pi->vnr);
5019 if (!mdev)
2735a594 5020 return -EIO;
1952e916 5021
b411b363
PR
5022 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
5023
579b57ed 5024 if (p->block_id == ID_SYNCER) {
b411b363
PR
5025 dec_rs_pending(mdev);
5026 drbd_rs_failed_io(mdev, sector, size);
2735a594 5027 return 0;
b411b363 5028 }
2deb8336 5029
85997675
AG
5030 err = validate_req_change_req_state(mdev, p->block_id, sector,
5031 &mdev->write_requests, __func__,
303d1448 5032 NEG_ACKED, true);
85997675 5033 if (err) {
c3afd8f5
AG
5034 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
5035 The master bio might already be completed, therefore the
5036 request is no longer in the collision hash. */
5037 /* In Protocol B we might already have got a P_RECV_ACK
5038 but then get a P_NEG_ACK afterwards. */
c3afd8f5 5039 drbd_set_out_of_sync(mdev, sector, size);
2deb8336 5040 }
2735a594 5041 return 0;
b411b363
PR
5042}
5043
1952e916 5044static int got_NegDReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 5045{
1952e916 5046 struct drbd_conf *mdev;
e658983a 5047 struct p_block_ack *p = pi->data;
b411b363
PR
5048 sector_t sector = be64_to_cpu(p->sector);
5049
1952e916
AG
5050 mdev = vnr_to_mdev(tconn, pi->vnr);
5051 if (!mdev)
2735a594 5052 return -EIO;
1952e916 5053
b411b363 5054 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
7be8da07 5055
380207d0 5056 dev_err(DEV, "Got NegDReply; Sector %llus, len %u.\n",
b411b363
PR
5057 (unsigned long long)sector, be32_to_cpu(p->blksize));
5058
2735a594
AG
5059 return validate_req_change_req_state(mdev, p->block_id, sector,
5060 &mdev->read_requests, __func__,
5061 NEG_ACKED, false);
b411b363
PR
5062}
5063
1952e916 5064static int got_NegRSDReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 5065{
1952e916 5066 struct drbd_conf *mdev;
b411b363
PR
5067 sector_t sector;
5068 int size;
e658983a 5069 struct p_block_ack *p = pi->data;
1952e916
AG
5070
5071 mdev = vnr_to_mdev(tconn, pi->vnr);
5072 if (!mdev)
2735a594 5073 return -EIO;
b411b363
PR
5074
5075 sector = be64_to_cpu(p->sector);
5076 size = be32_to_cpu(p->blksize);
b411b363
PR
5077
5078 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
5079
5080 dec_rs_pending(mdev);
5081
5082 if (get_ldev_if_state(mdev, D_FAILED)) {
5083 drbd_rs_complete_io(mdev, sector);
e05e1e59 5084 switch (pi->cmd) {
d612d309
PR
5085 case P_NEG_RS_DREPLY:
5086 drbd_rs_failed_io(mdev, sector, size);
5087 case P_RS_CANCEL:
5088 break;
5089 default:
2735a594 5090 BUG();
d612d309 5091 }
b411b363
PR
5092 put_ldev(mdev);
5093 }
5094
2735a594 5095 return 0;
b411b363
PR
5096}
5097
1952e916 5098static int got_BarrierAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 5099{
e658983a 5100 struct p_barrier_ack *p = pi->data;
9ed57dcb
LE
5101 struct drbd_conf *mdev;
5102 int vnr;
1952e916 5103
9ed57dcb 5104 tl_release(tconn, p->barrier, be32_to_cpu(p->set_size));
b411b363 5105
9ed57dcb
LE
5106 rcu_read_lock();
5107 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
5108 if (mdev->state.conn == C_AHEAD &&
5109 atomic_read(&mdev->ap_in_flight) == 0 &&
5110 !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) {
5111 mdev->start_resync_timer.expires = jiffies + HZ;
5112 add_timer(&mdev->start_resync_timer);
5113 }
c4752ef1 5114 }
9ed57dcb 5115 rcu_read_unlock();
c4752ef1 5116
2735a594 5117 return 0;
b411b363
PR
5118}
5119
1952e916 5120static int got_OVResult(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 5121{
1952e916 5122 struct drbd_conf *mdev;
e658983a 5123 struct p_block_ack *p = pi->data;
b411b363
PR
5124 struct drbd_work *w;
5125 sector_t sector;
5126 int size;
5127
1952e916
AG
5128 mdev = vnr_to_mdev(tconn, pi->vnr);
5129 if (!mdev)
2735a594 5130 return -EIO;
1952e916 5131
b411b363
PR
5132 sector = be64_to_cpu(p->sector);
5133 size = be32_to_cpu(p->blksize);
5134
5135 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
5136
5137 if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
8f7bed77 5138 drbd_ov_out_of_sync_found(mdev, sector, size);
b411b363 5139 else
8f7bed77 5140 ov_out_of_sync_print(mdev);
b411b363 5141
1d53f09e 5142 if (!get_ldev(mdev))
2735a594 5143 return 0;
1d53f09e 5144
b411b363
PR
5145 drbd_rs_complete_io(mdev, sector);
5146 dec_rs_pending(mdev);
5147
ea5442af
LE
5148 --mdev->ov_left;
5149
5150 /* let's advance progress step marks only for every other megabyte */
5151 if ((mdev->ov_left & 0x200) == 0x200)
5152 drbd_advance_rs_marks(mdev, mdev->ov_left);
5153
5154 if (mdev->ov_left == 0) {
b411b363
PR
5155 w = kmalloc(sizeof(*w), GFP_NOIO);
5156 if (w) {
5157 w->cb = w_ov_finished;
a21e9298 5158 w->mdev = mdev;
d5b27b01 5159 drbd_queue_work(&mdev->tconn->sender_work, w);
b411b363
PR
5160 } else {
5161 dev_err(DEV, "kmalloc(w) failed.");
8f7bed77 5162 ov_out_of_sync_print(mdev);
b411b363
PR
5163 drbd_resync_finished(mdev);
5164 }
5165 }
1d53f09e 5166 put_ldev(mdev);
2735a594 5167 return 0;
b411b363
PR
5168}
5169
1952e916 5170static int got_skip(struct drbd_tconn *tconn, struct packet_info *pi)
0ced55a3 5171{
2735a594 5172 return 0;
0ced55a3
PR
5173}
5174
a990be46 5175static int tconn_finish_peer_reqs(struct drbd_tconn *tconn)
32862ec7 5176{
082a3439 5177 struct drbd_conf *mdev;
c141ebda 5178 int vnr, not_empty = 0;
32862ec7
PR
5179
5180 do {
5181 clear_bit(SIGNAL_ASENDER, &tconn->flags);
5182 flush_signals(current);
c141ebda
PR
5183
5184 rcu_read_lock();
5185 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
5186 kref_get(&mdev->kref);
5187 rcu_read_unlock();
d3fcb490 5188 if (drbd_finish_peer_reqs(mdev)) {
c141ebda
PR
5189 kref_put(&mdev->kref, &drbd_minor_destroy);
5190 return 1;
d3fcb490 5191 }
c141ebda
PR
5192 kref_put(&mdev->kref, &drbd_minor_destroy);
5193 rcu_read_lock();
082a3439 5194 }
32862ec7 5195 set_bit(SIGNAL_ASENDER, &tconn->flags);
082a3439
PR
5196
5197 spin_lock_irq(&tconn->req_lock);
c141ebda 5198 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
082a3439
PR
5199 not_empty = !list_empty(&mdev->done_ee);
5200 if (not_empty)
5201 break;
5202 }
5203 spin_unlock_irq(&tconn->req_lock);
c141ebda 5204 rcu_read_unlock();
32862ec7
PR
5205 } while (not_empty);
5206
5207 return 0;
5208}
5209
7201b972
AG
5210struct asender_cmd {
5211 size_t pkt_size;
1952e916 5212 int (*fn)(struct drbd_tconn *tconn, struct packet_info *);
7201b972
AG
5213};
5214
5215static struct asender_cmd asender_tbl[] = {
e658983a
AG
5216 [P_PING] = { 0, got_Ping },
5217 [P_PING_ACK] = { 0, got_PingAck },
1952e916
AG
5218 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
5219 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
5220 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
5221 [P_DISCARD_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
5222 [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
5223 [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
5224 [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply },
5225 [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
5226 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
5227 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
5228 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
5229 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
5230 [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply },
5231 [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
5232 [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
7201b972
AG
5233};
5234
b411b363
PR
5235int drbd_asender(struct drbd_thread *thi)
5236{
392c8801 5237 struct drbd_tconn *tconn = thi->tconn;
b411b363 5238 struct asender_cmd *cmd = NULL;
77351055 5239 struct packet_info pi;
257d0af6 5240 int rv;
e658983a 5241 void *buf = tconn->meta.rbuf;
b411b363 5242 int received = 0;
52b061a4
AG
5243 unsigned int header_size = drbd_header_size(tconn);
5244 int expect = header_size;
44ed167d
PR
5245 bool ping_timeout_active = false;
5246 struct net_conf *nc;
bb77d34e 5247 int ping_timeo, tcp_cork, ping_int;
b411b363 5248
b411b363
PR
5249 current->policy = SCHED_RR; /* Make this a realtime task! */
5250 current->rt_priority = 2; /* more important than all other tasks */
5251
e77a0a5c 5252 while (get_t_state(thi) == RUNNING) {
80822284 5253 drbd_thread_current_set_cpu(thi);
44ed167d
PR
5254
5255 rcu_read_lock();
5256 nc = rcu_dereference(tconn->net_conf);
5257 ping_timeo = nc->ping_timeo;
bb77d34e 5258 tcp_cork = nc->tcp_cork;
44ed167d
PR
5259 ping_int = nc->ping_int;
5260 rcu_read_unlock();
5261
32862ec7 5262 if (test_and_clear_bit(SEND_PING, &tconn->flags)) {
a17647aa 5263 if (drbd_send_ping(tconn)) {
32862ec7 5264 conn_err(tconn, "drbd_send_ping has failed\n");
841ce241
AG
5265 goto reconnect;
5266 }
44ed167d
PR
5267 tconn->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
5268 ping_timeout_active = true;
b411b363
PR
5269 }
5270
32862ec7
PR
5271 /* TODO: conditionally cork; it may hurt latency if we cork without
5272 much to send */
bb77d34e 5273 if (tcp_cork)
32862ec7 5274 drbd_tcp_cork(tconn->meta.socket);
a990be46
AG
5275 if (tconn_finish_peer_reqs(tconn)) {
5276 conn_err(tconn, "tconn_finish_peer_reqs() failed\n");
32862ec7 5277 goto reconnect;
082a3439 5278 }
b411b363 5279 /* but unconditionally uncork unless disabled */
bb77d34e 5280 if (tcp_cork)
32862ec7 5281 drbd_tcp_uncork(tconn->meta.socket);
b411b363
PR
5282
5283 /* short circuit, recv_msg would return EINTR anyways. */
5284 if (signal_pending(current))
5285 continue;
5286
32862ec7
PR
5287 rv = drbd_recv_short(tconn->meta.socket, buf, expect-received, 0);
5288 clear_bit(SIGNAL_ASENDER, &tconn->flags);
b411b363
PR
5289
5290 flush_signals(current);
5291
5292 /* Note:
5293 * -EINTR (on meta) we got a signal
5294 * -EAGAIN (on meta) rcvtimeo expired
5295 * -ECONNRESET other side closed the connection
5296 * -ERESTARTSYS (on data) we got a signal
5297 * rv < 0 other than above: unexpected error!
5298 * rv == expected: full header or command
5299 * rv < expected: "woken" by signal during receive
5300 * rv == 0 : "connection shut down by peer"
5301 */
5302 if (likely(rv > 0)) {
5303 received += rv;
5304 buf += rv;
5305 } else if (rv == 0) {
32862ec7 5306 conn_err(tconn, "meta connection shut down by peer.\n");
b411b363
PR
5307 goto reconnect;
5308 } else if (rv == -EAGAIN) {
cb6518cb
LE
5309 /* If the data socket received something meanwhile,
5310 * that is good enough: peer is still alive. */
32862ec7
PR
5311 if (time_after(tconn->last_received,
5312 jiffies - tconn->meta.socket->sk->sk_rcvtimeo))
cb6518cb 5313 continue;
f36af18c 5314 if (ping_timeout_active) {
32862ec7 5315 conn_err(tconn, "PingAck did not arrive in time.\n");
b411b363
PR
5316 goto reconnect;
5317 }
32862ec7 5318 set_bit(SEND_PING, &tconn->flags);
b411b363
PR
5319 continue;
5320 } else if (rv == -EINTR) {
5321 continue;
5322 } else {
32862ec7 5323 conn_err(tconn, "sock_recvmsg returned %d\n", rv);
b411b363
PR
5324 goto reconnect;
5325 }
5326
5327 if (received == expect && cmd == NULL) {
e658983a 5328 if (decode_header(tconn, tconn->meta.rbuf, &pi))
b411b363 5329 goto reconnect;
7201b972 5330 cmd = &asender_tbl[pi.cmd];
1952e916 5331 if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
2fcb8f30
AG
5332 conn_err(tconn, "Unexpected meta packet %s (0x%04x)\n",
5333 cmdname(pi.cmd), pi.cmd);
b411b363
PR
5334 goto disconnect;
5335 }
e658983a 5336 expect = header_size + cmd->pkt_size;
52b061a4 5337 if (pi.size != expect - header_size) {
32862ec7 5338 conn_err(tconn, "Wrong packet size on meta (c: %d, l: %d)\n",
77351055 5339 pi.cmd, pi.size);
b411b363 5340 goto reconnect;
257d0af6 5341 }
b411b363
PR
5342 }
5343 if (received == expect) {
2735a594 5344 bool err;
a4fbda8e 5345
2735a594
AG
5346 err = cmd->fn(tconn, &pi);
5347 if (err) {
1952e916 5348 conn_err(tconn, "%pf failed\n", cmd->fn);
b411b363 5349 goto reconnect;
1952e916 5350 }
b411b363 5351
a4fbda8e
PR
5352 tconn->last_received = jiffies;
5353
44ed167d
PR
5354 if (cmd == &asender_tbl[P_PING_ACK]) {
5355 /* restore idle timeout */
5356 tconn->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
5357 ping_timeout_active = false;
5358 }
f36af18c 5359
e658983a 5360 buf = tconn->meta.rbuf;
b411b363 5361 received = 0;
52b061a4 5362 expect = header_size;
b411b363
PR
5363 cmd = NULL;
5364 }
5365 }
5366
5367 if (0) {
5368reconnect:
bbeb641c 5369 conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
b411b363
PR
5370 }
5371 if (0) {
5372disconnect:
bbeb641c 5373 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 5374 }
32862ec7 5375 clear_bit(SIGNAL_ASENDER, &tconn->flags);
b411b363 5376
32862ec7 5377 conn_info(tconn, "asender terminated\n");
b411b363
PR
5378
5379 return 0;
5380}
This page took 0.531729 seconds and 5 git commands to generate.