drbd: pass some more information to userspace.
[deliverable/linux.git] / drivers / block / drbd / drbd_receiver.c
CommitLineData
b411b363
PR
1/*
2 drbd_receiver.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25
b411b363
PR
26#include <linux/module.h>
27
28#include <asm/uaccess.h>
29#include <net/sock.h>
30
b411b363
PR
31#include <linux/drbd.h>
32#include <linux/fs.h>
33#include <linux/file.h>
34#include <linux/in.h>
35#include <linux/mm.h>
36#include <linux/memcontrol.h>
37#include <linux/mm_inline.h>
38#include <linux/slab.h>
b411b363
PR
39#include <linux/pkt_sched.h>
40#define __KERNEL_SYSCALLS__
41#include <linux/unistd.h>
42#include <linux/vmalloc.h>
43#include <linux/random.h>
b411b363
PR
44#include <linux/string.h>
45#include <linux/scatterlist.h>
46#include "drbd_int.h"
b411b363
PR
47#include "drbd_req.h"
48
49#include "drbd_vli.h"
50
77351055
PR
51struct packet_info {
52 enum drbd_packet cmd;
e2857216
AG
53 unsigned int size;
54 unsigned int vnr;
e658983a 55 void *data;
77351055
PR
56};
57
b411b363
PR
58enum finish_epoch {
59 FE_STILL_LIVE,
60 FE_DESTROYED,
61 FE_RECYCLED,
62};
63
6038178e 64static int drbd_do_features(struct drbd_tconn *tconn);
13e6037d 65static int drbd_do_auth(struct drbd_tconn *tconn);
c141ebda 66static int drbd_disconnected(struct drbd_conf *mdev);
b411b363 67
1e9dd291 68static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *, struct drbd_epoch *, enum epoch_event);
99920dc5 69static int e_end_block(struct drbd_work *, int);
b411b363 70
b411b363
PR
71
72#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
73
45bb912b
LE
74/*
75 * some helper functions to deal with single linked page lists,
76 * page->private being our "next" pointer.
77 */
78
79/* If at least n pages are linked at head, get n pages off.
80 * Otherwise, don't modify head, and return NULL.
81 * Locking is the responsibility of the caller.
82 */
83static struct page *page_chain_del(struct page **head, int n)
84{
85 struct page *page;
86 struct page *tmp;
87
88 BUG_ON(!n);
89 BUG_ON(!head);
90
91 page = *head;
23ce4227
PR
92
93 if (!page)
94 return NULL;
95
45bb912b
LE
96 while (page) {
97 tmp = page_chain_next(page);
98 if (--n == 0)
99 break; /* found sufficient pages */
100 if (tmp == NULL)
101 /* insufficient pages, don't use any of them. */
102 return NULL;
103 page = tmp;
104 }
105
106 /* add end of list marker for the returned list */
107 set_page_private(page, 0);
108 /* actual return value, and adjustment of head */
109 page = *head;
110 *head = tmp;
111 return page;
112}
113
114/* may be used outside of locks to find the tail of a (usually short)
115 * "private" page chain, before adding it back to a global chain head
116 * with page_chain_add() under a spinlock. */
117static struct page *page_chain_tail(struct page *page, int *len)
118{
119 struct page *tmp;
120 int i = 1;
121 while ((tmp = page_chain_next(page)))
122 ++i, page = tmp;
123 if (len)
124 *len = i;
125 return page;
126}
127
128static int page_chain_free(struct page *page)
129{
130 struct page *tmp;
131 int i = 0;
132 page_chain_for_each_safe(page, tmp) {
133 put_page(page);
134 ++i;
135 }
136 return i;
137}
138
139static void page_chain_add(struct page **head,
140 struct page *chain_first, struct page *chain_last)
141{
142#if 1
143 struct page *tmp;
144 tmp = page_chain_tail(chain_first, NULL);
145 BUG_ON(tmp != chain_last);
146#endif
147
148 /* add chain to head */
149 set_page_private(chain_last, (unsigned long)*head);
150 *head = chain_first;
151}
152
18c2d522
AG
153static struct page *__drbd_alloc_pages(struct drbd_conf *mdev,
154 unsigned int number)
b411b363
PR
155{
156 struct page *page = NULL;
45bb912b 157 struct page *tmp = NULL;
18c2d522 158 unsigned int i = 0;
b411b363
PR
159
160 /* Yes, testing drbd_pp_vacant outside the lock is racy.
161 * So what. It saves a spin_lock. */
45bb912b 162 if (drbd_pp_vacant >= number) {
b411b363 163 spin_lock(&drbd_pp_lock);
45bb912b
LE
164 page = page_chain_del(&drbd_pp_pool, number);
165 if (page)
166 drbd_pp_vacant -= number;
b411b363 167 spin_unlock(&drbd_pp_lock);
45bb912b
LE
168 if (page)
169 return page;
b411b363 170 }
45bb912b 171
b411b363
PR
172 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
173 * "criss-cross" setup, that might cause write-out on some other DRBD,
174 * which in turn might block on the other node at this very place. */
45bb912b
LE
175 for (i = 0; i < number; i++) {
176 tmp = alloc_page(GFP_TRY);
177 if (!tmp)
178 break;
179 set_page_private(tmp, (unsigned long)page);
180 page = tmp;
181 }
182
183 if (i == number)
184 return page;
185
186 /* Not enough pages immediately available this time.
c37c8ecf 187 * No need to jump around here, drbd_alloc_pages will retry this
45bb912b
LE
188 * function "soon". */
189 if (page) {
190 tmp = page_chain_tail(page, NULL);
191 spin_lock(&drbd_pp_lock);
192 page_chain_add(&drbd_pp_pool, page, tmp);
193 drbd_pp_vacant += i;
194 spin_unlock(&drbd_pp_lock);
195 }
196 return NULL;
b411b363
PR
197}
198
a990be46
AG
199static void reclaim_finished_net_peer_reqs(struct drbd_conf *mdev,
200 struct list_head *to_be_freed)
b411b363 201{
db830c46 202 struct drbd_peer_request *peer_req;
b411b363
PR
203 struct list_head *le, *tle;
204
205 /* The EEs are always appended to the end of the list. Since
206 they are sent in order over the wire, they have to finish
207 in order. As soon as we see the first not finished we can
208 stop to examine the list... */
209
210 list_for_each_safe(le, tle, &mdev->net_ee) {
db830c46 211 peer_req = list_entry(le, struct drbd_peer_request, w.list);
045417f7 212 if (drbd_peer_req_has_active_page(peer_req))
b411b363
PR
213 break;
214 list_move(le, to_be_freed);
215 }
216}
217
218static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
219{
220 LIST_HEAD(reclaimed);
db830c46 221 struct drbd_peer_request *peer_req, *t;
b411b363 222
87eeee41 223 spin_lock_irq(&mdev->tconn->req_lock);
a990be46 224 reclaim_finished_net_peer_reqs(mdev, &reclaimed);
87eeee41 225 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 226
db830c46 227 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
3967deb1 228 drbd_free_net_peer_req(mdev, peer_req);
b411b363
PR
229}
230
231/**
c37c8ecf 232 * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
b411b363 233 * @mdev: DRBD device.
45bb912b
LE
234 * @number: number of pages requested
235 * @retry: whether to retry, if not enough pages are available right now
236 *
237 * Tries to allocate number pages, first from our own page pool, then from
238 * the kernel, unless this allocation would exceed the max_buffers setting.
239 * Possibly retry until DRBD frees sufficient pages somewhere else.
b411b363 240 *
45bb912b 241 * Returns a page chain linked via page->private.
b411b363 242 */
c37c8ecf
AG
243struct page *drbd_alloc_pages(struct drbd_conf *mdev, unsigned int number,
244 bool retry)
b411b363
PR
245{
246 struct page *page = NULL;
44ed167d 247 struct net_conf *nc;
b411b363 248 DEFINE_WAIT(wait);
44ed167d 249 int mxb;
b411b363 250
45bb912b
LE
251 /* Yes, we may run up to @number over max_buffers. If we
252 * follow it strictly, the admin will get it wrong anyways. */
44ed167d
PR
253 rcu_read_lock();
254 nc = rcu_dereference(mdev->tconn->net_conf);
255 mxb = nc ? nc->max_buffers : 1000000;
256 rcu_read_unlock();
257
258 if (atomic_read(&mdev->pp_in_use) < mxb)
18c2d522 259 page = __drbd_alloc_pages(mdev, number);
b411b363 260
45bb912b 261 while (page == NULL) {
b411b363
PR
262 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
263
264 drbd_kick_lo_and_reclaim_net(mdev);
265
44ed167d 266 if (atomic_read(&mdev->pp_in_use) < mxb) {
18c2d522 267 page = __drbd_alloc_pages(mdev, number);
b411b363
PR
268 if (page)
269 break;
270 }
271
272 if (!retry)
273 break;
274
275 if (signal_pending(current)) {
c37c8ecf 276 dev_warn(DEV, "drbd_alloc_pages interrupted!\n");
b411b363
PR
277 break;
278 }
279
280 schedule();
281 }
282 finish_wait(&drbd_pp_wait, &wait);
283
45bb912b
LE
284 if (page)
285 atomic_add(number, &mdev->pp_in_use);
b411b363
PR
286 return page;
287}
288
c37c8ecf 289/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
87eeee41 290 * Is also used from inside an other spin_lock_irq(&mdev->tconn->req_lock);
45bb912b
LE
291 * Either links the page chain back to the global pool,
292 * or returns all pages to the system. */
5cc287e0 293static void drbd_free_pages(struct drbd_conf *mdev, struct page *page, int is_net)
b411b363 294{
435f0740 295 atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
b411b363 296 int i;
435f0740 297
81a3537a
LE
298 if (page == NULL)
299 return;
300
81a5d60e 301 if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
45bb912b
LE
302 i = page_chain_free(page);
303 else {
304 struct page *tmp;
305 tmp = page_chain_tail(page, &i);
306 spin_lock(&drbd_pp_lock);
307 page_chain_add(&drbd_pp_pool, page, tmp);
308 drbd_pp_vacant += i;
309 spin_unlock(&drbd_pp_lock);
b411b363 310 }
435f0740 311 i = atomic_sub_return(i, a);
45bb912b 312 if (i < 0)
435f0740
LE
313 dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
314 is_net ? "pp_in_use_by_net" : "pp_in_use", i);
b411b363
PR
315 wake_up(&drbd_pp_wait);
316}
317
318/*
319You need to hold the req_lock:
320 _drbd_wait_ee_list_empty()
321
322You must not have the req_lock:
3967deb1 323 drbd_free_peer_req()
0db55363 324 drbd_alloc_peer_req()
7721f567 325 drbd_free_peer_reqs()
b411b363 326 drbd_ee_fix_bhs()
a990be46 327 drbd_finish_peer_reqs()
b411b363
PR
328 drbd_clear_done_ee()
329 drbd_wait_ee_list_empty()
330*/
331
f6ffca9f 332struct drbd_peer_request *
0db55363
AG
333drbd_alloc_peer_req(struct drbd_conf *mdev, u64 id, sector_t sector,
334 unsigned int data_size, gfp_t gfp_mask) __must_hold(local)
b411b363 335{
db830c46 336 struct drbd_peer_request *peer_req;
81a3537a 337 struct page *page = NULL;
45bb912b 338 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
b411b363 339
0cf9d27e 340 if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
b411b363
PR
341 return NULL;
342
db830c46
AG
343 peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
344 if (!peer_req) {
b411b363 345 if (!(gfp_mask & __GFP_NOWARN))
0db55363 346 dev_err(DEV, "%s: allocation failed\n", __func__);
b411b363
PR
347 return NULL;
348 }
349
81a3537a
LE
350 if (data_size) {
351 page = drbd_alloc_pages(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
352 if (!page)
353 goto fail;
354 }
b411b363 355
db830c46
AG
356 drbd_clear_interval(&peer_req->i);
357 peer_req->i.size = data_size;
358 peer_req->i.sector = sector;
359 peer_req->i.local = false;
360 peer_req->i.waiting = false;
361
362 peer_req->epoch = NULL;
a21e9298 363 peer_req->w.mdev = mdev;
db830c46
AG
364 peer_req->pages = page;
365 atomic_set(&peer_req->pending_bios, 0);
366 peer_req->flags = 0;
9a8e7753
AG
367 /*
368 * The block_id is opaque to the receiver. It is not endianness
369 * converted, and sent back to the sender unchanged.
370 */
db830c46 371 peer_req->block_id = id;
b411b363 372
db830c46 373 return peer_req;
b411b363 374
45bb912b 375 fail:
db830c46 376 mempool_free(peer_req, drbd_ee_mempool);
b411b363
PR
377 return NULL;
378}
379
3967deb1 380void __drbd_free_peer_req(struct drbd_conf *mdev, struct drbd_peer_request *peer_req,
f6ffca9f 381 int is_net)
b411b363 382{
db830c46
AG
383 if (peer_req->flags & EE_HAS_DIGEST)
384 kfree(peer_req->digest);
5cc287e0 385 drbd_free_pages(mdev, peer_req->pages, is_net);
db830c46
AG
386 D_ASSERT(atomic_read(&peer_req->pending_bios) == 0);
387 D_ASSERT(drbd_interval_empty(&peer_req->i));
388 mempool_free(peer_req, drbd_ee_mempool);
b411b363
PR
389}
390
7721f567 391int drbd_free_peer_reqs(struct drbd_conf *mdev, struct list_head *list)
b411b363
PR
392{
393 LIST_HEAD(work_list);
db830c46 394 struct drbd_peer_request *peer_req, *t;
b411b363 395 int count = 0;
435f0740 396 int is_net = list == &mdev->net_ee;
b411b363 397
87eeee41 398 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 399 list_splice_init(list, &work_list);
87eeee41 400 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 401
db830c46 402 list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
3967deb1 403 __drbd_free_peer_req(mdev, peer_req, is_net);
b411b363
PR
404 count++;
405 }
406 return count;
407}
408
a990be46
AG
409/*
410 * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
b411b363 411 */
a990be46 412static int drbd_finish_peer_reqs(struct drbd_conf *mdev)
b411b363
PR
413{
414 LIST_HEAD(work_list);
415 LIST_HEAD(reclaimed);
db830c46 416 struct drbd_peer_request *peer_req, *t;
e2b3032b 417 int err = 0;
b411b363 418
87eeee41 419 spin_lock_irq(&mdev->tconn->req_lock);
a990be46 420 reclaim_finished_net_peer_reqs(mdev, &reclaimed);
b411b363 421 list_splice_init(&mdev->done_ee, &work_list);
87eeee41 422 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 423
db830c46 424 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
3967deb1 425 drbd_free_net_peer_req(mdev, peer_req);
b411b363
PR
426
427 /* possible callbacks here:
7be8da07 428 * e_end_block, and e_end_resync_block, e_send_discard_write.
b411b363
PR
429 * all ignore the last argument.
430 */
db830c46 431 list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
e2b3032b
AG
432 int err2;
433
b411b363 434 /* list_del not necessary, next/prev members not touched */
e2b3032b
AG
435 err2 = peer_req->w.cb(&peer_req->w, !!err);
436 if (!err)
437 err = err2;
3967deb1 438 drbd_free_peer_req(mdev, peer_req);
b411b363
PR
439 }
440 wake_up(&mdev->ee_wait);
441
e2b3032b 442 return err;
b411b363
PR
443}
444
d4da1537
AG
445static void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
446 struct list_head *head)
b411b363
PR
447{
448 DEFINE_WAIT(wait);
449
450 /* avoids spin_lock/unlock
451 * and calling prepare_to_wait in the fast path */
452 while (!list_empty(head)) {
453 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
87eeee41 454 spin_unlock_irq(&mdev->tconn->req_lock);
7eaceacc 455 io_schedule();
b411b363 456 finish_wait(&mdev->ee_wait, &wait);
87eeee41 457 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
458 }
459}
460
d4da1537
AG
461static void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
462 struct list_head *head)
b411b363 463{
87eeee41 464 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 465 _drbd_wait_ee_list_empty(mdev, head);
87eeee41 466 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
467}
468
dbd9eea0 469static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
b411b363
PR
470{
471 mm_segment_t oldfs;
472 struct kvec iov = {
473 .iov_base = buf,
474 .iov_len = size,
475 };
476 struct msghdr msg = {
477 .msg_iovlen = 1,
478 .msg_iov = (struct iovec *)&iov,
479 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
480 };
481 int rv;
482
483 oldfs = get_fs();
484 set_fs(KERNEL_DS);
485 rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
486 set_fs(oldfs);
487
488 return rv;
489}
490
de0ff338 491static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size)
b411b363
PR
492{
493 mm_segment_t oldfs;
494 struct kvec iov = {
495 .iov_base = buf,
496 .iov_len = size,
497 };
498 struct msghdr msg = {
499 .msg_iovlen = 1,
500 .msg_iov = (struct iovec *)&iov,
501 .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
502 };
503 int rv;
504
505 oldfs = get_fs();
506 set_fs(KERNEL_DS);
507
508 for (;;) {
de0ff338 509 rv = sock_recvmsg(tconn->data.socket, &msg, size, msg.msg_flags);
b411b363
PR
510 if (rv == size)
511 break;
512
513 /* Note:
514 * ECONNRESET other side closed the connection
515 * ERESTARTSYS (on sock) we got a signal
516 */
517
518 if (rv < 0) {
519 if (rv == -ECONNRESET)
de0ff338 520 conn_info(tconn, "sock was reset by peer\n");
b411b363 521 else if (rv != -ERESTARTSYS)
de0ff338 522 conn_err(tconn, "sock_recvmsg returned %d\n", rv);
b411b363
PR
523 break;
524 } else if (rv == 0) {
de0ff338 525 conn_info(tconn, "sock was shut down by peer\n");
b411b363
PR
526 break;
527 } else {
528 /* signal came in, or peer/link went down,
529 * after we read a partial message
530 */
531 /* D_ASSERT(signal_pending(current)); */
532 break;
533 }
534 };
535
536 set_fs(oldfs);
537
538 if (rv != size)
bbeb641c 539 conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
b411b363
PR
540
541 return rv;
542}
543
c6967746
AG
544static int drbd_recv_all(struct drbd_tconn *tconn, void *buf, size_t size)
545{
546 int err;
547
548 err = drbd_recv(tconn, buf, size);
549 if (err != size) {
550 if (err >= 0)
551 err = -EIO;
552 } else
553 err = 0;
554 return err;
555}
556
a5c31904
AG
557static int drbd_recv_all_warn(struct drbd_tconn *tconn, void *buf, size_t size)
558{
559 int err;
560
561 err = drbd_recv_all(tconn, buf, size);
562 if (err && !signal_pending(current))
563 conn_warn(tconn, "short read (expected size %d)\n", (int)size);
564 return err;
565}
566
5dbf1673
LE
567/* quoting tcp(7):
568 * On individual connections, the socket buffer size must be set prior to the
569 * listen(2) or connect(2) calls in order to have it take effect.
570 * This is our wrapper to do so.
571 */
572static void drbd_setbufsize(struct socket *sock, unsigned int snd,
573 unsigned int rcv)
574{
575 /* open coded SO_SNDBUF, SO_RCVBUF */
576 if (snd) {
577 sock->sk->sk_sndbuf = snd;
578 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
579 }
580 if (rcv) {
581 sock->sk->sk_rcvbuf = rcv;
582 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
583 }
584}
585
eac3e990 586static struct socket *drbd_try_connect(struct drbd_tconn *tconn)
b411b363
PR
587{
588 const char *what;
589 struct socket *sock;
590 struct sockaddr_in6 src_in6;
44ed167d
PR
591 struct sockaddr_in6 peer_in6;
592 struct net_conf *nc;
593 int err, peer_addr_len, my_addr_len;
69ef82de 594 int sndbuf_size, rcvbuf_size, connect_int;
b411b363
PR
595 int disconnect_on_error = 1;
596
44ed167d
PR
597 rcu_read_lock();
598 nc = rcu_dereference(tconn->net_conf);
599 if (!nc) {
600 rcu_read_unlock();
b411b363 601 return NULL;
44ed167d 602 }
44ed167d
PR
603 sndbuf_size = nc->sndbuf_size;
604 rcvbuf_size = nc->rcvbuf_size;
69ef82de 605 connect_int = nc->connect_int;
089c075d 606 rcu_read_unlock();
44ed167d 607
089c075d
AG
608 my_addr_len = min_t(int, tconn->my_addr_len, sizeof(src_in6));
609 memcpy(&src_in6, &tconn->my_addr, my_addr_len);
44ed167d 610
089c075d 611 if (((struct sockaddr *)&tconn->my_addr)->sa_family == AF_INET6)
44ed167d
PR
612 src_in6.sin6_port = 0;
613 else
614 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
615
089c075d
AG
616 peer_addr_len = min_t(int, tconn->peer_addr_len, sizeof(src_in6));
617 memcpy(&peer_in6, &tconn->peer_addr, peer_addr_len);
b411b363
PR
618
619 what = "sock_create_kern";
44ed167d
PR
620 err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family,
621 SOCK_STREAM, IPPROTO_TCP, &sock);
b411b363
PR
622 if (err < 0) {
623 sock = NULL;
624 goto out;
625 }
626
627 sock->sk->sk_rcvtimeo =
69ef82de 628 sock->sk->sk_sndtimeo = connect_int * HZ;
44ed167d 629 drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
b411b363
PR
630
631 /* explicitly bind to the configured IP as source IP
632 * for the outgoing connections.
633 * This is needed for multihomed hosts and to be
634 * able to use lo: interfaces for drbd.
635 * Make sure to use 0 as port number, so linux selects
636 * a free one dynamically.
637 */
b411b363 638 what = "bind before connect";
44ed167d 639 err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
b411b363
PR
640 if (err < 0)
641 goto out;
642
643 /* connect may fail, peer not yet available.
644 * stay C_WF_CONNECTION, don't go Disconnecting! */
645 disconnect_on_error = 0;
646 what = "connect";
44ed167d 647 err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
b411b363
PR
648
649out:
650 if (err < 0) {
651 if (sock) {
652 sock_release(sock);
653 sock = NULL;
654 }
655 switch (-err) {
656 /* timeout, busy, signal pending */
657 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
658 case EINTR: case ERESTARTSYS:
659 /* peer not (yet) available, network problem */
660 case ECONNREFUSED: case ENETUNREACH:
661 case EHOSTDOWN: case EHOSTUNREACH:
662 disconnect_on_error = 0;
663 break;
664 default:
eac3e990 665 conn_err(tconn, "%s failed, err = %d\n", what, err);
b411b363
PR
666 }
667 if (disconnect_on_error)
bbeb641c 668 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 669 }
44ed167d 670
b411b363
PR
671 return sock;
672}
673
7a426fd8
PR
674struct accept_wait_data {
675 struct drbd_tconn *tconn;
676 struct socket *s_listen;
677 struct completion door_bell;
678 void (*original_sk_state_change)(struct sock *sk);
679
680};
681
682static void incomming_connection(struct sock *sk)
683{
684 struct accept_wait_data *ad = sk->sk_user_data;
685 struct drbd_tconn *tconn = ad->tconn;
686
687 if (sk->sk_state != TCP_ESTABLISHED)
688 conn_warn(tconn, "unexpected tcp state change. sk_state = %d\n", sk->sk_state);
689
690 write_lock_bh(&sk->sk_callback_lock);
691 sk->sk_state_change = ad->original_sk_state_change;
692 sk->sk_user_data = NULL;
693 write_unlock_bh(&sk->sk_callback_lock);
694
695 sk->sk_state_change(sk);
696 complete(&ad->door_bell);
697}
698
699static int prepare_listen_socket(struct drbd_tconn *tconn, struct accept_wait_data *ad)
b411b363 700{
1f3e509b 701 int err, sndbuf_size, rcvbuf_size, my_addr_len;
44ed167d 702 struct sockaddr_in6 my_addr;
1f3e509b 703 struct socket *s_listen;
44ed167d 704 struct net_conf *nc;
b411b363
PR
705 const char *what;
706
44ed167d
PR
707 rcu_read_lock();
708 nc = rcu_dereference(tconn->net_conf);
709 if (!nc) {
710 rcu_read_unlock();
7a426fd8 711 return -EIO;
44ed167d 712 }
44ed167d
PR
713 sndbuf_size = nc->sndbuf_size;
714 rcvbuf_size = nc->rcvbuf_size;
44ed167d 715 rcu_read_unlock();
b411b363 716
089c075d
AG
717 my_addr_len = min_t(int, tconn->my_addr_len, sizeof(struct sockaddr_in6));
718 memcpy(&my_addr, &tconn->my_addr, my_addr_len);
719
b411b363 720 what = "sock_create_kern";
44ed167d 721 err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family,
1f3e509b 722 SOCK_STREAM, IPPROTO_TCP, &s_listen);
b411b363
PR
723 if (err) {
724 s_listen = NULL;
725 goto out;
726 }
727
1f3e509b 728 s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
44ed167d 729 drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
b411b363
PR
730
731 what = "bind before listen";
44ed167d 732 err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
b411b363
PR
733 if (err < 0)
734 goto out;
735
7a426fd8
PR
736 ad->s_listen = s_listen;
737 write_lock_bh(&s_listen->sk->sk_callback_lock);
738 ad->original_sk_state_change = s_listen->sk->sk_state_change;
739 s_listen->sk->sk_state_change = incomming_connection;
740 s_listen->sk->sk_user_data = ad;
741 write_unlock_bh(&s_listen->sk->sk_callback_lock);
742
2820fd39
PR
743 what = "listen";
744 err = s_listen->ops->listen(s_listen, 5);
745 if (err < 0)
746 goto out;
747
7a426fd8 748 return 0;
1f3e509b
PR
749out:
750 if (s_listen)
751 sock_release(s_listen);
752 if (err < 0) {
753 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
754 conn_err(tconn, "%s failed, err = %d\n", what, err);
755 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
756 }
757 }
758
7a426fd8 759 return -EIO;
1f3e509b
PR
760}
761
7a426fd8 762static struct socket *drbd_wait_for_connect(struct drbd_tconn *tconn, struct accept_wait_data *ad)
1f3e509b
PR
763{
764 int timeo, connect_int, err = 0;
765 struct socket *s_estab = NULL;
1f3e509b
PR
766 struct net_conf *nc;
767
768 rcu_read_lock();
769 nc = rcu_dereference(tconn->net_conf);
770 if (!nc) {
771 rcu_read_unlock();
772 return NULL;
773 }
774 connect_int = nc->connect_int;
775 rcu_read_unlock();
776
777 timeo = connect_int * HZ;
778 timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
779
7a426fd8
PR
780 err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
781 if (err <= 0)
782 return NULL;
b411b363 783
7a426fd8 784 err = kernel_accept(ad->s_listen, &s_estab, 0);
b411b363
PR
785 if (err < 0) {
786 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
1f3e509b 787 conn_err(tconn, "accept failed, err = %d\n", err);
bbeb641c 788 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
789 }
790 }
b411b363
PR
791
792 return s_estab;
793}
794
e658983a 795static int decode_header(struct drbd_tconn *, void *, struct packet_info *);
b411b363 796
9f5bdc33
AG
797static int send_first_packet(struct drbd_tconn *tconn, struct drbd_socket *sock,
798 enum drbd_packet cmd)
799{
800 if (!conn_prepare_command(tconn, sock))
801 return -EIO;
e658983a 802 return conn_send_command(tconn, sock, cmd, 0, NULL, 0);
b411b363
PR
803}
804
9f5bdc33 805static int receive_first_packet(struct drbd_tconn *tconn, struct socket *sock)
b411b363 806{
9f5bdc33
AG
807 unsigned int header_size = drbd_header_size(tconn);
808 struct packet_info pi;
809 int err;
b411b363 810
9f5bdc33
AG
811 err = drbd_recv_short(sock, tconn->data.rbuf, header_size, 0);
812 if (err != header_size) {
813 if (err >= 0)
814 err = -EIO;
815 return err;
816 }
817 err = decode_header(tconn, tconn->data.rbuf, &pi);
818 if (err)
819 return err;
820 return pi.cmd;
b411b363
PR
821}
822
823/**
824 * drbd_socket_okay() - Free the socket if its connection is not okay
b411b363
PR
825 * @sock: pointer to the pointer to the socket.
826 */
dbd9eea0 827static int drbd_socket_okay(struct socket **sock)
b411b363
PR
828{
829 int rr;
830 char tb[4];
831
832 if (!*sock)
81e84650 833 return false;
b411b363 834
dbd9eea0 835 rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
b411b363
PR
836
837 if (rr > 0 || rr == -EAGAIN) {
81e84650 838 return true;
b411b363
PR
839 } else {
840 sock_release(*sock);
841 *sock = NULL;
81e84650 842 return false;
b411b363
PR
843 }
844}
2325eb66
PR
845/* Gets called if a connection is established, or if a new minor gets created
846 in a connection */
c141ebda 847int drbd_connected(struct drbd_conf *mdev)
907599e0 848{
0829f5ed 849 int err;
907599e0
PR
850
851 atomic_set(&mdev->packet_seq, 0);
852 mdev->peer_seq = 0;
853
8410da8f
PR
854 mdev->state_mutex = mdev->tconn->agreed_pro_version < 100 ?
855 &mdev->tconn->cstate_mutex :
856 &mdev->own_state_mutex;
857
0829f5ed
AG
858 err = drbd_send_sync_param(mdev);
859 if (!err)
860 err = drbd_send_sizes(mdev, 0, 0);
861 if (!err)
862 err = drbd_send_uuids(mdev);
863 if (!err)
43de7c85 864 err = drbd_send_current_state(mdev);
907599e0
PR
865 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
866 clear_bit(RESIZE_PENDING, &mdev->flags);
8b924f1d 867 mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
0829f5ed 868 return err;
907599e0
PR
869}
870
b411b363
PR
871/*
872 * return values:
873 * 1 yes, we have a valid connection
874 * 0 oops, did not work out, please try again
875 * -1 peer talks different language,
876 * no point in trying again, please go standalone.
877 * -2 We do not have a network config...
878 */
81fa2e67 879static int conn_connect(struct drbd_tconn *tconn)
b411b363 880{
7da35862 881 struct drbd_socket sock, msock;
c141ebda 882 struct drbd_conf *mdev;
44ed167d 883 struct net_conf *nc;
c141ebda 884 int vnr, timeout, try, h, ok;
08b165ba 885 bool discard_my_data;
a1096a6e 886 enum drbd_state_rv rv;
7a426fd8
PR
887 struct accept_wait_data ad = {
888 .tconn = tconn,
889 .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
890 };
b411b363 891
bbeb641c 892 if (conn_request_state(tconn, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
b411b363
PR
893 return -2;
894
7da35862
PR
895 mutex_init(&sock.mutex);
896 sock.sbuf = tconn->data.sbuf;
897 sock.rbuf = tconn->data.rbuf;
898 sock.socket = NULL;
899 mutex_init(&msock.mutex);
900 msock.sbuf = tconn->meta.sbuf;
901 msock.rbuf = tconn->meta.rbuf;
902 msock.socket = NULL;
903
907599e0 904 clear_bit(DISCARD_CONCURRENT, &tconn->flags);
0916e0e3
AG
905
906 /* Assume that the peer only understands protocol 80 until we know better. */
907 tconn->agreed_pro_version = 80;
b411b363 908
7a426fd8
PR
909 if (prepare_listen_socket(tconn, &ad))
910 return 0;
911
b411b363 912 do {
2bf89621
AG
913 struct socket *s;
914
b411b363
PR
915 for (try = 0;;) {
916 /* 3 tries, this should take less than a second! */
907599e0 917 s = drbd_try_connect(tconn);
b411b363
PR
918 if (s || ++try >= 3)
919 break;
920 /* give the other side time to call bind() & listen() */
20ee6390 921 schedule_timeout_interruptible(HZ / 10);
b411b363
PR
922 }
923
924 if (s) {
7da35862
PR
925 if (!sock.socket) {
926 sock.socket = s;
927 send_first_packet(tconn, &sock, P_INITIAL_DATA);
928 } else if (!msock.socket) {
929 msock.socket = s;
930 send_first_packet(tconn, &msock, P_INITIAL_META);
b411b363 931 } else {
81fa2e67 932 conn_err(tconn, "Logic error in conn_connect()\n");
b411b363
PR
933 goto out_release_sockets;
934 }
935 }
936
7da35862
PR
937 if (sock.socket && msock.socket) {
938 rcu_read_lock();
939 nc = rcu_dereference(tconn->net_conf);
940 timeout = nc->ping_timeo * HZ / 10;
941 rcu_read_unlock();
942 schedule_timeout_interruptible(timeout);
943 ok = drbd_socket_okay(&sock.socket);
944 ok = drbd_socket_okay(&msock.socket) && ok;
b411b363
PR
945 if (ok)
946 break;
947 }
948
949retry:
7a426fd8 950 s = drbd_wait_for_connect(tconn, &ad);
b411b363 951 if (s) {
9f5bdc33 952 try = receive_first_packet(tconn, s);
7da35862
PR
953 drbd_socket_okay(&sock.socket);
954 drbd_socket_okay(&msock.socket);
b411b363 955 switch (try) {
e5d6f33a 956 case P_INITIAL_DATA:
7da35862 957 if (sock.socket) {
907599e0 958 conn_warn(tconn, "initial packet S crossed\n");
7da35862 959 sock_release(sock.socket);
b411b363 960 }
7da35862 961 sock.socket = s;
b411b363 962 break;
e5d6f33a 963 case P_INITIAL_META:
7da35862 964 if (msock.socket) {
907599e0 965 conn_warn(tconn, "initial packet M crossed\n");
7da35862 966 sock_release(msock.socket);
b411b363 967 }
7da35862 968 msock.socket = s;
907599e0 969 set_bit(DISCARD_CONCURRENT, &tconn->flags);
b411b363
PR
970 break;
971 default:
907599e0 972 conn_warn(tconn, "Error receiving initial packet\n");
b411b363
PR
973 sock_release(s);
974 if (random32() & 1)
975 goto retry;
976 }
977 }
978
bbeb641c 979 if (tconn->cstate <= C_DISCONNECTING)
b411b363
PR
980 goto out_release_sockets;
981 if (signal_pending(current)) {
982 flush_signals(current);
983 smp_rmb();
907599e0 984 if (get_t_state(&tconn->receiver) == EXITING)
b411b363
PR
985 goto out_release_sockets;
986 }
987
7da35862
PR
988 if (sock.socket && &msock.socket) {
989 ok = drbd_socket_okay(&sock.socket);
990 ok = drbd_socket_okay(&msock.socket) && ok;
b411b363
PR
991 if (ok)
992 break;
993 }
994 } while (1);
995
7a426fd8
PR
996 if (ad.s_listen)
997 sock_release(ad.s_listen);
998
7da35862
PR
999 sock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */
1000 msock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */
b411b363 1001
7da35862
PR
1002 sock.socket->sk->sk_allocation = GFP_NOIO;
1003 msock.socket->sk->sk_allocation = GFP_NOIO;
b411b363 1004
7da35862
PR
1005 sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
1006 msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
b411b363 1007
b411b363 1008 /* NOT YET ...
7da35862
PR
1009 * sock.socket->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10;
1010 * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
6038178e 1011 * first set it to the P_CONNECTION_FEATURES timeout,
b411b363 1012 * which we set to 4x the configured ping_timeout. */
44ed167d
PR
1013 rcu_read_lock();
1014 nc = rcu_dereference(tconn->net_conf);
1015
7da35862
PR
1016 sock.socket->sk->sk_sndtimeo =
1017 sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
44ed167d 1018
7da35862 1019 msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
44ed167d 1020 timeout = nc->timeout * HZ / 10;
08b165ba 1021 discard_my_data = nc->discard_my_data;
44ed167d 1022 rcu_read_unlock();
b411b363 1023
7da35862 1024 msock.socket->sk->sk_sndtimeo = timeout;
b411b363
PR
1025
1026 /* we don't want delays.
25985edc 1027 * we use TCP_CORK where appropriate, though */
7da35862
PR
1028 drbd_tcp_nodelay(sock.socket);
1029 drbd_tcp_nodelay(msock.socket);
b411b363 1030
7da35862
PR
1031 tconn->data.socket = sock.socket;
1032 tconn->meta.socket = msock.socket;
907599e0 1033 tconn->last_received = jiffies;
b411b363 1034
6038178e 1035 h = drbd_do_features(tconn);
b411b363
PR
1036 if (h <= 0)
1037 return h;
1038
907599e0 1039 if (tconn->cram_hmac_tfm) {
b411b363 1040 /* drbd_request_state(mdev, NS(conn, WFAuth)); */
907599e0 1041 switch (drbd_do_auth(tconn)) {
b10d96cb 1042 case -1:
907599e0 1043 conn_err(tconn, "Authentication of peer failed\n");
b411b363 1044 return -1;
b10d96cb 1045 case 0:
907599e0 1046 conn_err(tconn, "Authentication of peer failed, trying again.\n");
b10d96cb 1047 return 0;
b411b363
PR
1048 }
1049 }
1050
7da35862
PR
1051 tconn->data.socket->sk->sk_sndtimeo = timeout;
1052 tconn->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
b411b363 1053
387eb308 1054 if (drbd_send_protocol(tconn) == -EOPNOTSUPP)
7e2455c1 1055 return -1;
b411b363 1056
a1096a6e
PR
1057 set_bit(STATE_SENT, &tconn->flags);
1058
c141ebda
PR
1059 rcu_read_lock();
1060 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
1061 kref_get(&mdev->kref);
1062 rcu_read_unlock();
08b165ba
PR
1063
1064 if (discard_my_data)
1065 set_bit(DISCARD_MY_DATA, &mdev->flags);
1066 else
1067 clear_bit(DISCARD_MY_DATA, &mdev->flags);
1068
c141ebda
PR
1069 drbd_connected(mdev);
1070 kref_put(&mdev->kref, &drbd_minor_destroy);
1071 rcu_read_lock();
1072 }
1073 rcu_read_unlock();
1074
a1096a6e
PR
1075 rv = conn_request_state(tconn, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
1076 if (rv < SS_SUCCESS) {
1077 clear_bit(STATE_SENT, &tconn->flags);
823bd832 1078 return 0;
a1096a6e 1079 }
823bd832
PR
1080
1081 drbd_thread_start(&tconn->asender);
1082
08b165ba
PR
1083 mutex_lock(&tconn->conf_update);
1084 /* The discard_my_data flag is a single-shot modifier to the next
1085 * connection attempt, the handshake of which is now well underway.
1086 * No need for rcu style copying of the whole struct
1087 * just to clear a single value. */
1088 tconn->net_conf->discard_my_data = 0;
1089 mutex_unlock(&tconn->conf_update);
1090
d3fcb490 1091 return h;
b411b363
PR
1092
1093out_release_sockets:
7a426fd8
PR
1094 if (ad.s_listen)
1095 sock_release(ad.s_listen);
7da35862
PR
1096 if (sock.socket)
1097 sock_release(sock.socket);
1098 if (msock.socket)
1099 sock_release(msock.socket);
b411b363
PR
1100 return -1;
1101}
1102
e658983a 1103static int decode_header(struct drbd_tconn *tconn, void *header, struct packet_info *pi)
b411b363 1104{
e658983a
AG
1105 unsigned int header_size = drbd_header_size(tconn);
1106
0c8e36d9
AG
1107 if (header_size == sizeof(struct p_header100) &&
1108 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
1109 struct p_header100 *h = header;
1110 if (h->pad != 0) {
1111 conn_err(tconn, "Header padding is not zero\n");
1112 return -EINVAL;
1113 }
1114 pi->vnr = be16_to_cpu(h->volume);
1115 pi->cmd = be16_to_cpu(h->command);
1116 pi->size = be32_to_cpu(h->length);
1117 } else if (header_size == sizeof(struct p_header95) &&
1118 *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
e658983a 1119 struct p_header95 *h = header;
e658983a 1120 pi->cmd = be16_to_cpu(h->command);
b55d84ba
AG
1121 pi->size = be32_to_cpu(h->length);
1122 pi->vnr = 0;
e658983a
AG
1123 } else if (header_size == sizeof(struct p_header80) &&
1124 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
1125 struct p_header80 *h = header;
1126 pi->cmd = be16_to_cpu(h->command);
1127 pi->size = be16_to_cpu(h->length);
77351055 1128 pi->vnr = 0;
02918be2 1129 } else {
e658983a
AG
1130 conn_err(tconn, "Wrong magic value 0x%08x in protocol version %d\n",
1131 be32_to_cpu(*(__be32 *)header),
1132 tconn->agreed_pro_version);
8172f3e9 1133 return -EINVAL;
b411b363 1134 }
e658983a 1135 pi->data = header + header_size;
8172f3e9 1136 return 0;
257d0af6
PR
1137}
1138
9ba7aa00 1139static int drbd_recv_header(struct drbd_tconn *tconn, struct packet_info *pi)
257d0af6 1140{
e658983a 1141 void *buffer = tconn->data.rbuf;
69bc7bc3 1142 int err;
257d0af6 1143
e658983a 1144 err = drbd_recv_all_warn(tconn, buffer, drbd_header_size(tconn));
a5c31904 1145 if (err)
69bc7bc3 1146 return err;
257d0af6 1147
e658983a 1148 err = decode_header(tconn, buffer, pi);
9ba7aa00 1149 tconn->last_received = jiffies;
b411b363 1150
69bc7bc3 1151 return err;
b411b363
PR
1152}
1153
4b0007c0 1154static void drbd_flush(struct drbd_tconn *tconn)
b411b363
PR
1155{
1156 int rv;
4b0007c0
PR
1157 struct drbd_conf *mdev;
1158 int vnr;
1159
1160 if (tconn->write_ordering >= WO_bdev_flush) {
615e087f 1161 rcu_read_lock();
4b0007c0 1162 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
615e087f
LE
1163 if (!get_ldev(mdev))
1164 continue;
1165 kref_get(&mdev->kref);
1166 rcu_read_unlock();
1167
1168 rv = blkdev_issue_flush(mdev->ldev->backing_bdev,
1169 GFP_NOIO, NULL);
1170 if (rv) {
1171 dev_info(DEV, "local disk flush failed with status %d\n", rv);
1172 /* would rather check on EOPNOTSUPP, but that is not reliable.
1173 * don't try again for ANY return value != 0
1174 * if (rv == -EOPNOTSUPP) */
1175 drbd_bump_write_ordering(tconn, WO_drain_io);
4b0007c0 1176 }
615e087f
LE
1177 put_ldev(mdev);
1178 kref_put(&mdev->kref, &drbd_minor_destroy);
1179
1180 rcu_read_lock();
1181 if (rv)
1182 break;
b411b363 1183 }
615e087f 1184 rcu_read_unlock();
b411b363 1185 }
b411b363
PR
1186}
1187
1188/**
1189 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
1190 * @mdev: DRBD device.
1191 * @epoch: Epoch object.
1192 * @ev: Epoch event.
1193 */
1e9dd291 1194static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn,
b411b363
PR
1195 struct drbd_epoch *epoch,
1196 enum epoch_event ev)
1197{
2451fc3b 1198 int epoch_size;
b411b363 1199 struct drbd_epoch *next_epoch;
b411b363
PR
1200 enum finish_epoch rv = FE_STILL_LIVE;
1201
12038a3a 1202 spin_lock(&tconn->epoch_lock);
b411b363
PR
1203 do {
1204 next_epoch = NULL;
b411b363
PR
1205
1206 epoch_size = atomic_read(&epoch->epoch_size);
1207
1208 switch (ev & ~EV_CLEANUP) {
1209 case EV_PUT:
1210 atomic_dec(&epoch->active);
1211 break;
1212 case EV_GOT_BARRIER_NR:
1213 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
b411b363
PR
1214 break;
1215 case EV_BECAME_LAST:
1216 /* nothing to do*/
1217 break;
1218 }
1219
b411b363
PR
1220 if (epoch_size != 0 &&
1221 atomic_read(&epoch->active) == 0 &&
85d73513 1222 (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
b411b363 1223 if (!(ev & EV_CLEANUP)) {
12038a3a 1224 spin_unlock(&tconn->epoch_lock);
9ed57dcb 1225 drbd_send_b_ack(epoch->tconn, epoch->barrier_nr, epoch_size);
12038a3a 1226 spin_lock(&tconn->epoch_lock);
b411b363 1227 }
9ed57dcb
LE
1228#if 0
1229 /* FIXME: dec unacked on connection, once we have
1230 * something to count pending connection packets in. */
85d73513 1231 if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
9ed57dcb
LE
1232 dec_unacked(epoch->tconn);
1233#endif
b411b363 1234
12038a3a 1235 if (tconn->current_epoch != epoch) {
b411b363
PR
1236 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1237 list_del(&epoch->list);
1238 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
12038a3a 1239 tconn->epochs--;
b411b363
PR
1240 kfree(epoch);
1241
1242 if (rv == FE_STILL_LIVE)
1243 rv = FE_DESTROYED;
1244 } else {
1245 epoch->flags = 0;
1246 atomic_set(&epoch->epoch_size, 0);
698f9315 1247 /* atomic_set(&epoch->active, 0); is already zero */
b411b363
PR
1248 if (rv == FE_STILL_LIVE)
1249 rv = FE_RECYCLED;
1250 }
1251 }
1252
1253 if (!next_epoch)
1254 break;
1255
1256 epoch = next_epoch;
1257 } while (1);
1258
12038a3a 1259 spin_unlock(&tconn->epoch_lock);
b411b363 1260
b411b363
PR
1261 return rv;
1262}
1263
1264/**
1265 * drbd_bump_write_ordering() - Fall back to an other write ordering method
4b0007c0 1266 * @tconn: DRBD connection.
b411b363
PR
1267 * @wo: Write ordering method to try.
1268 */
4b0007c0 1269void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo)
b411b363 1270{
daeda1cc 1271 struct disk_conf *dc;
4b0007c0 1272 struct drbd_conf *mdev;
b411b363 1273 enum write_ordering_e pwo;
4b0007c0 1274 int vnr;
b411b363
PR
1275 static char *write_ordering_str[] = {
1276 [WO_none] = "none",
1277 [WO_drain_io] = "drain",
1278 [WO_bdev_flush] = "flush",
b411b363
PR
1279 };
1280
4b0007c0 1281 pwo = tconn->write_ordering;
b411b363 1282 wo = min(pwo, wo);
daeda1cc 1283 rcu_read_lock();
4b0007c0 1284 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
27eb13e9 1285 if (!get_ldev_if_state(mdev, D_ATTACHING))
4b0007c0
PR
1286 continue;
1287 dc = rcu_dereference(mdev->ldev->disk_conf);
1288
1289 if (wo == WO_bdev_flush && !dc->disk_flushes)
1290 wo = WO_drain_io;
1291 if (wo == WO_drain_io && !dc->disk_drain)
1292 wo = WO_none;
1293 put_ldev(mdev);
1294 }
daeda1cc 1295 rcu_read_unlock();
4b0007c0
PR
1296 tconn->write_ordering = wo;
1297 if (pwo != tconn->write_ordering || wo == WO_bdev_flush)
1298 conn_info(tconn, "Method to ensure write ordering: %s\n", write_ordering_str[tconn->write_ordering]);
b411b363
PR
1299}
1300
45bb912b 1301/**
fbe29dec 1302 * drbd_submit_peer_request()
45bb912b 1303 * @mdev: DRBD device.
db830c46 1304 * @peer_req: peer request
45bb912b 1305 * @rw: flag field, see bio->bi_rw
10f6d992
LE
1306 *
1307 * May spread the pages to multiple bios,
1308 * depending on bio_add_page restrictions.
1309 *
1310 * Returns 0 if all bios have been submitted,
1311 * -ENOMEM if we could not allocate enough bios,
1312 * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1313 * single page to an empty bio (which should never happen and likely indicates
1314 * that the lower level IO stack is in some way broken). This has been observed
1315 * on certain Xen deployments.
45bb912b
LE
1316 */
1317/* TODO allocate from our own bio_set. */
fbe29dec
AG
1318int drbd_submit_peer_request(struct drbd_conf *mdev,
1319 struct drbd_peer_request *peer_req,
1320 const unsigned rw, const int fault_type)
45bb912b
LE
1321{
1322 struct bio *bios = NULL;
1323 struct bio *bio;
db830c46
AG
1324 struct page *page = peer_req->pages;
1325 sector_t sector = peer_req->i.sector;
1326 unsigned ds = peer_req->i.size;
45bb912b
LE
1327 unsigned n_bios = 0;
1328 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
10f6d992 1329 int err = -ENOMEM;
45bb912b
LE
1330
1331 /* In most cases, we will only need one bio. But in case the lower
1332 * level restrictions happen to be different at this offset on this
1333 * side than those of the sending peer, we may need to submit the
da4a75d2
LE
1334 * request in more than one bio.
1335 *
1336 * Plain bio_alloc is good enough here, this is no DRBD internally
1337 * generated bio, but a bio allocated on behalf of the peer.
1338 */
45bb912b
LE
1339next_bio:
1340 bio = bio_alloc(GFP_NOIO, nr_pages);
1341 if (!bio) {
1342 dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
1343 goto fail;
1344 }
db830c46 1345 /* > peer_req->i.sector, unless this is the first bio */
45bb912b
LE
1346 bio->bi_sector = sector;
1347 bio->bi_bdev = mdev->ldev->backing_bdev;
45bb912b 1348 bio->bi_rw = rw;
db830c46 1349 bio->bi_private = peer_req;
fcefa62e 1350 bio->bi_end_io = drbd_peer_request_endio;
45bb912b
LE
1351
1352 bio->bi_next = bios;
1353 bios = bio;
1354 ++n_bios;
1355
1356 page_chain_for_each(page) {
1357 unsigned len = min_t(unsigned, ds, PAGE_SIZE);
1358 if (!bio_add_page(bio, page, len, 0)) {
10f6d992
LE
1359 /* A single page must always be possible!
1360 * But in case it fails anyways,
1361 * we deal with it, and complain (below). */
1362 if (bio->bi_vcnt == 0) {
1363 dev_err(DEV,
1364 "bio_add_page failed for len=%u, "
1365 "bi_vcnt=0 (bi_sector=%llu)\n",
1366 len, (unsigned long long)bio->bi_sector);
1367 err = -ENOSPC;
1368 goto fail;
1369 }
45bb912b
LE
1370 goto next_bio;
1371 }
1372 ds -= len;
1373 sector += len >> 9;
1374 --nr_pages;
1375 }
1376 D_ASSERT(page == NULL);
1377 D_ASSERT(ds == 0);
1378
db830c46 1379 atomic_set(&peer_req->pending_bios, n_bios);
45bb912b
LE
1380 do {
1381 bio = bios;
1382 bios = bios->bi_next;
1383 bio->bi_next = NULL;
1384
45bb912b 1385 drbd_generic_make_request(mdev, fault_type, bio);
45bb912b 1386 } while (bios);
45bb912b
LE
1387 return 0;
1388
1389fail:
1390 while (bios) {
1391 bio = bios;
1392 bios = bios->bi_next;
1393 bio_put(bio);
1394 }
10f6d992 1395 return err;
45bb912b
LE
1396}
1397
53840641 1398static void drbd_remove_epoch_entry_interval(struct drbd_conf *mdev,
db830c46 1399 struct drbd_peer_request *peer_req)
53840641 1400{
db830c46 1401 struct drbd_interval *i = &peer_req->i;
53840641
AG
1402
1403 drbd_remove_interval(&mdev->write_requests, i);
1404 drbd_clear_interval(i);
1405
6c852bec 1406 /* Wake up any processes waiting for this peer request to complete. */
53840641
AG
1407 if (i->waiting)
1408 wake_up(&mdev->misc_wait);
1409}
1410
77fede51
PR
1411void conn_wait_active_ee_empty(struct drbd_tconn *tconn)
1412{
1413 struct drbd_conf *mdev;
1414 int vnr;
1415
1416 rcu_read_lock();
1417 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
1418 kref_get(&mdev->kref);
1419 rcu_read_unlock();
1420 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1421 kref_put(&mdev->kref, &drbd_minor_destroy);
1422 rcu_read_lock();
1423 }
1424 rcu_read_unlock();
1425}
1426
4a76b161 1427static int receive_Barrier(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 1428{
2451fc3b 1429 int rv;
e658983a 1430 struct p_barrier *p = pi->data;
b411b363
PR
1431 struct drbd_epoch *epoch;
1432
9ed57dcb
LE
1433 /* FIXME these are unacked on connection,
1434 * not a specific (peer)device.
1435 */
12038a3a 1436 tconn->current_epoch->barrier_nr = p->barrier;
9ed57dcb 1437 tconn->current_epoch->tconn = tconn;
1e9dd291 1438 rv = drbd_may_finish_epoch(tconn, tconn->current_epoch, EV_GOT_BARRIER_NR);
b411b363
PR
1439
1440 /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1441 * the activity log, which means it would not be resynced in case the
1442 * R_PRIMARY crashes now.
1443 * Therefore we must send the barrier_ack after the barrier request was
1444 * completed. */
4b0007c0 1445 switch (tconn->write_ordering) {
b411b363
PR
1446 case WO_none:
1447 if (rv == FE_RECYCLED)
82bc0194 1448 return 0;
2451fc3b
PR
1449
1450 /* receiver context, in the writeout path of the other node.
1451 * avoid potential distributed deadlock */
1452 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1453 if (epoch)
1454 break;
1455 else
9ed57dcb 1456 conn_warn(tconn, "Allocation of an epoch failed, slowing down\n");
2451fc3b 1457 /* Fall through */
b411b363
PR
1458
1459 case WO_bdev_flush:
1460 case WO_drain_io:
77fede51 1461 conn_wait_active_ee_empty(tconn);
4b0007c0 1462 drbd_flush(tconn);
2451fc3b 1463
12038a3a 1464 if (atomic_read(&tconn->current_epoch->epoch_size)) {
2451fc3b
PR
1465 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1466 if (epoch)
1467 break;
b411b363
PR
1468 }
1469
82bc0194 1470 return 0;
2451fc3b 1471 default:
9ed57dcb 1472 conn_err(tconn, "Strangeness in tconn->write_ordering %d\n", tconn->write_ordering);
82bc0194 1473 return -EIO;
b411b363
PR
1474 }
1475
1476 epoch->flags = 0;
1477 atomic_set(&epoch->epoch_size, 0);
1478 atomic_set(&epoch->active, 0);
1479
12038a3a
PR
1480 spin_lock(&tconn->epoch_lock);
1481 if (atomic_read(&tconn->current_epoch->epoch_size)) {
1482 list_add(&epoch->list, &tconn->current_epoch->list);
1483 tconn->current_epoch = epoch;
1484 tconn->epochs++;
b411b363
PR
1485 } else {
1486 /* The current_epoch got recycled while we allocated this one... */
1487 kfree(epoch);
1488 }
12038a3a 1489 spin_unlock(&tconn->epoch_lock);
b411b363 1490
82bc0194 1491 return 0;
b411b363
PR
1492}
1493
1494/* used from receive_RSDataReply (recv_resync_read)
1495 * and from receive_Data */
f6ffca9f
AG
1496static struct drbd_peer_request *
1497read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector,
1498 int data_size) __must_hold(local)
b411b363 1499{
6666032a 1500 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
db830c46 1501 struct drbd_peer_request *peer_req;
b411b363 1502 struct page *page;
a5c31904 1503 int dgs, ds, err;
a0638456
PR
1504 void *dig_in = mdev->tconn->int_dig_in;
1505 void *dig_vv = mdev->tconn->int_dig_vv;
6b4388ac 1506 unsigned long *data;
b411b363 1507
88104ca4
AG
1508 dgs = 0;
1509 if (mdev->tconn->peer_integrity_tfm) {
1510 dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
9f5bdc33
AG
1511 /*
1512 * FIXME: Receive the incoming digest into the receive buffer
1513 * here, together with its struct p_data?
1514 */
a5c31904
AG
1515 err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
1516 if (err)
b411b363 1517 return NULL;
88104ca4 1518 data_size -= dgs;
b411b363
PR
1519 }
1520
841ce241
AG
1521 if (!expect(IS_ALIGNED(data_size, 512)))
1522 return NULL;
1523 if (!expect(data_size <= DRBD_MAX_BIO_SIZE))
1524 return NULL;
b411b363 1525
6666032a
LE
1526 /* even though we trust out peer,
1527 * we sometimes have to double check. */
1528 if (sector + (data_size>>9) > capacity) {
fdda6544
LE
1529 dev_err(DEV, "request from peer beyond end of local disk: "
1530 "capacity: %llus < sector: %llus + size: %u\n",
6666032a
LE
1531 (unsigned long long)capacity,
1532 (unsigned long long)sector, data_size);
1533 return NULL;
1534 }
1535
b411b363
PR
1536 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1537 * "criss-cross" setup, that might cause write-out on some other DRBD,
1538 * which in turn might block on the other node at this very place. */
0db55363 1539 peer_req = drbd_alloc_peer_req(mdev, id, sector, data_size, GFP_NOIO);
db830c46 1540 if (!peer_req)
b411b363 1541 return NULL;
45bb912b 1542
81a3537a
LE
1543 if (!data_size)
1544 return peer_req;
1545
b411b363 1546 ds = data_size;
db830c46 1547 page = peer_req->pages;
45bb912b
LE
1548 page_chain_for_each(page) {
1549 unsigned len = min_t(int, ds, PAGE_SIZE);
6b4388ac 1550 data = kmap(page);
a5c31904 1551 err = drbd_recv_all_warn(mdev->tconn, data, len);
0cf9d27e 1552 if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) {
6b4388ac
PR
1553 dev_err(DEV, "Fault injection: Corrupting data on receive\n");
1554 data[0] = data[0] ^ (unsigned long)-1;
1555 }
b411b363 1556 kunmap(page);
a5c31904 1557 if (err) {
3967deb1 1558 drbd_free_peer_req(mdev, peer_req);
b411b363
PR
1559 return NULL;
1560 }
a5c31904 1561 ds -= len;
b411b363
PR
1562 }
1563
1564 if (dgs) {
5b614abe 1565 drbd_csum_ee(mdev, mdev->tconn->peer_integrity_tfm, peer_req, dig_vv);
b411b363 1566 if (memcmp(dig_in, dig_vv, dgs)) {
470be44a
LE
1567 dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n",
1568 (unsigned long long)sector, data_size);
3967deb1 1569 drbd_free_peer_req(mdev, peer_req);
b411b363
PR
1570 return NULL;
1571 }
1572 }
1573 mdev->recv_cnt += data_size>>9;
db830c46 1574 return peer_req;
b411b363
PR
1575}
1576
1577/* drbd_drain_block() just takes a data block
1578 * out of the socket input buffer, and discards it.
1579 */
1580static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1581{
1582 struct page *page;
a5c31904 1583 int err = 0;
b411b363
PR
1584 void *data;
1585
c3470cde 1586 if (!data_size)
fc5be839 1587 return 0;
c3470cde 1588
c37c8ecf 1589 page = drbd_alloc_pages(mdev, 1, 1);
b411b363
PR
1590
1591 data = kmap(page);
1592 while (data_size) {
fc5be839
AG
1593 unsigned int len = min_t(int, data_size, PAGE_SIZE);
1594
a5c31904
AG
1595 err = drbd_recv_all_warn(mdev->tconn, data, len);
1596 if (err)
b411b363 1597 break;
a5c31904 1598 data_size -= len;
b411b363
PR
1599 }
1600 kunmap(page);
5cc287e0 1601 drbd_free_pages(mdev, page, 0);
fc5be839 1602 return err;
b411b363
PR
1603}
1604
1605static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1606 sector_t sector, int data_size)
1607{
1608 struct bio_vec *bvec;
1609 struct bio *bio;
a5c31904 1610 int dgs, err, i, expect;
a0638456
PR
1611 void *dig_in = mdev->tconn->int_dig_in;
1612 void *dig_vv = mdev->tconn->int_dig_vv;
b411b363 1613
88104ca4
AG
1614 dgs = 0;
1615 if (mdev->tconn->peer_integrity_tfm) {
1616 dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
a5c31904
AG
1617 err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
1618 if (err)
1619 return err;
88104ca4 1620 data_size -= dgs;
b411b363
PR
1621 }
1622
b411b363
PR
1623 /* optimistically update recv_cnt. if receiving fails below,
1624 * we disconnect anyways, and counters will be reset. */
1625 mdev->recv_cnt += data_size>>9;
1626
1627 bio = req->master_bio;
1628 D_ASSERT(sector == bio->bi_sector);
1629
1630 bio_for_each_segment(bvec, bio, i) {
a5c31904 1631 void *mapped = kmap(bvec->bv_page) + bvec->bv_offset;
b411b363 1632 expect = min_t(int, data_size, bvec->bv_len);
a5c31904 1633 err = drbd_recv_all_warn(mdev->tconn, mapped, expect);
b411b363 1634 kunmap(bvec->bv_page);
a5c31904
AG
1635 if (err)
1636 return err;
1637 data_size -= expect;
b411b363
PR
1638 }
1639
1640 if (dgs) {
5b614abe 1641 drbd_csum_bio(mdev, mdev->tconn->peer_integrity_tfm, bio, dig_vv);
b411b363
PR
1642 if (memcmp(dig_in, dig_vv, dgs)) {
1643 dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
28284cef 1644 return -EINVAL;
b411b363
PR
1645 }
1646 }
1647
1648 D_ASSERT(data_size == 0);
28284cef 1649 return 0;
b411b363
PR
1650}
1651
a990be46
AG
1652/*
1653 * e_end_resync_block() is called in asender context via
1654 * drbd_finish_peer_reqs().
1655 */
99920dc5 1656static int e_end_resync_block(struct drbd_work *w, int unused)
b411b363 1657{
8050e6d0
AG
1658 struct drbd_peer_request *peer_req =
1659 container_of(w, struct drbd_peer_request, w);
00d56944 1660 struct drbd_conf *mdev = w->mdev;
db830c46 1661 sector_t sector = peer_req->i.sector;
99920dc5 1662 int err;
b411b363 1663
db830c46 1664 D_ASSERT(drbd_interval_empty(&peer_req->i));
b411b363 1665
db830c46
AG
1666 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1667 drbd_set_in_sync(mdev, sector, peer_req->i.size);
99920dc5 1668 err = drbd_send_ack(mdev, P_RS_WRITE_ACK, peer_req);
b411b363
PR
1669 } else {
1670 /* Record failure to sync */
db830c46 1671 drbd_rs_failed_io(mdev, sector, peer_req->i.size);
b411b363 1672
99920dc5 1673 err = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
b411b363
PR
1674 }
1675 dec_unacked(mdev);
1676
99920dc5 1677 return err;
b411b363
PR
1678}
1679
1680static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
1681{
db830c46 1682 struct drbd_peer_request *peer_req;
b411b363 1683
db830c46
AG
1684 peer_req = read_in_block(mdev, ID_SYNCER, sector, data_size);
1685 if (!peer_req)
45bb912b 1686 goto fail;
b411b363
PR
1687
1688 dec_rs_pending(mdev);
1689
b411b363
PR
1690 inc_unacked(mdev);
1691 /* corresponding dec_unacked() in e_end_resync_block()
1692 * respective _drbd_clear_done_ee */
1693
db830c46 1694 peer_req->w.cb = e_end_resync_block;
45bb912b 1695
87eeee41 1696 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 1697 list_add(&peer_req->w.list, &mdev->sync_ee);
87eeee41 1698 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 1699
0f0601f4 1700 atomic_add(data_size >> 9, &mdev->rs_sect_ev);
fbe29dec 1701 if (drbd_submit_peer_request(mdev, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
e1c1b0fc 1702 return 0;
b411b363 1703
10f6d992
LE
1704 /* don't care for the reason here */
1705 dev_err(DEV, "submit failed, triggering re-connect\n");
87eeee41 1706 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 1707 list_del(&peer_req->w.list);
87eeee41 1708 spin_unlock_irq(&mdev->tconn->req_lock);
22cc37a9 1709
3967deb1 1710 drbd_free_peer_req(mdev, peer_req);
45bb912b
LE
1711fail:
1712 put_ldev(mdev);
e1c1b0fc 1713 return -EIO;
b411b363
PR
1714}
1715
668eebc6 1716static struct drbd_request *
bc9c5c41
AG
1717find_request(struct drbd_conf *mdev, struct rb_root *root, u64 id,
1718 sector_t sector, bool missing_ok, const char *func)
51624585 1719{
51624585
AG
1720 struct drbd_request *req;
1721
bc9c5c41
AG
1722 /* Request object according to our peer */
1723 req = (struct drbd_request *)(unsigned long)id;
5e472264 1724 if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
668eebc6 1725 return req;
c3afd8f5 1726 if (!missing_ok) {
5af172ed 1727 dev_err(DEV, "%s: failed to find request 0x%lx, sector %llus\n", func,
c3afd8f5
AG
1728 (unsigned long)id, (unsigned long long)sector);
1729 }
51624585
AG
1730 return NULL;
1731}
1732
4a76b161 1733static int receive_DataReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 1734{
4a76b161 1735 struct drbd_conf *mdev;
b411b363
PR
1736 struct drbd_request *req;
1737 sector_t sector;
82bc0194 1738 int err;
e658983a 1739 struct p_data *p = pi->data;
4a76b161
AG
1740
1741 mdev = vnr_to_mdev(tconn, pi->vnr);
1742 if (!mdev)
1743 return -EIO;
b411b363
PR
1744
1745 sector = be64_to_cpu(p->sector);
1746
87eeee41 1747 spin_lock_irq(&mdev->tconn->req_lock);
bc9c5c41 1748 req = find_request(mdev, &mdev->read_requests, p->block_id, sector, false, __func__);
87eeee41 1749 spin_unlock_irq(&mdev->tconn->req_lock);
c3afd8f5 1750 if (unlikely(!req))
82bc0194 1751 return -EIO;
b411b363 1752
24c4830c 1753 /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
b411b363
PR
1754 * special casing it there for the various failure cases.
1755 * still no race with drbd_fail_pending_reads */
e2857216 1756 err = recv_dless_read(mdev, req, sector, pi->size);
82bc0194 1757 if (!err)
8554df1c 1758 req_mod(req, DATA_RECEIVED);
b411b363
PR
1759 /* else: nothing. handled from drbd_disconnect...
1760 * I don't think we may complete this just yet
1761 * in case we are "on-disconnect: freeze" */
1762
82bc0194 1763 return err;
b411b363
PR
1764}
1765
4a76b161 1766static int receive_RSDataReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 1767{
4a76b161 1768 struct drbd_conf *mdev;
b411b363 1769 sector_t sector;
82bc0194 1770 int err;
e658983a 1771 struct p_data *p = pi->data;
4a76b161
AG
1772
1773 mdev = vnr_to_mdev(tconn, pi->vnr);
1774 if (!mdev)
1775 return -EIO;
b411b363
PR
1776
1777 sector = be64_to_cpu(p->sector);
1778 D_ASSERT(p->block_id == ID_SYNCER);
1779
1780 if (get_ldev(mdev)) {
1781 /* data is submitted to disk within recv_resync_read.
1782 * corresponding put_ldev done below on error,
fcefa62e 1783 * or in drbd_peer_request_endio. */
e2857216 1784 err = recv_resync_read(mdev, sector, pi->size);
b411b363
PR
1785 } else {
1786 if (__ratelimit(&drbd_ratelimit_state))
1787 dev_err(DEV, "Can not write resync data to local disk.\n");
1788
e2857216 1789 err = drbd_drain_block(mdev, pi->size);
b411b363 1790
e2857216 1791 drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
b411b363
PR
1792 }
1793
e2857216 1794 atomic_add(pi->size >> 9, &mdev->rs_sect_in);
778f271d 1795
82bc0194 1796 return err;
b411b363
PR
1797}
1798
7be8da07
AG
1799static void restart_conflicting_writes(struct drbd_conf *mdev,
1800 sector_t sector, int size)
1801{
1802 struct drbd_interval *i;
1803 struct drbd_request *req;
1804
1805 drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
1806 if (!i->local)
1807 continue;
1808 req = container_of(i, struct drbd_request, i);
1809 if (req->rq_state & RQ_LOCAL_PENDING ||
1810 !(req->rq_state & RQ_POSTPONED))
1811 continue;
2312f0b3
LE
1812 /* as it is RQ_POSTPONED, this will cause it to
1813 * be queued on the retry workqueue. */
1814 __req_mod(req, DISCARD_WRITE, NULL);
7be8da07
AG
1815 }
1816}
1817
a990be46
AG
1818/*
1819 * e_end_block() is called in asender context via drbd_finish_peer_reqs().
b411b363 1820 */
99920dc5 1821static int e_end_block(struct drbd_work *w, int cancel)
b411b363 1822{
8050e6d0
AG
1823 struct drbd_peer_request *peer_req =
1824 container_of(w, struct drbd_peer_request, w);
00d56944 1825 struct drbd_conf *mdev = w->mdev;
db830c46 1826 sector_t sector = peer_req->i.sector;
99920dc5 1827 int err = 0, pcmd;
b411b363 1828
303d1448 1829 if (peer_req->flags & EE_SEND_WRITE_ACK) {
db830c46 1830 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
b411b363
PR
1831 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
1832 mdev->state.conn <= C_PAUSED_SYNC_T &&
db830c46 1833 peer_req->flags & EE_MAY_SET_IN_SYNC) ?
b411b363 1834 P_RS_WRITE_ACK : P_WRITE_ACK;
99920dc5 1835 err = drbd_send_ack(mdev, pcmd, peer_req);
b411b363 1836 if (pcmd == P_RS_WRITE_ACK)
db830c46 1837 drbd_set_in_sync(mdev, sector, peer_req->i.size);
b411b363 1838 } else {
99920dc5 1839 err = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
b411b363
PR
1840 /* we expect it to be marked out of sync anyways...
1841 * maybe assert this? */
1842 }
1843 dec_unacked(mdev);
1844 }
1845 /* we delete from the conflict detection hash _after_ we sent out the
1846 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
302bdeae 1847 if (peer_req->flags & EE_IN_INTERVAL_TREE) {
87eeee41 1848 spin_lock_irq(&mdev->tconn->req_lock);
db830c46
AG
1849 D_ASSERT(!drbd_interval_empty(&peer_req->i));
1850 drbd_remove_epoch_entry_interval(mdev, peer_req);
7be8da07
AG
1851 if (peer_req->flags & EE_RESTART_REQUESTS)
1852 restart_conflicting_writes(mdev, sector, peer_req->i.size);
87eeee41 1853 spin_unlock_irq(&mdev->tconn->req_lock);
bb3bfe96 1854 } else
db830c46 1855 D_ASSERT(drbd_interval_empty(&peer_req->i));
b411b363 1856
1e9dd291 1857 drbd_may_finish_epoch(mdev->tconn, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
b411b363 1858
99920dc5 1859 return err;
b411b363
PR
1860}
1861
7be8da07 1862static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
b411b363 1863{
7be8da07 1864 struct drbd_conf *mdev = w->mdev;
8050e6d0
AG
1865 struct drbd_peer_request *peer_req =
1866 container_of(w, struct drbd_peer_request, w);
99920dc5 1867 int err;
b411b363 1868
99920dc5 1869 err = drbd_send_ack(mdev, ack, peer_req);
b411b363
PR
1870 dec_unacked(mdev);
1871
99920dc5 1872 return err;
b411b363
PR
1873}
1874
99920dc5 1875static int e_send_discard_write(struct drbd_work *w, int unused)
7be8da07
AG
1876{
1877 return e_send_ack(w, P_DISCARD_WRITE);
1878}
1879
99920dc5 1880static int e_send_retry_write(struct drbd_work *w, int unused)
7be8da07
AG
1881{
1882 struct drbd_tconn *tconn = w->mdev->tconn;
1883
1884 return e_send_ack(w, tconn->agreed_pro_version >= 100 ?
1885 P_RETRY_WRITE : P_DISCARD_WRITE);
1886}
1887
3e394da1
AG
1888static bool seq_greater(u32 a, u32 b)
1889{
1890 /*
1891 * We assume 32-bit wrap-around here.
1892 * For 24-bit wrap-around, we would have to shift:
1893 * a <<= 8; b <<= 8;
1894 */
1895 return (s32)a - (s32)b > 0;
1896}
1897
1898static u32 seq_max(u32 a, u32 b)
1899{
1900 return seq_greater(a, b) ? a : b;
1901}
1902
7be8da07
AG
1903static bool need_peer_seq(struct drbd_conf *mdev)
1904{
1905 struct drbd_tconn *tconn = mdev->tconn;
302bdeae 1906 int tp;
7be8da07
AG
1907
1908 /*
1909 * We only need to keep track of the last packet_seq number of our peer
1910 * if we are in dual-primary mode and we have the discard flag set; see
1911 * handle_write_conflicts().
1912 */
302bdeae
PR
1913
1914 rcu_read_lock();
1915 tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
1916 rcu_read_unlock();
1917
1918 return tp && test_bit(DISCARD_CONCURRENT, &tconn->flags);
7be8da07
AG
1919}
1920
43ae077d 1921static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq)
3e394da1 1922{
3c13b680 1923 unsigned int newest_peer_seq;
3e394da1 1924
7be8da07
AG
1925 if (need_peer_seq(mdev)) {
1926 spin_lock(&mdev->peer_seq_lock);
3c13b680
LE
1927 newest_peer_seq = seq_max(mdev->peer_seq, peer_seq);
1928 mdev->peer_seq = newest_peer_seq;
7be8da07 1929 spin_unlock(&mdev->peer_seq_lock);
3c13b680
LE
1930 /* wake up only if we actually changed mdev->peer_seq */
1931 if (peer_seq == newest_peer_seq)
7be8da07
AG
1932 wake_up(&mdev->seq_wait);
1933 }
3e394da1
AG
1934}
1935
d93f6302
LE
1936static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
1937{
1938 return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
1939}
1940
1941/* maybe change sync_ee into interval trees as well? */
3ea35df8 1942static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_peer_request *peer_req)
d93f6302
LE
1943{
1944 struct drbd_peer_request *rs_req;
1945 bool rv = 0;
1946
1947 spin_lock_irq(&mdev->tconn->req_lock);
1948 list_for_each_entry(rs_req, &mdev->sync_ee, w.list) {
1949 if (overlaps(peer_req->i.sector, peer_req->i.size,
1950 rs_req->i.sector, rs_req->i.size)) {
1951 rv = 1;
1952 break;
1953 }
1954 }
1955 spin_unlock_irq(&mdev->tconn->req_lock);
1956
d93f6302
LE
1957 return rv;
1958}
1959
b411b363
PR
1960/* Called from receive_Data.
1961 * Synchronize packets on sock with packets on msock.
1962 *
1963 * This is here so even when a P_DATA packet traveling via sock overtook an Ack
1964 * packet traveling on msock, they are still processed in the order they have
1965 * been sent.
1966 *
1967 * Note: we don't care for Ack packets overtaking P_DATA packets.
1968 *
1969 * In case packet_seq is larger than mdev->peer_seq number, there are
1970 * outstanding packets on the msock. We wait for them to arrive.
1971 * In case we are the logically next packet, we update mdev->peer_seq
1972 * ourselves. Correctly handles 32bit wrap around.
1973 *
1974 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
1975 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
1976 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
1977 * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
1978 *
1979 * returns 0 if we may process the packet,
1980 * -ERESTARTSYS if we were interrupted (by disconnect signal). */
7be8da07 1981static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_seq)
b411b363
PR
1982{
1983 DEFINE_WAIT(wait);
b411b363 1984 long timeout;
7be8da07
AG
1985 int ret;
1986
1987 if (!need_peer_seq(mdev))
1988 return 0;
1989
b411b363
PR
1990 spin_lock(&mdev->peer_seq_lock);
1991 for (;;) {
7be8da07
AG
1992 if (!seq_greater(peer_seq - 1, mdev->peer_seq)) {
1993 mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq);
1994 ret = 0;
b411b363 1995 break;
7be8da07 1996 }
b411b363
PR
1997 if (signal_pending(current)) {
1998 ret = -ERESTARTSYS;
1999 break;
2000 }
7be8da07 2001 prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
b411b363 2002 spin_unlock(&mdev->peer_seq_lock);
44ed167d
PR
2003 rcu_read_lock();
2004 timeout = rcu_dereference(mdev->tconn->net_conf)->ping_timeo*HZ/10;
2005 rcu_read_unlock();
71b1c1eb 2006 timeout = schedule_timeout(timeout);
b411b363 2007 spin_lock(&mdev->peer_seq_lock);
7be8da07 2008 if (!timeout) {
b411b363 2009 ret = -ETIMEDOUT;
71b1c1eb 2010 dev_err(DEV, "Timed out waiting for missing ack packets; disconnecting\n");
b411b363
PR
2011 break;
2012 }
2013 }
b411b363 2014 spin_unlock(&mdev->peer_seq_lock);
7be8da07 2015 finish_wait(&mdev->seq_wait, &wait);
b411b363
PR
2016 return ret;
2017}
2018
688593c5
LE
2019/* see also bio_flags_to_wire()
2020 * DRBD_REQ_*, because we need to semantically map the flags to data packet
2021 * flags and back. We may replicate to other kernel versions. */
2022static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
76d2e7ec 2023{
688593c5
LE
2024 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
2025 (dpf & DP_FUA ? REQ_FUA : 0) |
2026 (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
2027 (dpf & DP_DISCARD ? REQ_DISCARD : 0);
76d2e7ec
PR
2028}
2029
7be8da07
AG
2030static void fail_postponed_requests(struct drbd_conf *mdev, sector_t sector,
2031 unsigned int size)
2032{
2033 struct drbd_interval *i;
2034
2035 repeat:
2036 drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
2037 struct drbd_request *req;
2038 struct bio_and_error m;
2039
2040 if (!i->local)
2041 continue;
2042 req = container_of(i, struct drbd_request, i);
2043 if (!(req->rq_state & RQ_POSTPONED))
2044 continue;
2045 req->rq_state &= ~RQ_POSTPONED;
2046 __req_mod(req, NEG_ACKED, &m);
2047 spin_unlock_irq(&mdev->tconn->req_lock);
2048 if (m.bio)
2049 complete_master_bio(mdev, &m);
2050 spin_lock_irq(&mdev->tconn->req_lock);
2051 goto repeat;
2052 }
2053}
2054
2055static int handle_write_conflicts(struct drbd_conf *mdev,
2056 struct drbd_peer_request *peer_req)
2057{
2058 struct drbd_tconn *tconn = mdev->tconn;
2059 bool resolve_conflicts = test_bit(DISCARD_CONCURRENT, &tconn->flags);
2060 sector_t sector = peer_req->i.sector;
2061 const unsigned int size = peer_req->i.size;
2062 struct drbd_interval *i;
2063 bool equal;
2064 int err;
2065
2066 /*
2067 * Inserting the peer request into the write_requests tree will prevent
2068 * new conflicting local requests from being added.
2069 */
2070 drbd_insert_interval(&mdev->write_requests, &peer_req->i);
2071
2072 repeat:
2073 drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
2074 if (i == &peer_req->i)
2075 continue;
2076
2077 if (!i->local) {
2078 /*
2079 * Our peer has sent a conflicting remote request; this
2080 * should not happen in a two-node setup. Wait for the
2081 * earlier peer request to complete.
2082 */
2083 err = drbd_wait_misc(mdev, i);
2084 if (err)
2085 goto out;
2086 goto repeat;
2087 }
2088
2089 equal = i->sector == sector && i->size == size;
2090 if (resolve_conflicts) {
2091 /*
2092 * If the peer request is fully contained within the
2093 * overlapping request, it can be discarded; otherwise,
2094 * it will be retried once all overlapping requests
2095 * have completed.
2096 */
2097 bool discard = i->sector <= sector && i->sector +
2098 (i->size >> 9) >= sector + (size >> 9);
2099
2100 if (!equal)
2101 dev_alert(DEV, "Concurrent writes detected: "
2102 "local=%llus +%u, remote=%llus +%u, "
2103 "assuming %s came first\n",
2104 (unsigned long long)i->sector, i->size,
2105 (unsigned long long)sector, size,
2106 discard ? "local" : "remote");
2107
2108 inc_unacked(mdev);
2109 peer_req->w.cb = discard ? e_send_discard_write :
2110 e_send_retry_write;
2111 list_add_tail(&peer_req->w.list, &mdev->done_ee);
2112 wake_asender(mdev->tconn);
2113
2114 err = -ENOENT;
2115 goto out;
2116 } else {
2117 struct drbd_request *req =
2118 container_of(i, struct drbd_request, i);
2119
2120 if (!equal)
2121 dev_alert(DEV, "Concurrent writes detected: "
2122 "local=%llus +%u, remote=%llus +%u\n",
2123 (unsigned long long)i->sector, i->size,
2124 (unsigned long long)sector, size);
2125
2126 if (req->rq_state & RQ_LOCAL_PENDING ||
2127 !(req->rq_state & RQ_POSTPONED)) {
2128 /*
2129 * Wait for the node with the discard flag to
2130 * decide if this request will be discarded or
2131 * retried. Requests that are discarded will
2132 * disappear from the write_requests tree.
2133 *
2134 * In addition, wait for the conflicting
2135 * request to finish locally before submitting
2136 * the conflicting peer request.
2137 */
2138 err = drbd_wait_misc(mdev, &req->i);
2139 if (err) {
2140 _conn_request_state(mdev->tconn,
2141 NS(conn, C_TIMEOUT),
2142 CS_HARD);
2143 fail_postponed_requests(mdev, sector, size);
2144 goto out;
2145 }
2146 goto repeat;
2147 }
2148 /*
2149 * Remember to restart the conflicting requests after
2150 * the new peer request has completed.
2151 */
2152 peer_req->flags |= EE_RESTART_REQUESTS;
2153 }
2154 }
2155 err = 0;
2156
2157 out:
2158 if (err)
2159 drbd_remove_epoch_entry_interval(mdev, peer_req);
2160 return err;
2161}
2162
b411b363 2163/* mirrored write */
4a76b161 2164static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 2165{
4a76b161 2166 struct drbd_conf *mdev;
b411b363 2167 sector_t sector;
db830c46 2168 struct drbd_peer_request *peer_req;
e658983a 2169 struct p_data *p = pi->data;
7be8da07 2170 u32 peer_seq = be32_to_cpu(p->seq_num);
b411b363
PR
2171 int rw = WRITE;
2172 u32 dp_flags;
302bdeae 2173 int err, tp;
b411b363 2174
4a76b161
AG
2175 mdev = vnr_to_mdev(tconn, pi->vnr);
2176 if (!mdev)
2177 return -EIO;
2178
7be8da07 2179 if (!get_ldev(mdev)) {
82bc0194
AG
2180 int err2;
2181
7be8da07 2182 err = wait_for_and_update_peer_seq(mdev, peer_seq);
e2857216 2183 drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
12038a3a 2184 atomic_inc(&tconn->current_epoch->epoch_size);
e2857216 2185 err2 = drbd_drain_block(mdev, pi->size);
82bc0194
AG
2186 if (!err)
2187 err = err2;
2188 return err;
b411b363
PR
2189 }
2190
fcefa62e
AG
2191 /*
2192 * Corresponding put_ldev done either below (on various errors), or in
2193 * drbd_peer_request_endio, if we successfully submit the data at the
2194 * end of this function.
2195 */
b411b363
PR
2196
2197 sector = be64_to_cpu(p->sector);
e2857216 2198 peer_req = read_in_block(mdev, p->block_id, sector, pi->size);
db830c46 2199 if (!peer_req) {
b411b363 2200 put_ldev(mdev);
82bc0194 2201 return -EIO;
b411b363
PR
2202 }
2203
db830c46 2204 peer_req->w.cb = e_end_block;
b411b363 2205
688593c5
LE
2206 dp_flags = be32_to_cpu(p->dp_flags);
2207 rw |= wire_flags_to_bio(mdev, dp_flags);
81a3537a
LE
2208 if (peer_req->pages == NULL) {
2209 D_ASSERT(peer_req->i.size == 0);
2210 D_ASSERT(dp_flags & DP_FLUSH);
2211 }
688593c5
LE
2212
2213 if (dp_flags & DP_MAY_SET_IN_SYNC)
db830c46 2214 peer_req->flags |= EE_MAY_SET_IN_SYNC;
688593c5 2215
12038a3a
PR
2216 spin_lock(&tconn->epoch_lock);
2217 peer_req->epoch = tconn->current_epoch;
db830c46
AG
2218 atomic_inc(&peer_req->epoch->epoch_size);
2219 atomic_inc(&peer_req->epoch->active);
12038a3a 2220 spin_unlock(&tconn->epoch_lock);
b411b363 2221
302bdeae
PR
2222 rcu_read_lock();
2223 tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
2224 rcu_read_unlock();
2225 if (tp) {
2226 peer_req->flags |= EE_IN_INTERVAL_TREE;
7be8da07
AG
2227 err = wait_for_and_update_peer_seq(mdev, peer_seq);
2228 if (err)
b411b363 2229 goto out_interrupted;
87eeee41 2230 spin_lock_irq(&mdev->tconn->req_lock);
7be8da07
AG
2231 err = handle_write_conflicts(mdev, peer_req);
2232 if (err) {
2233 spin_unlock_irq(&mdev->tconn->req_lock);
2234 if (err == -ENOENT) {
b411b363 2235 put_ldev(mdev);
82bc0194 2236 return 0;
b411b363 2237 }
7be8da07 2238 goto out_interrupted;
b411b363 2239 }
7be8da07
AG
2240 } else
2241 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 2242 list_add(&peer_req->w.list, &mdev->active_ee);
87eeee41 2243 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 2244
d93f6302 2245 if (mdev->state.conn == C_SYNC_TARGET)
3ea35df8 2246 wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, peer_req));
d93f6302 2247
303d1448 2248 if (mdev->tconn->agreed_pro_version < 100) {
44ed167d
PR
2249 rcu_read_lock();
2250 switch (rcu_dereference(mdev->tconn->net_conf)->wire_protocol) {
303d1448
PR
2251 case DRBD_PROT_C:
2252 dp_flags |= DP_SEND_WRITE_ACK;
2253 break;
2254 case DRBD_PROT_B:
2255 dp_flags |= DP_SEND_RECEIVE_ACK;
2256 break;
2257 }
44ed167d 2258 rcu_read_unlock();
303d1448
PR
2259 }
2260
2261 if (dp_flags & DP_SEND_WRITE_ACK) {
2262 peer_req->flags |= EE_SEND_WRITE_ACK;
b411b363
PR
2263 inc_unacked(mdev);
2264 /* corresponding dec_unacked() in e_end_block()
2265 * respective _drbd_clear_done_ee */
303d1448
PR
2266 }
2267
2268 if (dp_flags & DP_SEND_RECEIVE_ACK) {
b411b363
PR
2269 /* I really don't like it that the receiver thread
2270 * sends on the msock, but anyways */
db830c46 2271 drbd_send_ack(mdev, P_RECV_ACK, peer_req);
b411b363
PR
2272 }
2273
6719fb03 2274 if (mdev->state.pdsk < D_INCONSISTENT) {
b411b363 2275 /* In case we have the only disk of the cluster, */
db830c46
AG
2276 drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size);
2277 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
2278 peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
181286ad 2279 drbd_al_begin_io(mdev, &peer_req->i);
b411b363
PR
2280 }
2281
82bc0194
AG
2282 err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR);
2283 if (!err)
2284 return 0;
b411b363 2285
10f6d992
LE
2286 /* don't care for the reason here */
2287 dev_err(DEV, "submit failed, triggering re-connect\n");
87eeee41 2288 spin_lock_irq(&mdev->tconn->req_lock);
db830c46
AG
2289 list_del(&peer_req->w.list);
2290 drbd_remove_epoch_entry_interval(mdev, peer_req);
87eeee41 2291 spin_unlock_irq(&mdev->tconn->req_lock);
db830c46 2292 if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
181286ad 2293 drbd_al_complete_io(mdev, &peer_req->i);
22cc37a9 2294
b411b363 2295out_interrupted:
1e9dd291 2296 drbd_may_finish_epoch(tconn, peer_req->epoch, EV_PUT + EV_CLEANUP);
b411b363 2297 put_ldev(mdev);
3967deb1 2298 drbd_free_peer_req(mdev, peer_req);
82bc0194 2299 return err;
b411b363
PR
2300}
2301
0f0601f4
LE
2302/* We may throttle resync, if the lower device seems to be busy,
2303 * and current sync rate is above c_min_rate.
2304 *
2305 * To decide whether or not the lower device is busy, we use a scheme similar
2306 * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2307 * (more than 64 sectors) of activity we cannot account for with our own resync
2308 * activity, it obviously is "busy".
2309 *
2310 * The current sync rate used here uses only the most recent two step marks,
2311 * to have a short time average so we can react faster.
2312 */
e3555d85 2313int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
0f0601f4
LE
2314{
2315 struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
2316 unsigned long db, dt, dbdt;
e3555d85 2317 struct lc_element *tmp;
0f0601f4
LE
2318 int curr_events;
2319 int throttle = 0;
daeda1cc
PR
2320 unsigned int c_min_rate;
2321
2322 rcu_read_lock();
2323 c_min_rate = rcu_dereference(mdev->ldev->disk_conf)->c_min_rate;
2324 rcu_read_unlock();
0f0601f4
LE
2325
2326 /* feature disabled? */
daeda1cc 2327 if (c_min_rate == 0)
0f0601f4
LE
2328 return 0;
2329
e3555d85
PR
2330 spin_lock_irq(&mdev->al_lock);
2331 tmp = lc_find(mdev->resync, BM_SECT_TO_EXT(sector));
2332 if (tmp) {
2333 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
2334 if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
2335 spin_unlock_irq(&mdev->al_lock);
2336 return 0;
2337 }
2338 /* Do not slow down if app IO is already waiting for this extent */
2339 }
2340 spin_unlock_irq(&mdev->al_lock);
2341
0f0601f4
LE
2342 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
2343 (int)part_stat_read(&disk->part0, sectors[1]) -
2344 atomic_read(&mdev->rs_sect_ev);
e3555d85 2345
0f0601f4
LE
2346 if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
2347 unsigned long rs_left;
2348 int i;
2349
2350 mdev->rs_last_events = curr_events;
2351
2352 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2353 * approx. */
2649f080
LE
2354 i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
2355
2356 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
2357 rs_left = mdev->ov_left;
2358 else
2359 rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
0f0601f4
LE
2360
2361 dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
2362 if (!dt)
2363 dt++;
2364 db = mdev->rs_mark_left[i] - rs_left;
2365 dbdt = Bit2KB(db/dt);
2366
daeda1cc 2367 if (dbdt > c_min_rate)
0f0601f4
LE
2368 throttle = 1;
2369 }
2370 return throttle;
2371}
2372
2373
4a76b161 2374static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 2375{
4a76b161 2376 struct drbd_conf *mdev;
b411b363 2377 sector_t sector;
4a76b161 2378 sector_t capacity;
db830c46 2379 struct drbd_peer_request *peer_req;
b411b363 2380 struct digest_info *di = NULL;
b18b37be 2381 int size, verb;
b411b363 2382 unsigned int fault_type;
e658983a 2383 struct p_block_req *p = pi->data;
4a76b161
AG
2384
2385 mdev = vnr_to_mdev(tconn, pi->vnr);
2386 if (!mdev)
2387 return -EIO;
2388 capacity = drbd_get_capacity(mdev->this_bdev);
b411b363
PR
2389
2390 sector = be64_to_cpu(p->sector);
2391 size = be32_to_cpu(p->blksize);
2392
c670a398 2393 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
b411b363
PR
2394 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2395 (unsigned long long)sector, size);
82bc0194 2396 return -EINVAL;
b411b363
PR
2397 }
2398 if (sector + (size>>9) > capacity) {
2399 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2400 (unsigned long long)sector, size);
82bc0194 2401 return -EINVAL;
b411b363
PR
2402 }
2403
2404 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
b18b37be 2405 verb = 1;
e2857216 2406 switch (pi->cmd) {
b18b37be
PR
2407 case P_DATA_REQUEST:
2408 drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
2409 break;
2410 case P_RS_DATA_REQUEST:
2411 case P_CSUM_RS_REQUEST:
2412 case P_OV_REQUEST:
2413 drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
2414 break;
2415 case P_OV_REPLY:
2416 verb = 0;
2417 dec_rs_pending(mdev);
2418 drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
2419 break;
2420 default:
49ba9b1b 2421 BUG();
b18b37be
PR
2422 }
2423 if (verb && __ratelimit(&drbd_ratelimit_state))
b411b363
PR
2424 dev_err(DEV, "Can not satisfy peer's read request, "
2425 "no local data.\n");
b18b37be 2426
a821cc4a 2427 /* drain possibly payload */
e2857216 2428 return drbd_drain_block(mdev, pi->size);
b411b363
PR
2429 }
2430
2431 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2432 * "criss-cross" setup, that might cause write-out on some other DRBD,
2433 * which in turn might block on the other node at this very place. */
0db55363 2434 peer_req = drbd_alloc_peer_req(mdev, p->block_id, sector, size, GFP_NOIO);
db830c46 2435 if (!peer_req) {
b411b363 2436 put_ldev(mdev);
82bc0194 2437 return -ENOMEM;
b411b363
PR
2438 }
2439
e2857216 2440 switch (pi->cmd) {
b411b363 2441 case P_DATA_REQUEST:
db830c46 2442 peer_req->w.cb = w_e_end_data_req;
b411b363 2443 fault_type = DRBD_FAULT_DT_RD;
80a40e43
LE
2444 /* application IO, don't drbd_rs_begin_io */
2445 goto submit;
2446
b411b363 2447 case P_RS_DATA_REQUEST:
db830c46 2448 peer_req->w.cb = w_e_end_rsdata_req;
b411b363 2449 fault_type = DRBD_FAULT_RS_RD;
5f9915bb
LE
2450 /* used in the sector offset progress display */
2451 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
b411b363
PR
2452 break;
2453
2454 case P_OV_REPLY:
2455 case P_CSUM_RS_REQUEST:
2456 fault_type = DRBD_FAULT_RS_RD;
e2857216 2457 di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
b411b363
PR
2458 if (!di)
2459 goto out_free_e;
2460
e2857216 2461 di->digest_size = pi->size;
b411b363
PR
2462 di->digest = (((char *)di)+sizeof(struct digest_info));
2463
db830c46
AG
2464 peer_req->digest = di;
2465 peer_req->flags |= EE_HAS_DIGEST;
c36c3ced 2466
e2857216 2467 if (drbd_recv_all(mdev->tconn, di->digest, pi->size))
b411b363
PR
2468 goto out_free_e;
2469
e2857216 2470 if (pi->cmd == P_CSUM_RS_REQUEST) {
31890f4a 2471 D_ASSERT(mdev->tconn->agreed_pro_version >= 89);
db830c46 2472 peer_req->w.cb = w_e_end_csum_rs_req;
5f9915bb
LE
2473 /* used in the sector offset progress display */
2474 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
e2857216 2475 } else if (pi->cmd == P_OV_REPLY) {
2649f080
LE
2476 /* track progress, we may need to throttle */
2477 atomic_add(size >> 9, &mdev->rs_sect_in);
db830c46 2478 peer_req->w.cb = w_e_end_ov_reply;
b411b363 2479 dec_rs_pending(mdev);
0f0601f4
LE
2480 /* drbd_rs_begin_io done when we sent this request,
2481 * but accounting still needs to be done. */
2482 goto submit_for_resync;
b411b363
PR
2483 }
2484 break;
2485
2486 case P_OV_REQUEST:
b411b363 2487 if (mdev->ov_start_sector == ~(sector_t)0 &&
31890f4a 2488 mdev->tconn->agreed_pro_version >= 90) {
de228bba
LE
2489 unsigned long now = jiffies;
2490 int i;
b411b363
PR
2491 mdev->ov_start_sector = sector;
2492 mdev->ov_position = sector;
30b743a2
LE
2493 mdev->ov_left = drbd_bm_bits(mdev) - BM_SECT_TO_BIT(sector);
2494 mdev->rs_total = mdev->ov_left;
de228bba
LE
2495 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2496 mdev->rs_mark_left[i] = mdev->ov_left;
2497 mdev->rs_mark_time[i] = now;
2498 }
b411b363
PR
2499 dev_info(DEV, "Online Verify start sector: %llu\n",
2500 (unsigned long long)sector);
2501 }
db830c46 2502 peer_req->w.cb = w_e_end_ov_req;
b411b363 2503 fault_type = DRBD_FAULT_RS_RD;
b411b363
PR
2504 break;
2505
b411b363 2506 default:
49ba9b1b 2507 BUG();
b411b363
PR
2508 }
2509
0f0601f4
LE
2510 /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2511 * wrt the receiver, but it is not as straightforward as it may seem.
2512 * Various places in the resync start and stop logic assume resync
2513 * requests are processed in order, requeuing this on the worker thread
2514 * introduces a bunch of new code for synchronization between threads.
2515 *
2516 * Unlimited throttling before drbd_rs_begin_io may stall the resync
2517 * "forever", throttling after drbd_rs_begin_io will lock that extent
2518 * for application writes for the same time. For now, just throttle
2519 * here, where the rest of the code expects the receiver to sleep for
2520 * a while, anyways.
2521 */
2522
2523 /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2524 * this defers syncer requests for some time, before letting at least
2525 * on request through. The resync controller on the receiving side
2526 * will adapt to the incoming rate accordingly.
2527 *
2528 * We cannot throttle here if remote is Primary/SyncTarget:
2529 * we would also throttle its application reads.
2530 * In that case, throttling is done on the SyncTarget only.
2531 */
e3555d85
PR
2532 if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev, sector))
2533 schedule_timeout_uninterruptible(HZ/10);
2534 if (drbd_rs_begin_io(mdev, sector))
80a40e43 2535 goto out_free_e;
b411b363 2536
0f0601f4
LE
2537submit_for_resync:
2538 atomic_add(size >> 9, &mdev->rs_sect_ev);
2539
80a40e43 2540submit:
b411b363 2541 inc_unacked(mdev);
87eeee41 2542 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 2543 list_add_tail(&peer_req->w.list, &mdev->read_ee);
87eeee41 2544 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 2545
fbe29dec 2546 if (drbd_submit_peer_request(mdev, peer_req, READ, fault_type) == 0)
82bc0194 2547 return 0;
b411b363 2548
10f6d992
LE
2549 /* don't care for the reason here */
2550 dev_err(DEV, "submit failed, triggering re-connect\n");
87eeee41 2551 spin_lock_irq(&mdev->tconn->req_lock);
db830c46 2552 list_del(&peer_req->w.list);
87eeee41 2553 spin_unlock_irq(&mdev->tconn->req_lock);
22cc37a9
LE
2554 /* no drbd_rs_complete_io(), we are dropping the connection anyways */
2555
b411b363 2556out_free_e:
b411b363 2557 put_ldev(mdev);
3967deb1 2558 drbd_free_peer_req(mdev, peer_req);
82bc0194 2559 return -EIO;
b411b363
PR
2560}
2561
2562static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
2563{
2564 int self, peer, rv = -100;
2565 unsigned long ch_self, ch_peer;
44ed167d 2566 enum drbd_after_sb_p after_sb_0p;
b411b363
PR
2567
2568 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2569 peer = mdev->p_uuid[UI_BITMAP] & 1;
2570
2571 ch_peer = mdev->p_uuid[UI_SIZE];
2572 ch_self = mdev->comm_bm_set;
2573
44ed167d
PR
2574 rcu_read_lock();
2575 after_sb_0p = rcu_dereference(mdev->tconn->net_conf)->after_sb_0p;
2576 rcu_read_unlock();
2577 switch (after_sb_0p) {
b411b363
PR
2578 case ASB_CONSENSUS:
2579 case ASB_DISCARD_SECONDARY:
2580 case ASB_CALL_HELPER:
44ed167d 2581 case ASB_VIOLENTLY:
b411b363
PR
2582 dev_err(DEV, "Configuration error.\n");
2583 break;
2584 case ASB_DISCONNECT:
2585 break;
2586 case ASB_DISCARD_YOUNGER_PRI:
2587 if (self == 0 && peer == 1) {
2588 rv = -1;
2589 break;
2590 }
2591 if (self == 1 && peer == 0) {
2592 rv = 1;
2593 break;
2594 }
2595 /* Else fall through to one of the other strategies... */
2596 case ASB_DISCARD_OLDER_PRI:
2597 if (self == 0 && peer == 1) {
2598 rv = 1;
2599 break;
2600 }
2601 if (self == 1 && peer == 0) {
2602 rv = -1;
2603 break;
2604 }
2605 /* Else fall through to one of the other strategies... */
ad19bf6e 2606 dev_warn(DEV, "Discard younger/older primary did not find a decision\n"
b411b363
PR
2607 "Using discard-least-changes instead\n");
2608 case ASB_DISCARD_ZERO_CHG:
2609 if (ch_peer == 0 && ch_self == 0) {
25703f83 2610 rv = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags)
b411b363
PR
2611 ? -1 : 1;
2612 break;
2613 } else {
2614 if (ch_peer == 0) { rv = 1; break; }
2615 if (ch_self == 0) { rv = -1; break; }
2616 }
44ed167d 2617 if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
b411b363
PR
2618 break;
2619 case ASB_DISCARD_LEAST_CHG:
2620 if (ch_self < ch_peer)
2621 rv = -1;
2622 else if (ch_self > ch_peer)
2623 rv = 1;
2624 else /* ( ch_self == ch_peer ) */
2625 /* Well, then use something else. */
25703f83 2626 rv = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags)
b411b363
PR
2627 ? -1 : 1;
2628 break;
2629 case ASB_DISCARD_LOCAL:
2630 rv = -1;
2631 break;
2632 case ASB_DISCARD_REMOTE:
2633 rv = 1;
2634 }
2635
2636 return rv;
2637}
2638
2639static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2640{
6184ea21 2641 int hg, rv = -100;
44ed167d 2642 enum drbd_after_sb_p after_sb_1p;
b411b363 2643
44ed167d
PR
2644 rcu_read_lock();
2645 after_sb_1p = rcu_dereference(mdev->tconn->net_conf)->after_sb_1p;
2646 rcu_read_unlock();
2647 switch (after_sb_1p) {
b411b363
PR
2648 case ASB_DISCARD_YOUNGER_PRI:
2649 case ASB_DISCARD_OLDER_PRI:
2650 case ASB_DISCARD_LEAST_CHG:
2651 case ASB_DISCARD_LOCAL:
2652 case ASB_DISCARD_REMOTE:
44ed167d 2653 case ASB_DISCARD_ZERO_CHG:
b411b363
PR
2654 dev_err(DEV, "Configuration error.\n");
2655 break;
2656 case ASB_DISCONNECT:
2657 break;
2658 case ASB_CONSENSUS:
2659 hg = drbd_asb_recover_0p(mdev);
2660 if (hg == -1 && mdev->state.role == R_SECONDARY)
2661 rv = hg;
2662 if (hg == 1 && mdev->state.role == R_PRIMARY)
2663 rv = hg;
2664 break;
2665 case ASB_VIOLENTLY:
2666 rv = drbd_asb_recover_0p(mdev);
2667 break;
2668 case ASB_DISCARD_SECONDARY:
2669 return mdev->state.role == R_PRIMARY ? 1 : -1;
2670 case ASB_CALL_HELPER:
2671 hg = drbd_asb_recover_0p(mdev);
2672 if (hg == -1 && mdev->state.role == R_PRIMARY) {
bb437946
AG
2673 enum drbd_state_rv rv2;
2674
2675 drbd_set_role(mdev, R_SECONDARY, 0);
b411b363
PR
2676 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2677 * we might be here in C_WF_REPORT_PARAMS which is transient.
2678 * we do not need to wait for the after state change work either. */
bb437946
AG
2679 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2680 if (rv2 != SS_SUCCESS) {
b411b363
PR
2681 drbd_khelper(mdev, "pri-lost-after-sb");
2682 } else {
2683 dev_warn(DEV, "Successfully gave up primary role.\n");
2684 rv = hg;
2685 }
2686 } else
2687 rv = hg;
2688 }
2689
2690 return rv;
2691}
2692
2693static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2694{
6184ea21 2695 int hg, rv = -100;
44ed167d 2696 enum drbd_after_sb_p after_sb_2p;
b411b363 2697
44ed167d
PR
2698 rcu_read_lock();
2699 after_sb_2p = rcu_dereference(mdev->tconn->net_conf)->after_sb_2p;
2700 rcu_read_unlock();
2701 switch (after_sb_2p) {
b411b363
PR
2702 case ASB_DISCARD_YOUNGER_PRI:
2703 case ASB_DISCARD_OLDER_PRI:
2704 case ASB_DISCARD_LEAST_CHG:
2705 case ASB_DISCARD_LOCAL:
2706 case ASB_DISCARD_REMOTE:
2707 case ASB_CONSENSUS:
2708 case ASB_DISCARD_SECONDARY:
44ed167d 2709 case ASB_DISCARD_ZERO_CHG:
b411b363
PR
2710 dev_err(DEV, "Configuration error.\n");
2711 break;
2712 case ASB_VIOLENTLY:
2713 rv = drbd_asb_recover_0p(mdev);
2714 break;
2715 case ASB_DISCONNECT:
2716 break;
2717 case ASB_CALL_HELPER:
2718 hg = drbd_asb_recover_0p(mdev);
2719 if (hg == -1) {
bb437946
AG
2720 enum drbd_state_rv rv2;
2721
b411b363
PR
2722 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2723 * we might be here in C_WF_REPORT_PARAMS which is transient.
2724 * we do not need to wait for the after state change work either. */
bb437946
AG
2725 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2726 if (rv2 != SS_SUCCESS) {
b411b363
PR
2727 drbd_khelper(mdev, "pri-lost-after-sb");
2728 } else {
2729 dev_warn(DEV, "Successfully gave up primary role.\n");
2730 rv = hg;
2731 }
2732 } else
2733 rv = hg;
2734 }
2735
2736 return rv;
2737}
2738
2739static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
2740 u64 bits, u64 flags)
2741{
2742 if (!uuid) {
2743 dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
2744 return;
2745 }
2746 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
2747 text,
2748 (unsigned long long)uuid[UI_CURRENT],
2749 (unsigned long long)uuid[UI_BITMAP],
2750 (unsigned long long)uuid[UI_HISTORY_START],
2751 (unsigned long long)uuid[UI_HISTORY_END],
2752 (unsigned long long)bits,
2753 (unsigned long long)flags);
2754}
2755
2756/*
2757 100 after split brain try auto recover
2758 2 C_SYNC_SOURCE set BitMap
2759 1 C_SYNC_SOURCE use BitMap
2760 0 no Sync
2761 -1 C_SYNC_TARGET use BitMap
2762 -2 C_SYNC_TARGET set BitMap
2763 -100 after split brain, disconnect
2764-1000 unrelated data
4a23f264
PR
2765-1091 requires proto 91
2766-1096 requires proto 96
b411b363
PR
2767 */
2768static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
2769{
2770 u64 self, peer;
2771 int i, j;
2772
2773 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2774 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2775
2776 *rule_nr = 10;
2777 if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2778 return 0;
2779
2780 *rule_nr = 20;
2781 if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2782 peer != UUID_JUST_CREATED)
2783 return -2;
2784
2785 *rule_nr = 30;
2786 if (self != UUID_JUST_CREATED &&
2787 (peer == UUID_JUST_CREATED || peer == (u64)0))
2788 return 2;
2789
2790 if (self == peer) {
2791 int rct, dc; /* roles at crash time */
2792
2793 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2794
31890f4a 2795 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2796 return -1091;
b411b363
PR
2797
2798 if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2799 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
2800 dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
2801 drbd_uuid_set_bm(mdev, 0UL);
2802
2803 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2804 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2805 *rule_nr = 34;
2806 } else {
2807 dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
2808 *rule_nr = 36;
2809 }
2810
2811 return 1;
2812 }
2813
2814 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
2815
31890f4a 2816 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2817 return -1091;
b411b363
PR
2818
2819 if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2820 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
2821 dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
2822
2823 mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
2824 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
2825 mdev->p_uuid[UI_BITMAP] = 0UL;
2826
2827 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2828 *rule_nr = 35;
2829 } else {
2830 dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
2831 *rule_nr = 37;
2832 }
2833
2834 return -1;
2835 }
2836
2837 /* Common power [off|failure] */
2838 rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
2839 (mdev->p_uuid[UI_FLAGS] & 2);
2840 /* lowest bit is set when we were primary,
2841 * next bit (weight 2) is set when peer was primary */
2842 *rule_nr = 40;
2843
2844 switch (rct) {
2845 case 0: /* !self_pri && !peer_pri */ return 0;
2846 case 1: /* self_pri && !peer_pri */ return 1;
2847 case 2: /* !self_pri && peer_pri */ return -1;
2848 case 3: /* self_pri && peer_pri */
25703f83 2849 dc = test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags);
b411b363
PR
2850 return dc ? -1 : 1;
2851 }
2852 }
2853
2854 *rule_nr = 50;
2855 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2856 if (self == peer)
2857 return -1;
2858
2859 *rule_nr = 51;
2860 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2861 if (self == peer) {
31890f4a 2862 if (mdev->tconn->agreed_pro_version < 96 ?
4a23f264
PR
2863 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
2864 (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
2865 peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) {
b411b363
PR
2866 /* The last P_SYNC_UUID did not get though. Undo the last start of
2867 resync as sync source modifications of the peer's UUIDs. */
2868
31890f4a 2869 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2870 return -1091;
b411b363
PR
2871
2872 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2873 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
4a23f264 2874
1882e22d 2875 dev_info(DEV, "Lost last syncUUID packet, corrected:\n");
4a23f264
PR
2876 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2877
b411b363
PR
2878 return -1;
2879 }
2880 }
2881
2882 *rule_nr = 60;
2883 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2884 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2885 peer = mdev->p_uuid[i] & ~((u64)1);
2886 if (self == peer)
2887 return -2;
2888 }
2889
2890 *rule_nr = 70;
2891 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2892 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2893 if (self == peer)
2894 return 1;
2895
2896 *rule_nr = 71;
2897 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2898 if (self == peer) {
31890f4a 2899 if (mdev->tconn->agreed_pro_version < 96 ?
4a23f264
PR
2900 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
2901 (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
2902 self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
b411b363
PR
2903 /* The last P_SYNC_UUID did not get though. Undo the last start of
2904 resync as sync source modifications of our UUIDs. */
2905
31890f4a 2906 if (mdev->tconn->agreed_pro_version < 91)
4a23f264 2907 return -1091;
b411b363
PR
2908
2909 _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
2910 _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
2911
4a23f264 2912 dev_info(DEV, "Last syncUUID did not get through, corrected:\n");
b411b363
PR
2913 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2914 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2915
2916 return 1;
2917 }
2918 }
2919
2920
2921 *rule_nr = 80;
d8c2a36b 2922 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
b411b363
PR
2923 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2924 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2925 if (self == peer)
2926 return 2;
2927 }
2928
2929 *rule_nr = 90;
2930 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2931 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2932 if (self == peer && self != ((u64)0))
2933 return 100;
2934
2935 *rule_nr = 100;
2936 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2937 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2938 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
2939 peer = mdev->p_uuid[j] & ~((u64)1);
2940 if (self == peer)
2941 return -100;
2942 }
2943 }
2944
2945 return -1000;
2946}
2947
2948/* drbd_sync_handshake() returns the new conn state on success, or
2949 CONN_MASK (-1) on failure.
2950 */
2951static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
2952 enum drbd_disk_state peer_disk) __must_hold(local)
2953{
b411b363
PR
2954 enum drbd_conns rv = C_MASK;
2955 enum drbd_disk_state mydisk;
44ed167d 2956 struct net_conf *nc;
6dff2902 2957 int hg, rule_nr, rr_conflict, tentative;
b411b363
PR
2958
2959 mydisk = mdev->state.disk;
2960 if (mydisk == D_NEGOTIATING)
2961 mydisk = mdev->new_state_tmp.disk;
2962
2963 dev_info(DEV, "drbd_sync_handshake:\n");
2964 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
2965 drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
2966 mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2967
2968 hg = drbd_uuid_compare(mdev, &rule_nr);
2969
2970 dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
2971
2972 if (hg == -1000) {
2973 dev_alert(DEV, "Unrelated data, aborting!\n");
2974 return C_MASK;
2975 }
4a23f264
PR
2976 if (hg < -1000) {
2977 dev_alert(DEV, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
b411b363
PR
2978 return C_MASK;
2979 }
2980
2981 if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
2982 (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
2983 int f = (hg == -100) || abs(hg) == 2;
2984 hg = mydisk > D_INCONSISTENT ? 1 : -1;
2985 if (f)
2986 hg = hg*2;
2987 dev_info(DEV, "Becoming sync %s due to disk states.\n",
2988 hg > 0 ? "source" : "target");
2989 }
2990
3a11a487
AG
2991 if (abs(hg) == 100)
2992 drbd_khelper(mdev, "initial-split-brain");
2993
44ed167d
PR
2994 rcu_read_lock();
2995 nc = rcu_dereference(mdev->tconn->net_conf);
2996
2997 if (hg == 100 || (hg == -100 && nc->always_asbp)) {
b411b363
PR
2998 int pcount = (mdev->state.role == R_PRIMARY)
2999 + (peer_role == R_PRIMARY);
3000 int forced = (hg == -100);
3001
3002 switch (pcount) {
3003 case 0:
3004 hg = drbd_asb_recover_0p(mdev);
3005 break;
3006 case 1:
3007 hg = drbd_asb_recover_1p(mdev);
3008 break;
3009 case 2:
3010 hg = drbd_asb_recover_2p(mdev);
3011 break;
3012 }
3013 if (abs(hg) < 100) {
3014 dev_warn(DEV, "Split-Brain detected, %d primaries, "
3015 "automatically solved. Sync from %s node\n",
3016 pcount, (hg < 0) ? "peer" : "this");
3017 if (forced) {
3018 dev_warn(DEV, "Doing a full sync, since"
3019 " UUIDs where ambiguous.\n");
3020 hg = hg*2;
3021 }
3022 }
3023 }
3024
3025 if (hg == -100) {
08b165ba 3026 if (test_bit(DISCARD_MY_DATA, &mdev->flags) && !(mdev->p_uuid[UI_FLAGS]&1))
b411b363 3027 hg = -1;
08b165ba 3028 if (!test_bit(DISCARD_MY_DATA, &mdev->flags) && (mdev->p_uuid[UI_FLAGS]&1))
b411b363
PR
3029 hg = 1;
3030
3031 if (abs(hg) < 100)
3032 dev_warn(DEV, "Split-Brain detected, manually solved. "
3033 "Sync from %s node\n",
3034 (hg < 0) ? "peer" : "this");
3035 }
44ed167d 3036 rr_conflict = nc->rr_conflict;
6dff2902 3037 tentative = nc->tentative;
44ed167d 3038 rcu_read_unlock();
b411b363
PR
3039
3040 if (hg == -100) {
580b9767
LE
3041 /* FIXME this log message is not correct if we end up here
3042 * after an attempted attach on a diskless node.
3043 * We just refuse to attach -- well, we drop the "connection"
3044 * to that disk, in a way... */
3a11a487 3045 dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
b411b363
PR
3046 drbd_khelper(mdev, "split-brain");
3047 return C_MASK;
3048 }
3049
3050 if (hg > 0 && mydisk <= D_INCONSISTENT) {
3051 dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
3052 return C_MASK;
3053 }
3054
3055 if (hg < 0 && /* by intention we do not use mydisk here. */
3056 mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
44ed167d 3057 switch (rr_conflict) {
b411b363
PR
3058 case ASB_CALL_HELPER:
3059 drbd_khelper(mdev, "pri-lost");
3060 /* fall through */
3061 case ASB_DISCONNECT:
3062 dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
3063 return C_MASK;
3064 case ASB_VIOLENTLY:
3065 dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
3066 "assumption\n");
3067 }
3068 }
3069
6dff2902 3070 if (tentative || test_bit(CONN_DRY_RUN, &mdev->tconn->flags)) {
cf14c2e9
PR
3071 if (hg == 0)
3072 dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
3073 else
3074 dev_info(DEV, "dry-run connect: Would become %s, doing a %s resync.",
3075 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
3076 abs(hg) >= 2 ? "full" : "bit-map based");
3077 return C_MASK;
3078 }
3079
b411b363
PR
3080 if (abs(hg) >= 2) {
3081 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
20ceb2b2
LE
3082 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
3083 BM_LOCKED_SET_ALLOWED))
b411b363
PR
3084 return C_MASK;
3085 }
3086
3087 if (hg > 0) { /* become sync source. */
3088 rv = C_WF_BITMAP_S;
3089 } else if (hg < 0) { /* become sync target */
3090 rv = C_WF_BITMAP_T;
3091 } else {
3092 rv = C_CONNECTED;
3093 if (drbd_bm_total_weight(mdev)) {
3094 dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
3095 drbd_bm_total_weight(mdev));
3096 }
3097 }
3098
3099 return rv;
3100}
3101
f179d76d 3102static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
b411b363
PR
3103{
3104 /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
f179d76d
PR
3105 if (peer == ASB_DISCARD_REMOTE)
3106 return ASB_DISCARD_LOCAL;
b411b363
PR
3107
3108 /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
f179d76d
PR
3109 if (peer == ASB_DISCARD_LOCAL)
3110 return ASB_DISCARD_REMOTE;
b411b363
PR
3111
3112 /* everything else is valid if they are equal on both sides. */
f179d76d 3113 return peer;
b411b363
PR
3114}
3115
e2857216 3116static int receive_protocol(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3117{
e658983a 3118 struct p_protocol *p = pi->data;
036b17ea
PR
3119 enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
3120 int p_proto, p_discard_my_data, p_two_primaries, cf;
3121 struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
3122 char integrity_alg[SHARED_SECRET_MAX] = "";
accdbcc5 3123 struct crypto_hash *peer_integrity_tfm = NULL;
7aca6c75 3124 void *int_dig_in = NULL, *int_dig_vv = NULL;
b411b363 3125
b411b363
PR
3126 p_proto = be32_to_cpu(p->protocol);
3127 p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
3128 p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
3129 p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
b411b363 3130 p_two_primaries = be32_to_cpu(p->two_primaries);
cf14c2e9 3131 cf = be32_to_cpu(p->conn_flags);
6139f60d 3132 p_discard_my_data = cf & CF_DISCARD_MY_DATA;
cf14c2e9 3133
86db0618
AG
3134 if (tconn->agreed_pro_version >= 87) {
3135 int err;
3136
88104ca4 3137 if (pi->size > sizeof(integrity_alg))
86db0618 3138 return -EIO;
88104ca4 3139 err = drbd_recv_all(tconn, integrity_alg, pi->size);
86db0618
AG
3140 if (err)
3141 return err;
036b17ea
PR
3142 integrity_alg[SHARED_SECRET_MAX - 1] = 0;
3143 }
88104ca4 3144
7d4c782c 3145 if (pi->cmd != P_PROTOCOL_UPDATE) {
fbc12f45 3146 clear_bit(CONN_DRY_RUN, &tconn->flags);
036b17ea 3147
fbc12f45
AG
3148 if (cf & CF_DRY_RUN)
3149 set_bit(CONN_DRY_RUN, &tconn->flags);
cf14c2e9 3150
fbc12f45
AG
3151 rcu_read_lock();
3152 nc = rcu_dereference(tconn->net_conf);
b411b363 3153
fbc12f45 3154 if (p_proto != nc->wire_protocol) {
d505d9be 3155 conn_err(tconn, "incompatible %s settings\n", "protocol");
fbc12f45
AG
3156 goto disconnect_rcu_unlock;
3157 }
44ed167d 3158
fbc12f45 3159 if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
d505d9be 3160 conn_err(tconn, "incompatible %s settings\n", "after-sb-0pri");
fbc12f45
AG
3161 goto disconnect_rcu_unlock;
3162 }
b411b363 3163
fbc12f45 3164 if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
d505d9be 3165 conn_err(tconn, "incompatible %s settings\n", "after-sb-1pri");
fbc12f45
AG
3166 goto disconnect_rcu_unlock;
3167 }
b411b363 3168
fbc12f45 3169 if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
d505d9be 3170 conn_err(tconn, "incompatible %s settings\n", "after-sb-2pri");
fbc12f45
AG
3171 goto disconnect_rcu_unlock;
3172 }
b411b363 3173
fbc12f45 3174 if (p_discard_my_data && nc->discard_my_data) {
d505d9be 3175 conn_err(tconn, "incompatible %s settings\n", "discard-my-data");
fbc12f45
AG
3176 goto disconnect_rcu_unlock;
3177 }
b411b363 3178
fbc12f45 3179 if (p_two_primaries != nc->two_primaries) {
d505d9be 3180 conn_err(tconn, "incompatible %s settings\n", "allow-two-primaries");
fbc12f45
AG
3181 goto disconnect_rcu_unlock;
3182 }
b411b363 3183
fbc12f45 3184 if (strcmp(integrity_alg, nc->integrity_alg)) {
d505d9be 3185 conn_err(tconn, "incompatible %s settings\n", "data-integrity-alg");
fbc12f45
AG
3186 goto disconnect_rcu_unlock;
3187 }
b411b363 3188
fbc12f45 3189 rcu_read_unlock();
036b17ea 3190 }
7d4c782c
AG
3191
3192 if (integrity_alg[0]) {
3193 int hash_size;
3194
3195 /*
3196 * We can only change the peer data integrity algorithm
3197 * here. Changing our own data integrity algorithm
3198 * requires that we send a P_PROTOCOL_UPDATE packet at
3199 * the same time; otherwise, the peer has no way to
3200 * tell between which packets the algorithm should
3201 * change.
3202 */
3203
3204 peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
3205 if (!peer_integrity_tfm) {
3206 conn_err(tconn, "peer data-integrity-alg %s not supported\n",
3207 integrity_alg);
3208 goto disconnect;
3209 }
3210
3211 hash_size = crypto_hash_digestsize(peer_integrity_tfm);
3212 int_dig_in = kmalloc(hash_size, GFP_KERNEL);
3213 int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
3214 if (!(int_dig_in && int_dig_vv)) {
3215 conn_err(tconn, "Allocation of buffers for data integrity checking failed\n");
3216 goto disconnect;
3217 }
3218 }
3219
3220 new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
3221 if (!new_net_conf) {
3222 conn_err(tconn, "Allocation of new net_conf failed\n");
3223 goto disconnect;
3224 }
3225
3226 mutex_lock(&tconn->data.mutex);
3227 mutex_lock(&tconn->conf_update);
3228 old_net_conf = tconn->net_conf;
3229 *new_net_conf = *old_net_conf;
3230
3231 new_net_conf->wire_protocol = p_proto;
3232 new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
3233 new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
3234 new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
3235 new_net_conf->two_primaries = p_two_primaries;
3236
3237 rcu_assign_pointer(tconn->net_conf, new_net_conf);
3238 mutex_unlock(&tconn->conf_update);
3239 mutex_unlock(&tconn->data.mutex);
3240
3241 crypto_free_hash(tconn->peer_integrity_tfm);
3242 kfree(tconn->int_dig_in);
3243 kfree(tconn->int_dig_vv);
3244 tconn->peer_integrity_tfm = peer_integrity_tfm;
3245 tconn->int_dig_in = int_dig_in;
3246 tconn->int_dig_vv = int_dig_vv;
3247
3248 if (strcmp(old_net_conf->integrity_alg, integrity_alg))
3249 conn_info(tconn, "peer data-integrity-alg: %s\n",
3250 integrity_alg[0] ? integrity_alg : "(none)");
3251
3252 synchronize_rcu();
3253 kfree(old_net_conf);
82bc0194 3254 return 0;
b411b363 3255
44ed167d
PR
3256disconnect_rcu_unlock:
3257 rcu_read_unlock();
b411b363 3258disconnect:
b792c35c 3259 crypto_free_hash(peer_integrity_tfm);
036b17ea
PR
3260 kfree(int_dig_in);
3261 kfree(int_dig_vv);
7204624c 3262 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3263 return -EIO;
b411b363
PR
3264}
3265
3266/* helper function
3267 * input: alg name, feature name
3268 * return: NULL (alg name was "")
3269 * ERR_PTR(error) if something goes wrong
3270 * or the crypto hash ptr, if it worked out ok. */
3271struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
3272 const char *alg, const char *name)
3273{
3274 struct crypto_hash *tfm;
3275
3276 if (!alg[0])
3277 return NULL;
3278
3279 tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
3280 if (IS_ERR(tfm)) {
3281 dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
3282 alg, name, PTR_ERR(tfm));
3283 return tfm;
3284 }
b411b363
PR
3285 return tfm;
3286}
3287
4a76b161
AG
3288static int ignore_remaining_packet(struct drbd_tconn *tconn, struct packet_info *pi)
3289{
3290 void *buffer = tconn->data.rbuf;
3291 int size = pi->size;
3292
3293 while (size) {
3294 int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
3295 s = drbd_recv(tconn, buffer, s);
3296 if (s <= 0) {
3297 if (s < 0)
3298 return s;
3299 break;
3300 }
3301 size -= s;
3302 }
3303 if (size)
3304 return -EIO;
3305 return 0;
3306}
3307
3308/*
3309 * config_unknown_volume - device configuration command for unknown volume
3310 *
3311 * When a device is added to an existing connection, the node on which the
3312 * device is added first will send configuration commands to its peer but the
3313 * peer will not know about the device yet. It will warn and ignore these
3314 * commands. Once the device is added on the second node, the second node will
3315 * send the same device configuration commands, but in the other direction.
3316 *
3317 * (We can also end up here if drbd is misconfigured.)
3318 */
3319static int config_unknown_volume(struct drbd_tconn *tconn, struct packet_info *pi)
3320{
2fcb8f30
AG
3321 conn_warn(tconn, "%s packet received for volume %u, which is not configured locally\n",
3322 cmdname(pi->cmd), pi->vnr);
4a76b161
AG
3323 return ignore_remaining_packet(tconn, pi);
3324}
3325
3326static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3327{
4a76b161 3328 struct drbd_conf *mdev;
e658983a 3329 struct p_rs_param_95 *p;
b411b363
PR
3330 unsigned int header_size, data_size, exp_max_sz;
3331 struct crypto_hash *verify_tfm = NULL;
3332 struct crypto_hash *csums_tfm = NULL;
2ec91e0e 3333 struct net_conf *old_net_conf, *new_net_conf = NULL;
813472ce 3334 struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
4a76b161 3335 const int apv = tconn->agreed_pro_version;
813472ce 3336 struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
778f271d 3337 int fifo_size = 0;
82bc0194 3338 int err;
b411b363 3339
4a76b161
AG
3340 mdev = vnr_to_mdev(tconn, pi->vnr);
3341 if (!mdev)
3342 return config_unknown_volume(tconn, pi);
3343
b411b363
PR
3344 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
3345 : apv == 88 ? sizeof(struct p_rs_param)
3346 + SHARED_SECRET_MAX
8e26f9cc
PR
3347 : apv <= 94 ? sizeof(struct p_rs_param_89)
3348 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363 3349
e2857216 3350 if (pi->size > exp_max_sz) {
b411b363 3351 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
e2857216 3352 pi->size, exp_max_sz);
82bc0194 3353 return -EIO;
b411b363
PR
3354 }
3355
3356 if (apv <= 88) {
e658983a 3357 header_size = sizeof(struct p_rs_param);
e2857216 3358 data_size = pi->size - header_size;
8e26f9cc 3359 } else if (apv <= 94) {
e658983a 3360 header_size = sizeof(struct p_rs_param_89);
e2857216 3361 data_size = pi->size - header_size;
b411b363 3362 D_ASSERT(data_size == 0);
8e26f9cc 3363 } else {
e658983a 3364 header_size = sizeof(struct p_rs_param_95);
e2857216 3365 data_size = pi->size - header_size;
b411b363
PR
3366 D_ASSERT(data_size == 0);
3367 }
3368
3369 /* initialize verify_alg and csums_alg */
e658983a 3370 p = pi->data;
b411b363
PR
3371 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
3372
e658983a 3373 err = drbd_recv_all(mdev->tconn, p, header_size);
82bc0194
AG
3374 if (err)
3375 return err;
b411b363 3376
daeda1cc
PR
3377 mutex_lock(&mdev->tconn->conf_update);
3378 old_net_conf = mdev->tconn->net_conf;
813472ce
PR
3379 if (get_ldev(mdev)) {
3380 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3381 if (!new_disk_conf) {
3382 put_ldev(mdev);
3383 mutex_unlock(&mdev->tconn->conf_update);
3384 dev_err(DEV, "Allocation of new disk_conf failed\n");
3385 return -ENOMEM;
3386 }
daeda1cc 3387
813472ce
PR
3388 old_disk_conf = mdev->ldev->disk_conf;
3389 *new_disk_conf = *old_disk_conf;
3390
6394b935 3391 new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
813472ce 3392 }
daeda1cc 3393
b411b363
PR
3394 if (apv >= 88) {
3395 if (apv == 88) {
e4bad1bc
PR
3396 if (data_size > SHARED_SECRET_MAX || data_size == 0) {
3397 dev_err(DEV, "verify-alg of wrong size, "
3398 "peer wants %u, accepting only up to %u byte\n",
3399 data_size, SHARED_SECRET_MAX);
813472ce
PR
3400 err = -EIO;
3401 goto reconnect;
b411b363
PR
3402 }
3403
82bc0194 3404 err = drbd_recv_all(mdev->tconn, p->verify_alg, data_size);
813472ce
PR
3405 if (err)
3406 goto reconnect;
b411b363
PR
3407 /* we expect NUL terminated string */
3408 /* but just in case someone tries to be evil */
3409 D_ASSERT(p->verify_alg[data_size-1] == 0);
3410 p->verify_alg[data_size-1] = 0;
3411
3412 } else /* apv >= 89 */ {
3413 /* we still expect NUL terminated strings */
3414 /* but just in case someone tries to be evil */
3415 D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
3416 D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
3417 p->verify_alg[SHARED_SECRET_MAX-1] = 0;
3418 p->csums_alg[SHARED_SECRET_MAX-1] = 0;
3419 }
3420
2ec91e0e 3421 if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
b411b363
PR
3422 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
3423 dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
2ec91e0e 3424 old_net_conf->verify_alg, p->verify_alg);
b411b363
PR
3425 goto disconnect;
3426 }
3427 verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
3428 p->verify_alg, "verify-alg");
3429 if (IS_ERR(verify_tfm)) {
3430 verify_tfm = NULL;
3431 goto disconnect;
3432 }
3433 }
3434
2ec91e0e 3435 if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
b411b363
PR
3436 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
3437 dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
2ec91e0e 3438 old_net_conf->csums_alg, p->csums_alg);
b411b363
PR
3439 goto disconnect;
3440 }
3441 csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
3442 p->csums_alg, "csums-alg");
3443 if (IS_ERR(csums_tfm)) {
3444 csums_tfm = NULL;
3445 goto disconnect;
3446 }
3447 }
3448
813472ce 3449 if (apv > 94 && new_disk_conf) {
daeda1cc
PR
3450 new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
3451 new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
3452 new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
3453 new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
778f271d 3454
daeda1cc 3455 fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
9958c857 3456 if (fifo_size != mdev->rs_plan_s->size) {
813472ce
PR
3457 new_plan = fifo_alloc(fifo_size);
3458 if (!new_plan) {
778f271d 3459 dev_err(DEV, "kmalloc of fifo_buffer failed");
f399002e 3460 put_ldev(mdev);
778f271d
PR
3461 goto disconnect;
3462 }
3463 }
8e26f9cc 3464 }
b411b363 3465
91fd4dad 3466 if (verify_tfm || csums_tfm) {
2ec91e0e
PR
3467 new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
3468 if (!new_net_conf) {
91fd4dad
PR
3469 dev_err(DEV, "Allocation of new net_conf failed\n");
3470 goto disconnect;
3471 }
3472
2ec91e0e 3473 *new_net_conf = *old_net_conf;
91fd4dad
PR
3474
3475 if (verify_tfm) {
2ec91e0e
PR
3476 strcpy(new_net_conf->verify_alg, p->verify_alg);
3477 new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
91fd4dad
PR
3478 crypto_free_hash(mdev->tconn->verify_tfm);
3479 mdev->tconn->verify_tfm = verify_tfm;
3480 dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
3481 }
3482 if (csums_tfm) {
2ec91e0e
PR
3483 strcpy(new_net_conf->csums_alg, p->csums_alg);
3484 new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
91fd4dad
PR
3485 crypto_free_hash(mdev->tconn->csums_tfm);
3486 mdev->tconn->csums_tfm = csums_tfm;
3487 dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
3488 }
2ec91e0e 3489 rcu_assign_pointer(tconn->net_conf, new_net_conf);
b411b363 3490 }
daeda1cc 3491 }
91fd4dad 3492
813472ce
PR
3493 if (new_disk_conf) {
3494 rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
3495 put_ldev(mdev);
3496 }
3497
3498 if (new_plan) {
3499 old_plan = mdev->rs_plan_s;
3500 rcu_assign_pointer(mdev->rs_plan_s, new_plan);
b411b363 3501 }
daeda1cc
PR
3502
3503 mutex_unlock(&mdev->tconn->conf_update);
3504 synchronize_rcu();
3505 if (new_net_conf)
3506 kfree(old_net_conf);
3507 kfree(old_disk_conf);
813472ce 3508 kfree(old_plan);
daeda1cc 3509
82bc0194 3510 return 0;
b411b363 3511
813472ce
PR
3512reconnect:
3513 if (new_disk_conf) {
3514 put_ldev(mdev);
3515 kfree(new_disk_conf);
3516 }
3517 mutex_unlock(&mdev->tconn->conf_update);
3518 return -EIO;
3519
b411b363 3520disconnect:
813472ce
PR
3521 kfree(new_plan);
3522 if (new_disk_conf) {
3523 put_ldev(mdev);
3524 kfree(new_disk_conf);
3525 }
a0095508 3526 mutex_unlock(&mdev->tconn->conf_update);
b411b363
PR
3527 /* just for completeness: actually not needed,
3528 * as this is not reached if csums_tfm was ok. */
3529 crypto_free_hash(csums_tfm);
3530 /* but free the verify_tfm again, if csums_tfm did not work out */
3531 crypto_free_hash(verify_tfm);
38fa9988 3532 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3533 return -EIO;
b411b363
PR
3534}
3535
b411b363
PR
3536/* warn if the arguments differ by more than 12.5% */
3537static void warn_if_differ_considerably(struct drbd_conf *mdev,
3538 const char *s, sector_t a, sector_t b)
3539{
3540 sector_t d;
3541 if (a == 0 || b == 0)
3542 return;
3543 d = (a > b) ? (a - b) : (b - a);
3544 if (d > (a>>3) || d > (b>>3))
3545 dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
3546 (unsigned long long)a, (unsigned long long)b);
3547}
3548
4a76b161 3549static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3550{
4a76b161 3551 struct drbd_conf *mdev;
e658983a 3552 struct p_sizes *p = pi->data;
b411b363 3553 enum determine_dev_size dd = unchanged;
b411b363
PR
3554 sector_t p_size, p_usize, my_usize;
3555 int ldsc = 0; /* local disk size changed */
e89b591c 3556 enum dds_flags ddsf;
b411b363 3557
4a76b161
AG
3558 mdev = vnr_to_mdev(tconn, pi->vnr);
3559 if (!mdev)
3560 return config_unknown_volume(tconn, pi);
3561
b411b363
PR
3562 p_size = be64_to_cpu(p->d_size);
3563 p_usize = be64_to_cpu(p->u_size);
3564
b411b363
PR
3565 /* just store the peer's disk size for now.
3566 * we still need to figure out whether we accept that. */
3567 mdev->p_size = p_size;
3568
b411b363 3569 if (get_ldev(mdev)) {
daeda1cc
PR
3570 rcu_read_lock();
3571 my_usize = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
3572 rcu_read_unlock();
3573
b411b363
PR
3574 warn_if_differ_considerably(mdev, "lower level device sizes",
3575 p_size, drbd_get_max_capacity(mdev->ldev));
3576 warn_if_differ_considerably(mdev, "user requested size",
daeda1cc 3577 p_usize, my_usize);
b411b363
PR
3578
3579 /* if this is the first connect, or an otherwise expected
3580 * param exchange, choose the minimum */
3581 if (mdev->state.conn == C_WF_REPORT_PARAMS)
daeda1cc 3582 p_usize = min_not_zero(my_usize, p_usize);
b411b363
PR
3583
3584 /* Never shrink a device with usable data during connect.
3585 But allow online shrinking if we are connected. */
ef5e44a6 3586 if (drbd_new_dev_size(mdev, mdev->ldev, p_usize, 0) <
daeda1cc
PR
3587 drbd_get_capacity(mdev->this_bdev) &&
3588 mdev->state.disk >= D_OUTDATED &&
3589 mdev->state.conn < C_CONNECTED) {
b411b363 3590 dev_err(DEV, "The peer's disk size is too small!\n");
38fa9988 3591 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 3592 put_ldev(mdev);
82bc0194 3593 return -EIO;
b411b363 3594 }
daeda1cc
PR
3595
3596 if (my_usize != p_usize) {
3597 struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
3598
3599 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3600 if (!new_disk_conf) {
3601 dev_err(DEV, "Allocation of new disk_conf failed\n");
3602 put_ldev(mdev);
3603 return -ENOMEM;
3604 }
3605
3606 mutex_lock(&mdev->tconn->conf_update);
3607 old_disk_conf = mdev->ldev->disk_conf;
3608 *new_disk_conf = *old_disk_conf;
3609 new_disk_conf->disk_size = p_usize;
3610
3611 rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
3612 mutex_unlock(&mdev->tconn->conf_update);
3613 synchronize_rcu();
3614 kfree(old_disk_conf);
3615
3616 dev_info(DEV, "Peer sets u_size to %lu sectors\n",
3617 (unsigned long)my_usize);
3618 }
3619
b411b363
PR
3620 put_ldev(mdev);
3621 }
b411b363 3622
e89b591c 3623 ddsf = be16_to_cpu(p->dds_flags);
b411b363 3624 if (get_ldev(mdev)) {
24c4830c 3625 dd = drbd_determine_dev_size(mdev, ddsf);
b411b363
PR
3626 put_ldev(mdev);
3627 if (dd == dev_size_error)
82bc0194 3628 return -EIO;
b411b363
PR
3629 drbd_md_sync(mdev);
3630 } else {
3631 /* I am diskless, need to accept the peer's size. */
3632 drbd_set_my_capacity(mdev, p_size);
3633 }
3634
99432fcc
PR
3635 mdev->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
3636 drbd_reconsider_max_bio_size(mdev);
3637
b411b363
PR
3638 if (get_ldev(mdev)) {
3639 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
3640 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
3641 ldsc = 1;
3642 }
3643
b411b363
PR
3644 put_ldev(mdev);
3645 }
3646
3647 if (mdev->state.conn > C_WF_REPORT_PARAMS) {
3648 if (be64_to_cpu(p->c_size) !=
3649 drbd_get_capacity(mdev->this_bdev) || ldsc) {
3650 /* we have different sizes, probably peer
3651 * needs to know my new size... */
e89b591c 3652 drbd_send_sizes(mdev, 0, ddsf);
b411b363
PR
3653 }
3654 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
3655 (dd == grew && mdev->state.conn == C_CONNECTED)) {
3656 if (mdev->state.pdsk >= D_INCONSISTENT &&
e89b591c
PR
3657 mdev->state.disk >= D_INCONSISTENT) {
3658 if (ddsf & DDSF_NO_RESYNC)
3659 dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
3660 else
3661 resync_after_online_grow(mdev);
3662 } else
b411b363
PR
3663 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
3664 }
3665 }
3666
82bc0194 3667 return 0;
b411b363
PR
3668}
3669
4a76b161 3670static int receive_uuids(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3671{
4a76b161 3672 struct drbd_conf *mdev;
e658983a 3673 struct p_uuids *p = pi->data;
b411b363 3674 u64 *p_uuid;
62b0da3a 3675 int i, updated_uuids = 0;
b411b363 3676
4a76b161
AG
3677 mdev = vnr_to_mdev(tconn, pi->vnr);
3678 if (!mdev)
3679 return config_unknown_volume(tconn, pi);
3680
b411b363
PR
3681 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
3682
3683 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
3684 p_uuid[i] = be64_to_cpu(p->uuid[i]);
3685
3686 kfree(mdev->p_uuid);
3687 mdev->p_uuid = p_uuid;
3688
3689 if (mdev->state.conn < C_CONNECTED &&
3690 mdev->state.disk < D_INCONSISTENT &&
3691 mdev->state.role == R_PRIMARY &&
3692 (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
3693 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
3694 (unsigned long long)mdev->ed_uuid);
38fa9988 3695 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3696 return -EIO;
b411b363
PR
3697 }
3698
3699 if (get_ldev(mdev)) {
3700 int skip_initial_sync =
3701 mdev->state.conn == C_CONNECTED &&
31890f4a 3702 mdev->tconn->agreed_pro_version >= 90 &&
b411b363
PR
3703 mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
3704 (p_uuid[UI_FLAGS] & 8);
3705 if (skip_initial_sync) {
3706 dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
3707 drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
20ceb2b2
LE
3708 "clear_n_write from receive_uuids",
3709 BM_LOCKED_TEST_ALLOWED);
b411b363
PR
3710 _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
3711 _drbd_uuid_set(mdev, UI_BITMAP, 0);
3712 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3713 CS_VERBOSE, NULL);
3714 drbd_md_sync(mdev);
62b0da3a 3715 updated_uuids = 1;
b411b363
PR
3716 }
3717 put_ldev(mdev);
18a50fa2
PR
3718 } else if (mdev->state.disk < D_INCONSISTENT &&
3719 mdev->state.role == R_PRIMARY) {
3720 /* I am a diskless primary, the peer just created a new current UUID
3721 for me. */
62b0da3a 3722 updated_uuids = drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
b411b363
PR
3723 }
3724
3725 /* Before we test for the disk state, we should wait until an eventually
3726 ongoing cluster wide state change is finished. That is important if
3727 we are primary and are detaching from our disk. We need to see the
3728 new disk state... */
8410da8f
PR
3729 mutex_lock(mdev->state_mutex);
3730 mutex_unlock(mdev->state_mutex);
b411b363 3731 if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
62b0da3a
LE
3732 updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3733
3734 if (updated_uuids)
3735 drbd_print_uuids(mdev, "receiver updated UUIDs to");
b411b363 3736
82bc0194 3737 return 0;
b411b363
PR
3738}
3739
3740/**
3741 * convert_state() - Converts the peer's view of the cluster state to our point of view
3742 * @ps: The state as seen by the peer.
3743 */
3744static union drbd_state convert_state(union drbd_state ps)
3745{
3746 union drbd_state ms;
3747
3748 static enum drbd_conns c_tab[] = {
369bea63 3749 [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
b411b363
PR
3750 [C_CONNECTED] = C_CONNECTED,
3751
3752 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3753 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3754 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3755 [C_VERIFY_S] = C_VERIFY_T,
3756 [C_MASK] = C_MASK,
3757 };
3758
3759 ms.i = ps.i;
3760
3761 ms.conn = c_tab[ps.conn];
3762 ms.peer = ps.role;
3763 ms.role = ps.peer;
3764 ms.pdsk = ps.disk;
3765 ms.disk = ps.pdsk;
3766 ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3767
3768 return ms;
3769}
3770
4a76b161 3771static int receive_req_state(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3772{
4a76b161 3773 struct drbd_conf *mdev;
e658983a 3774 struct p_req_state *p = pi->data;
b411b363 3775 union drbd_state mask, val;
bf885f8a 3776 enum drbd_state_rv rv;
b411b363 3777
4a76b161
AG
3778 mdev = vnr_to_mdev(tconn, pi->vnr);
3779 if (!mdev)
3780 return -EIO;
3781
b411b363
PR
3782 mask.i = be32_to_cpu(p->mask);
3783 val.i = be32_to_cpu(p->val);
3784
25703f83 3785 if (test_bit(DISCARD_CONCURRENT, &mdev->tconn->flags) &&
8410da8f 3786 mutex_is_locked(mdev->state_mutex)) {
b411b363 3787 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
82bc0194 3788 return 0;
b411b363
PR
3789 }
3790
3791 mask = convert_state(mask);
3792 val = convert_state(val);
3793
dfafcc8a
PR
3794 rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
3795 drbd_send_sr_reply(mdev, rv);
b411b363 3796
b411b363
PR
3797 drbd_md_sync(mdev);
3798
82bc0194 3799 return 0;
b411b363
PR
3800}
3801
e2857216 3802static int receive_req_conn_state(struct drbd_tconn *tconn, struct packet_info *pi)
dfafcc8a 3803{
e658983a 3804 struct p_req_state *p = pi->data;
dfafcc8a
PR
3805 union drbd_state mask, val;
3806 enum drbd_state_rv rv;
3807
3808 mask.i = be32_to_cpu(p->mask);
3809 val.i = be32_to_cpu(p->val);
3810
3811 if (test_bit(DISCARD_CONCURRENT, &tconn->flags) &&
3812 mutex_is_locked(&tconn->cstate_mutex)) {
3813 conn_send_sr_reply(tconn, SS_CONCURRENT_ST_CHG);
82bc0194 3814 return 0;
dfafcc8a
PR
3815 }
3816
3817 mask = convert_state(mask);
3818 val = convert_state(val);
3819
778bcf2e 3820 rv = conn_request_state(tconn, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
dfafcc8a
PR
3821 conn_send_sr_reply(tconn, rv);
3822
82bc0194 3823 return 0;
dfafcc8a
PR
3824}
3825
4a76b161 3826static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 3827{
4a76b161 3828 struct drbd_conf *mdev;
e658983a 3829 struct p_state *p = pi->data;
4ac4aada 3830 union drbd_state os, ns, peer_state;
b411b363 3831 enum drbd_disk_state real_peer_disk;
65d922c3 3832 enum chg_state_flags cs_flags;
b411b363
PR
3833 int rv;
3834
4a76b161
AG
3835 mdev = vnr_to_mdev(tconn, pi->vnr);
3836 if (!mdev)
3837 return config_unknown_volume(tconn, pi);
3838
b411b363
PR
3839 peer_state.i = be32_to_cpu(p->state);
3840
3841 real_peer_disk = peer_state.disk;
3842 if (peer_state.disk == D_NEGOTIATING) {
3843 real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
3844 dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
3845 }
3846
87eeee41 3847 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 3848 retry:
78bae59b 3849 os = ns = drbd_read_state(mdev);
87eeee41 3850 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 3851
b8853dbd
PR
3852 /* If some other part of the code (asender thread, timeout)
3853 * already decided to close the connection again,
3854 * we must not "re-establish" it here. */
3855 if (os.conn <= C_TEAR_DOWN)
58ffa580 3856 return -ECONNRESET;
b8853dbd 3857
9bcd2521
PR
3858 /* If this is the "end of sync" confirmation, usually the peer disk
3859 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
3860 * set) resync started in PausedSyncT, or if the timing of pause-/
3861 * unpause-sync events has been "just right", the peer disk may
3862 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
3863 */
3864 if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
3865 real_peer_disk == D_UP_TO_DATE &&
e9ef7bb6
LE
3866 os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
3867 /* If we are (becoming) SyncSource, but peer is still in sync
3868 * preparation, ignore its uptodate-ness to avoid flapping, it
3869 * will change to inconsistent once the peer reaches active
3870 * syncing states.
3871 * It may have changed syncer-paused flags, however, so we
3872 * cannot ignore this completely. */
3873 if (peer_state.conn > C_CONNECTED &&
3874 peer_state.conn < C_SYNC_SOURCE)
3875 real_peer_disk = D_INCONSISTENT;
3876
3877 /* if peer_state changes to connected at the same time,
3878 * it explicitly notifies us that it finished resync.
3879 * Maybe we should finish it up, too? */
3880 else if (os.conn >= C_SYNC_SOURCE &&
3881 peer_state.conn == C_CONNECTED) {
3882 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
3883 drbd_resync_finished(mdev);
82bc0194 3884 return 0;
e9ef7bb6
LE
3885 }
3886 }
3887
58ffa580
LE
3888 /* explicit verify finished notification, stop sector reached. */
3889 if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
3890 peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
3891 ov_out_of_sync_print(mdev);
3892 drbd_resync_finished(mdev);
3893 return 0;
3894 }
3895
e9ef7bb6
LE
3896 /* peer says his disk is inconsistent, while we think it is uptodate,
3897 * and this happens while the peer still thinks we have a sync going on,
3898 * but we think we are already done with the sync.
3899 * We ignore this to avoid flapping pdsk.
3900 * This should not happen, if the peer is a recent version of drbd. */
3901 if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
3902 os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
3903 real_peer_disk = D_UP_TO_DATE;
3904
4ac4aada
LE
3905 if (ns.conn == C_WF_REPORT_PARAMS)
3906 ns.conn = C_CONNECTED;
b411b363 3907
67531718
PR
3908 if (peer_state.conn == C_AHEAD)
3909 ns.conn = C_BEHIND;
3910
b411b363
PR
3911 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3912 get_ldev_if_state(mdev, D_NEGOTIATING)) {
3913 int cr; /* consider resync */
3914
3915 /* if we established a new connection */
4ac4aada 3916 cr = (os.conn < C_CONNECTED);
b411b363
PR
3917 /* if we had an established connection
3918 * and one of the nodes newly attaches a disk */
4ac4aada 3919 cr |= (os.conn == C_CONNECTED &&
b411b363 3920 (peer_state.disk == D_NEGOTIATING ||
4ac4aada 3921 os.disk == D_NEGOTIATING));
b411b363
PR
3922 /* if we have both been inconsistent, and the peer has been
3923 * forced to be UpToDate with --overwrite-data */
3924 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
3925 /* if we had been plain connected, and the admin requested to
3926 * start a sync by "invalidate" or "invalidate-remote" */
4ac4aada 3927 cr |= (os.conn == C_CONNECTED &&
b411b363
PR
3928 (peer_state.conn >= C_STARTING_SYNC_S &&
3929 peer_state.conn <= C_WF_BITMAP_T));
3930
3931 if (cr)
4ac4aada 3932 ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
b411b363
PR
3933
3934 put_ldev(mdev);
4ac4aada
LE
3935 if (ns.conn == C_MASK) {
3936 ns.conn = C_CONNECTED;
b411b363 3937 if (mdev->state.disk == D_NEGOTIATING) {
82f59cc6 3938 drbd_force_state(mdev, NS(disk, D_FAILED));
b411b363
PR
3939 } else if (peer_state.disk == D_NEGOTIATING) {
3940 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3941 peer_state.disk = D_DISKLESS;
580b9767 3942 real_peer_disk = D_DISKLESS;
b411b363 3943 } else {
8169e41b 3944 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->tconn->flags))
82bc0194 3945 return -EIO;
4ac4aada 3946 D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
38fa9988 3947 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3948 return -EIO;
b411b363
PR
3949 }
3950 }
3951 }
3952
87eeee41 3953 spin_lock_irq(&mdev->tconn->req_lock);
78bae59b 3954 if (os.i != drbd_read_state(mdev).i)
b411b363
PR
3955 goto retry;
3956 clear_bit(CONSIDER_RESYNC, &mdev->flags);
b411b363
PR
3957 ns.peer = peer_state.role;
3958 ns.pdsk = real_peer_disk;
3959 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
4ac4aada 3960 if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
b411b363 3961 ns.disk = mdev->new_state_tmp.disk;
4ac4aada 3962 cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
2aebfabb 3963 if (ns.pdsk == D_CONSISTENT && drbd_suspended(mdev) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
481c6f50 3964 test_bit(NEW_CUR_UUID, &mdev->flags)) {
8554df1c 3965 /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
481c6f50 3966 for temporal network outages! */
87eeee41 3967 spin_unlock_irq(&mdev->tconn->req_lock);
481c6f50 3968 dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
2f5cdd0b 3969 tl_clear(mdev->tconn);
481c6f50
PR
3970 drbd_uuid_new_current(mdev);
3971 clear_bit(NEW_CUR_UUID, &mdev->flags);
38fa9988 3972 conn_request_state(mdev->tconn, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
82bc0194 3973 return -EIO;
481c6f50 3974 }
65d922c3 3975 rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
78bae59b 3976 ns = drbd_read_state(mdev);
87eeee41 3977 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
3978
3979 if (rv < SS_SUCCESS) {
38fa9988 3980 conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3981 return -EIO;
b411b363
PR
3982 }
3983
4ac4aada
LE
3984 if (os.conn > C_WF_REPORT_PARAMS) {
3985 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
b411b363
PR
3986 peer_state.disk != D_NEGOTIATING ) {
3987 /* we want resync, peer has not yet decided to sync... */
3988 /* Nowadays only used when forcing a node into primary role and
3989 setting its disk to UpToDate with that */
3990 drbd_send_uuids(mdev);
43de7c85 3991 drbd_send_current_state(mdev);
b411b363
PR
3992 }
3993 }
3994
08b165ba 3995 clear_bit(DISCARD_MY_DATA, &mdev->flags);
b411b363
PR
3996
3997 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
3998
82bc0194 3999 return 0;
b411b363
PR
4000}
4001
4a76b161 4002static int receive_sync_uuid(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4003{
4a76b161 4004 struct drbd_conf *mdev;
e658983a 4005 struct p_rs_uuid *p = pi->data;
4a76b161
AG
4006
4007 mdev = vnr_to_mdev(tconn, pi->vnr);
4008 if (!mdev)
4009 return -EIO;
b411b363
PR
4010
4011 wait_event(mdev->misc_wait,
4012 mdev->state.conn == C_WF_SYNC_UUID ||
c4752ef1 4013 mdev->state.conn == C_BEHIND ||
b411b363
PR
4014 mdev->state.conn < C_CONNECTED ||
4015 mdev->state.disk < D_NEGOTIATING);
4016
4017 /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
4018
b411b363
PR
4019 /* Here the _drbd_uuid_ functions are right, current should
4020 _not_ be rotated into the history */
4021 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
4022 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
4023 _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
4024
62b0da3a 4025 drbd_print_uuids(mdev, "updated sync uuid");
b411b363
PR
4026 drbd_start_resync(mdev, C_SYNC_TARGET);
4027
4028 put_ldev(mdev);
4029 } else
4030 dev_err(DEV, "Ignoring SyncUUID packet!\n");
4031
82bc0194 4032 return 0;
b411b363
PR
4033}
4034
2c46407d
AG
4035/**
4036 * receive_bitmap_plain
4037 *
4038 * Return 0 when done, 1 when another iteration is needed, and a negative error
4039 * code upon failure.
4040 */
4041static int
50d0b1ad 4042receive_bitmap_plain(struct drbd_conf *mdev, unsigned int size,
e658983a 4043 unsigned long *p, struct bm_xfer_ctx *c)
b411b363 4044{
50d0b1ad
AG
4045 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
4046 drbd_header_size(mdev->tconn);
e658983a 4047 unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
50d0b1ad 4048 c->bm_words - c->word_offset);
e658983a 4049 unsigned int want = num_words * sizeof(*p);
2c46407d 4050 int err;
b411b363 4051
50d0b1ad
AG
4052 if (want != size) {
4053 dev_err(DEV, "%s:want (%u) != size (%u)\n", __func__, want, size);
2c46407d 4054 return -EIO;
b411b363
PR
4055 }
4056 if (want == 0)
2c46407d 4057 return 0;
e658983a 4058 err = drbd_recv_all(mdev->tconn, p, want);
82bc0194 4059 if (err)
2c46407d 4060 return err;
b411b363 4061
e658983a 4062 drbd_bm_merge_lel(mdev, c->word_offset, num_words, p);
b411b363
PR
4063
4064 c->word_offset += num_words;
4065 c->bit_offset = c->word_offset * BITS_PER_LONG;
4066 if (c->bit_offset > c->bm_bits)
4067 c->bit_offset = c->bm_bits;
4068
2c46407d 4069 return 1;
b411b363
PR
4070}
4071
a02d1240
AG
4072static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
4073{
4074 return (enum drbd_bitmap_code)(p->encoding & 0x0f);
4075}
4076
4077static int dcbp_get_start(struct p_compressed_bm *p)
4078{
4079 return (p->encoding & 0x80) != 0;
4080}
4081
4082static int dcbp_get_pad_bits(struct p_compressed_bm *p)
4083{
4084 return (p->encoding >> 4) & 0x7;
4085}
4086
2c46407d
AG
4087/**
4088 * recv_bm_rle_bits
4089 *
4090 * Return 0 when done, 1 when another iteration is needed, and a negative error
4091 * code upon failure.
4092 */
4093static int
b411b363
PR
4094recv_bm_rle_bits(struct drbd_conf *mdev,
4095 struct p_compressed_bm *p,
c6d25cfe
PR
4096 struct bm_xfer_ctx *c,
4097 unsigned int len)
b411b363
PR
4098{
4099 struct bitstream bs;
4100 u64 look_ahead;
4101 u64 rl;
4102 u64 tmp;
4103 unsigned long s = c->bit_offset;
4104 unsigned long e;
a02d1240 4105 int toggle = dcbp_get_start(p);
b411b363
PR
4106 int have;
4107 int bits;
4108
a02d1240 4109 bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
b411b363
PR
4110
4111 bits = bitstream_get_bits(&bs, &look_ahead, 64);
4112 if (bits < 0)
2c46407d 4113 return -EIO;
b411b363
PR
4114
4115 for (have = bits; have > 0; s += rl, toggle = !toggle) {
4116 bits = vli_decode_bits(&rl, look_ahead);
4117 if (bits <= 0)
2c46407d 4118 return -EIO;
b411b363
PR
4119
4120 if (toggle) {
4121 e = s + rl -1;
4122 if (e >= c->bm_bits) {
4123 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
2c46407d 4124 return -EIO;
b411b363
PR
4125 }
4126 _drbd_bm_set_bits(mdev, s, e);
4127 }
4128
4129 if (have < bits) {
4130 dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
4131 have, bits, look_ahead,
4132 (unsigned int)(bs.cur.b - p->code),
4133 (unsigned int)bs.buf_len);
2c46407d 4134 return -EIO;
b411b363
PR
4135 }
4136 look_ahead >>= bits;
4137 have -= bits;
4138
4139 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
4140 if (bits < 0)
2c46407d 4141 return -EIO;
b411b363
PR
4142 look_ahead |= tmp << have;
4143 have += bits;
4144 }
4145
4146 c->bit_offset = s;
4147 bm_xfer_ctx_bit_to_word_offset(c);
4148
2c46407d 4149 return (s != c->bm_bits);
b411b363
PR
4150}
4151
2c46407d
AG
4152/**
4153 * decode_bitmap_c
4154 *
4155 * Return 0 when done, 1 when another iteration is needed, and a negative error
4156 * code upon failure.
4157 */
4158static int
b411b363
PR
4159decode_bitmap_c(struct drbd_conf *mdev,
4160 struct p_compressed_bm *p,
c6d25cfe
PR
4161 struct bm_xfer_ctx *c,
4162 unsigned int len)
b411b363 4163{
a02d1240 4164 if (dcbp_get_code(p) == RLE_VLI_Bits)
e658983a 4165 return recv_bm_rle_bits(mdev, p, c, len - sizeof(*p));
b411b363
PR
4166
4167 /* other variants had been implemented for evaluation,
4168 * but have been dropped as this one turned out to be "best"
4169 * during all our tests. */
4170
4171 dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
38fa9988 4172 conn_request_state(mdev->tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
2c46407d 4173 return -EIO;
b411b363
PR
4174}
4175
4176void INFO_bm_xfer_stats(struct drbd_conf *mdev,
4177 const char *direction, struct bm_xfer_ctx *c)
4178{
4179 /* what would it take to transfer it "plaintext" */
50d0b1ad
AG
4180 unsigned int header_size = drbd_header_size(mdev->tconn);
4181 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
4182 unsigned int plain =
4183 header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
4184 c->bm_words * sizeof(unsigned long);
4185 unsigned int total = c->bytes[0] + c->bytes[1];
4186 unsigned int r;
b411b363
PR
4187
4188 /* total can not be zero. but just in case: */
4189 if (total == 0)
4190 return;
4191
4192 /* don't report if not compressed */
4193 if (total >= plain)
4194 return;
4195
4196 /* total < plain. check for overflow, still */
4197 r = (total > UINT_MAX/1000) ? (total / (plain/1000))
4198 : (1000 * total / plain);
4199
4200 if (r > 1000)
4201 r = 1000;
4202
4203 r = 1000 - r;
4204 dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
4205 "total %u; compression: %u.%u%%\n",
4206 direction,
4207 c->bytes[1], c->packets[1],
4208 c->bytes[0], c->packets[0],
4209 total, r/10, r % 10);
4210}
4211
4212/* Since we are processing the bitfield from lower addresses to higher,
4213 it does not matter if the process it in 32 bit chunks or 64 bit
4214 chunks as long as it is little endian. (Understand it as byte stream,
4215 beginning with the lowest byte...) If we would use big endian
4216 we would need to process it from the highest address to the lowest,
4217 in order to be agnostic to the 32 vs 64 bits issue.
4218
4219 returns 0 on failure, 1 if we successfully received it. */
4a76b161 4220static int receive_bitmap(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4221{
4a76b161 4222 struct drbd_conf *mdev;
b411b363 4223 struct bm_xfer_ctx c;
2c46407d 4224 int err;
4a76b161
AG
4225
4226 mdev = vnr_to_mdev(tconn, pi->vnr);
4227 if (!mdev)
4228 return -EIO;
b411b363 4229
20ceb2b2
LE
4230 drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED);
4231 /* you are supposed to send additional out-of-sync information
4232 * if you actually set bits during this phase */
b411b363 4233
b411b363
PR
4234 c = (struct bm_xfer_ctx) {
4235 .bm_bits = drbd_bm_bits(mdev),
4236 .bm_words = drbd_bm_words(mdev),
4237 };
4238
2c46407d 4239 for(;;) {
e658983a
AG
4240 if (pi->cmd == P_BITMAP)
4241 err = receive_bitmap_plain(mdev, pi->size, pi->data, &c);
4242 else if (pi->cmd == P_COMPRESSED_BITMAP) {
b411b363
PR
4243 /* MAYBE: sanity check that we speak proto >= 90,
4244 * and the feature is enabled! */
e658983a 4245 struct p_compressed_bm *p = pi->data;
b411b363 4246
50d0b1ad 4247 if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(tconn)) {
b411b363 4248 dev_err(DEV, "ReportCBitmap packet too large\n");
82bc0194 4249 err = -EIO;
b411b363
PR
4250 goto out;
4251 }
e658983a 4252 if (pi->size <= sizeof(*p)) {
e2857216 4253 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", pi->size);
82bc0194 4254 err = -EIO;
78fcbdae 4255 goto out;
b411b363 4256 }
e658983a
AG
4257 err = drbd_recv_all(mdev->tconn, p, pi->size);
4258 if (err)
4259 goto out;
e2857216 4260 err = decode_bitmap_c(mdev, p, &c, pi->size);
b411b363 4261 } else {
e2857216 4262 dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
82bc0194 4263 err = -EIO;
b411b363
PR
4264 goto out;
4265 }
4266
e2857216 4267 c.packets[pi->cmd == P_BITMAP]++;
50d0b1ad 4268 c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(tconn) + pi->size;
b411b363 4269
2c46407d
AG
4270 if (err <= 0) {
4271 if (err < 0)
4272 goto out;
b411b363 4273 break;
2c46407d 4274 }
e2857216 4275 err = drbd_recv_header(mdev->tconn, pi);
82bc0194 4276 if (err)
b411b363 4277 goto out;
2c46407d 4278 }
b411b363
PR
4279
4280 INFO_bm_xfer_stats(mdev, "receive", &c);
4281
4282 if (mdev->state.conn == C_WF_BITMAP_T) {
de1f8e4a
AG
4283 enum drbd_state_rv rv;
4284
82bc0194
AG
4285 err = drbd_send_bitmap(mdev);
4286 if (err)
b411b363
PR
4287 goto out;
4288 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
de1f8e4a
AG
4289 rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
4290 D_ASSERT(rv == SS_SUCCESS);
b411b363
PR
4291 } else if (mdev->state.conn != C_WF_BITMAP_S) {
4292 /* admin may have requested C_DISCONNECTING,
4293 * other threads may have noticed network errors */
4294 dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
4295 drbd_conn_str(mdev->state.conn));
4296 }
82bc0194 4297 err = 0;
b411b363 4298
b411b363 4299 out:
20ceb2b2 4300 drbd_bm_unlock(mdev);
82bc0194 4301 if (!err && mdev->state.conn == C_WF_BITMAP_S)
b411b363 4302 drbd_start_resync(mdev, C_SYNC_SOURCE);
82bc0194 4303 return err;
b411b363
PR
4304}
4305
4a76b161 4306static int receive_skip(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4307{
4a76b161 4308 conn_warn(tconn, "skipping unknown optional packet type %d, l: %d!\n",
e2857216 4309 pi->cmd, pi->size);
2de876ef 4310
4a76b161 4311 return ignore_remaining_packet(tconn, pi);
2de876ef
PR
4312}
4313
4a76b161 4314static int receive_UnplugRemote(struct drbd_tconn *tconn, struct packet_info *pi)
0ced55a3 4315{
e7f52dfb
LE
4316 /* Make sure we've acked all the TCP data associated
4317 * with the data requests being unplugged */
4a76b161 4318 drbd_tcp_quickack(tconn->data.socket);
0ced55a3 4319
82bc0194 4320 return 0;
0ced55a3
PR
4321}
4322
4a76b161 4323static int receive_out_of_sync(struct drbd_tconn *tconn, struct packet_info *pi)
73a01a18 4324{
4a76b161 4325 struct drbd_conf *mdev;
e658983a 4326 struct p_block_desc *p = pi->data;
4a76b161
AG
4327
4328 mdev = vnr_to_mdev(tconn, pi->vnr);
4329 if (!mdev)
4330 return -EIO;
73a01a18 4331
f735e363
LE
4332 switch (mdev->state.conn) {
4333 case C_WF_SYNC_UUID:
4334 case C_WF_BITMAP_T:
4335 case C_BEHIND:
4336 break;
4337 default:
4338 dev_err(DEV, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
4339 drbd_conn_str(mdev->state.conn));
4340 }
4341
73a01a18
PR
4342 drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
4343
82bc0194 4344 return 0;
73a01a18
PR
4345}
4346
02918be2
PR
4347struct data_cmd {
4348 int expect_payload;
4349 size_t pkt_size;
4a76b161 4350 int (*fn)(struct drbd_tconn *, struct packet_info *);
02918be2
PR
4351};
4352
4353static struct data_cmd drbd_cmd_handler[] = {
4a76b161
AG
4354 [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
4355 [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
4356 [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
4357 [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
e658983a
AG
4358 [P_BITMAP] = { 1, 0, receive_bitmap } ,
4359 [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
4360 [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote },
4a76b161
AG
4361 [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4362 [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
e658983a
AG
4363 [P_SYNC_PARAM] = { 1, 0, receive_SyncParam },
4364 [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam },
4a76b161
AG
4365 [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
4366 [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
4367 [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
4368 [P_STATE] = { 0, sizeof(struct p_state), receive_state },
4369 [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
4370 [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
4371 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4372 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4373 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4374 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
4375 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
4376 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
036b17ea 4377 [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
b411b363
PR
4378};
4379
eefc2f7d 4380static void drbdd(struct drbd_tconn *tconn)
b411b363 4381{
77351055 4382 struct packet_info pi;
02918be2 4383 size_t shs; /* sub header size */
82bc0194 4384 int err;
b411b363 4385
eefc2f7d 4386 while (get_t_state(&tconn->receiver) == RUNNING) {
deebe195
AG
4387 struct data_cmd *cmd;
4388
eefc2f7d 4389 drbd_thread_current_set_cpu(&tconn->receiver);
69bc7bc3 4390 if (drbd_recv_header(tconn, &pi))
02918be2 4391 goto err_out;
b411b363 4392
deebe195 4393 cmd = &drbd_cmd_handler[pi.cmd];
4a76b161 4394 if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
2fcb8f30
AG
4395 conn_err(tconn, "Unexpected data packet %s (0x%04x)",
4396 cmdname(pi.cmd), pi.cmd);
02918be2 4397 goto err_out;
0b33a916 4398 }
b411b363 4399
e658983a
AG
4400 shs = cmd->pkt_size;
4401 if (pi.size > shs && !cmd->expect_payload) {
2fcb8f30
AG
4402 conn_err(tconn, "No payload expected %s l:%d\n",
4403 cmdname(pi.cmd), pi.size);
02918be2 4404 goto err_out;
b411b363 4405 }
b411b363 4406
c13f7e1a 4407 if (shs) {
e658983a 4408 err = drbd_recv_all_warn(tconn, pi.data, shs);
a5c31904 4409 if (err)
c13f7e1a 4410 goto err_out;
e2857216 4411 pi.size -= shs;
c13f7e1a
LE
4412 }
4413
4a76b161
AG
4414 err = cmd->fn(tconn, &pi);
4415 if (err) {
9f5bdc33
AG
4416 conn_err(tconn, "error receiving %s, e: %d l: %d!\n",
4417 cmdname(pi.cmd), err, pi.size);
02918be2 4418 goto err_out;
b411b363
PR
4419 }
4420 }
82bc0194 4421 return;
b411b363 4422
82bc0194
AG
4423 err_out:
4424 conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
b411b363
PR
4425}
4426
0e29d163 4427void conn_flush_workqueue(struct drbd_tconn *tconn)
b411b363
PR
4428{
4429 struct drbd_wq_barrier barr;
4430
4431 barr.w.cb = w_prev_work_done;
0e29d163 4432 barr.w.tconn = tconn;
b411b363 4433 init_completion(&barr.done);
d5b27b01 4434 drbd_queue_work(&tconn->sender_work, &barr.w);
b411b363
PR
4435 wait_for_completion(&barr.done);
4436}
4437
81fa2e67 4438static void conn_disconnect(struct drbd_tconn *tconn)
b411b363 4439{
c141ebda 4440 struct drbd_conf *mdev;
bbeb641c 4441 enum drbd_conns oc;
376694a0 4442 int vnr;
b411b363 4443
bbeb641c 4444 if (tconn->cstate == C_STANDALONE)
b411b363 4445 return;
b411b363 4446
b8853dbd
PR
4447 /* We are about to start the cleanup after connection loss.
4448 * Make sure drbd_make_request knows about that.
4449 * Usually we should be in some network failure state already,
4450 * but just in case we are not, we fix it up here.
4451 */
4452 conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
4453
b411b363 4454 /* asender does not clean up anything. it must not interfere, either */
360cc740
PR
4455 drbd_thread_stop(&tconn->asender);
4456 drbd_free_sock(tconn);
4457
c141ebda
PR
4458 rcu_read_lock();
4459 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
4460 kref_get(&mdev->kref);
4461 rcu_read_unlock();
4462 drbd_disconnected(mdev);
4463 kref_put(&mdev->kref, &drbd_minor_destroy);
4464 rcu_read_lock();
4465 }
4466 rcu_read_unlock();
4467
12038a3a
PR
4468 if (!list_empty(&tconn->current_epoch->list))
4469 conn_err(tconn, "ASSERTION FAILED: tconn->current_epoch->list not empty\n");
4470 /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
4471 atomic_set(&tconn->current_epoch->epoch_size, 0);
b6dd1a89 4472 tconn->send.seen_any_write_yet = false;
12038a3a 4473
360cc740
PR
4474 conn_info(tconn, "Connection closed\n");
4475
cb703454
PR
4476 if (conn_highest_role(tconn) == R_PRIMARY && conn_highest_pdsk(tconn) >= D_UNKNOWN)
4477 conn_try_outdate_peer_async(tconn);
4478
360cc740 4479 spin_lock_irq(&tconn->req_lock);
bbeb641c
PR
4480 oc = tconn->cstate;
4481 if (oc >= C_UNCONNECTED)
376694a0 4482 _conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE);
bbeb641c 4483
360cc740
PR
4484 spin_unlock_irq(&tconn->req_lock);
4485
f3dfa40a 4486 if (oc == C_DISCONNECTING)
d9cc6e23 4487 conn_request_state(tconn, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
360cc740
PR
4488}
4489
c141ebda 4490static int drbd_disconnected(struct drbd_conf *mdev)
360cc740 4491{
360cc740 4492 unsigned int i;
b411b363 4493
85719573 4494 /* wait for current activity to cease. */
87eeee41 4495 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
4496 _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
4497 _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
4498 _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
87eeee41 4499 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
4500
4501 /* We do not have data structures that would allow us to
4502 * get the rs_pending_cnt down to 0 again.
4503 * * On C_SYNC_TARGET we do not have any data structures describing
4504 * the pending RSDataRequest's we have sent.
4505 * * On C_SYNC_SOURCE there is no data structure that tracks
4506 * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
4507 * And no, it is not the sum of the reference counts in the
4508 * resync_LRU. The resync_LRU tracks the whole operation including
4509 * the disk-IO, while the rs_pending_cnt only tracks the blocks
4510 * on the fly. */
4511 drbd_rs_cancel_all(mdev);
4512 mdev->rs_total = 0;
4513 mdev->rs_failed = 0;
4514 atomic_set(&mdev->rs_pending_cnt, 0);
4515 wake_up(&mdev->misc_wait);
4516
b411b363 4517 del_timer_sync(&mdev->resync_timer);
b411b363
PR
4518 resync_timer_fn((unsigned long)mdev);
4519
b411b363
PR
4520 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
4521 * w_make_resync_request etc. which may still be on the worker queue
4522 * to be "canceled" */
a21e9298 4523 drbd_flush_workqueue(mdev);
b411b363 4524
a990be46 4525 drbd_finish_peer_reqs(mdev);
b411b363 4526
d10b4ea3
PR
4527 /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
4528 might have issued a work again. The one before drbd_finish_peer_reqs() is
4529 necessary to reclain net_ee in drbd_finish_peer_reqs(). */
4530 drbd_flush_workqueue(mdev);
4531
b411b363
PR
4532 kfree(mdev->p_uuid);
4533 mdev->p_uuid = NULL;
4534
2aebfabb 4535 if (!drbd_suspended(mdev))
2f5cdd0b 4536 tl_clear(mdev->tconn);
b411b363 4537
b411b363
PR
4538 drbd_md_sync(mdev);
4539
20ceb2b2
LE
4540 /* serialize with bitmap writeout triggered by the state change,
4541 * if any. */
4542 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
4543
b411b363
PR
4544 /* tcp_close and release of sendpage pages can be deferred. I don't
4545 * want to use SO_LINGER, because apparently it can be deferred for
4546 * more than 20 seconds (longest time I checked).
4547 *
4548 * Actually we don't care for exactly when the network stack does its
4549 * put_page(), but release our reference on these pages right here.
4550 */
7721f567 4551 i = drbd_free_peer_reqs(mdev, &mdev->net_ee);
b411b363
PR
4552 if (i)
4553 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
435f0740
LE
4554 i = atomic_read(&mdev->pp_in_use_by_net);
4555 if (i)
4556 dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
b411b363
PR
4557 i = atomic_read(&mdev->pp_in_use);
4558 if (i)
45bb912b 4559 dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
b411b363
PR
4560
4561 D_ASSERT(list_empty(&mdev->read_ee));
4562 D_ASSERT(list_empty(&mdev->active_ee));
4563 D_ASSERT(list_empty(&mdev->sync_ee));
4564 D_ASSERT(list_empty(&mdev->done_ee));
4565
360cc740 4566 return 0;
b411b363
PR
4567}
4568
4569/*
4570 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
4571 * we can agree on is stored in agreed_pro_version.
4572 *
4573 * feature flags and the reserved array should be enough room for future
4574 * enhancements of the handshake protocol, and possible plugins...
4575 *
4576 * for now, they are expected to be zero, but ignored.
4577 */
6038178e 4578static int drbd_send_features(struct drbd_tconn *tconn)
b411b363 4579{
9f5bdc33
AG
4580 struct drbd_socket *sock;
4581 struct p_connection_features *p;
b411b363 4582
9f5bdc33
AG
4583 sock = &tconn->data;
4584 p = conn_prepare_command(tconn, sock);
4585 if (!p)
e8d17b01 4586 return -EIO;
b411b363
PR
4587 memset(p, 0, sizeof(*p));
4588 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
4589 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
9f5bdc33 4590 return conn_send_command(tconn, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
b411b363
PR
4591}
4592
4593/*
4594 * return values:
4595 * 1 yes, we have a valid connection
4596 * 0 oops, did not work out, please try again
4597 * -1 peer talks different language,
4598 * no point in trying again, please go standalone.
4599 */
6038178e 4600static int drbd_do_features(struct drbd_tconn *tconn)
b411b363 4601{
65d11ed6 4602 /* ASSERT current == tconn->receiver ... */
e658983a
AG
4603 struct p_connection_features *p;
4604 const int expect = sizeof(struct p_connection_features);
77351055 4605 struct packet_info pi;
a5c31904 4606 int err;
b411b363 4607
6038178e 4608 err = drbd_send_features(tconn);
e8d17b01 4609 if (err)
b411b363
PR
4610 return 0;
4611
69bc7bc3
AG
4612 err = drbd_recv_header(tconn, &pi);
4613 if (err)
b411b363
PR
4614 return 0;
4615
6038178e
AG
4616 if (pi.cmd != P_CONNECTION_FEATURES) {
4617 conn_err(tconn, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
2fcb8f30 4618 cmdname(pi.cmd), pi.cmd);
b411b363
PR
4619 return -1;
4620 }
4621
77351055 4622 if (pi.size != expect) {
6038178e 4623 conn_err(tconn, "expected ConnectionFeatures length: %u, received: %u\n",
77351055 4624 expect, pi.size);
b411b363
PR
4625 return -1;
4626 }
4627
e658983a
AG
4628 p = pi.data;
4629 err = drbd_recv_all_warn(tconn, p, expect);
a5c31904 4630 if (err)
b411b363 4631 return 0;
b411b363 4632
b411b363
PR
4633 p->protocol_min = be32_to_cpu(p->protocol_min);
4634 p->protocol_max = be32_to_cpu(p->protocol_max);
4635 if (p->protocol_max == 0)
4636 p->protocol_max = p->protocol_min;
4637
4638 if (PRO_VERSION_MAX < p->protocol_min ||
4639 PRO_VERSION_MIN > p->protocol_max)
4640 goto incompat;
4641
65d11ed6 4642 tconn->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
b411b363 4643
65d11ed6
PR
4644 conn_info(tconn, "Handshake successful: "
4645 "Agreed network protocol version %d\n", tconn->agreed_pro_version);
b411b363
PR
4646
4647 return 1;
4648
4649 incompat:
65d11ed6 4650 conn_err(tconn, "incompatible DRBD dialects: "
b411b363
PR
4651 "I support %d-%d, peer supports %d-%d\n",
4652 PRO_VERSION_MIN, PRO_VERSION_MAX,
4653 p->protocol_min, p->protocol_max);
4654 return -1;
4655}
4656
4657#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
13e6037d 4658static int drbd_do_auth(struct drbd_tconn *tconn)
b411b363
PR
4659{
4660 dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
4661 dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
b10d96cb 4662 return -1;
b411b363
PR
4663}
4664#else
4665#define CHALLENGE_LEN 64
b10d96cb
JT
4666
4667/* Return value:
4668 1 - auth succeeded,
4669 0 - failed, try again (network error),
4670 -1 - auth failed, don't try again.
4671*/
4672
13e6037d 4673static int drbd_do_auth(struct drbd_tconn *tconn)
b411b363 4674{
9f5bdc33 4675 struct drbd_socket *sock;
b411b363
PR
4676 char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
4677 struct scatterlist sg;
4678 char *response = NULL;
4679 char *right_response = NULL;
4680 char *peers_ch = NULL;
44ed167d
PR
4681 unsigned int key_len;
4682 char secret[SHARED_SECRET_MAX]; /* 64 byte */
b411b363
PR
4683 unsigned int resp_size;
4684 struct hash_desc desc;
77351055 4685 struct packet_info pi;
44ed167d 4686 struct net_conf *nc;
69bc7bc3 4687 int err, rv;
b411b363 4688
9f5bdc33
AG
4689 /* FIXME: Put the challenge/response into the preallocated socket buffer. */
4690
44ed167d
PR
4691 rcu_read_lock();
4692 nc = rcu_dereference(tconn->net_conf);
4693 key_len = strlen(nc->shared_secret);
4694 memcpy(secret, nc->shared_secret, key_len);
4695 rcu_read_unlock();
4696
13e6037d 4697 desc.tfm = tconn->cram_hmac_tfm;
b411b363
PR
4698 desc.flags = 0;
4699
44ed167d 4700 rv = crypto_hash_setkey(tconn->cram_hmac_tfm, (u8 *)secret, key_len);
b411b363 4701 if (rv) {
13e6037d 4702 conn_err(tconn, "crypto_hash_setkey() failed with %d\n", rv);
b10d96cb 4703 rv = -1;
b411b363
PR
4704 goto fail;
4705 }
4706
4707 get_random_bytes(my_challenge, CHALLENGE_LEN);
4708
9f5bdc33
AG
4709 sock = &tconn->data;
4710 if (!conn_prepare_command(tconn, sock)) {
4711 rv = 0;
4712 goto fail;
4713 }
e658983a 4714 rv = !conn_send_command(tconn, sock, P_AUTH_CHALLENGE, 0,
9f5bdc33 4715 my_challenge, CHALLENGE_LEN);
b411b363
PR
4716 if (!rv)
4717 goto fail;
4718
69bc7bc3
AG
4719 err = drbd_recv_header(tconn, &pi);
4720 if (err) {
4721 rv = 0;
b411b363 4722 goto fail;
69bc7bc3 4723 }
b411b363 4724
77351055 4725 if (pi.cmd != P_AUTH_CHALLENGE) {
13e6037d 4726 conn_err(tconn, "expected AuthChallenge packet, received: %s (0x%04x)\n",
2fcb8f30 4727 cmdname(pi.cmd), pi.cmd);
b411b363
PR
4728 rv = 0;
4729 goto fail;
4730 }
4731
77351055 4732 if (pi.size > CHALLENGE_LEN * 2) {
13e6037d 4733 conn_err(tconn, "expected AuthChallenge payload too big.\n");
b10d96cb 4734 rv = -1;
b411b363
PR
4735 goto fail;
4736 }
4737
77351055 4738 peers_ch = kmalloc(pi.size, GFP_NOIO);
b411b363 4739 if (peers_ch == NULL) {
13e6037d 4740 conn_err(tconn, "kmalloc of peers_ch failed\n");
b10d96cb 4741 rv = -1;
b411b363
PR
4742 goto fail;
4743 }
4744
a5c31904
AG
4745 err = drbd_recv_all_warn(tconn, peers_ch, pi.size);
4746 if (err) {
b411b363
PR
4747 rv = 0;
4748 goto fail;
4749 }
4750
13e6037d 4751 resp_size = crypto_hash_digestsize(tconn->cram_hmac_tfm);
b411b363
PR
4752 response = kmalloc(resp_size, GFP_NOIO);
4753 if (response == NULL) {
13e6037d 4754 conn_err(tconn, "kmalloc of response failed\n");
b10d96cb 4755 rv = -1;
b411b363
PR
4756 goto fail;
4757 }
4758
4759 sg_init_table(&sg, 1);
77351055 4760 sg_set_buf(&sg, peers_ch, pi.size);
b411b363
PR
4761
4762 rv = crypto_hash_digest(&desc, &sg, sg.length, response);
4763 if (rv) {
13e6037d 4764 conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv);
b10d96cb 4765 rv = -1;
b411b363
PR
4766 goto fail;
4767 }
4768
9f5bdc33
AG
4769 if (!conn_prepare_command(tconn, sock)) {
4770 rv = 0;
4771 goto fail;
4772 }
e658983a 4773 rv = !conn_send_command(tconn, sock, P_AUTH_RESPONSE, 0,
9f5bdc33 4774 response, resp_size);
b411b363
PR
4775 if (!rv)
4776 goto fail;
4777
69bc7bc3
AG
4778 err = drbd_recv_header(tconn, &pi);
4779 if (err) {
4780 rv = 0;
b411b363 4781 goto fail;
69bc7bc3 4782 }
b411b363 4783
77351055 4784 if (pi.cmd != P_AUTH_RESPONSE) {
13e6037d 4785 conn_err(tconn, "expected AuthResponse packet, received: %s (0x%04x)\n",
2fcb8f30 4786 cmdname(pi.cmd), pi.cmd);
b411b363
PR
4787 rv = 0;
4788 goto fail;
4789 }
4790
77351055 4791 if (pi.size != resp_size) {
13e6037d 4792 conn_err(tconn, "expected AuthResponse payload of wrong size\n");
b411b363
PR
4793 rv = 0;
4794 goto fail;
4795 }
4796
a5c31904
AG
4797 err = drbd_recv_all_warn(tconn, response , resp_size);
4798 if (err) {
b411b363
PR
4799 rv = 0;
4800 goto fail;
4801 }
4802
4803 right_response = kmalloc(resp_size, GFP_NOIO);
2d1ee87d 4804 if (right_response == NULL) {
13e6037d 4805 conn_err(tconn, "kmalloc of right_response failed\n");
b10d96cb 4806 rv = -1;
b411b363
PR
4807 goto fail;
4808 }
4809
4810 sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
4811
4812 rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
4813 if (rv) {
13e6037d 4814 conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv);
b10d96cb 4815 rv = -1;
b411b363
PR
4816 goto fail;
4817 }
4818
4819 rv = !memcmp(response, right_response, resp_size);
4820
4821 if (rv)
44ed167d
PR
4822 conn_info(tconn, "Peer authenticated using %d bytes HMAC\n",
4823 resp_size);
b10d96cb
JT
4824 else
4825 rv = -1;
b411b363
PR
4826
4827 fail:
4828 kfree(peers_ch);
4829 kfree(response);
4830 kfree(right_response);
4831
4832 return rv;
4833}
4834#endif
4835
4836int drbdd_init(struct drbd_thread *thi)
4837{
392c8801 4838 struct drbd_tconn *tconn = thi->tconn;
b411b363
PR
4839 int h;
4840
4d641dd7 4841 conn_info(tconn, "receiver (re)started\n");
b411b363
PR
4842
4843 do {
81fa2e67 4844 h = conn_connect(tconn);
b411b363 4845 if (h == 0) {
81fa2e67 4846 conn_disconnect(tconn);
20ee6390 4847 schedule_timeout_interruptible(HZ);
b411b363
PR
4848 }
4849 if (h == -1) {
4d641dd7 4850 conn_warn(tconn, "Discarding network configuration.\n");
bbeb641c 4851 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
4852 }
4853 } while (h == 0);
4854
91fd4dad
PR
4855 if (h > 0)
4856 drbdd(tconn);
b411b363 4857
81fa2e67 4858 conn_disconnect(tconn);
b411b363 4859
4d641dd7 4860 conn_info(tconn, "receiver terminated\n");
b411b363
PR
4861 return 0;
4862}
4863
4864/* ********* acknowledge sender ******** */
4865
e05e1e59 4866static int got_conn_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi)
e4f78ede 4867{
e658983a 4868 struct p_req_state_reply *p = pi->data;
e4f78ede
PR
4869 int retcode = be32_to_cpu(p->retcode);
4870
4871 if (retcode >= SS_SUCCESS) {
4872 set_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags);
4873 } else {
4874 set_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags);
4875 conn_err(tconn, "Requested state change failed by peer: %s (%d)\n",
4876 drbd_set_st_err_str(retcode), retcode);
4877 }
4878 wake_up(&tconn->ping_wait);
4879
2735a594 4880 return 0;
e4f78ede
PR
4881}
4882
1952e916 4883static int got_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4884{
1952e916 4885 struct drbd_conf *mdev;
e658983a 4886 struct p_req_state_reply *p = pi->data;
b411b363
PR
4887 int retcode = be32_to_cpu(p->retcode);
4888
1952e916
AG
4889 mdev = vnr_to_mdev(tconn, pi->vnr);
4890 if (!mdev)
2735a594 4891 return -EIO;
1952e916 4892
4d0fc3fd
PR
4893 if (test_bit(CONN_WD_ST_CHG_REQ, &tconn->flags)) {
4894 D_ASSERT(tconn->agreed_pro_version < 100);
4895 return got_conn_RqSReply(tconn, pi);
4896 }
4897
e4f78ede
PR
4898 if (retcode >= SS_SUCCESS) {
4899 set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
4900 } else {
4901 set_bit(CL_ST_CHG_FAIL, &mdev->flags);
4902 dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
4903 drbd_set_st_err_str(retcode), retcode);
b411b363 4904 }
e4f78ede
PR
4905 wake_up(&mdev->state_wait);
4906
2735a594 4907 return 0;
b411b363
PR
4908}
4909
e05e1e59 4910static int got_Ping(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4911{
2735a594 4912 return drbd_send_ping_ack(tconn);
b411b363
PR
4913
4914}
4915
e05e1e59 4916static int got_PingAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363
PR
4917{
4918 /* restore idle timeout */
2a67d8b9
PR
4919 tconn->meta.socket->sk->sk_rcvtimeo = tconn->net_conf->ping_int*HZ;
4920 if (!test_and_set_bit(GOT_PING_ACK, &tconn->flags))
4921 wake_up(&tconn->ping_wait);
b411b363 4922
2735a594 4923 return 0;
b411b363
PR
4924}
4925
1952e916 4926static int got_IsInSync(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4927{
1952e916 4928 struct drbd_conf *mdev;
e658983a 4929 struct p_block_ack *p = pi->data;
b411b363
PR
4930 sector_t sector = be64_to_cpu(p->sector);
4931 int blksize = be32_to_cpu(p->blksize);
4932
1952e916
AG
4933 mdev = vnr_to_mdev(tconn, pi->vnr);
4934 if (!mdev)
2735a594 4935 return -EIO;
1952e916 4936
31890f4a 4937 D_ASSERT(mdev->tconn->agreed_pro_version >= 89);
b411b363
PR
4938
4939 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4940
1d53f09e
LE
4941 if (get_ldev(mdev)) {
4942 drbd_rs_complete_io(mdev, sector);
4943 drbd_set_in_sync(mdev, sector, blksize);
4944 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4945 mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4946 put_ldev(mdev);
4947 }
b411b363 4948 dec_rs_pending(mdev);
778f271d 4949 atomic_add(blksize >> 9, &mdev->rs_sect_in);
b411b363 4950
2735a594 4951 return 0;
b411b363
PR
4952}
4953
bc9c5c41
AG
4954static int
4955validate_req_change_req_state(struct drbd_conf *mdev, u64 id, sector_t sector,
4956 struct rb_root *root, const char *func,
4957 enum drbd_req_event what, bool missing_ok)
b411b363
PR
4958{
4959 struct drbd_request *req;
4960 struct bio_and_error m;
4961
87eeee41 4962 spin_lock_irq(&mdev->tconn->req_lock);
bc9c5c41 4963 req = find_request(mdev, root, id, sector, missing_ok, func);
b411b363 4964 if (unlikely(!req)) {
87eeee41 4965 spin_unlock_irq(&mdev->tconn->req_lock);
85997675 4966 return -EIO;
b411b363
PR
4967 }
4968 __req_mod(req, what, &m);
87eeee41 4969 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
4970
4971 if (m.bio)
4972 complete_master_bio(mdev, &m);
85997675 4973 return 0;
b411b363
PR
4974}
4975
1952e916 4976static int got_BlockAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 4977{
1952e916 4978 struct drbd_conf *mdev;
e658983a 4979 struct p_block_ack *p = pi->data;
b411b363
PR
4980 sector_t sector = be64_to_cpu(p->sector);
4981 int blksize = be32_to_cpu(p->blksize);
4982 enum drbd_req_event what;
4983
1952e916
AG
4984 mdev = vnr_to_mdev(tconn, pi->vnr);
4985 if (!mdev)
2735a594 4986 return -EIO;
1952e916 4987
b411b363
PR
4988 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4989
579b57ed 4990 if (p->block_id == ID_SYNCER) {
b411b363
PR
4991 drbd_set_in_sync(mdev, sector, blksize);
4992 dec_rs_pending(mdev);
2735a594 4993 return 0;
b411b363 4994 }
e05e1e59 4995 switch (pi->cmd) {
b411b363 4996 case P_RS_WRITE_ACK:
8554df1c 4997 what = WRITE_ACKED_BY_PEER_AND_SIS;
b411b363
PR
4998 break;
4999 case P_WRITE_ACK:
8554df1c 5000 what = WRITE_ACKED_BY_PEER;
b411b363
PR
5001 break;
5002 case P_RECV_ACK:
8554df1c 5003 what = RECV_ACKED_BY_PEER;
b411b363 5004 break;
7be8da07 5005 case P_DISCARD_WRITE:
7be8da07
AG
5006 what = DISCARD_WRITE;
5007 break;
5008 case P_RETRY_WRITE:
7be8da07 5009 what = POSTPONE_WRITE;
b411b363
PR
5010 break;
5011 default:
2735a594 5012 BUG();
b411b363
PR
5013 }
5014
2735a594
AG
5015 return validate_req_change_req_state(mdev, p->block_id, sector,
5016 &mdev->write_requests, __func__,
5017 what, false);
b411b363
PR
5018}
5019
1952e916 5020static int got_NegAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 5021{
1952e916 5022 struct drbd_conf *mdev;
e658983a 5023 struct p_block_ack *p = pi->data;
b411b363 5024 sector_t sector = be64_to_cpu(p->sector);
2deb8336 5025 int size = be32_to_cpu(p->blksize);
85997675 5026 int err;
b411b363 5027
1952e916
AG
5028 mdev = vnr_to_mdev(tconn, pi->vnr);
5029 if (!mdev)
2735a594 5030 return -EIO;
1952e916 5031
b411b363
PR
5032 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
5033
579b57ed 5034 if (p->block_id == ID_SYNCER) {
b411b363
PR
5035 dec_rs_pending(mdev);
5036 drbd_rs_failed_io(mdev, sector, size);
2735a594 5037 return 0;
b411b363 5038 }
2deb8336 5039
85997675
AG
5040 err = validate_req_change_req_state(mdev, p->block_id, sector,
5041 &mdev->write_requests, __func__,
303d1448 5042 NEG_ACKED, true);
85997675 5043 if (err) {
c3afd8f5
AG
5044 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
5045 The master bio might already be completed, therefore the
5046 request is no longer in the collision hash. */
5047 /* In Protocol B we might already have got a P_RECV_ACK
5048 but then get a P_NEG_ACK afterwards. */
c3afd8f5 5049 drbd_set_out_of_sync(mdev, sector, size);
2deb8336 5050 }
2735a594 5051 return 0;
b411b363
PR
5052}
5053
1952e916 5054static int got_NegDReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 5055{
1952e916 5056 struct drbd_conf *mdev;
e658983a 5057 struct p_block_ack *p = pi->data;
b411b363
PR
5058 sector_t sector = be64_to_cpu(p->sector);
5059
1952e916
AG
5060 mdev = vnr_to_mdev(tconn, pi->vnr);
5061 if (!mdev)
2735a594 5062 return -EIO;
1952e916 5063
b411b363 5064 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
7be8da07 5065
380207d0 5066 dev_err(DEV, "Got NegDReply; Sector %llus, len %u.\n",
b411b363
PR
5067 (unsigned long long)sector, be32_to_cpu(p->blksize));
5068
2735a594
AG
5069 return validate_req_change_req_state(mdev, p->block_id, sector,
5070 &mdev->read_requests, __func__,
5071 NEG_ACKED, false);
b411b363
PR
5072}
5073
1952e916 5074static int got_NegRSDReply(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 5075{
1952e916 5076 struct drbd_conf *mdev;
b411b363
PR
5077 sector_t sector;
5078 int size;
e658983a 5079 struct p_block_ack *p = pi->data;
1952e916
AG
5080
5081 mdev = vnr_to_mdev(tconn, pi->vnr);
5082 if (!mdev)
2735a594 5083 return -EIO;
b411b363
PR
5084
5085 sector = be64_to_cpu(p->sector);
5086 size = be32_to_cpu(p->blksize);
b411b363
PR
5087
5088 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
5089
5090 dec_rs_pending(mdev);
5091
5092 if (get_ldev_if_state(mdev, D_FAILED)) {
5093 drbd_rs_complete_io(mdev, sector);
e05e1e59 5094 switch (pi->cmd) {
d612d309
PR
5095 case P_NEG_RS_DREPLY:
5096 drbd_rs_failed_io(mdev, sector, size);
5097 case P_RS_CANCEL:
5098 break;
5099 default:
2735a594 5100 BUG();
d612d309 5101 }
b411b363
PR
5102 put_ldev(mdev);
5103 }
5104
2735a594 5105 return 0;
b411b363
PR
5106}
5107
1952e916 5108static int got_BarrierAck(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 5109{
e658983a 5110 struct p_barrier_ack *p = pi->data;
9ed57dcb
LE
5111 struct drbd_conf *mdev;
5112 int vnr;
1952e916 5113
9ed57dcb 5114 tl_release(tconn, p->barrier, be32_to_cpu(p->set_size));
b411b363 5115
9ed57dcb
LE
5116 rcu_read_lock();
5117 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
5118 if (mdev->state.conn == C_AHEAD &&
5119 atomic_read(&mdev->ap_in_flight) == 0 &&
5120 !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) {
5121 mdev->start_resync_timer.expires = jiffies + HZ;
5122 add_timer(&mdev->start_resync_timer);
5123 }
c4752ef1 5124 }
9ed57dcb 5125 rcu_read_unlock();
c4752ef1 5126
2735a594 5127 return 0;
b411b363
PR
5128}
5129
1952e916 5130static int got_OVResult(struct drbd_tconn *tconn, struct packet_info *pi)
b411b363 5131{
1952e916 5132 struct drbd_conf *mdev;
e658983a 5133 struct p_block_ack *p = pi->data;
b411b363
PR
5134 struct drbd_work *w;
5135 sector_t sector;
5136 int size;
5137
1952e916
AG
5138 mdev = vnr_to_mdev(tconn, pi->vnr);
5139 if (!mdev)
2735a594 5140 return -EIO;
1952e916 5141
b411b363
PR
5142 sector = be64_to_cpu(p->sector);
5143 size = be32_to_cpu(p->blksize);
5144
5145 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
5146
5147 if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
8f7bed77 5148 drbd_ov_out_of_sync_found(mdev, sector, size);
b411b363 5149 else
8f7bed77 5150 ov_out_of_sync_print(mdev);
b411b363 5151
1d53f09e 5152 if (!get_ldev(mdev))
2735a594 5153 return 0;
1d53f09e 5154
b411b363
PR
5155 drbd_rs_complete_io(mdev, sector);
5156 dec_rs_pending(mdev);
5157
ea5442af
LE
5158 --mdev->ov_left;
5159
5160 /* let's advance progress step marks only for every other megabyte */
5161 if ((mdev->ov_left & 0x200) == 0x200)
5162 drbd_advance_rs_marks(mdev, mdev->ov_left);
5163
5164 if (mdev->ov_left == 0) {
b411b363
PR
5165 w = kmalloc(sizeof(*w), GFP_NOIO);
5166 if (w) {
5167 w->cb = w_ov_finished;
a21e9298 5168 w->mdev = mdev;
d5b27b01 5169 drbd_queue_work(&mdev->tconn->sender_work, w);
b411b363
PR
5170 } else {
5171 dev_err(DEV, "kmalloc(w) failed.");
8f7bed77 5172 ov_out_of_sync_print(mdev);
b411b363
PR
5173 drbd_resync_finished(mdev);
5174 }
5175 }
1d53f09e 5176 put_ldev(mdev);
2735a594 5177 return 0;
b411b363
PR
5178}
5179
1952e916 5180static int got_skip(struct drbd_tconn *tconn, struct packet_info *pi)
0ced55a3 5181{
2735a594 5182 return 0;
0ced55a3
PR
5183}
5184
a990be46 5185static int tconn_finish_peer_reqs(struct drbd_tconn *tconn)
32862ec7 5186{
082a3439 5187 struct drbd_conf *mdev;
c141ebda 5188 int vnr, not_empty = 0;
32862ec7
PR
5189
5190 do {
5191 clear_bit(SIGNAL_ASENDER, &tconn->flags);
5192 flush_signals(current);
c141ebda
PR
5193
5194 rcu_read_lock();
5195 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
5196 kref_get(&mdev->kref);
5197 rcu_read_unlock();
d3fcb490 5198 if (drbd_finish_peer_reqs(mdev)) {
c141ebda
PR
5199 kref_put(&mdev->kref, &drbd_minor_destroy);
5200 return 1;
d3fcb490 5201 }
c141ebda
PR
5202 kref_put(&mdev->kref, &drbd_minor_destroy);
5203 rcu_read_lock();
082a3439 5204 }
32862ec7 5205 set_bit(SIGNAL_ASENDER, &tconn->flags);
082a3439
PR
5206
5207 spin_lock_irq(&tconn->req_lock);
c141ebda 5208 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
082a3439
PR
5209 not_empty = !list_empty(&mdev->done_ee);
5210 if (not_empty)
5211 break;
5212 }
5213 spin_unlock_irq(&tconn->req_lock);
c141ebda 5214 rcu_read_unlock();
32862ec7
PR
5215 } while (not_empty);
5216
5217 return 0;
5218}
5219
7201b972
AG
5220struct asender_cmd {
5221 size_t pkt_size;
1952e916 5222 int (*fn)(struct drbd_tconn *tconn, struct packet_info *);
7201b972
AG
5223};
5224
5225static struct asender_cmd asender_tbl[] = {
e658983a
AG
5226 [P_PING] = { 0, got_Ping },
5227 [P_PING_ACK] = { 0, got_PingAck },
1952e916
AG
5228 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
5229 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
5230 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
5231 [P_DISCARD_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
5232 [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
5233 [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
5234 [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply },
5235 [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
5236 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
5237 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
5238 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
5239 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
5240 [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply },
5241 [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
5242 [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
7201b972
AG
5243};
5244
b411b363
PR
5245int drbd_asender(struct drbd_thread *thi)
5246{
392c8801 5247 struct drbd_tconn *tconn = thi->tconn;
b411b363 5248 struct asender_cmd *cmd = NULL;
77351055 5249 struct packet_info pi;
257d0af6 5250 int rv;
e658983a 5251 void *buf = tconn->meta.rbuf;
b411b363 5252 int received = 0;
52b061a4
AG
5253 unsigned int header_size = drbd_header_size(tconn);
5254 int expect = header_size;
44ed167d
PR
5255 bool ping_timeout_active = false;
5256 struct net_conf *nc;
bb77d34e 5257 int ping_timeo, tcp_cork, ping_int;
b411b363 5258
b411b363
PR
5259 current->policy = SCHED_RR; /* Make this a realtime task! */
5260 current->rt_priority = 2; /* more important than all other tasks */
5261
e77a0a5c 5262 while (get_t_state(thi) == RUNNING) {
80822284 5263 drbd_thread_current_set_cpu(thi);
44ed167d
PR
5264
5265 rcu_read_lock();
5266 nc = rcu_dereference(tconn->net_conf);
5267 ping_timeo = nc->ping_timeo;
bb77d34e 5268 tcp_cork = nc->tcp_cork;
44ed167d
PR
5269 ping_int = nc->ping_int;
5270 rcu_read_unlock();
5271
32862ec7 5272 if (test_and_clear_bit(SEND_PING, &tconn->flags)) {
a17647aa 5273 if (drbd_send_ping(tconn)) {
32862ec7 5274 conn_err(tconn, "drbd_send_ping has failed\n");
841ce241
AG
5275 goto reconnect;
5276 }
44ed167d
PR
5277 tconn->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
5278 ping_timeout_active = true;
b411b363
PR
5279 }
5280
32862ec7
PR
5281 /* TODO: conditionally cork; it may hurt latency if we cork without
5282 much to send */
bb77d34e 5283 if (tcp_cork)
32862ec7 5284 drbd_tcp_cork(tconn->meta.socket);
a990be46
AG
5285 if (tconn_finish_peer_reqs(tconn)) {
5286 conn_err(tconn, "tconn_finish_peer_reqs() failed\n");
32862ec7 5287 goto reconnect;
082a3439 5288 }
b411b363 5289 /* but unconditionally uncork unless disabled */
bb77d34e 5290 if (tcp_cork)
32862ec7 5291 drbd_tcp_uncork(tconn->meta.socket);
b411b363
PR
5292
5293 /* short circuit, recv_msg would return EINTR anyways. */
5294 if (signal_pending(current))
5295 continue;
5296
32862ec7
PR
5297 rv = drbd_recv_short(tconn->meta.socket, buf, expect-received, 0);
5298 clear_bit(SIGNAL_ASENDER, &tconn->flags);
b411b363
PR
5299
5300 flush_signals(current);
5301
5302 /* Note:
5303 * -EINTR (on meta) we got a signal
5304 * -EAGAIN (on meta) rcvtimeo expired
5305 * -ECONNRESET other side closed the connection
5306 * -ERESTARTSYS (on data) we got a signal
5307 * rv < 0 other than above: unexpected error!
5308 * rv == expected: full header or command
5309 * rv < expected: "woken" by signal during receive
5310 * rv == 0 : "connection shut down by peer"
5311 */
5312 if (likely(rv > 0)) {
5313 received += rv;
5314 buf += rv;
5315 } else if (rv == 0) {
32862ec7 5316 conn_err(tconn, "meta connection shut down by peer.\n");
b411b363
PR
5317 goto reconnect;
5318 } else if (rv == -EAGAIN) {
cb6518cb
LE
5319 /* If the data socket received something meanwhile,
5320 * that is good enough: peer is still alive. */
32862ec7
PR
5321 if (time_after(tconn->last_received,
5322 jiffies - tconn->meta.socket->sk->sk_rcvtimeo))
cb6518cb 5323 continue;
f36af18c 5324 if (ping_timeout_active) {
32862ec7 5325 conn_err(tconn, "PingAck did not arrive in time.\n");
b411b363
PR
5326 goto reconnect;
5327 }
32862ec7 5328 set_bit(SEND_PING, &tconn->flags);
b411b363
PR
5329 continue;
5330 } else if (rv == -EINTR) {
5331 continue;
5332 } else {
32862ec7 5333 conn_err(tconn, "sock_recvmsg returned %d\n", rv);
b411b363
PR
5334 goto reconnect;
5335 }
5336
5337 if (received == expect && cmd == NULL) {
e658983a 5338 if (decode_header(tconn, tconn->meta.rbuf, &pi))
b411b363 5339 goto reconnect;
7201b972 5340 cmd = &asender_tbl[pi.cmd];
1952e916 5341 if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
2fcb8f30
AG
5342 conn_err(tconn, "Unexpected meta packet %s (0x%04x)\n",
5343 cmdname(pi.cmd), pi.cmd);
b411b363
PR
5344 goto disconnect;
5345 }
e658983a 5346 expect = header_size + cmd->pkt_size;
52b061a4 5347 if (pi.size != expect - header_size) {
32862ec7 5348 conn_err(tconn, "Wrong packet size on meta (c: %d, l: %d)\n",
77351055 5349 pi.cmd, pi.size);
b411b363 5350 goto reconnect;
257d0af6 5351 }
b411b363
PR
5352 }
5353 if (received == expect) {
2735a594 5354 bool err;
a4fbda8e 5355
2735a594
AG
5356 err = cmd->fn(tconn, &pi);
5357 if (err) {
1952e916 5358 conn_err(tconn, "%pf failed\n", cmd->fn);
b411b363 5359 goto reconnect;
1952e916 5360 }
b411b363 5361
a4fbda8e
PR
5362 tconn->last_received = jiffies;
5363
44ed167d
PR
5364 if (cmd == &asender_tbl[P_PING_ACK]) {
5365 /* restore idle timeout */
5366 tconn->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
5367 ping_timeout_active = false;
5368 }
f36af18c 5369
e658983a 5370 buf = tconn->meta.rbuf;
b411b363 5371 received = 0;
52b061a4 5372 expect = header_size;
b411b363
PR
5373 cmd = NULL;
5374 }
5375 }
5376
5377 if (0) {
5378reconnect:
bbeb641c 5379 conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
b411b363
PR
5380 }
5381 if (0) {
5382disconnect:
bbeb641c 5383 conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 5384 }
32862ec7 5385 clear_bit(SIGNAL_ASENDER, &tconn->flags);
b411b363 5386
32862ec7 5387 conn_info(tconn, "asender terminated\n");
b411b363
PR
5388
5389 return 0;
5390}
This page took 0.507097 seconds and 5 git commands to generate.