Merge branch 'fix/rt5645' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie...
[deliverable/linux.git] / drivers / block / drbd / drbd_receiver.c
CommitLineData
b411b363
PR
1/*
2 drbd_receiver.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25
b411b363
PR
26#include <linux/module.h>
27
28#include <asm/uaccess.h>
29#include <net/sock.h>
30
b411b363
PR
31#include <linux/drbd.h>
32#include <linux/fs.h>
33#include <linux/file.h>
34#include <linux/in.h>
35#include <linux/mm.h>
36#include <linux/memcontrol.h>
37#include <linux/mm_inline.h>
38#include <linux/slab.h>
b411b363
PR
39#include <linux/pkt_sched.h>
40#define __KERNEL_SYSCALLS__
41#include <linux/unistd.h>
42#include <linux/vmalloc.h>
43#include <linux/random.h>
b411b363
PR
44#include <linux/string.h>
45#include <linux/scatterlist.h>
46#include "drbd_int.h"
a3603a6e 47#include "drbd_protocol.h"
b411b363 48#include "drbd_req.h"
b411b363
PR
49#include "drbd_vli.h"
50
20c68fde
LE
51#define PRO_FEATURES (FF_TRIM)
52
77351055
PR
53struct packet_info {
54 enum drbd_packet cmd;
e2857216
AG
55 unsigned int size;
56 unsigned int vnr;
e658983a 57 void *data;
77351055
PR
58};
59
b411b363
PR
60enum finish_epoch {
61 FE_STILL_LIVE,
62 FE_DESTROYED,
63 FE_RECYCLED,
64};
65
bde89a9e
AG
66static int drbd_do_features(struct drbd_connection *connection);
67static int drbd_do_auth(struct drbd_connection *connection);
69a22773 68static int drbd_disconnected(struct drbd_peer_device *);
a0fb3c47 69static void conn_wait_active_ee_empty(struct drbd_connection *connection);
bde89a9e 70static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
99920dc5 71static int e_end_block(struct drbd_work *, int);
b411b363 72
b411b363
PR
73
74#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
75
45bb912b
LE
76/*
77 * some helper functions to deal with single linked page lists,
78 * page->private being our "next" pointer.
79 */
80
81/* If at least n pages are linked at head, get n pages off.
82 * Otherwise, don't modify head, and return NULL.
83 * Locking is the responsibility of the caller.
84 */
85static struct page *page_chain_del(struct page **head, int n)
86{
87 struct page *page;
88 struct page *tmp;
89
90 BUG_ON(!n);
91 BUG_ON(!head);
92
93 page = *head;
23ce4227
PR
94
95 if (!page)
96 return NULL;
97
45bb912b
LE
98 while (page) {
99 tmp = page_chain_next(page);
100 if (--n == 0)
101 break; /* found sufficient pages */
102 if (tmp == NULL)
103 /* insufficient pages, don't use any of them. */
104 return NULL;
105 page = tmp;
106 }
107
108 /* add end of list marker for the returned list */
109 set_page_private(page, 0);
110 /* actual return value, and adjustment of head */
111 page = *head;
112 *head = tmp;
113 return page;
114}
115
116/* may be used outside of locks to find the tail of a (usually short)
117 * "private" page chain, before adding it back to a global chain head
118 * with page_chain_add() under a spinlock. */
119static struct page *page_chain_tail(struct page *page, int *len)
120{
121 struct page *tmp;
122 int i = 1;
123 while ((tmp = page_chain_next(page)))
124 ++i, page = tmp;
125 if (len)
126 *len = i;
127 return page;
128}
129
130static int page_chain_free(struct page *page)
131{
132 struct page *tmp;
133 int i = 0;
134 page_chain_for_each_safe(page, tmp) {
135 put_page(page);
136 ++i;
137 }
138 return i;
139}
140
141static void page_chain_add(struct page **head,
142 struct page *chain_first, struct page *chain_last)
143{
144#if 1
145 struct page *tmp;
146 tmp = page_chain_tail(chain_first, NULL);
147 BUG_ON(tmp != chain_last);
148#endif
149
150 /* add chain to head */
151 set_page_private(chain_last, (unsigned long)*head);
152 *head = chain_first;
153}
154
b30ab791 155static struct page *__drbd_alloc_pages(struct drbd_device *device,
18c2d522 156 unsigned int number)
b411b363
PR
157{
158 struct page *page = NULL;
45bb912b 159 struct page *tmp = NULL;
18c2d522 160 unsigned int i = 0;
b411b363
PR
161
162 /* Yes, testing drbd_pp_vacant outside the lock is racy.
163 * So what. It saves a spin_lock. */
45bb912b 164 if (drbd_pp_vacant >= number) {
b411b363 165 spin_lock(&drbd_pp_lock);
45bb912b
LE
166 page = page_chain_del(&drbd_pp_pool, number);
167 if (page)
168 drbd_pp_vacant -= number;
b411b363 169 spin_unlock(&drbd_pp_lock);
45bb912b
LE
170 if (page)
171 return page;
b411b363 172 }
45bb912b 173
b411b363
PR
174 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
175 * "criss-cross" setup, that might cause write-out on some other DRBD,
176 * which in turn might block on the other node at this very place. */
45bb912b
LE
177 for (i = 0; i < number; i++) {
178 tmp = alloc_page(GFP_TRY);
179 if (!tmp)
180 break;
181 set_page_private(tmp, (unsigned long)page);
182 page = tmp;
183 }
184
185 if (i == number)
186 return page;
187
188 /* Not enough pages immediately available this time.
c37c8ecf 189 * No need to jump around here, drbd_alloc_pages will retry this
45bb912b
LE
190 * function "soon". */
191 if (page) {
192 tmp = page_chain_tail(page, NULL);
193 spin_lock(&drbd_pp_lock);
194 page_chain_add(&drbd_pp_pool, page, tmp);
195 drbd_pp_vacant += i;
196 spin_unlock(&drbd_pp_lock);
197 }
198 return NULL;
b411b363
PR
199}
200
b30ab791 201static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
a990be46 202 struct list_head *to_be_freed)
b411b363 203{
a8cd15ba 204 struct drbd_peer_request *peer_req, *tmp;
b411b363
PR
205
206 /* The EEs are always appended to the end of the list. Since
207 they are sent in order over the wire, they have to finish
208 in order. As soon as we see the first not finished we can
209 stop to examine the list... */
210
a8cd15ba 211 list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
045417f7 212 if (drbd_peer_req_has_active_page(peer_req))
b411b363 213 break;
a8cd15ba 214 list_move(&peer_req->w.list, to_be_freed);
b411b363
PR
215 }
216}
217
b30ab791 218static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
b411b363
PR
219{
220 LIST_HEAD(reclaimed);
db830c46 221 struct drbd_peer_request *peer_req, *t;
b411b363 222
0500813f 223 spin_lock_irq(&device->resource->req_lock);
b30ab791 224 reclaim_finished_net_peer_reqs(device, &reclaimed);
0500813f 225 spin_unlock_irq(&device->resource->req_lock);
b411b363 226
a8cd15ba 227 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
b30ab791 228 drbd_free_net_peer_req(device, peer_req);
b411b363
PR
229}
230
231/**
c37c8ecf 232 * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
b30ab791 233 * @device: DRBD device.
45bb912b
LE
234 * @number: number of pages requested
235 * @retry: whether to retry, if not enough pages are available right now
236 *
237 * Tries to allocate number pages, first from our own page pool, then from
0e49d7b0 238 * the kernel.
45bb912b 239 * Possibly retry until DRBD frees sufficient pages somewhere else.
b411b363 240 *
0e49d7b0
LE
241 * If this allocation would exceed the max_buffers setting, we throttle
242 * allocation (schedule_timeout) to give the system some room to breathe.
243 *
244 * We do not use max-buffers as hard limit, because it could lead to
245 * congestion and further to a distributed deadlock during online-verify or
246 * (checksum based) resync, if the max-buffers, socket buffer sizes and
247 * resync-rate settings are mis-configured.
248 *
45bb912b 249 * Returns a page chain linked via page->private.
b411b363 250 */
69a22773 251struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
c37c8ecf 252 bool retry)
b411b363 253{
69a22773 254 struct drbd_device *device = peer_device->device;
b411b363 255 struct page *page = NULL;
44ed167d 256 struct net_conf *nc;
b411b363 257 DEFINE_WAIT(wait);
0e49d7b0 258 unsigned int mxb;
b411b363 259
44ed167d 260 rcu_read_lock();
69a22773 261 nc = rcu_dereference(peer_device->connection->net_conf);
44ed167d
PR
262 mxb = nc ? nc->max_buffers : 1000000;
263 rcu_read_unlock();
264
b30ab791
AG
265 if (atomic_read(&device->pp_in_use) < mxb)
266 page = __drbd_alloc_pages(device, number);
b411b363 267
45bb912b 268 while (page == NULL) {
b411b363
PR
269 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
270
b30ab791 271 drbd_kick_lo_and_reclaim_net(device);
b411b363 272
b30ab791
AG
273 if (atomic_read(&device->pp_in_use) < mxb) {
274 page = __drbd_alloc_pages(device, number);
b411b363
PR
275 if (page)
276 break;
277 }
278
279 if (!retry)
280 break;
281
282 if (signal_pending(current)) {
d0180171 283 drbd_warn(device, "drbd_alloc_pages interrupted!\n");
b411b363
PR
284 break;
285 }
286
0e49d7b0
LE
287 if (schedule_timeout(HZ/10) == 0)
288 mxb = UINT_MAX;
b411b363
PR
289 }
290 finish_wait(&drbd_pp_wait, &wait);
291
45bb912b 292 if (page)
b30ab791 293 atomic_add(number, &device->pp_in_use);
b411b363
PR
294 return page;
295}
296
c37c8ecf 297/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
0500813f 298 * Is also used from inside an other spin_lock_irq(&resource->req_lock);
45bb912b
LE
299 * Either links the page chain back to the global pool,
300 * or returns all pages to the system. */
b30ab791 301static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
b411b363 302{
b30ab791 303 atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
b411b363 304 int i;
435f0740 305
a73ff323
LE
306 if (page == NULL)
307 return;
308
81a5d60e 309 if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
45bb912b
LE
310 i = page_chain_free(page);
311 else {
312 struct page *tmp;
313 tmp = page_chain_tail(page, &i);
314 spin_lock(&drbd_pp_lock);
315 page_chain_add(&drbd_pp_pool, page, tmp);
316 drbd_pp_vacant += i;
317 spin_unlock(&drbd_pp_lock);
b411b363 318 }
435f0740 319 i = atomic_sub_return(i, a);
45bb912b 320 if (i < 0)
d0180171 321 drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
435f0740 322 is_net ? "pp_in_use_by_net" : "pp_in_use", i);
b411b363
PR
323 wake_up(&drbd_pp_wait);
324}
325
326/*
327You need to hold the req_lock:
328 _drbd_wait_ee_list_empty()
329
330You must not have the req_lock:
3967deb1 331 drbd_free_peer_req()
0db55363 332 drbd_alloc_peer_req()
7721f567 333 drbd_free_peer_reqs()
b411b363 334 drbd_ee_fix_bhs()
a990be46 335 drbd_finish_peer_reqs()
b411b363
PR
336 drbd_clear_done_ee()
337 drbd_wait_ee_list_empty()
338*/
339
f6ffca9f 340struct drbd_peer_request *
69a22773 341drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
a0fb3c47 342 unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local)
b411b363 343{
69a22773 344 struct drbd_device *device = peer_device->device;
db830c46 345 struct drbd_peer_request *peer_req;
a73ff323 346 struct page *page = NULL;
45bb912b 347 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
b411b363 348
b30ab791 349 if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
b411b363
PR
350 return NULL;
351
db830c46
AG
352 peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
353 if (!peer_req) {
b411b363 354 if (!(gfp_mask & __GFP_NOWARN))
d0180171 355 drbd_err(device, "%s: allocation failed\n", __func__);
b411b363
PR
356 return NULL;
357 }
358
a0fb3c47 359 if (has_payload && data_size) {
69a22773 360 page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT));
a73ff323
LE
361 if (!page)
362 goto fail;
363 }
b411b363 364
c5a2c150
LE
365 memset(peer_req, 0, sizeof(*peer_req));
366 INIT_LIST_HEAD(&peer_req->w.list);
db830c46
AG
367 drbd_clear_interval(&peer_req->i);
368 peer_req->i.size = data_size;
369 peer_req->i.sector = sector;
c5a2c150 370 peer_req->submit_jif = jiffies;
a8cd15ba 371 peer_req->peer_device = peer_device;
db830c46 372 peer_req->pages = page;
9a8e7753
AG
373 /*
374 * The block_id is opaque to the receiver. It is not endianness
375 * converted, and sent back to the sender unchanged.
376 */
db830c46 377 peer_req->block_id = id;
b411b363 378
db830c46 379 return peer_req;
b411b363 380
45bb912b 381 fail:
db830c46 382 mempool_free(peer_req, drbd_ee_mempool);
b411b363
PR
383 return NULL;
384}
385
b30ab791 386void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
f6ffca9f 387 int is_net)
b411b363 388{
21ae5d7f 389 might_sleep();
db830c46
AG
390 if (peer_req->flags & EE_HAS_DIGEST)
391 kfree(peer_req->digest);
b30ab791 392 drbd_free_pages(device, peer_req->pages, is_net);
0b0ba1ef
AG
393 D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
394 D_ASSERT(device, drbd_interval_empty(&peer_req->i));
21ae5d7f
LE
395 if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
396 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
397 drbd_al_complete_io(device, &peer_req->i);
398 }
db830c46 399 mempool_free(peer_req, drbd_ee_mempool);
b411b363
PR
400}
401
b30ab791 402int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
b411b363
PR
403{
404 LIST_HEAD(work_list);
db830c46 405 struct drbd_peer_request *peer_req, *t;
b411b363 406 int count = 0;
b30ab791 407 int is_net = list == &device->net_ee;
b411b363 408
0500813f 409 spin_lock_irq(&device->resource->req_lock);
b411b363 410 list_splice_init(list, &work_list);
0500813f 411 spin_unlock_irq(&device->resource->req_lock);
b411b363 412
a8cd15ba 413 list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
b30ab791 414 __drbd_free_peer_req(device, peer_req, is_net);
b411b363
PR
415 count++;
416 }
417 return count;
418}
419
b411b363 420/*
a990be46 421 * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
b411b363 422 */
b30ab791 423static int drbd_finish_peer_reqs(struct drbd_device *device)
b411b363
PR
424{
425 LIST_HEAD(work_list);
426 LIST_HEAD(reclaimed);
db830c46 427 struct drbd_peer_request *peer_req, *t;
e2b3032b 428 int err = 0;
b411b363 429
0500813f 430 spin_lock_irq(&device->resource->req_lock);
b30ab791
AG
431 reclaim_finished_net_peer_reqs(device, &reclaimed);
432 list_splice_init(&device->done_ee, &work_list);
0500813f 433 spin_unlock_irq(&device->resource->req_lock);
b411b363 434
a8cd15ba 435 list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
b30ab791 436 drbd_free_net_peer_req(device, peer_req);
b411b363
PR
437
438 /* possible callbacks here:
d4dabbe2 439 * e_end_block, and e_end_resync_block, e_send_superseded.
b411b363
PR
440 * all ignore the last argument.
441 */
a8cd15ba 442 list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
e2b3032b
AG
443 int err2;
444
b411b363 445 /* list_del not necessary, next/prev members not touched */
a8cd15ba 446 err2 = peer_req->w.cb(&peer_req->w, !!err);
e2b3032b
AG
447 if (!err)
448 err = err2;
b30ab791 449 drbd_free_peer_req(device, peer_req);
b411b363 450 }
b30ab791 451 wake_up(&device->ee_wait);
b411b363 452
e2b3032b 453 return err;
b411b363
PR
454}
455
b30ab791 456static void _drbd_wait_ee_list_empty(struct drbd_device *device,
d4da1537 457 struct list_head *head)
b411b363
PR
458{
459 DEFINE_WAIT(wait);
460
461 /* avoids spin_lock/unlock
462 * and calling prepare_to_wait in the fast path */
463 while (!list_empty(head)) {
b30ab791 464 prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
0500813f 465 spin_unlock_irq(&device->resource->req_lock);
7eaceacc 466 io_schedule();
b30ab791 467 finish_wait(&device->ee_wait, &wait);
0500813f 468 spin_lock_irq(&device->resource->req_lock);
b411b363
PR
469 }
470}
471
b30ab791 472static void drbd_wait_ee_list_empty(struct drbd_device *device,
d4da1537 473 struct list_head *head)
b411b363 474{
0500813f 475 spin_lock_irq(&device->resource->req_lock);
b30ab791 476 _drbd_wait_ee_list_empty(device, head);
0500813f 477 spin_unlock_irq(&device->resource->req_lock);
b411b363
PR
478}
479
dbd9eea0 480static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
b411b363 481{
b411b363
PR
482 struct kvec iov = {
483 .iov_base = buf,
484 .iov_len = size,
485 };
486 struct msghdr msg = {
b411b363
PR
487 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
488 };
f730c848 489 return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags);
b411b363
PR
490}
491
bde89a9e 492static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
b411b363 493{
b411b363
PR
494 int rv;
495
bde89a9e 496 rv = drbd_recv_short(connection->data.socket, buf, size, 0);
b411b363 497
dbd0820c
PR
498 if (rv < 0) {
499 if (rv == -ECONNRESET)
1ec861eb 500 drbd_info(connection, "sock was reset by peer\n");
dbd0820c 501 else if (rv != -ERESTARTSYS)
1ec861eb 502 drbd_err(connection, "sock_recvmsg returned %d\n", rv);
dbd0820c 503 } else if (rv == 0) {
bde89a9e 504 if (test_bit(DISCONNECT_SENT, &connection->flags)) {
b66623e3
PR
505 long t;
506 rcu_read_lock();
bde89a9e 507 t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
b66623e3
PR
508 rcu_read_unlock();
509
bde89a9e 510 t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
b66623e3 511
599377ac
PR
512 if (t)
513 goto out;
514 }
1ec861eb 515 drbd_info(connection, "sock was shut down by peer\n");
599377ac
PR
516 }
517
b411b363 518 if (rv != size)
bde89a9e 519 conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
b411b363 520
599377ac 521out:
b411b363
PR
522 return rv;
523}
524
bde89a9e 525static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
c6967746
AG
526{
527 int err;
528
bde89a9e 529 err = drbd_recv(connection, buf, size);
c6967746
AG
530 if (err != size) {
531 if (err >= 0)
532 err = -EIO;
533 } else
534 err = 0;
535 return err;
536}
537
bde89a9e 538static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
a5c31904
AG
539{
540 int err;
541
bde89a9e 542 err = drbd_recv_all(connection, buf, size);
a5c31904 543 if (err && !signal_pending(current))
1ec861eb 544 drbd_warn(connection, "short read (expected size %d)\n", (int)size);
a5c31904
AG
545 return err;
546}
547
5dbf1673
LE
548/* quoting tcp(7):
549 * On individual connections, the socket buffer size must be set prior to the
550 * listen(2) or connect(2) calls in order to have it take effect.
551 * This is our wrapper to do so.
552 */
553static void drbd_setbufsize(struct socket *sock, unsigned int snd,
554 unsigned int rcv)
555{
556 /* open coded SO_SNDBUF, SO_RCVBUF */
557 if (snd) {
558 sock->sk->sk_sndbuf = snd;
559 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
560 }
561 if (rcv) {
562 sock->sk->sk_rcvbuf = rcv;
563 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
564 }
565}
566
bde89a9e 567static struct socket *drbd_try_connect(struct drbd_connection *connection)
b411b363
PR
568{
569 const char *what;
570 struct socket *sock;
571 struct sockaddr_in6 src_in6;
44ed167d
PR
572 struct sockaddr_in6 peer_in6;
573 struct net_conf *nc;
574 int err, peer_addr_len, my_addr_len;
69ef82de 575 int sndbuf_size, rcvbuf_size, connect_int;
b411b363
PR
576 int disconnect_on_error = 1;
577
44ed167d 578 rcu_read_lock();
bde89a9e 579 nc = rcu_dereference(connection->net_conf);
44ed167d
PR
580 if (!nc) {
581 rcu_read_unlock();
b411b363 582 return NULL;
44ed167d 583 }
44ed167d
PR
584 sndbuf_size = nc->sndbuf_size;
585 rcvbuf_size = nc->rcvbuf_size;
69ef82de 586 connect_int = nc->connect_int;
089c075d 587 rcu_read_unlock();
44ed167d 588
bde89a9e
AG
589 my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
590 memcpy(&src_in6, &connection->my_addr, my_addr_len);
44ed167d 591
bde89a9e 592 if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
44ed167d
PR
593 src_in6.sin6_port = 0;
594 else
595 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
596
bde89a9e
AG
597 peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
598 memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
b411b363
PR
599
600 what = "sock_create_kern";
eeb1bd5c 601 err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family,
44ed167d 602 SOCK_STREAM, IPPROTO_TCP, &sock);
b411b363
PR
603 if (err < 0) {
604 sock = NULL;
605 goto out;
606 }
607
608 sock->sk->sk_rcvtimeo =
69ef82de 609 sock->sk->sk_sndtimeo = connect_int * HZ;
44ed167d 610 drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
b411b363
PR
611
612 /* explicitly bind to the configured IP as source IP
613 * for the outgoing connections.
614 * This is needed for multihomed hosts and to be
615 * able to use lo: interfaces for drbd.
616 * Make sure to use 0 as port number, so linux selects
617 * a free one dynamically.
618 */
b411b363 619 what = "bind before connect";
44ed167d 620 err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
b411b363
PR
621 if (err < 0)
622 goto out;
623
624 /* connect may fail, peer not yet available.
625 * stay C_WF_CONNECTION, don't go Disconnecting! */
626 disconnect_on_error = 0;
627 what = "connect";
44ed167d 628 err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
b411b363
PR
629
630out:
631 if (err < 0) {
632 if (sock) {
633 sock_release(sock);
634 sock = NULL;
635 }
636 switch (-err) {
637 /* timeout, busy, signal pending */
638 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
639 case EINTR: case ERESTARTSYS:
640 /* peer not (yet) available, network problem */
641 case ECONNREFUSED: case ENETUNREACH:
642 case EHOSTDOWN: case EHOSTUNREACH:
643 disconnect_on_error = 0;
644 break;
645 default:
1ec861eb 646 drbd_err(connection, "%s failed, err = %d\n", what, err);
b411b363
PR
647 }
648 if (disconnect_on_error)
bde89a9e 649 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 650 }
44ed167d 651
b411b363
PR
652 return sock;
653}
654
7a426fd8 655struct accept_wait_data {
bde89a9e 656 struct drbd_connection *connection;
7a426fd8
PR
657 struct socket *s_listen;
658 struct completion door_bell;
659 void (*original_sk_state_change)(struct sock *sk);
660
661};
662
715306f6 663static void drbd_incoming_connection(struct sock *sk)
7a426fd8
PR
664{
665 struct accept_wait_data *ad = sk->sk_user_data;
715306f6 666 void (*state_change)(struct sock *sk);
7a426fd8 667
715306f6
AG
668 state_change = ad->original_sk_state_change;
669 if (sk->sk_state == TCP_ESTABLISHED)
670 complete(&ad->door_bell);
671 state_change(sk);
7a426fd8
PR
672}
673
bde89a9e 674static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
b411b363 675{
1f3e509b 676 int err, sndbuf_size, rcvbuf_size, my_addr_len;
44ed167d 677 struct sockaddr_in6 my_addr;
1f3e509b 678 struct socket *s_listen;
44ed167d 679 struct net_conf *nc;
b411b363
PR
680 const char *what;
681
44ed167d 682 rcu_read_lock();
bde89a9e 683 nc = rcu_dereference(connection->net_conf);
44ed167d
PR
684 if (!nc) {
685 rcu_read_unlock();
7a426fd8 686 return -EIO;
44ed167d 687 }
44ed167d
PR
688 sndbuf_size = nc->sndbuf_size;
689 rcvbuf_size = nc->rcvbuf_size;
44ed167d 690 rcu_read_unlock();
b411b363 691
bde89a9e
AG
692 my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
693 memcpy(&my_addr, &connection->my_addr, my_addr_len);
b411b363
PR
694
695 what = "sock_create_kern";
eeb1bd5c 696 err = sock_create_kern(&init_net, ((struct sockaddr *)&my_addr)->sa_family,
1f3e509b 697 SOCK_STREAM, IPPROTO_TCP, &s_listen);
b411b363
PR
698 if (err) {
699 s_listen = NULL;
700 goto out;
701 }
702
98683650 703 s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
44ed167d 704 drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
b411b363
PR
705
706 what = "bind before listen";
44ed167d 707 err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
b411b363
PR
708 if (err < 0)
709 goto out;
710
7a426fd8
PR
711 ad->s_listen = s_listen;
712 write_lock_bh(&s_listen->sk->sk_callback_lock);
713 ad->original_sk_state_change = s_listen->sk->sk_state_change;
715306f6 714 s_listen->sk->sk_state_change = drbd_incoming_connection;
7a426fd8
PR
715 s_listen->sk->sk_user_data = ad;
716 write_unlock_bh(&s_listen->sk->sk_callback_lock);
b411b363 717
2820fd39
PR
718 what = "listen";
719 err = s_listen->ops->listen(s_listen, 5);
720 if (err < 0)
721 goto out;
722
7a426fd8 723 return 0;
b411b363
PR
724out:
725 if (s_listen)
726 sock_release(s_listen);
727 if (err < 0) {
728 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
1ec861eb 729 drbd_err(connection, "%s failed, err = %d\n", what, err);
bde89a9e 730 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
731 }
732 }
b411b363 733
7a426fd8 734 return -EIO;
b411b363
PR
735}
736
715306f6 737static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
b411b363 738{
715306f6
AG
739 write_lock_bh(&sk->sk_callback_lock);
740 sk->sk_state_change = ad->original_sk_state_change;
741 sk->sk_user_data = NULL;
742 write_unlock_bh(&sk->sk_callback_lock);
b411b363
PR
743}
744
bde89a9e 745static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
b411b363 746{
1f3e509b
PR
747 int timeo, connect_int, err = 0;
748 struct socket *s_estab = NULL;
1f3e509b
PR
749 struct net_conf *nc;
750
751 rcu_read_lock();
bde89a9e 752 nc = rcu_dereference(connection->net_conf);
1f3e509b
PR
753 if (!nc) {
754 rcu_read_unlock();
755 return NULL;
756 }
757 connect_int = nc->connect_int;
758 rcu_read_unlock();
759
760 timeo = connect_int * HZ;
38b682b2
AM
761 /* 28.5% random jitter */
762 timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7;
1f3e509b 763
7a426fd8
PR
764 err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
765 if (err <= 0)
766 return NULL;
b411b363 767
7a426fd8 768 err = kernel_accept(ad->s_listen, &s_estab, 0);
b411b363
PR
769 if (err < 0) {
770 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
1ec861eb 771 drbd_err(connection, "accept failed, err = %d\n", err);
bde89a9e 772 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
773 }
774 }
b411b363 775
715306f6
AG
776 if (s_estab)
777 unregister_state_change(s_estab->sk, ad);
b411b363 778
b411b363
PR
779 return s_estab;
780}
b411b363 781
bde89a9e 782static int decode_header(struct drbd_connection *, void *, struct packet_info *);
b411b363 783
bde89a9e 784static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
9f5bdc33
AG
785 enum drbd_packet cmd)
786{
bde89a9e 787 if (!conn_prepare_command(connection, sock))
9f5bdc33 788 return -EIO;
bde89a9e 789 return conn_send_command(connection, sock, cmd, 0, NULL, 0);
b411b363
PR
790}
791
bde89a9e 792static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
b411b363 793{
bde89a9e 794 unsigned int header_size = drbd_header_size(connection);
9f5bdc33 795 struct packet_info pi;
4920e37a 796 struct net_conf *nc;
9f5bdc33 797 int err;
b411b363 798
4920e37a
PR
799 rcu_read_lock();
800 nc = rcu_dereference(connection->net_conf);
801 if (!nc) {
802 rcu_read_unlock();
803 return -EIO;
804 }
805 sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
806 rcu_read_unlock();
807
bde89a9e 808 err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
9f5bdc33
AG
809 if (err != header_size) {
810 if (err >= 0)
811 err = -EIO;
812 return err;
813 }
bde89a9e 814 err = decode_header(connection, connection->data.rbuf, &pi);
9f5bdc33
AG
815 if (err)
816 return err;
817 return pi.cmd;
b411b363
PR
818}
819
820/**
821 * drbd_socket_okay() - Free the socket if its connection is not okay
b411b363
PR
822 * @sock: pointer to the pointer to the socket.
823 */
5d0b17f1 824static bool drbd_socket_okay(struct socket **sock)
b411b363
PR
825{
826 int rr;
827 char tb[4];
828
829 if (!*sock)
81e84650 830 return false;
b411b363 831
dbd9eea0 832 rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
b411b363
PR
833
834 if (rr > 0 || rr == -EAGAIN) {
81e84650 835 return true;
b411b363
PR
836 } else {
837 sock_release(*sock);
838 *sock = NULL;
81e84650 839 return false;
b411b363
PR
840 }
841}
5d0b17f1
PR
842
843static bool connection_established(struct drbd_connection *connection,
844 struct socket **sock1,
845 struct socket **sock2)
846{
847 struct net_conf *nc;
848 int timeout;
849 bool ok;
850
851 if (!*sock1 || !*sock2)
852 return false;
853
854 rcu_read_lock();
855 nc = rcu_dereference(connection->net_conf);
856 timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
857 rcu_read_unlock();
858 schedule_timeout_interruptible(timeout);
859
860 ok = drbd_socket_okay(sock1);
861 ok = drbd_socket_okay(sock2) && ok;
862
863 return ok;
864}
865
2325eb66
PR
866/* Gets called if a connection is established, or if a new minor gets created
867 in a connection */
69a22773 868int drbd_connected(struct drbd_peer_device *peer_device)
907599e0 869{
69a22773 870 struct drbd_device *device = peer_device->device;
0829f5ed 871 int err;
907599e0 872
b30ab791
AG
873 atomic_set(&device->packet_seq, 0);
874 device->peer_seq = 0;
907599e0 875
69a22773
AG
876 device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
877 &peer_device->connection->cstate_mutex :
b30ab791 878 &device->own_state_mutex;
8410da8f 879
69a22773 880 err = drbd_send_sync_param(peer_device);
0829f5ed 881 if (!err)
69a22773 882 err = drbd_send_sizes(peer_device, 0, 0);
0829f5ed 883 if (!err)
69a22773 884 err = drbd_send_uuids(peer_device);
0829f5ed 885 if (!err)
69a22773 886 err = drbd_send_current_state(peer_device);
b30ab791
AG
887 clear_bit(USE_DEGR_WFC_T, &device->flags);
888 clear_bit(RESIZE_PENDING, &device->flags);
889 atomic_set(&device->ap_in_flight, 0);
890 mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
0829f5ed 891 return err;
907599e0 892}
b411b363
PR
893
894/*
895 * return values:
896 * 1 yes, we have a valid connection
897 * 0 oops, did not work out, please try again
898 * -1 peer talks different language,
899 * no point in trying again, please go standalone.
900 * -2 We do not have a network config...
901 */
bde89a9e 902static int conn_connect(struct drbd_connection *connection)
b411b363 903{
7da35862 904 struct drbd_socket sock, msock;
c06ece6b 905 struct drbd_peer_device *peer_device;
44ed167d 906 struct net_conf *nc;
5d0b17f1
PR
907 int vnr, timeout, h;
908 bool discard_my_data, ok;
197296ff 909 enum drbd_state_rv rv;
7a426fd8 910 struct accept_wait_data ad = {
bde89a9e 911 .connection = connection,
7a426fd8
PR
912 .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
913 };
b411b363 914
bde89a9e
AG
915 clear_bit(DISCONNECT_SENT, &connection->flags);
916 if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
b411b363
PR
917 return -2;
918
7da35862 919 mutex_init(&sock.mutex);
bde89a9e
AG
920 sock.sbuf = connection->data.sbuf;
921 sock.rbuf = connection->data.rbuf;
7da35862
PR
922 sock.socket = NULL;
923 mutex_init(&msock.mutex);
bde89a9e
AG
924 msock.sbuf = connection->meta.sbuf;
925 msock.rbuf = connection->meta.rbuf;
7da35862
PR
926 msock.socket = NULL;
927
0916e0e3 928 /* Assume that the peer only understands protocol 80 until we know better. */
bde89a9e 929 connection->agreed_pro_version = 80;
b411b363 930
bde89a9e 931 if (prepare_listen_socket(connection, &ad))
7a426fd8 932 return 0;
b411b363
PR
933
934 do {
2bf89621 935 struct socket *s;
b411b363 936
bde89a9e 937 s = drbd_try_connect(connection);
b411b363 938 if (s) {
7da35862
PR
939 if (!sock.socket) {
940 sock.socket = s;
bde89a9e 941 send_first_packet(connection, &sock, P_INITIAL_DATA);
7da35862 942 } else if (!msock.socket) {
bde89a9e 943 clear_bit(RESOLVE_CONFLICTS, &connection->flags);
7da35862 944 msock.socket = s;
bde89a9e 945 send_first_packet(connection, &msock, P_INITIAL_META);
b411b363 946 } else {
1ec861eb 947 drbd_err(connection, "Logic error in conn_connect()\n");
b411b363
PR
948 goto out_release_sockets;
949 }
950 }
951
5d0b17f1
PR
952 if (connection_established(connection, &sock.socket, &msock.socket))
953 break;
b411b363
PR
954
955retry:
bde89a9e 956 s = drbd_wait_for_connect(connection, &ad);
b411b363 957 if (s) {
bde89a9e 958 int fp = receive_first_packet(connection, s);
7da35862
PR
959 drbd_socket_okay(&sock.socket);
960 drbd_socket_okay(&msock.socket);
92f14951 961 switch (fp) {
e5d6f33a 962 case P_INITIAL_DATA:
7da35862 963 if (sock.socket) {
1ec861eb 964 drbd_warn(connection, "initial packet S crossed\n");
7da35862 965 sock_release(sock.socket);
80c6eed4
PR
966 sock.socket = s;
967 goto randomize;
b411b363 968 }
7da35862 969 sock.socket = s;
b411b363 970 break;
e5d6f33a 971 case P_INITIAL_META:
bde89a9e 972 set_bit(RESOLVE_CONFLICTS, &connection->flags);
7da35862 973 if (msock.socket) {
1ec861eb 974 drbd_warn(connection, "initial packet M crossed\n");
7da35862 975 sock_release(msock.socket);
80c6eed4
PR
976 msock.socket = s;
977 goto randomize;
b411b363 978 }
7da35862 979 msock.socket = s;
b411b363
PR
980 break;
981 default:
1ec861eb 982 drbd_warn(connection, "Error receiving initial packet\n");
b411b363 983 sock_release(s);
80c6eed4 984randomize:
38b682b2 985 if (prandom_u32() & 1)
b411b363
PR
986 goto retry;
987 }
988 }
989
bde89a9e 990 if (connection->cstate <= C_DISCONNECTING)
b411b363
PR
991 goto out_release_sockets;
992 if (signal_pending(current)) {
993 flush_signals(current);
994 smp_rmb();
bde89a9e 995 if (get_t_state(&connection->receiver) == EXITING)
b411b363
PR
996 goto out_release_sockets;
997 }
998
5d0b17f1 999 ok = connection_established(connection, &sock.socket, &msock.socket);
b666dbf8 1000 } while (!ok);
b411b363 1001
7a426fd8
PR
1002 if (ad.s_listen)
1003 sock_release(ad.s_listen);
b411b363 1004
98683650
PR
1005 sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
1006 msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
b411b363 1007
7da35862
PR
1008 sock.socket->sk->sk_allocation = GFP_NOIO;
1009 msock.socket->sk->sk_allocation = GFP_NOIO;
b411b363 1010
7da35862
PR
1011 sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
1012 msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
b411b363 1013
b411b363 1014 /* NOT YET ...
bde89a9e 1015 * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
7da35862 1016 * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
6038178e 1017 * first set it to the P_CONNECTION_FEATURES timeout,
b411b363 1018 * which we set to 4x the configured ping_timeout. */
44ed167d 1019 rcu_read_lock();
bde89a9e 1020 nc = rcu_dereference(connection->net_conf);
44ed167d 1021
7da35862
PR
1022 sock.socket->sk->sk_sndtimeo =
1023 sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
b411b363 1024
7da35862 1025 msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
44ed167d 1026 timeout = nc->timeout * HZ / 10;
08b165ba 1027 discard_my_data = nc->discard_my_data;
44ed167d 1028 rcu_read_unlock();
b411b363 1029
7da35862 1030 msock.socket->sk->sk_sndtimeo = timeout;
b411b363
PR
1031
1032 /* we don't want delays.
25985edc 1033 * we use TCP_CORK where appropriate, though */
7da35862
PR
1034 drbd_tcp_nodelay(sock.socket);
1035 drbd_tcp_nodelay(msock.socket);
b411b363 1036
bde89a9e
AG
1037 connection->data.socket = sock.socket;
1038 connection->meta.socket = msock.socket;
1039 connection->last_received = jiffies;
b411b363 1040
bde89a9e 1041 h = drbd_do_features(connection);
b411b363
PR
1042 if (h <= 0)
1043 return h;
1044
bde89a9e 1045 if (connection->cram_hmac_tfm) {
b30ab791 1046 /* drbd_request_state(device, NS(conn, WFAuth)); */
bde89a9e 1047 switch (drbd_do_auth(connection)) {
b10d96cb 1048 case -1:
1ec861eb 1049 drbd_err(connection, "Authentication of peer failed\n");
b411b363 1050 return -1;
b10d96cb 1051 case 0:
1ec861eb 1052 drbd_err(connection, "Authentication of peer failed, trying again.\n");
b10d96cb 1053 return 0;
b411b363
PR
1054 }
1055 }
1056
bde89a9e
AG
1057 connection->data.socket->sk->sk_sndtimeo = timeout;
1058 connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
b411b363 1059
bde89a9e 1060 if (drbd_send_protocol(connection) == -EOPNOTSUPP)
7e2455c1 1061 return -1;
b411b363 1062
31007745
PR
1063 /* Prevent a race between resync-handshake and
1064 * being promoted to Primary.
1065 *
1066 * Grab and release the state mutex, so we know that any current
1067 * drbd_set_role() is finished, and any incoming drbd_set_role
1068 * will see the STATE_SENT flag, and wait for it to be cleared.
1069 */
1070 idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1071 mutex_lock(peer_device->device->state_mutex);
1072
bde89a9e 1073 set_bit(STATE_SENT, &connection->flags);
a1096a6e 1074
31007745
PR
1075 idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1076 mutex_unlock(peer_device->device->state_mutex);
1077
c141ebda 1078 rcu_read_lock();
c06ece6b
AG
1079 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1080 struct drbd_device *device = peer_device->device;
b30ab791 1081 kref_get(&device->kref);
26ea8f92
AG
1082 rcu_read_unlock();
1083
08b165ba 1084 if (discard_my_data)
b30ab791 1085 set_bit(DISCARD_MY_DATA, &device->flags);
08b165ba 1086 else
b30ab791 1087 clear_bit(DISCARD_MY_DATA, &device->flags);
08b165ba 1088
69a22773 1089 drbd_connected(peer_device);
05a10ec7 1090 kref_put(&device->kref, drbd_destroy_device);
c141ebda
PR
1091 rcu_read_lock();
1092 }
1093 rcu_read_unlock();
1094
bde89a9e
AG
1095 rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
1096 if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
1097 clear_bit(STATE_SENT, &connection->flags);
1e86ac48 1098 return 0;
a1096a6e 1099 }
1e86ac48 1100
bde89a9e 1101 drbd_thread_start(&connection->asender);
b411b363 1102
0500813f 1103 mutex_lock(&connection->resource->conf_update);
08b165ba
PR
1104 /* The discard_my_data flag is a single-shot modifier to the next
1105 * connection attempt, the handshake of which is now well underway.
1106 * No need for rcu style copying of the whole struct
1107 * just to clear a single value. */
bde89a9e 1108 connection->net_conf->discard_my_data = 0;
0500813f 1109 mutex_unlock(&connection->resource->conf_update);
08b165ba 1110
d3fcb490 1111 return h;
b411b363
PR
1112
1113out_release_sockets:
7a426fd8
PR
1114 if (ad.s_listen)
1115 sock_release(ad.s_listen);
7da35862
PR
1116 if (sock.socket)
1117 sock_release(sock.socket);
1118 if (msock.socket)
1119 sock_release(msock.socket);
b411b363
PR
1120 return -1;
1121}
1122
bde89a9e 1123static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
b411b363 1124{
bde89a9e 1125 unsigned int header_size = drbd_header_size(connection);
e658983a 1126
0c8e36d9
AG
1127 if (header_size == sizeof(struct p_header100) &&
1128 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
1129 struct p_header100 *h = header;
1130 if (h->pad != 0) {
1ec861eb 1131 drbd_err(connection, "Header padding is not zero\n");
0c8e36d9
AG
1132 return -EINVAL;
1133 }
1134 pi->vnr = be16_to_cpu(h->volume);
1135 pi->cmd = be16_to_cpu(h->command);
1136 pi->size = be32_to_cpu(h->length);
1137 } else if (header_size == sizeof(struct p_header95) &&
1138 *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
e658983a 1139 struct p_header95 *h = header;
e658983a 1140 pi->cmd = be16_to_cpu(h->command);
b55d84ba
AG
1141 pi->size = be32_to_cpu(h->length);
1142 pi->vnr = 0;
e658983a
AG
1143 } else if (header_size == sizeof(struct p_header80) &&
1144 *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
1145 struct p_header80 *h = header;
1146 pi->cmd = be16_to_cpu(h->command);
1147 pi->size = be16_to_cpu(h->length);
77351055 1148 pi->vnr = 0;
02918be2 1149 } else {
1ec861eb 1150 drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
e658983a 1151 be32_to_cpu(*(__be32 *)header),
bde89a9e 1152 connection->agreed_pro_version);
8172f3e9 1153 return -EINVAL;
b411b363 1154 }
e658983a 1155 pi->data = header + header_size;
8172f3e9 1156 return 0;
257d0af6 1157}
b411b363 1158
bde89a9e 1159static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
257d0af6 1160{
bde89a9e 1161 void *buffer = connection->data.rbuf;
69bc7bc3 1162 int err;
257d0af6 1163
bde89a9e 1164 err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
a5c31904 1165 if (err)
69bc7bc3 1166 return err;
257d0af6 1167
bde89a9e
AG
1168 err = decode_header(connection, buffer, pi);
1169 connection->last_received = jiffies;
b411b363 1170
69bc7bc3 1171 return err;
b411b363
PR
1172}
1173
bde89a9e 1174static void drbd_flush(struct drbd_connection *connection)
b411b363
PR
1175{
1176 int rv;
c06ece6b 1177 struct drbd_peer_device *peer_device;
4b0007c0
PR
1178 int vnr;
1179
e9526580 1180 if (connection->resource->write_ordering >= WO_bdev_flush) {
615e087f 1181 rcu_read_lock();
c06ece6b
AG
1182 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1183 struct drbd_device *device = peer_device->device;
1184
b30ab791 1185 if (!get_ldev(device))
615e087f 1186 continue;
b30ab791 1187 kref_get(&device->kref);
615e087f
LE
1188 rcu_read_unlock();
1189
f418815f
LE
1190 /* Right now, we have only this one synchronous code path
1191 * for flushes between request epochs.
1192 * We may want to make those asynchronous,
1193 * or at least parallelize the flushes to the volume devices.
1194 */
1195 device->flush_jif = jiffies;
1196 set_bit(FLUSH_PENDING, &device->flags);
b30ab791 1197 rv = blkdev_issue_flush(device->ldev->backing_bdev,
615e087f 1198 GFP_NOIO, NULL);
f418815f 1199 clear_bit(FLUSH_PENDING, &device->flags);
615e087f 1200 if (rv) {
d0180171 1201 drbd_info(device, "local disk flush failed with status %d\n", rv);
615e087f
LE
1202 /* would rather check on EOPNOTSUPP, but that is not reliable.
1203 * don't try again for ANY return value != 0
1204 * if (rv == -EOPNOTSUPP) */
8fe39aac 1205 drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io);
4b0007c0 1206 }
b30ab791 1207 put_ldev(device);
05a10ec7 1208 kref_put(&device->kref, drbd_destroy_device);
b411b363 1209
615e087f
LE
1210 rcu_read_lock();
1211 if (rv)
1212 break;
b411b363 1213 }
615e087f 1214 rcu_read_unlock();
b411b363 1215 }
b411b363
PR
1216}
1217
1218/**
1219 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
b30ab791 1220 * @device: DRBD device.
b411b363
PR
1221 * @epoch: Epoch object.
1222 * @ev: Epoch event.
1223 */
bde89a9e 1224static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection,
b411b363
PR
1225 struct drbd_epoch *epoch,
1226 enum epoch_event ev)
1227{
2451fc3b 1228 int epoch_size;
b411b363 1229 struct drbd_epoch *next_epoch;
b411b363
PR
1230 enum finish_epoch rv = FE_STILL_LIVE;
1231
bde89a9e 1232 spin_lock(&connection->epoch_lock);
b411b363
PR
1233 do {
1234 next_epoch = NULL;
b411b363
PR
1235
1236 epoch_size = atomic_read(&epoch->epoch_size);
1237
1238 switch (ev & ~EV_CLEANUP) {
1239 case EV_PUT:
1240 atomic_dec(&epoch->active);
1241 break;
1242 case EV_GOT_BARRIER_NR:
1243 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
b411b363
PR
1244 break;
1245 case EV_BECAME_LAST:
1246 /* nothing to do*/
1247 break;
1248 }
1249
b411b363
PR
1250 if (epoch_size != 0 &&
1251 atomic_read(&epoch->active) == 0 &&
80f9fd55 1252 (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
b411b363 1253 if (!(ev & EV_CLEANUP)) {
bde89a9e
AG
1254 spin_unlock(&connection->epoch_lock);
1255 drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
1256 spin_lock(&connection->epoch_lock);
b411b363 1257 }
9ed57dcb
LE
1258#if 0
1259 /* FIXME: dec unacked on connection, once we have
1260 * something to count pending connection packets in. */
80f9fd55 1261 if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
bde89a9e 1262 dec_unacked(epoch->connection);
9ed57dcb 1263#endif
b411b363 1264
bde89a9e 1265 if (connection->current_epoch != epoch) {
b411b363
PR
1266 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1267 list_del(&epoch->list);
1268 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
bde89a9e 1269 connection->epochs--;
b411b363
PR
1270 kfree(epoch);
1271
1272 if (rv == FE_STILL_LIVE)
1273 rv = FE_DESTROYED;
1274 } else {
1275 epoch->flags = 0;
1276 atomic_set(&epoch->epoch_size, 0);
698f9315 1277 /* atomic_set(&epoch->active, 0); is already zero */
b411b363
PR
1278 if (rv == FE_STILL_LIVE)
1279 rv = FE_RECYCLED;
1280 }
1281 }
1282
1283 if (!next_epoch)
1284 break;
1285
1286 epoch = next_epoch;
1287 } while (1);
1288
bde89a9e 1289 spin_unlock(&connection->epoch_lock);
b411b363 1290
b411b363
PR
1291 return rv;
1292}
1293
8fe39aac
PR
1294static enum write_ordering_e
1295max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
1296{
1297 struct disk_conf *dc;
1298
1299 dc = rcu_dereference(bdev->disk_conf);
1300
1301 if (wo == WO_bdev_flush && !dc->disk_flushes)
1302 wo = WO_drain_io;
1303 if (wo == WO_drain_io && !dc->disk_drain)
1304 wo = WO_none;
1305
1306 return wo;
1307}
1308
b411b363
PR
1309/**
1310 * drbd_bump_write_ordering() - Fall back to an other write ordering method
bde89a9e 1311 * @connection: DRBD connection.
b411b363
PR
1312 * @wo: Write ordering method to try.
1313 */
8fe39aac
PR
1314void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
1315 enum write_ordering_e wo)
b411b363 1316{
e9526580 1317 struct drbd_device *device;
b411b363 1318 enum write_ordering_e pwo;
4b0007c0 1319 int vnr;
b411b363
PR
1320 static char *write_ordering_str[] = {
1321 [WO_none] = "none",
1322 [WO_drain_io] = "drain",
1323 [WO_bdev_flush] = "flush",
b411b363
PR
1324 };
1325
e9526580 1326 pwo = resource->write_ordering;
70df7092
LE
1327 if (wo != WO_bdev_flush)
1328 wo = min(pwo, wo);
daeda1cc 1329 rcu_read_lock();
e9526580 1330 idr_for_each_entry(&resource->devices, device, vnr) {
8fe39aac
PR
1331 if (get_ldev(device)) {
1332 wo = max_allowed_wo(device->ldev, wo);
1333 if (device->ldev == bdev)
1334 bdev = NULL;
1335 put_ldev(device);
1336 }
4b0007c0 1337 }
8fe39aac
PR
1338
1339 if (bdev)
1340 wo = max_allowed_wo(bdev, wo);
1341
70df7092
LE
1342 rcu_read_unlock();
1343
e9526580
PR
1344 resource->write_ordering = wo;
1345 if (pwo != resource->write_ordering || wo == WO_bdev_flush)
1346 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
b411b363
PR
1347}
1348
45bb912b 1349/**
fbe29dec 1350 * drbd_submit_peer_request()
b30ab791 1351 * @device: DRBD device.
db830c46 1352 * @peer_req: peer request
45bb912b 1353 * @rw: flag field, see bio->bi_rw
10f6d992
LE
1354 *
1355 * May spread the pages to multiple bios,
1356 * depending on bio_add_page restrictions.
1357 *
1358 * Returns 0 if all bios have been submitted,
1359 * -ENOMEM if we could not allocate enough bios,
1360 * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1361 * single page to an empty bio (which should never happen and likely indicates
1362 * that the lower level IO stack is in some way broken). This has been observed
1363 * on certain Xen deployments.
45bb912b
LE
1364 */
1365/* TODO allocate from our own bio_set. */
b30ab791 1366int drbd_submit_peer_request(struct drbd_device *device,
fbe29dec
AG
1367 struct drbd_peer_request *peer_req,
1368 const unsigned rw, const int fault_type)
45bb912b
LE
1369{
1370 struct bio *bios = NULL;
1371 struct bio *bio;
db830c46
AG
1372 struct page *page = peer_req->pages;
1373 sector_t sector = peer_req->i.sector;
11f8b2b6 1374 unsigned data_size = peer_req->i.size;
45bb912b 1375 unsigned n_bios = 0;
11f8b2b6 1376 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
10f6d992 1377 int err = -ENOMEM;
45bb912b 1378
a0fb3c47
LE
1379 if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
1380 /* wait for all pending IO completions, before we start
1381 * zeroing things out. */
1382 conn_wait_active_ee_empty(first_peer_device(device)->connection);
45d2933c
LE
1383 /* add it to the active list now,
1384 * so we can find it to present it in debugfs */
21ae5d7f
LE
1385 peer_req->submit_jif = jiffies;
1386 peer_req->flags |= EE_SUBMITTED;
45d2933c
LE
1387 spin_lock_irq(&device->resource->req_lock);
1388 list_add_tail(&peer_req->w.list, &device->active_ee);
1389 spin_unlock_irq(&device->resource->req_lock);
a0fb3c47 1390 if (blkdev_issue_zeroout(device->ldev->backing_bdev,
d93ba7a5 1391 sector, data_size >> 9, GFP_NOIO, false))
a0fb3c47
LE
1392 peer_req->flags |= EE_WAS_ERROR;
1393 drbd_endio_write_sec_final(peer_req);
1394 return 0;
1395 }
1396
54ed4ed8
LE
1397 /* Discards don't have any payload.
1398 * But the scsi layer still expects a bio_vec it can use internally,
1399 * see sd_setup_discard_cmnd() and blk_add_request_payload(). */
a0fb3c47 1400 if (peer_req->flags & EE_IS_TRIM)
54ed4ed8 1401 nr_pages = 1;
a0fb3c47 1402
45bb912b
LE
1403 /* In most cases, we will only need one bio. But in case the lower
1404 * level restrictions happen to be different at this offset on this
1405 * side than those of the sending peer, we may need to submit the
9476f39d
LE
1406 * request in more than one bio.
1407 *
1408 * Plain bio_alloc is good enough here, this is no DRBD internally
1409 * generated bio, but a bio allocated on behalf of the peer.
1410 */
45bb912b
LE
1411next_bio:
1412 bio = bio_alloc(GFP_NOIO, nr_pages);
1413 if (!bio) {
a0fb3c47 1414 drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
45bb912b
LE
1415 goto fail;
1416 }
db830c46 1417 /* > peer_req->i.sector, unless this is the first bio */
4f024f37 1418 bio->bi_iter.bi_sector = sector;
b30ab791 1419 bio->bi_bdev = device->ldev->backing_bdev;
45bb912b 1420 bio->bi_rw = rw;
db830c46 1421 bio->bi_private = peer_req;
fcefa62e 1422 bio->bi_end_io = drbd_peer_request_endio;
45bb912b
LE
1423
1424 bio->bi_next = bios;
1425 bios = bio;
1426 ++n_bios;
1427
a0fb3c47 1428 if (rw & REQ_DISCARD) {
11f8b2b6 1429 bio->bi_iter.bi_size = data_size;
a0fb3c47
LE
1430 goto submit;
1431 }
1432
45bb912b 1433 page_chain_for_each(page) {
11f8b2b6 1434 unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
45bb912b 1435 if (!bio_add_page(bio, page, len, 0)) {
10f6d992
LE
1436 /* A single page must always be possible!
1437 * But in case it fails anyways,
1438 * we deal with it, and complain (below). */
1439 if (bio->bi_vcnt == 0) {
d0180171 1440 drbd_err(device,
10f6d992
LE
1441 "bio_add_page failed for len=%u, "
1442 "bi_vcnt=0 (bi_sector=%llu)\n",
4f024f37 1443 len, (uint64_t)bio->bi_iter.bi_sector);
10f6d992
LE
1444 err = -ENOSPC;
1445 goto fail;
1446 }
45bb912b
LE
1447 goto next_bio;
1448 }
11f8b2b6 1449 data_size -= len;
45bb912b
LE
1450 sector += len >> 9;
1451 --nr_pages;
1452 }
11f8b2b6 1453 D_ASSERT(device, data_size == 0);
a0fb3c47
LE
1454submit:
1455 D_ASSERT(device, page == NULL);
45bb912b 1456
db830c46 1457 atomic_set(&peer_req->pending_bios, n_bios);
21ae5d7f
LE
1458 /* for debugfs: update timestamp, mark as submitted */
1459 peer_req->submit_jif = jiffies;
1460 peer_req->flags |= EE_SUBMITTED;
45bb912b
LE
1461 do {
1462 bio = bios;
1463 bios = bios->bi_next;
1464 bio->bi_next = NULL;
1465
b30ab791 1466 drbd_generic_make_request(device, fault_type, bio);
45bb912b 1467 } while (bios);
45bb912b
LE
1468 return 0;
1469
1470fail:
1471 while (bios) {
1472 bio = bios;
1473 bios = bios->bi_next;
1474 bio_put(bio);
1475 }
10f6d992 1476 return err;
45bb912b
LE
1477}
1478
b30ab791 1479static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
db830c46 1480 struct drbd_peer_request *peer_req)
53840641 1481{
db830c46 1482 struct drbd_interval *i = &peer_req->i;
53840641 1483
b30ab791 1484 drbd_remove_interval(&device->write_requests, i);
53840641
AG
1485 drbd_clear_interval(i);
1486
6c852bec 1487 /* Wake up any processes waiting for this peer request to complete. */
53840641 1488 if (i->waiting)
b30ab791 1489 wake_up(&device->misc_wait);
53840641
AG
1490}
1491
bde89a9e 1492static void conn_wait_active_ee_empty(struct drbd_connection *connection)
77fede51 1493{
c06ece6b 1494 struct drbd_peer_device *peer_device;
77fede51
PR
1495 int vnr;
1496
1497 rcu_read_lock();
c06ece6b
AG
1498 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1499 struct drbd_device *device = peer_device->device;
1500
b30ab791 1501 kref_get(&device->kref);
77fede51 1502 rcu_read_unlock();
b30ab791 1503 drbd_wait_ee_list_empty(device, &device->active_ee);
05a10ec7 1504 kref_put(&device->kref, drbd_destroy_device);
77fede51
PR
1505 rcu_read_lock();
1506 }
1507 rcu_read_unlock();
1508}
1509
9f4fe9ad
AG
1510static struct drbd_peer_device *
1511conn_peer_device(struct drbd_connection *connection, int volume_number)
1512{
1513 return idr_find(&connection->peer_devices, volume_number);
1514}
1515
bde89a9e 1516static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
b411b363 1517{
2451fc3b 1518 int rv;
e658983a 1519 struct p_barrier *p = pi->data;
b411b363
PR
1520 struct drbd_epoch *epoch;
1521
9ed57dcb
LE
1522 /* FIXME these are unacked on connection,
1523 * not a specific (peer)device.
1524 */
bde89a9e
AG
1525 connection->current_epoch->barrier_nr = p->barrier;
1526 connection->current_epoch->connection = connection;
1527 rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR);
b411b363
PR
1528
1529 /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1530 * the activity log, which means it would not be resynced in case the
1531 * R_PRIMARY crashes now.
1532 * Therefore we must send the barrier_ack after the barrier request was
1533 * completed. */
e9526580 1534 switch (connection->resource->write_ordering) {
b411b363
PR
1535 case WO_none:
1536 if (rv == FE_RECYCLED)
82bc0194 1537 return 0;
2451fc3b
PR
1538
1539 /* receiver context, in the writeout path of the other node.
1540 * avoid potential distributed deadlock */
1541 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1542 if (epoch)
1543 break;
1544 else
1ec861eb 1545 drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
2451fc3b 1546 /* Fall through */
b411b363
PR
1547
1548 case WO_bdev_flush:
1549 case WO_drain_io:
bde89a9e
AG
1550 conn_wait_active_ee_empty(connection);
1551 drbd_flush(connection);
2451fc3b 1552
bde89a9e 1553 if (atomic_read(&connection->current_epoch->epoch_size)) {
2451fc3b
PR
1554 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1555 if (epoch)
1556 break;
b411b363
PR
1557 }
1558
82bc0194 1559 return 0;
2451fc3b 1560 default:
e9526580
PR
1561 drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
1562 connection->resource->write_ordering);
82bc0194 1563 return -EIO;
b411b363
PR
1564 }
1565
1566 epoch->flags = 0;
1567 atomic_set(&epoch->epoch_size, 0);
1568 atomic_set(&epoch->active, 0);
1569
bde89a9e
AG
1570 spin_lock(&connection->epoch_lock);
1571 if (atomic_read(&connection->current_epoch->epoch_size)) {
1572 list_add(&epoch->list, &connection->current_epoch->list);
1573 connection->current_epoch = epoch;
1574 connection->epochs++;
b411b363
PR
1575 } else {
1576 /* The current_epoch got recycled while we allocated this one... */
1577 kfree(epoch);
1578 }
bde89a9e 1579 spin_unlock(&connection->epoch_lock);
b411b363 1580
82bc0194 1581 return 0;
b411b363
PR
1582}
1583
1584/* used from receive_RSDataReply (recv_resync_read)
1585 * and from receive_Data */
f6ffca9f 1586static struct drbd_peer_request *
69a22773 1587read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
a0fb3c47 1588 struct packet_info *pi) __must_hold(local)
b411b363 1589{
69a22773 1590 struct drbd_device *device = peer_device->device;
b30ab791 1591 const sector_t capacity = drbd_get_capacity(device->this_bdev);
db830c46 1592 struct drbd_peer_request *peer_req;
b411b363 1593 struct page *page;
11f8b2b6
AG
1594 int digest_size, err;
1595 unsigned int data_size = pi->size, ds;
69a22773
AG
1596 void *dig_in = peer_device->connection->int_dig_in;
1597 void *dig_vv = peer_device->connection->int_dig_vv;
6b4388ac 1598 unsigned long *data;
a0fb3c47 1599 struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
b411b363 1600
11f8b2b6 1601 digest_size = 0;
a0fb3c47 1602 if (!trim && peer_device->connection->peer_integrity_tfm) {
11f8b2b6 1603 digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
9f5bdc33
AG
1604 /*
1605 * FIXME: Receive the incoming digest into the receive buffer
1606 * here, together with its struct p_data?
1607 */
11f8b2b6 1608 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
a5c31904 1609 if (err)
b411b363 1610 return NULL;
11f8b2b6 1611 data_size -= digest_size;
b411b363
PR
1612 }
1613
a0fb3c47
LE
1614 if (trim) {
1615 D_ASSERT(peer_device, data_size == 0);
1616 data_size = be32_to_cpu(trim->size);
1617 }
1618
841ce241
AG
1619 if (!expect(IS_ALIGNED(data_size, 512)))
1620 return NULL;
a0fb3c47
LE
1621 /* prepare for larger trim requests. */
1622 if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE))
841ce241 1623 return NULL;
b411b363 1624
6666032a
LE
1625 /* even though we trust out peer,
1626 * we sometimes have to double check. */
1627 if (sector + (data_size>>9) > capacity) {
d0180171 1628 drbd_err(device, "request from peer beyond end of local disk: "
fdda6544 1629 "capacity: %llus < sector: %llus + size: %u\n",
6666032a
LE
1630 (unsigned long long)capacity,
1631 (unsigned long long)sector, data_size);
1632 return NULL;
1633 }
1634
b411b363
PR
1635 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1636 * "criss-cross" setup, that might cause write-out on some other DRBD,
1637 * which in turn might block on the other node at this very place. */
a0fb3c47 1638 peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO);
db830c46 1639 if (!peer_req)
b411b363 1640 return NULL;
45bb912b 1641
21ae5d7f 1642 peer_req->flags |= EE_WRITE;
a0fb3c47 1643 if (trim)
81a3537a 1644 return peer_req;
a73ff323 1645
b411b363 1646 ds = data_size;
db830c46 1647 page = peer_req->pages;
45bb912b
LE
1648 page_chain_for_each(page) {
1649 unsigned len = min_t(int, ds, PAGE_SIZE);
6b4388ac 1650 data = kmap(page);
69a22773 1651 err = drbd_recv_all_warn(peer_device->connection, data, len);
b30ab791 1652 if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
d0180171 1653 drbd_err(device, "Fault injection: Corrupting data on receive\n");
6b4388ac
PR
1654 data[0] = data[0] ^ (unsigned long)-1;
1655 }
b411b363 1656 kunmap(page);
a5c31904 1657 if (err) {
b30ab791 1658 drbd_free_peer_req(device, peer_req);
b411b363
PR
1659 return NULL;
1660 }
a5c31904 1661 ds -= len;
b411b363
PR
1662 }
1663
11f8b2b6 1664 if (digest_size) {
69a22773 1665 drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv);
11f8b2b6 1666 if (memcmp(dig_in, dig_vv, digest_size)) {
d0180171 1667 drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
470be44a 1668 (unsigned long long)sector, data_size);
b30ab791 1669 drbd_free_peer_req(device, peer_req);
b411b363
PR
1670 return NULL;
1671 }
1672 }
11f8b2b6 1673 device->recv_cnt += data_size >> 9;
db830c46 1674 return peer_req;
b411b363
PR
1675}
1676
1677/* drbd_drain_block() just takes a data block
1678 * out of the socket input buffer, and discards it.
1679 */
69a22773 1680static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
b411b363
PR
1681{
1682 struct page *page;
a5c31904 1683 int err = 0;
b411b363
PR
1684 void *data;
1685
c3470cde 1686 if (!data_size)
fc5be839 1687 return 0;
c3470cde 1688
69a22773 1689 page = drbd_alloc_pages(peer_device, 1, 1);
b411b363
PR
1690
1691 data = kmap(page);
1692 while (data_size) {
fc5be839
AG
1693 unsigned int len = min_t(int, data_size, PAGE_SIZE);
1694
69a22773 1695 err = drbd_recv_all_warn(peer_device->connection, data, len);
a5c31904 1696 if (err)
b411b363 1697 break;
a5c31904 1698 data_size -= len;
b411b363
PR
1699 }
1700 kunmap(page);
69a22773 1701 drbd_free_pages(peer_device->device, page, 0);
fc5be839 1702 return err;
b411b363
PR
1703}
1704
69a22773 1705static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
b411b363
PR
1706 sector_t sector, int data_size)
1707{
7988613b
KO
1708 struct bio_vec bvec;
1709 struct bvec_iter iter;
b411b363 1710 struct bio *bio;
11f8b2b6 1711 int digest_size, err, expect;
69a22773
AG
1712 void *dig_in = peer_device->connection->int_dig_in;
1713 void *dig_vv = peer_device->connection->int_dig_vv;
b411b363 1714
11f8b2b6 1715 digest_size = 0;
69a22773 1716 if (peer_device->connection->peer_integrity_tfm) {
11f8b2b6
AG
1717 digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
1718 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
a5c31904
AG
1719 if (err)
1720 return err;
11f8b2b6 1721 data_size -= digest_size;
b411b363
PR
1722 }
1723
b411b363
PR
1724 /* optimistically update recv_cnt. if receiving fails below,
1725 * we disconnect anyways, and counters will be reset. */
69a22773 1726 peer_device->device->recv_cnt += data_size>>9;
b411b363
PR
1727
1728 bio = req->master_bio;
69a22773 1729 D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
b411b363 1730
7988613b
KO
1731 bio_for_each_segment(bvec, bio, iter) {
1732 void *mapped = kmap(bvec.bv_page) + bvec.bv_offset;
1733 expect = min_t(int, data_size, bvec.bv_len);
69a22773 1734 err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
7988613b 1735 kunmap(bvec.bv_page);
a5c31904
AG
1736 if (err)
1737 return err;
1738 data_size -= expect;
b411b363
PR
1739 }
1740
11f8b2b6 1741 if (digest_size) {
69a22773 1742 drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv);
11f8b2b6 1743 if (memcmp(dig_in, dig_vv, digest_size)) {
69a22773 1744 drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n");
28284cef 1745 return -EINVAL;
b411b363
PR
1746 }
1747 }
1748
69a22773 1749 D_ASSERT(peer_device->device, data_size == 0);
28284cef 1750 return 0;
b411b363
PR
1751}
1752
a990be46
AG
1753/*
1754 * e_end_resync_block() is called in asender context via
1755 * drbd_finish_peer_reqs().
1756 */
99920dc5 1757static int e_end_resync_block(struct drbd_work *w, int unused)
b411b363 1758{
8050e6d0 1759 struct drbd_peer_request *peer_req =
a8cd15ba
AG
1760 container_of(w, struct drbd_peer_request, w);
1761 struct drbd_peer_device *peer_device = peer_req->peer_device;
1762 struct drbd_device *device = peer_device->device;
db830c46 1763 sector_t sector = peer_req->i.sector;
99920dc5 1764 int err;
b411b363 1765
0b0ba1ef 1766 D_ASSERT(device, drbd_interval_empty(&peer_req->i));
b411b363 1767
db830c46 1768 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
b30ab791 1769 drbd_set_in_sync(device, sector, peer_req->i.size);
a8cd15ba 1770 err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
b411b363
PR
1771 } else {
1772 /* Record failure to sync */
b30ab791 1773 drbd_rs_failed_io(device, sector, peer_req->i.size);
b411b363 1774
a8cd15ba 1775 err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
b411b363 1776 }
b30ab791 1777 dec_unacked(device);
b411b363 1778
99920dc5 1779 return err;
b411b363
PR
1780}
1781
69a22773 1782static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
a0fb3c47 1783 struct packet_info *pi) __releases(local)
b411b363 1784{
69a22773 1785 struct drbd_device *device = peer_device->device;
db830c46 1786 struct drbd_peer_request *peer_req;
b411b363 1787
a0fb3c47 1788 peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
db830c46 1789 if (!peer_req)
45bb912b 1790 goto fail;
b411b363 1791
b30ab791 1792 dec_rs_pending(device);
b411b363 1793
b30ab791 1794 inc_unacked(device);
b411b363
PR
1795 /* corresponding dec_unacked() in e_end_resync_block()
1796 * respective _drbd_clear_done_ee */
1797
a8cd15ba 1798 peer_req->w.cb = e_end_resync_block;
21ae5d7f 1799 peer_req->submit_jif = jiffies;
45bb912b 1800
0500813f 1801 spin_lock_irq(&device->resource->req_lock);
b9ed7080 1802 list_add_tail(&peer_req->w.list, &device->sync_ee);
0500813f 1803 spin_unlock_irq(&device->resource->req_lock);
b411b363 1804
a0fb3c47 1805 atomic_add(pi->size >> 9, &device->rs_sect_ev);
b30ab791 1806 if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
e1c1b0fc 1807 return 0;
b411b363 1808
10f6d992 1809 /* don't care for the reason here */
d0180171 1810 drbd_err(device, "submit failed, triggering re-connect\n");
0500813f 1811 spin_lock_irq(&device->resource->req_lock);
a8cd15ba 1812 list_del(&peer_req->w.list);
0500813f 1813 spin_unlock_irq(&device->resource->req_lock);
22cc37a9 1814
b30ab791 1815 drbd_free_peer_req(device, peer_req);
45bb912b 1816fail:
b30ab791 1817 put_ldev(device);
e1c1b0fc 1818 return -EIO;
b411b363
PR
1819}
1820
668eebc6 1821static struct drbd_request *
b30ab791 1822find_request(struct drbd_device *device, struct rb_root *root, u64 id,
bc9c5c41 1823 sector_t sector, bool missing_ok, const char *func)
51624585 1824{
51624585
AG
1825 struct drbd_request *req;
1826
bc9c5c41
AG
1827 /* Request object according to our peer */
1828 req = (struct drbd_request *)(unsigned long)id;
5e472264 1829 if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
668eebc6 1830 return req;
c3afd8f5 1831 if (!missing_ok) {
d0180171 1832 drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
c3afd8f5
AG
1833 (unsigned long)id, (unsigned long long)sector);
1834 }
51624585 1835 return NULL;
b411b363
PR
1836}
1837
bde89a9e 1838static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 1839{
9f4fe9ad 1840 struct drbd_peer_device *peer_device;
b30ab791 1841 struct drbd_device *device;
b411b363
PR
1842 struct drbd_request *req;
1843 sector_t sector;
82bc0194 1844 int err;
e658983a 1845 struct p_data *p = pi->data;
4a76b161 1846
9f4fe9ad
AG
1847 peer_device = conn_peer_device(connection, pi->vnr);
1848 if (!peer_device)
4a76b161 1849 return -EIO;
9f4fe9ad 1850 device = peer_device->device;
b411b363
PR
1851
1852 sector = be64_to_cpu(p->sector);
1853
0500813f 1854 spin_lock_irq(&device->resource->req_lock);
b30ab791 1855 req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
0500813f 1856 spin_unlock_irq(&device->resource->req_lock);
c3afd8f5 1857 if (unlikely(!req))
82bc0194 1858 return -EIO;
b411b363 1859
24c4830c 1860 /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
b411b363
PR
1861 * special casing it there for the various failure cases.
1862 * still no race with drbd_fail_pending_reads */
69a22773 1863 err = recv_dless_read(peer_device, req, sector, pi->size);
82bc0194 1864 if (!err)
8554df1c 1865 req_mod(req, DATA_RECEIVED);
b411b363
PR
1866 /* else: nothing. handled from drbd_disconnect...
1867 * I don't think we may complete this just yet
1868 * in case we are "on-disconnect: freeze" */
1869
82bc0194 1870 return err;
b411b363
PR
1871}
1872
bde89a9e 1873static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 1874{
9f4fe9ad 1875 struct drbd_peer_device *peer_device;
b30ab791 1876 struct drbd_device *device;
b411b363 1877 sector_t sector;
82bc0194 1878 int err;
e658983a 1879 struct p_data *p = pi->data;
4a76b161 1880
9f4fe9ad
AG
1881 peer_device = conn_peer_device(connection, pi->vnr);
1882 if (!peer_device)
4a76b161 1883 return -EIO;
9f4fe9ad 1884 device = peer_device->device;
b411b363
PR
1885
1886 sector = be64_to_cpu(p->sector);
0b0ba1ef 1887 D_ASSERT(device, p->block_id == ID_SYNCER);
b411b363 1888
b30ab791 1889 if (get_ldev(device)) {
b411b363
PR
1890 /* data is submitted to disk within recv_resync_read.
1891 * corresponding put_ldev done below on error,
fcefa62e 1892 * or in drbd_peer_request_endio. */
a0fb3c47 1893 err = recv_resync_read(peer_device, sector, pi);
b411b363
PR
1894 } else {
1895 if (__ratelimit(&drbd_ratelimit_state))
d0180171 1896 drbd_err(device, "Can not write resync data to local disk.\n");
b411b363 1897
69a22773 1898 err = drbd_drain_block(peer_device, pi->size);
b411b363 1899
69a22773 1900 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
b411b363
PR
1901 }
1902
b30ab791 1903 atomic_add(pi->size >> 9, &device->rs_sect_in);
778f271d 1904
82bc0194 1905 return err;
b411b363
PR
1906}
1907
b30ab791 1908static void restart_conflicting_writes(struct drbd_device *device,
7be8da07 1909 sector_t sector, int size)
b411b363 1910{
7be8da07
AG
1911 struct drbd_interval *i;
1912 struct drbd_request *req;
1913
b30ab791 1914 drbd_for_each_overlap(i, &device->write_requests, sector, size) {
7be8da07
AG
1915 if (!i->local)
1916 continue;
1917 req = container_of(i, struct drbd_request, i);
1918 if (req->rq_state & RQ_LOCAL_PENDING ||
1919 !(req->rq_state & RQ_POSTPONED))
1920 continue;
2312f0b3
LE
1921 /* as it is RQ_POSTPONED, this will cause it to
1922 * be queued on the retry workqueue. */
d4dabbe2 1923 __req_mod(req, CONFLICT_RESOLVED, NULL);
7be8da07
AG
1924 }
1925}
b411b363 1926
a990be46
AG
1927/*
1928 * e_end_block() is called in asender context via drbd_finish_peer_reqs().
b411b363 1929 */
99920dc5 1930static int e_end_block(struct drbd_work *w, int cancel)
b411b363 1931{
8050e6d0 1932 struct drbd_peer_request *peer_req =
a8cd15ba
AG
1933 container_of(w, struct drbd_peer_request, w);
1934 struct drbd_peer_device *peer_device = peer_req->peer_device;
1935 struct drbd_device *device = peer_device->device;
db830c46 1936 sector_t sector = peer_req->i.sector;
99920dc5 1937 int err = 0, pcmd;
b411b363 1938
303d1448 1939 if (peer_req->flags & EE_SEND_WRITE_ACK) {
db830c46 1940 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
b30ab791
AG
1941 pcmd = (device->state.conn >= C_SYNC_SOURCE &&
1942 device->state.conn <= C_PAUSED_SYNC_T &&
db830c46 1943 peer_req->flags & EE_MAY_SET_IN_SYNC) ?
b411b363 1944 P_RS_WRITE_ACK : P_WRITE_ACK;
a8cd15ba 1945 err = drbd_send_ack(peer_device, pcmd, peer_req);
b411b363 1946 if (pcmd == P_RS_WRITE_ACK)
b30ab791 1947 drbd_set_in_sync(device, sector, peer_req->i.size);
b411b363 1948 } else {
a8cd15ba 1949 err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
b411b363
PR
1950 /* we expect it to be marked out of sync anyways...
1951 * maybe assert this? */
1952 }
b30ab791 1953 dec_unacked(device);
b411b363 1954 }
08d0dabf 1955
b411b363
PR
1956 /* we delete from the conflict detection hash _after_ we sent out the
1957 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
302bdeae 1958 if (peer_req->flags & EE_IN_INTERVAL_TREE) {
0500813f 1959 spin_lock_irq(&device->resource->req_lock);
0b0ba1ef 1960 D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
b30ab791 1961 drbd_remove_epoch_entry_interval(device, peer_req);
7be8da07 1962 if (peer_req->flags & EE_RESTART_REQUESTS)
b30ab791 1963 restart_conflicting_writes(device, sector, peer_req->i.size);
0500813f 1964 spin_unlock_irq(&device->resource->req_lock);
bb3bfe96 1965 } else
0b0ba1ef 1966 D_ASSERT(device, drbd_interval_empty(&peer_req->i));
b411b363 1967
a6b32bc3 1968 drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
b411b363 1969
99920dc5 1970 return err;
b411b363
PR
1971}
1972
a8cd15ba 1973static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
b411b363 1974{
8050e6d0 1975 struct drbd_peer_request *peer_req =
a8cd15ba
AG
1976 container_of(w, struct drbd_peer_request, w);
1977 struct drbd_peer_device *peer_device = peer_req->peer_device;
99920dc5 1978 int err;
b411b363 1979
a8cd15ba
AG
1980 err = drbd_send_ack(peer_device, ack, peer_req);
1981 dec_unacked(peer_device->device);
b411b363 1982
99920dc5 1983 return err;
b411b363
PR
1984}
1985
d4dabbe2 1986static int e_send_superseded(struct drbd_work *w, int unused)
7be8da07 1987{
a8cd15ba 1988 return e_send_ack(w, P_SUPERSEDED);
7be8da07
AG
1989}
1990
99920dc5 1991static int e_send_retry_write(struct drbd_work *w, int unused)
7be8da07 1992{
a8cd15ba
AG
1993 struct drbd_peer_request *peer_req =
1994 container_of(w, struct drbd_peer_request, w);
1995 struct drbd_connection *connection = peer_req->peer_device->connection;
7be8da07 1996
a8cd15ba 1997 return e_send_ack(w, connection->agreed_pro_version >= 100 ?
d4dabbe2 1998 P_RETRY_WRITE : P_SUPERSEDED);
7be8da07 1999}
b411b363 2000
3e394da1
AG
2001static bool seq_greater(u32 a, u32 b)
2002{
2003 /*
2004 * We assume 32-bit wrap-around here.
2005 * For 24-bit wrap-around, we would have to shift:
2006 * a <<= 8; b <<= 8;
2007 */
2008 return (s32)a - (s32)b > 0;
2009}
b411b363 2010
3e394da1
AG
2011static u32 seq_max(u32 a, u32 b)
2012{
2013 return seq_greater(a, b) ? a : b;
b411b363
PR
2014}
2015
69a22773 2016static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
3e394da1 2017{
69a22773 2018 struct drbd_device *device = peer_device->device;
3c13b680 2019 unsigned int newest_peer_seq;
3e394da1 2020
69a22773 2021 if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
b30ab791
AG
2022 spin_lock(&device->peer_seq_lock);
2023 newest_peer_seq = seq_max(device->peer_seq, peer_seq);
2024 device->peer_seq = newest_peer_seq;
2025 spin_unlock(&device->peer_seq_lock);
2026 /* wake up only if we actually changed device->peer_seq */
3c13b680 2027 if (peer_seq == newest_peer_seq)
b30ab791 2028 wake_up(&device->seq_wait);
7be8da07 2029 }
b411b363
PR
2030}
2031
d93f6302 2032static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
b6a370ba 2033{
d93f6302
LE
2034 return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
2035}
b6a370ba 2036
d93f6302 2037/* maybe change sync_ee into interval trees as well? */
b30ab791 2038static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
d93f6302
LE
2039{
2040 struct drbd_peer_request *rs_req;
b6a370ba
PR
2041 bool rv = 0;
2042
0500813f 2043 spin_lock_irq(&device->resource->req_lock);
a8cd15ba 2044 list_for_each_entry(rs_req, &device->sync_ee, w.list) {
d93f6302
LE
2045 if (overlaps(peer_req->i.sector, peer_req->i.size,
2046 rs_req->i.sector, rs_req->i.size)) {
b6a370ba
PR
2047 rv = 1;
2048 break;
2049 }
2050 }
0500813f 2051 spin_unlock_irq(&device->resource->req_lock);
b6a370ba
PR
2052
2053 return rv;
2054}
2055
b411b363
PR
2056/* Called from receive_Data.
2057 * Synchronize packets on sock with packets on msock.
2058 *
2059 * This is here so even when a P_DATA packet traveling via sock overtook an Ack
2060 * packet traveling on msock, they are still processed in the order they have
2061 * been sent.
2062 *
2063 * Note: we don't care for Ack packets overtaking P_DATA packets.
2064 *
b30ab791 2065 * In case packet_seq is larger than device->peer_seq number, there are
b411b363 2066 * outstanding packets on the msock. We wait for them to arrive.
b30ab791 2067 * In case we are the logically next packet, we update device->peer_seq
b411b363
PR
2068 * ourselves. Correctly handles 32bit wrap around.
2069 *
2070 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
2071 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
2072 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
2073 * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
2074 *
2075 * returns 0 if we may process the packet,
2076 * -ERESTARTSYS if we were interrupted (by disconnect signal). */
69a22773 2077static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
b411b363 2078{
69a22773 2079 struct drbd_device *device = peer_device->device;
b411b363 2080 DEFINE_WAIT(wait);
b411b363 2081 long timeout;
b874d231 2082 int ret = 0, tp;
7be8da07 2083
69a22773 2084 if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
7be8da07
AG
2085 return 0;
2086
b30ab791 2087 spin_lock(&device->peer_seq_lock);
b411b363 2088 for (;;) {
b30ab791
AG
2089 if (!seq_greater(peer_seq - 1, device->peer_seq)) {
2090 device->peer_seq = seq_max(device->peer_seq, peer_seq);
b411b363 2091 break;
7be8da07 2092 }
b874d231 2093
b411b363
PR
2094 if (signal_pending(current)) {
2095 ret = -ERESTARTSYS;
2096 break;
2097 }
b874d231
PR
2098
2099 rcu_read_lock();
a6b32bc3 2100 tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries;
b874d231
PR
2101 rcu_read_unlock();
2102
2103 if (!tp)
2104 break;
2105
2106 /* Only need to wait if two_primaries is enabled */
b30ab791
AG
2107 prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
2108 spin_unlock(&device->peer_seq_lock);
44ed167d 2109 rcu_read_lock();
69a22773 2110 timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
44ed167d 2111 rcu_read_unlock();
71b1c1eb 2112 timeout = schedule_timeout(timeout);
b30ab791 2113 spin_lock(&device->peer_seq_lock);
7be8da07 2114 if (!timeout) {
b411b363 2115 ret = -ETIMEDOUT;
d0180171 2116 drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
b411b363
PR
2117 break;
2118 }
2119 }
b30ab791
AG
2120 spin_unlock(&device->peer_seq_lock);
2121 finish_wait(&device->seq_wait, &wait);
b411b363
PR
2122 return ret;
2123}
2124
688593c5
LE
2125/* see also bio_flags_to_wire()
2126 * DRBD_REQ_*, because we need to semantically map the flags to data packet
2127 * flags and back. We may replicate to other kernel versions. */
81f0ffd2 2128static unsigned long wire_flags_to_bio(u32 dpf)
76d2e7ec 2129{
688593c5
LE
2130 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
2131 (dpf & DP_FUA ? REQ_FUA : 0) |
2132 (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
2133 (dpf & DP_DISCARD ? REQ_DISCARD : 0);
76d2e7ec
PR
2134}
2135
b30ab791 2136static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
7be8da07
AG
2137 unsigned int size)
2138{
2139 struct drbd_interval *i;
2140
2141 repeat:
b30ab791 2142 drbd_for_each_overlap(i, &device->write_requests, sector, size) {
7be8da07
AG
2143 struct drbd_request *req;
2144 struct bio_and_error m;
2145
2146 if (!i->local)
2147 continue;
2148 req = container_of(i, struct drbd_request, i);
2149 if (!(req->rq_state & RQ_POSTPONED))
2150 continue;
2151 req->rq_state &= ~RQ_POSTPONED;
2152 __req_mod(req, NEG_ACKED, &m);
0500813f 2153 spin_unlock_irq(&device->resource->req_lock);
7be8da07 2154 if (m.bio)
b30ab791 2155 complete_master_bio(device, &m);
0500813f 2156 spin_lock_irq(&device->resource->req_lock);
7be8da07
AG
2157 goto repeat;
2158 }
2159}
2160
b30ab791 2161static int handle_write_conflicts(struct drbd_device *device,
7be8da07
AG
2162 struct drbd_peer_request *peer_req)
2163{
e33b32de 2164 struct drbd_connection *connection = peer_req->peer_device->connection;
bde89a9e 2165 bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
7be8da07
AG
2166 sector_t sector = peer_req->i.sector;
2167 const unsigned int size = peer_req->i.size;
2168 struct drbd_interval *i;
2169 bool equal;
2170 int err;
2171
2172 /*
2173 * Inserting the peer request into the write_requests tree will prevent
2174 * new conflicting local requests from being added.
2175 */
b30ab791 2176 drbd_insert_interval(&device->write_requests, &peer_req->i);
7be8da07
AG
2177
2178 repeat:
b30ab791 2179 drbd_for_each_overlap(i, &device->write_requests, sector, size) {
7be8da07
AG
2180 if (i == &peer_req->i)
2181 continue;
08d0dabf
LE
2182 if (i->completed)
2183 continue;
7be8da07
AG
2184
2185 if (!i->local) {
2186 /*
2187 * Our peer has sent a conflicting remote request; this
2188 * should not happen in a two-node setup. Wait for the
2189 * earlier peer request to complete.
2190 */
b30ab791 2191 err = drbd_wait_misc(device, i);
7be8da07
AG
2192 if (err)
2193 goto out;
2194 goto repeat;
2195 }
2196
2197 equal = i->sector == sector && i->size == size;
2198 if (resolve_conflicts) {
2199 /*
2200 * If the peer request is fully contained within the
d4dabbe2
LE
2201 * overlapping request, it can be considered overwritten
2202 * and thus superseded; otherwise, it will be retried
2203 * once all overlapping requests have completed.
7be8da07 2204 */
d4dabbe2 2205 bool superseded = i->sector <= sector && i->sector +
7be8da07
AG
2206 (i->size >> 9) >= sector + (size >> 9);
2207
2208 if (!equal)
d0180171 2209 drbd_alert(device, "Concurrent writes detected: "
7be8da07
AG
2210 "local=%llus +%u, remote=%llus +%u, "
2211 "assuming %s came first\n",
2212 (unsigned long long)i->sector, i->size,
2213 (unsigned long long)sector, size,
d4dabbe2 2214 superseded ? "local" : "remote");
7be8da07 2215
a8cd15ba 2216 peer_req->w.cb = superseded ? e_send_superseded :
7be8da07 2217 e_send_retry_write;
a8cd15ba 2218 list_add_tail(&peer_req->w.list, &device->done_ee);
e33b32de 2219 wake_asender(connection);
7be8da07
AG
2220
2221 err = -ENOENT;
2222 goto out;
2223 } else {
2224 struct drbd_request *req =
2225 container_of(i, struct drbd_request, i);
2226
2227 if (!equal)
d0180171 2228 drbd_alert(device, "Concurrent writes detected: "
7be8da07
AG
2229 "local=%llus +%u, remote=%llus +%u\n",
2230 (unsigned long long)i->sector, i->size,
2231 (unsigned long long)sector, size);
2232
2233 if (req->rq_state & RQ_LOCAL_PENDING ||
2234 !(req->rq_state & RQ_POSTPONED)) {
2235 /*
2236 * Wait for the node with the discard flag to
d4dabbe2
LE
2237 * decide if this request has been superseded
2238 * or needs to be retried.
2239 * Requests that have been superseded will
7be8da07
AG
2240 * disappear from the write_requests tree.
2241 *
2242 * In addition, wait for the conflicting
2243 * request to finish locally before submitting
2244 * the conflicting peer request.
2245 */
b30ab791 2246 err = drbd_wait_misc(device, &req->i);
7be8da07 2247 if (err) {
e33b32de 2248 _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
b30ab791 2249 fail_postponed_requests(device, sector, size);
7be8da07
AG
2250 goto out;
2251 }
2252 goto repeat;
2253 }
2254 /*
2255 * Remember to restart the conflicting requests after
2256 * the new peer request has completed.
2257 */
2258 peer_req->flags |= EE_RESTART_REQUESTS;
2259 }
2260 }
2261 err = 0;
2262
2263 out:
2264 if (err)
b30ab791 2265 drbd_remove_epoch_entry_interval(device, peer_req);
7be8da07
AG
2266 return err;
2267}
2268
b411b363 2269/* mirrored write */
bde89a9e 2270static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
b411b363 2271{
9f4fe9ad 2272 struct drbd_peer_device *peer_device;
b30ab791 2273 struct drbd_device *device;
21ae5d7f 2274 struct net_conf *nc;
b411b363 2275 sector_t sector;
db830c46 2276 struct drbd_peer_request *peer_req;
e658983a 2277 struct p_data *p = pi->data;
7be8da07 2278 u32 peer_seq = be32_to_cpu(p->seq_num);
b411b363
PR
2279 int rw = WRITE;
2280 u32 dp_flags;
302bdeae 2281 int err, tp;
b411b363 2282
9f4fe9ad
AG
2283 peer_device = conn_peer_device(connection, pi->vnr);
2284 if (!peer_device)
4a76b161 2285 return -EIO;
9f4fe9ad 2286 device = peer_device->device;
b411b363 2287
b30ab791 2288 if (!get_ldev(device)) {
82bc0194
AG
2289 int err2;
2290
69a22773
AG
2291 err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2292 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
bde89a9e 2293 atomic_inc(&connection->current_epoch->epoch_size);
69a22773 2294 err2 = drbd_drain_block(peer_device, pi->size);
82bc0194
AG
2295 if (!err)
2296 err = err2;
2297 return err;
b411b363
PR
2298 }
2299
fcefa62e
AG
2300 /*
2301 * Corresponding put_ldev done either below (on various errors), or in
2302 * drbd_peer_request_endio, if we successfully submit the data at the
2303 * end of this function.
2304 */
b411b363
PR
2305
2306 sector = be64_to_cpu(p->sector);
a0fb3c47 2307 peer_req = read_in_block(peer_device, p->block_id, sector, pi);
db830c46 2308 if (!peer_req) {
b30ab791 2309 put_ldev(device);
82bc0194 2310 return -EIO;
b411b363
PR
2311 }
2312
a8cd15ba 2313 peer_req->w.cb = e_end_block;
21ae5d7f
LE
2314 peer_req->submit_jif = jiffies;
2315 peer_req->flags |= EE_APPLICATION;
b411b363 2316
688593c5 2317 dp_flags = be32_to_cpu(p->dp_flags);
81f0ffd2 2318 rw |= wire_flags_to_bio(dp_flags);
a0fb3c47
LE
2319 if (pi->cmd == P_TRIM) {
2320 struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
2321 peer_req->flags |= EE_IS_TRIM;
2322 if (!blk_queue_discard(q))
2323 peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
2324 D_ASSERT(peer_device, peer_req->i.size > 0);
2325 D_ASSERT(peer_device, rw & REQ_DISCARD);
2326 D_ASSERT(peer_device, peer_req->pages == NULL);
2327 } else if (peer_req->pages == NULL) {
0b0ba1ef
AG
2328 D_ASSERT(device, peer_req->i.size == 0);
2329 D_ASSERT(device, dp_flags & DP_FLUSH);
a73ff323 2330 }
688593c5
LE
2331
2332 if (dp_flags & DP_MAY_SET_IN_SYNC)
db830c46 2333 peer_req->flags |= EE_MAY_SET_IN_SYNC;
688593c5 2334
bde89a9e
AG
2335 spin_lock(&connection->epoch_lock);
2336 peer_req->epoch = connection->current_epoch;
db830c46
AG
2337 atomic_inc(&peer_req->epoch->epoch_size);
2338 atomic_inc(&peer_req->epoch->active);
bde89a9e 2339 spin_unlock(&connection->epoch_lock);
b411b363 2340
302bdeae 2341 rcu_read_lock();
21ae5d7f
LE
2342 nc = rcu_dereference(peer_device->connection->net_conf);
2343 tp = nc->two_primaries;
2344 if (peer_device->connection->agreed_pro_version < 100) {
2345 switch (nc->wire_protocol) {
2346 case DRBD_PROT_C:
2347 dp_flags |= DP_SEND_WRITE_ACK;
2348 break;
2349 case DRBD_PROT_B:
2350 dp_flags |= DP_SEND_RECEIVE_ACK;
2351 break;
2352 }
2353 }
302bdeae 2354 rcu_read_unlock();
21ae5d7f
LE
2355
2356 if (dp_flags & DP_SEND_WRITE_ACK) {
2357 peer_req->flags |= EE_SEND_WRITE_ACK;
2358 inc_unacked(device);
2359 /* corresponding dec_unacked() in e_end_block()
2360 * respective _drbd_clear_done_ee */
2361 }
2362
2363 if (dp_flags & DP_SEND_RECEIVE_ACK) {
2364 /* I really don't like it that the receiver thread
2365 * sends on the msock, but anyways */
2366 drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
2367 }
2368
302bdeae 2369 if (tp) {
21ae5d7f
LE
2370 /* two primaries implies protocol C */
2371 D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
302bdeae 2372 peer_req->flags |= EE_IN_INTERVAL_TREE;
69a22773 2373 err = wait_for_and_update_peer_seq(peer_device, peer_seq);
7be8da07 2374 if (err)
b411b363 2375 goto out_interrupted;
0500813f 2376 spin_lock_irq(&device->resource->req_lock);
b30ab791 2377 err = handle_write_conflicts(device, peer_req);
7be8da07 2378 if (err) {
0500813f 2379 spin_unlock_irq(&device->resource->req_lock);
7be8da07 2380 if (err == -ENOENT) {
b30ab791 2381 put_ldev(device);
82bc0194 2382 return 0;
b411b363 2383 }
7be8da07 2384 goto out_interrupted;
b411b363 2385 }
b874d231 2386 } else {
69a22773 2387 update_peer_seq(peer_device, peer_seq);
0500813f 2388 spin_lock_irq(&device->resource->req_lock);
b874d231 2389 }
a0fb3c47
LE
2390 /* if we use the zeroout fallback code, we process synchronously
2391 * and we wait for all pending requests, respectively wait for
2392 * active_ee to become empty in drbd_submit_peer_request();
2393 * better not add ourselves here. */
2394 if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
b9ed7080 2395 list_add_tail(&peer_req->w.list, &device->active_ee);
0500813f 2396 spin_unlock_irq(&device->resource->req_lock);
b411b363 2397
b30ab791
AG
2398 if (device->state.conn == C_SYNC_TARGET)
2399 wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
b411b363 2400
b30ab791 2401 if (device->state.pdsk < D_INCONSISTENT) {
b411b363 2402 /* In case we have the only disk of the cluster, */
b30ab791 2403 drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
db830c46 2404 peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
4dd726f0 2405 drbd_al_begin_io(device, &peer_req->i);
21ae5d7f 2406 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
b411b363
PR
2407 }
2408
b30ab791 2409 err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
82bc0194
AG
2410 if (!err)
2411 return 0;
b411b363 2412
10f6d992 2413 /* don't care for the reason here */
d0180171 2414 drbd_err(device, "submit failed, triggering re-connect\n");
0500813f 2415 spin_lock_irq(&device->resource->req_lock);
a8cd15ba 2416 list_del(&peer_req->w.list);
b30ab791 2417 drbd_remove_epoch_entry_interval(device, peer_req);
0500813f 2418 spin_unlock_irq(&device->resource->req_lock);
21ae5d7f
LE
2419 if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
2420 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
b30ab791 2421 drbd_al_complete_io(device, &peer_req->i);
21ae5d7f 2422 }
22cc37a9 2423
b411b363 2424out_interrupted:
bde89a9e 2425 drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
b30ab791
AG
2426 put_ldev(device);
2427 drbd_free_peer_req(device, peer_req);
82bc0194 2428 return err;
b411b363
PR
2429}
2430
0f0601f4
LE
2431/* We may throttle resync, if the lower device seems to be busy,
2432 * and current sync rate is above c_min_rate.
2433 *
2434 * To decide whether or not the lower device is busy, we use a scheme similar
2435 * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2436 * (more than 64 sectors) of activity we cannot account for with our own resync
2437 * activity, it obviously is "busy".
2438 *
2439 * The current sync rate used here uses only the most recent two step marks,
2440 * to have a short time average so we can react faster.
2441 */
ad3fee79
LE
2442bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
2443 bool throttle_if_app_is_waiting)
0f0601f4 2444{
e3555d85 2445 struct lc_element *tmp;
ad3fee79 2446 bool throttle = drbd_rs_c_min_rate_throttle(device);
daeda1cc 2447
ad3fee79
LE
2448 if (!throttle || throttle_if_app_is_waiting)
2449 return throttle;
0f0601f4 2450
b30ab791
AG
2451 spin_lock_irq(&device->al_lock);
2452 tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
e3555d85
PR
2453 if (tmp) {
2454 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
e8299874
LE
2455 if (test_bit(BME_PRIORITY, &bm_ext->flags))
2456 throttle = false;
ad3fee79
LE
2457 /* Do not slow down if app IO is already waiting for this extent,
2458 * and our progress is necessary for application IO to complete. */
e3555d85 2459 }
b30ab791 2460 spin_unlock_irq(&device->al_lock);
e3555d85 2461
e8299874
LE
2462 return throttle;
2463}
2464
2465bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
2466{
2467 struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
2468 unsigned long db, dt, dbdt;
2469 unsigned int c_min_rate;
2470 int curr_events;
2471
2472 rcu_read_lock();
2473 c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
2474 rcu_read_unlock();
2475
2476 /* feature disabled? */
2477 if (c_min_rate == 0)
2478 return false;
2479
0f0601f4
LE
2480 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
2481 (int)part_stat_read(&disk->part0, sectors[1]) -
b30ab791 2482 atomic_read(&device->rs_sect_ev);
ad3fee79
LE
2483
2484 if (atomic_read(&device->ap_actlog_cnt)
ff8bd88b 2485 || curr_events - device->rs_last_events > 64) {
0f0601f4
LE
2486 unsigned long rs_left;
2487 int i;
2488
b30ab791 2489 device->rs_last_events = curr_events;
0f0601f4
LE
2490
2491 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2492 * approx. */
b30ab791 2493 i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
2649f080 2494
b30ab791
AG
2495 if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
2496 rs_left = device->ov_left;
2649f080 2497 else
b30ab791 2498 rs_left = drbd_bm_total_weight(device) - device->rs_failed;
0f0601f4 2499
b30ab791 2500 dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
0f0601f4
LE
2501 if (!dt)
2502 dt++;
b30ab791 2503 db = device->rs_mark_left[i] - rs_left;
0f0601f4
LE
2504 dbdt = Bit2KB(db/dt);
2505
daeda1cc 2506 if (dbdt > c_min_rate)
e8299874 2507 return true;
0f0601f4 2508 }
e8299874 2509 return false;
0f0601f4
LE
2510}
2511
bde89a9e 2512static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
b411b363 2513{
9f4fe9ad 2514 struct drbd_peer_device *peer_device;
b30ab791 2515 struct drbd_device *device;
b411b363 2516 sector_t sector;
4a76b161 2517 sector_t capacity;
db830c46 2518 struct drbd_peer_request *peer_req;
b411b363 2519 struct digest_info *di = NULL;
b18b37be 2520 int size, verb;
b411b363 2521 unsigned int fault_type;
e658983a 2522 struct p_block_req *p = pi->data;
4a76b161 2523
9f4fe9ad
AG
2524 peer_device = conn_peer_device(connection, pi->vnr);
2525 if (!peer_device)
4a76b161 2526 return -EIO;
9f4fe9ad 2527 device = peer_device->device;
b30ab791 2528 capacity = drbd_get_capacity(device->this_bdev);
b411b363
PR
2529
2530 sector = be64_to_cpu(p->sector);
2531 size = be32_to_cpu(p->blksize);
2532
c670a398 2533 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
d0180171 2534 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
b411b363 2535 (unsigned long long)sector, size);
82bc0194 2536 return -EINVAL;
b411b363
PR
2537 }
2538 if (sector + (size>>9) > capacity) {
d0180171 2539 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
b411b363 2540 (unsigned long long)sector, size);
82bc0194 2541 return -EINVAL;
b411b363
PR
2542 }
2543
b30ab791 2544 if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
b18b37be 2545 verb = 1;
e2857216 2546 switch (pi->cmd) {
b18b37be 2547 case P_DATA_REQUEST:
69a22773 2548 drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
b18b37be
PR
2549 break;
2550 case P_RS_DATA_REQUEST:
2551 case P_CSUM_RS_REQUEST:
2552 case P_OV_REQUEST:
69a22773 2553 drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
b18b37be
PR
2554 break;
2555 case P_OV_REPLY:
2556 verb = 0;
b30ab791 2557 dec_rs_pending(device);
69a22773 2558 drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
b18b37be
PR
2559 break;
2560 default:
49ba9b1b 2561 BUG();
b18b37be
PR
2562 }
2563 if (verb && __ratelimit(&drbd_ratelimit_state))
d0180171 2564 drbd_err(device, "Can not satisfy peer's read request, "
b411b363 2565 "no local data.\n");
b18b37be 2566
a821cc4a 2567 /* drain possibly payload */
69a22773 2568 return drbd_drain_block(peer_device, pi->size);
b411b363
PR
2569 }
2570
2571 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2572 * "criss-cross" setup, that might cause write-out on some other DRBD,
2573 * which in turn might block on the other node at this very place. */
a0fb3c47
LE
2574 peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
2575 true /* has real payload */, GFP_NOIO);
db830c46 2576 if (!peer_req) {
b30ab791 2577 put_ldev(device);
82bc0194 2578 return -ENOMEM;
b411b363
PR
2579 }
2580
e2857216 2581 switch (pi->cmd) {
b411b363 2582 case P_DATA_REQUEST:
a8cd15ba 2583 peer_req->w.cb = w_e_end_data_req;
b411b363 2584 fault_type = DRBD_FAULT_DT_RD;
80a40e43 2585 /* application IO, don't drbd_rs_begin_io */
21ae5d7f 2586 peer_req->flags |= EE_APPLICATION;
80a40e43
LE
2587 goto submit;
2588
b411b363 2589 case P_RS_DATA_REQUEST:
a8cd15ba 2590 peer_req->w.cb = w_e_end_rsdata_req;
b411b363 2591 fault_type = DRBD_FAULT_RS_RD;
5f9915bb 2592 /* used in the sector offset progress display */
b30ab791 2593 device->bm_resync_fo = BM_SECT_TO_BIT(sector);
b411b363
PR
2594 break;
2595
2596 case P_OV_REPLY:
2597 case P_CSUM_RS_REQUEST:
2598 fault_type = DRBD_FAULT_RS_RD;
e2857216 2599 di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
b411b363
PR
2600 if (!di)
2601 goto out_free_e;
2602
e2857216 2603 di->digest_size = pi->size;
b411b363
PR
2604 di->digest = (((char *)di)+sizeof(struct digest_info));
2605
db830c46
AG
2606 peer_req->digest = di;
2607 peer_req->flags |= EE_HAS_DIGEST;
c36c3ced 2608
9f4fe9ad 2609 if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
b411b363
PR
2610 goto out_free_e;
2611
e2857216 2612 if (pi->cmd == P_CSUM_RS_REQUEST) {
9f4fe9ad 2613 D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
a8cd15ba 2614 peer_req->w.cb = w_e_end_csum_rs_req;
5f9915bb 2615 /* used in the sector offset progress display */
b30ab791 2616 device->bm_resync_fo = BM_SECT_TO_BIT(sector);
aaaba345
LE
2617 /* remember to report stats in drbd_resync_finished */
2618 device->use_csums = true;
e2857216 2619 } else if (pi->cmd == P_OV_REPLY) {
2649f080 2620 /* track progress, we may need to throttle */
b30ab791 2621 atomic_add(size >> 9, &device->rs_sect_in);
a8cd15ba 2622 peer_req->w.cb = w_e_end_ov_reply;
b30ab791 2623 dec_rs_pending(device);
0f0601f4
LE
2624 /* drbd_rs_begin_io done when we sent this request,
2625 * but accounting still needs to be done. */
2626 goto submit_for_resync;
b411b363
PR
2627 }
2628 break;
2629
2630 case P_OV_REQUEST:
b30ab791 2631 if (device->ov_start_sector == ~(sector_t)0 &&
9f4fe9ad 2632 peer_device->connection->agreed_pro_version >= 90) {
de228bba
LE
2633 unsigned long now = jiffies;
2634 int i;
b30ab791
AG
2635 device->ov_start_sector = sector;
2636 device->ov_position = sector;
2637 device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
2638 device->rs_total = device->ov_left;
de228bba 2639 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
b30ab791
AG
2640 device->rs_mark_left[i] = device->ov_left;
2641 device->rs_mark_time[i] = now;
de228bba 2642 }
d0180171 2643 drbd_info(device, "Online Verify start sector: %llu\n",
b411b363
PR
2644 (unsigned long long)sector);
2645 }
a8cd15ba 2646 peer_req->w.cb = w_e_end_ov_req;
b411b363 2647 fault_type = DRBD_FAULT_RS_RD;
b411b363
PR
2648 break;
2649
b411b363 2650 default:
49ba9b1b 2651 BUG();
b411b363
PR
2652 }
2653
0f0601f4
LE
2654 /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2655 * wrt the receiver, but it is not as straightforward as it may seem.
2656 * Various places in the resync start and stop logic assume resync
2657 * requests are processed in order, requeuing this on the worker thread
2658 * introduces a bunch of new code for synchronization between threads.
2659 *
2660 * Unlimited throttling before drbd_rs_begin_io may stall the resync
2661 * "forever", throttling after drbd_rs_begin_io will lock that extent
2662 * for application writes for the same time. For now, just throttle
2663 * here, where the rest of the code expects the receiver to sleep for
2664 * a while, anyways.
2665 */
2666
2667 /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2668 * this defers syncer requests for some time, before letting at least
2669 * on request through. The resync controller on the receiving side
2670 * will adapt to the incoming rate accordingly.
2671 *
2672 * We cannot throttle here if remote is Primary/SyncTarget:
2673 * we would also throttle its application reads.
2674 * In that case, throttling is done on the SyncTarget only.
2675 */
c5a2c150
LE
2676
2677 /* Even though this may be a resync request, we do add to "read_ee";
2678 * "sync_ee" is only used for resync WRITEs.
2679 * Add to list early, so debugfs can find this request
2680 * even if we have to sleep below. */
2681 spin_lock_irq(&device->resource->req_lock);
2682 list_add_tail(&peer_req->w.list, &device->read_ee);
2683 spin_unlock_irq(&device->resource->req_lock);
2684
944410e9 2685 update_receiver_timing_details(connection, drbd_rs_should_slow_down);
ad3fee79
LE
2686 if (device->state.peer != R_PRIMARY
2687 && drbd_rs_should_slow_down(device, sector, false))
e3555d85 2688 schedule_timeout_uninterruptible(HZ/10);
944410e9 2689 update_receiver_timing_details(connection, drbd_rs_begin_io);
b30ab791 2690 if (drbd_rs_begin_io(device, sector))
80a40e43 2691 goto out_free_e;
b411b363 2692
0f0601f4 2693submit_for_resync:
b30ab791 2694 atomic_add(size >> 9, &device->rs_sect_ev);
0f0601f4 2695
80a40e43 2696submit:
944410e9 2697 update_receiver_timing_details(connection, drbd_submit_peer_request);
b30ab791 2698 inc_unacked(device);
b30ab791 2699 if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
82bc0194 2700 return 0;
b411b363 2701
10f6d992 2702 /* don't care for the reason here */
d0180171 2703 drbd_err(device, "submit failed, triggering re-connect\n");
c5a2c150
LE
2704
2705out_free_e:
0500813f 2706 spin_lock_irq(&device->resource->req_lock);
a8cd15ba 2707 list_del(&peer_req->w.list);
0500813f 2708 spin_unlock_irq(&device->resource->req_lock);
22cc37a9
LE
2709 /* no drbd_rs_complete_io(), we are dropping the connection anyways */
2710
b30ab791
AG
2711 put_ldev(device);
2712 drbd_free_peer_req(device, peer_req);
82bc0194 2713 return -EIO;
b411b363
PR
2714}
2715
69a22773
AG
2716/**
2717 * drbd_asb_recover_0p - Recover after split-brain with no remaining primaries
2718 */
2719static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
b411b363 2720{
69a22773 2721 struct drbd_device *device = peer_device->device;
b411b363
PR
2722 int self, peer, rv = -100;
2723 unsigned long ch_self, ch_peer;
44ed167d 2724 enum drbd_after_sb_p after_sb_0p;
b411b363 2725
b30ab791
AG
2726 self = device->ldev->md.uuid[UI_BITMAP] & 1;
2727 peer = device->p_uuid[UI_BITMAP] & 1;
b411b363 2728
b30ab791
AG
2729 ch_peer = device->p_uuid[UI_SIZE];
2730 ch_self = device->comm_bm_set;
b411b363 2731
44ed167d 2732 rcu_read_lock();
69a22773 2733 after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
44ed167d
PR
2734 rcu_read_unlock();
2735 switch (after_sb_0p) {
b411b363
PR
2736 case ASB_CONSENSUS:
2737 case ASB_DISCARD_SECONDARY:
2738 case ASB_CALL_HELPER:
44ed167d 2739 case ASB_VIOLENTLY:
d0180171 2740 drbd_err(device, "Configuration error.\n");
b411b363
PR
2741 break;
2742 case ASB_DISCONNECT:
2743 break;
2744 case ASB_DISCARD_YOUNGER_PRI:
2745 if (self == 0 && peer == 1) {
2746 rv = -1;
2747 break;
2748 }
2749 if (self == 1 && peer == 0) {
2750 rv = 1;
2751 break;
2752 }
2753 /* Else fall through to one of the other strategies... */
2754 case ASB_DISCARD_OLDER_PRI:
2755 if (self == 0 && peer == 1) {
2756 rv = 1;
2757 break;
2758 }
2759 if (self == 1 && peer == 0) {
2760 rv = -1;
2761 break;
2762 }
2763 /* Else fall through to one of the other strategies... */
d0180171 2764 drbd_warn(device, "Discard younger/older primary did not find a decision\n"
b411b363
PR
2765 "Using discard-least-changes instead\n");
2766 case ASB_DISCARD_ZERO_CHG:
2767 if (ch_peer == 0 && ch_self == 0) {
69a22773 2768 rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
b411b363
PR
2769 ? -1 : 1;
2770 break;
2771 } else {
2772 if (ch_peer == 0) { rv = 1; break; }
2773 if (ch_self == 0) { rv = -1; break; }
2774 }
44ed167d 2775 if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
b411b363
PR
2776 break;
2777 case ASB_DISCARD_LEAST_CHG:
2778 if (ch_self < ch_peer)
2779 rv = -1;
2780 else if (ch_self > ch_peer)
2781 rv = 1;
2782 else /* ( ch_self == ch_peer ) */
2783 /* Well, then use something else. */
69a22773 2784 rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
b411b363
PR
2785 ? -1 : 1;
2786 break;
2787 case ASB_DISCARD_LOCAL:
2788 rv = -1;
2789 break;
2790 case ASB_DISCARD_REMOTE:
2791 rv = 1;
2792 }
2793
2794 return rv;
2795}
2796
69a22773
AG
2797/**
2798 * drbd_asb_recover_1p - Recover after split-brain with one remaining primary
2799 */
2800static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
b411b363 2801{
69a22773 2802 struct drbd_device *device = peer_device->device;
6184ea21 2803 int hg, rv = -100;
44ed167d 2804 enum drbd_after_sb_p after_sb_1p;
b411b363 2805
44ed167d 2806 rcu_read_lock();
69a22773 2807 after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
44ed167d
PR
2808 rcu_read_unlock();
2809 switch (after_sb_1p) {
b411b363
PR
2810 case ASB_DISCARD_YOUNGER_PRI:
2811 case ASB_DISCARD_OLDER_PRI:
2812 case ASB_DISCARD_LEAST_CHG:
2813 case ASB_DISCARD_LOCAL:
2814 case ASB_DISCARD_REMOTE:
44ed167d 2815 case ASB_DISCARD_ZERO_CHG:
d0180171 2816 drbd_err(device, "Configuration error.\n");
b411b363
PR
2817 break;
2818 case ASB_DISCONNECT:
2819 break;
2820 case ASB_CONSENSUS:
69a22773 2821 hg = drbd_asb_recover_0p(peer_device);
b30ab791 2822 if (hg == -1 && device->state.role == R_SECONDARY)
b411b363 2823 rv = hg;
b30ab791 2824 if (hg == 1 && device->state.role == R_PRIMARY)
b411b363
PR
2825 rv = hg;
2826 break;
2827 case ASB_VIOLENTLY:
69a22773 2828 rv = drbd_asb_recover_0p(peer_device);
b411b363
PR
2829 break;
2830 case ASB_DISCARD_SECONDARY:
b30ab791 2831 return device->state.role == R_PRIMARY ? 1 : -1;
b411b363 2832 case ASB_CALL_HELPER:
69a22773 2833 hg = drbd_asb_recover_0p(peer_device);
b30ab791 2834 if (hg == -1 && device->state.role == R_PRIMARY) {
bb437946
AG
2835 enum drbd_state_rv rv2;
2836
b411b363
PR
2837 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2838 * we might be here in C_WF_REPORT_PARAMS which is transient.
2839 * we do not need to wait for the after state change work either. */
b30ab791 2840 rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
bb437946 2841 if (rv2 != SS_SUCCESS) {
b30ab791 2842 drbd_khelper(device, "pri-lost-after-sb");
b411b363 2843 } else {
d0180171 2844 drbd_warn(device, "Successfully gave up primary role.\n");
b411b363
PR
2845 rv = hg;
2846 }
2847 } else
2848 rv = hg;
2849 }
2850
2851 return rv;
2852}
2853
69a22773
AG
2854/**
2855 * drbd_asb_recover_2p - Recover after split-brain with two remaining primaries
2856 */
2857static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
b411b363 2858{
69a22773 2859 struct drbd_device *device = peer_device->device;
6184ea21 2860 int hg, rv = -100;
44ed167d 2861 enum drbd_after_sb_p after_sb_2p;
b411b363 2862
44ed167d 2863 rcu_read_lock();
69a22773 2864 after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
44ed167d
PR
2865 rcu_read_unlock();
2866 switch (after_sb_2p) {
b411b363
PR
2867 case ASB_DISCARD_YOUNGER_PRI:
2868 case ASB_DISCARD_OLDER_PRI:
2869 case ASB_DISCARD_LEAST_CHG:
2870 case ASB_DISCARD_LOCAL:
2871 case ASB_DISCARD_REMOTE:
2872 case ASB_CONSENSUS:
2873 case ASB_DISCARD_SECONDARY:
44ed167d 2874 case ASB_DISCARD_ZERO_CHG:
d0180171 2875 drbd_err(device, "Configuration error.\n");
b411b363
PR
2876 break;
2877 case ASB_VIOLENTLY:
69a22773 2878 rv = drbd_asb_recover_0p(peer_device);
b411b363
PR
2879 break;
2880 case ASB_DISCONNECT:
2881 break;
2882 case ASB_CALL_HELPER:
69a22773 2883 hg = drbd_asb_recover_0p(peer_device);
b411b363 2884 if (hg == -1) {
bb437946
AG
2885 enum drbd_state_rv rv2;
2886
b411b363
PR
2887 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2888 * we might be here in C_WF_REPORT_PARAMS which is transient.
2889 * we do not need to wait for the after state change work either. */
b30ab791 2890 rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
bb437946 2891 if (rv2 != SS_SUCCESS) {
b30ab791 2892 drbd_khelper(device, "pri-lost-after-sb");
b411b363 2893 } else {
d0180171 2894 drbd_warn(device, "Successfully gave up primary role.\n");
b411b363
PR
2895 rv = hg;
2896 }
2897 } else
2898 rv = hg;
2899 }
2900
2901 return rv;
2902}
2903
b30ab791 2904static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
b411b363
PR
2905 u64 bits, u64 flags)
2906{
2907 if (!uuid) {
d0180171 2908 drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
b411b363
PR
2909 return;
2910 }
d0180171 2911 drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
b411b363
PR
2912 text,
2913 (unsigned long long)uuid[UI_CURRENT],
2914 (unsigned long long)uuid[UI_BITMAP],
2915 (unsigned long long)uuid[UI_HISTORY_START],
2916 (unsigned long long)uuid[UI_HISTORY_END],
2917 (unsigned long long)bits,
2918 (unsigned long long)flags);
2919}
2920
2921/*
2922 100 after split brain try auto recover
2923 2 C_SYNC_SOURCE set BitMap
2924 1 C_SYNC_SOURCE use BitMap
2925 0 no Sync
2926 -1 C_SYNC_TARGET use BitMap
2927 -2 C_SYNC_TARGET set BitMap
2928 -100 after split brain, disconnect
2929-1000 unrelated data
4a23f264
PR
2930-1091 requires proto 91
2931-1096 requires proto 96
b411b363 2932 */
44a4d551 2933static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local)
b411b363 2934{
44a4d551
LE
2935 struct drbd_peer_device *const peer_device = first_peer_device(device);
2936 struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
b411b363
PR
2937 u64 self, peer;
2938 int i, j;
2939
b30ab791
AG
2940 self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2941 peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
b411b363
PR
2942
2943 *rule_nr = 10;
2944 if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2945 return 0;
2946
2947 *rule_nr = 20;
2948 if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2949 peer != UUID_JUST_CREATED)
2950 return -2;
2951
2952 *rule_nr = 30;
2953 if (self != UUID_JUST_CREATED &&
2954 (peer == UUID_JUST_CREATED || peer == (u64)0))
2955 return 2;
2956
2957 if (self == peer) {
2958 int rct, dc; /* roles at crash time */
2959
b30ab791 2960 if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
b411b363 2961
44a4d551 2962 if (connection->agreed_pro_version < 91)
4a23f264 2963 return -1091;
b411b363 2964
b30ab791
AG
2965 if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2966 (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
d0180171 2967 drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
b30ab791
AG
2968 drbd_uuid_move_history(device);
2969 device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
2970 device->ldev->md.uuid[UI_BITMAP] = 0;
b411b363 2971
b30ab791
AG
2972 drbd_uuid_dump(device, "self", device->ldev->md.uuid,
2973 device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
b411b363
PR
2974 *rule_nr = 34;
2975 } else {
d0180171 2976 drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
b411b363
PR
2977 *rule_nr = 36;
2978 }
2979
2980 return 1;
2981 }
2982
b30ab791 2983 if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
b411b363 2984
44a4d551 2985 if (connection->agreed_pro_version < 91)
4a23f264 2986 return -1091;
b411b363 2987
b30ab791
AG
2988 if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2989 (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
d0180171 2990 drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
b411b363 2991
b30ab791
AG
2992 device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
2993 device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
2994 device->p_uuid[UI_BITMAP] = 0UL;
b411b363 2995
b30ab791 2996 drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
b411b363
PR
2997 *rule_nr = 35;
2998 } else {
d0180171 2999 drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
b411b363
PR
3000 *rule_nr = 37;
3001 }
3002
3003 return -1;
3004 }
3005
3006 /* Common power [off|failure] */
b30ab791
AG
3007 rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
3008 (device->p_uuid[UI_FLAGS] & 2);
b411b363
PR
3009 /* lowest bit is set when we were primary,
3010 * next bit (weight 2) is set when peer was primary */
3011 *rule_nr = 40;
3012
3013 switch (rct) {
3014 case 0: /* !self_pri && !peer_pri */ return 0;
3015 case 1: /* self_pri && !peer_pri */ return 1;
3016 case 2: /* !self_pri && peer_pri */ return -1;
3017 case 3: /* self_pri && peer_pri */
44a4d551 3018 dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
b411b363
PR
3019 return dc ? -1 : 1;
3020 }
3021 }
3022
3023 *rule_nr = 50;
b30ab791 3024 peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
b411b363
PR
3025 if (self == peer)
3026 return -1;
3027
3028 *rule_nr = 51;
b30ab791 3029 peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
b411b363 3030 if (self == peer) {
44a4d551 3031 if (connection->agreed_pro_version < 96 ?
b30ab791
AG
3032 (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
3033 (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
3034 peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
b411b363
PR
3035 /* The last P_SYNC_UUID did not get though. Undo the last start of
3036 resync as sync source modifications of the peer's UUIDs. */
3037
44a4d551 3038 if (connection->agreed_pro_version < 91)
4a23f264 3039 return -1091;
b411b363 3040
b30ab791
AG
3041 device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
3042 device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
4a23f264 3043
d0180171 3044 drbd_info(device, "Lost last syncUUID packet, corrected:\n");
b30ab791 3045 drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
4a23f264 3046
b411b363
PR
3047 return -1;
3048 }
3049 }
3050
3051 *rule_nr = 60;
b30ab791 3052 self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
b411b363 3053 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
b30ab791 3054 peer = device->p_uuid[i] & ~((u64)1);
b411b363
PR
3055 if (self == peer)
3056 return -2;
3057 }
3058
3059 *rule_nr = 70;
b30ab791
AG
3060 self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3061 peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
b411b363
PR
3062 if (self == peer)
3063 return 1;
3064
3065 *rule_nr = 71;
b30ab791 3066 self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
b411b363 3067 if (self == peer) {
44a4d551 3068 if (connection->agreed_pro_version < 96 ?
b30ab791
AG
3069 (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
3070 (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
3071 self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
b411b363
PR
3072 /* The last P_SYNC_UUID did not get though. Undo the last start of
3073 resync as sync source modifications of our UUIDs. */
3074
44a4d551 3075 if (connection->agreed_pro_version < 91)
4a23f264 3076 return -1091;
b411b363 3077
b30ab791
AG
3078 __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
3079 __drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
b411b363 3080
d0180171 3081 drbd_info(device, "Last syncUUID did not get through, corrected:\n");
b30ab791
AG
3082 drbd_uuid_dump(device, "self", device->ldev->md.uuid,
3083 device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
b411b363
PR
3084
3085 return 1;
3086 }
3087 }
3088
3089
3090 *rule_nr = 80;
b30ab791 3091 peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
b411b363 3092 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
b30ab791 3093 self = device->ldev->md.uuid[i] & ~((u64)1);
b411b363
PR
3094 if (self == peer)
3095 return 2;
3096 }
3097
3098 *rule_nr = 90;
b30ab791
AG
3099 self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3100 peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
b411b363
PR
3101 if (self == peer && self != ((u64)0))
3102 return 100;
3103
3104 *rule_nr = 100;
3105 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
b30ab791 3106 self = device->ldev->md.uuid[i] & ~((u64)1);
b411b363 3107 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
b30ab791 3108 peer = device->p_uuid[j] & ~((u64)1);
b411b363
PR
3109 if (self == peer)
3110 return -100;
3111 }
3112 }
3113
3114 return -1000;
3115}
3116
3117/* drbd_sync_handshake() returns the new conn state on success, or
3118 CONN_MASK (-1) on failure.
3119 */
69a22773
AG
3120static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
3121 enum drbd_role peer_role,
b411b363
PR
3122 enum drbd_disk_state peer_disk) __must_hold(local)
3123{
69a22773 3124 struct drbd_device *device = peer_device->device;
b411b363
PR
3125 enum drbd_conns rv = C_MASK;
3126 enum drbd_disk_state mydisk;
44ed167d 3127 struct net_conf *nc;
6dff2902 3128 int hg, rule_nr, rr_conflict, tentative;
b411b363 3129
b30ab791 3130 mydisk = device->state.disk;
b411b363 3131 if (mydisk == D_NEGOTIATING)
b30ab791 3132 mydisk = device->new_state_tmp.disk;
b411b363 3133
d0180171 3134 drbd_info(device, "drbd_sync_handshake:\n");
9f2247bb 3135
b30ab791
AG
3136 spin_lock_irq(&device->ldev->md.uuid_lock);
3137 drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
3138 drbd_uuid_dump(device, "peer", device->p_uuid,
3139 device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
b411b363 3140
b30ab791
AG
3141 hg = drbd_uuid_compare(device, &rule_nr);
3142 spin_unlock_irq(&device->ldev->md.uuid_lock);
b411b363 3143
d0180171 3144 drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
b411b363
PR
3145
3146 if (hg == -1000) {
d0180171 3147 drbd_alert(device, "Unrelated data, aborting!\n");
b411b363
PR
3148 return C_MASK;
3149 }
4a23f264 3150 if (hg < -1000) {
d0180171 3151 drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
b411b363
PR
3152 return C_MASK;
3153 }
3154
3155 if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
3156 (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
3157 int f = (hg == -100) || abs(hg) == 2;
3158 hg = mydisk > D_INCONSISTENT ? 1 : -1;
3159 if (f)
3160 hg = hg*2;
d0180171 3161 drbd_info(device, "Becoming sync %s due to disk states.\n",
b411b363
PR
3162 hg > 0 ? "source" : "target");
3163 }
3164
3a11a487 3165 if (abs(hg) == 100)
b30ab791 3166 drbd_khelper(device, "initial-split-brain");
3a11a487 3167
44ed167d 3168 rcu_read_lock();
69a22773 3169 nc = rcu_dereference(peer_device->connection->net_conf);
44ed167d
PR
3170
3171 if (hg == 100 || (hg == -100 && nc->always_asbp)) {
b30ab791 3172 int pcount = (device->state.role == R_PRIMARY)
b411b363
PR
3173 + (peer_role == R_PRIMARY);
3174 int forced = (hg == -100);
3175
3176 switch (pcount) {
3177 case 0:
69a22773 3178 hg = drbd_asb_recover_0p(peer_device);
b411b363
PR
3179 break;
3180 case 1:
69a22773 3181 hg = drbd_asb_recover_1p(peer_device);
b411b363
PR
3182 break;
3183 case 2:
69a22773 3184 hg = drbd_asb_recover_2p(peer_device);
b411b363
PR
3185 break;
3186 }
3187 if (abs(hg) < 100) {
d0180171 3188 drbd_warn(device, "Split-Brain detected, %d primaries, "
b411b363
PR
3189 "automatically solved. Sync from %s node\n",
3190 pcount, (hg < 0) ? "peer" : "this");
3191 if (forced) {
d0180171 3192 drbd_warn(device, "Doing a full sync, since"
b411b363
PR
3193 " UUIDs where ambiguous.\n");
3194 hg = hg*2;
3195 }
3196 }
3197 }
3198
3199 if (hg == -100) {
b30ab791 3200 if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
b411b363 3201 hg = -1;
b30ab791 3202 if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
b411b363
PR
3203 hg = 1;
3204
3205 if (abs(hg) < 100)
d0180171 3206 drbd_warn(device, "Split-Brain detected, manually solved. "
b411b363
PR
3207 "Sync from %s node\n",
3208 (hg < 0) ? "peer" : "this");
3209 }
44ed167d 3210 rr_conflict = nc->rr_conflict;
6dff2902 3211 tentative = nc->tentative;
44ed167d 3212 rcu_read_unlock();
b411b363
PR
3213
3214 if (hg == -100) {
580b9767
LE
3215 /* FIXME this log message is not correct if we end up here
3216 * after an attempted attach on a diskless node.
3217 * We just refuse to attach -- well, we drop the "connection"
3218 * to that disk, in a way... */
d0180171 3219 drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
b30ab791 3220 drbd_khelper(device, "split-brain");
b411b363
PR
3221 return C_MASK;
3222 }
3223
3224 if (hg > 0 && mydisk <= D_INCONSISTENT) {
d0180171 3225 drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
b411b363
PR
3226 return C_MASK;
3227 }
3228
3229 if (hg < 0 && /* by intention we do not use mydisk here. */
b30ab791 3230 device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
44ed167d 3231 switch (rr_conflict) {
b411b363 3232 case ASB_CALL_HELPER:
b30ab791 3233 drbd_khelper(device, "pri-lost");
b411b363
PR
3234 /* fall through */
3235 case ASB_DISCONNECT:
d0180171 3236 drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
b411b363
PR
3237 return C_MASK;
3238 case ASB_VIOLENTLY:
d0180171 3239 drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
b411b363
PR
3240 "assumption\n");
3241 }
3242 }
3243
69a22773 3244 if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
cf14c2e9 3245 if (hg == 0)
d0180171 3246 drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
cf14c2e9 3247 else
d0180171 3248 drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
cf14c2e9
PR
3249 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
3250 abs(hg) >= 2 ? "full" : "bit-map based");
3251 return C_MASK;
3252 }
3253
b411b363 3254 if (abs(hg) >= 2) {
d0180171 3255 drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
b30ab791 3256 if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
20ceb2b2 3257 BM_LOCKED_SET_ALLOWED))
b411b363
PR
3258 return C_MASK;
3259 }
3260
3261 if (hg > 0) { /* become sync source. */
3262 rv = C_WF_BITMAP_S;
3263 } else if (hg < 0) { /* become sync target */
3264 rv = C_WF_BITMAP_T;
3265 } else {
3266 rv = C_CONNECTED;
b30ab791 3267 if (drbd_bm_total_weight(device)) {
d0180171 3268 drbd_info(device, "No resync, but %lu bits in bitmap!\n",
b30ab791 3269 drbd_bm_total_weight(device));
b411b363
PR
3270 }
3271 }
3272
3273 return rv;
3274}
3275
f179d76d 3276static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
b411b363
PR
3277{
3278 /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
f179d76d
PR
3279 if (peer == ASB_DISCARD_REMOTE)
3280 return ASB_DISCARD_LOCAL;
b411b363
PR
3281
3282 /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
f179d76d
PR
3283 if (peer == ASB_DISCARD_LOCAL)
3284 return ASB_DISCARD_REMOTE;
b411b363
PR
3285
3286 /* everything else is valid if they are equal on both sides. */
f179d76d 3287 return peer;
b411b363
PR
3288}
3289
bde89a9e 3290static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi)
b411b363 3291{
e658983a 3292 struct p_protocol *p = pi->data;
036b17ea
PR
3293 enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
3294 int p_proto, p_discard_my_data, p_two_primaries, cf;
3295 struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
3296 char integrity_alg[SHARED_SECRET_MAX] = "";
accdbcc5 3297 struct crypto_hash *peer_integrity_tfm = NULL;
7aca6c75 3298 void *int_dig_in = NULL, *int_dig_vv = NULL;
b411b363 3299
b411b363
PR
3300 p_proto = be32_to_cpu(p->protocol);
3301 p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
3302 p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
3303 p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
b411b363 3304 p_two_primaries = be32_to_cpu(p->two_primaries);
cf14c2e9 3305 cf = be32_to_cpu(p->conn_flags);
6139f60d 3306 p_discard_my_data = cf & CF_DISCARD_MY_DATA;
cf14c2e9 3307
bde89a9e 3308 if (connection->agreed_pro_version >= 87) {
86db0618 3309 int err;
cf14c2e9 3310
88104ca4 3311 if (pi->size > sizeof(integrity_alg))
86db0618 3312 return -EIO;
bde89a9e 3313 err = drbd_recv_all(connection, integrity_alg, pi->size);
86db0618
AG
3314 if (err)
3315 return err;
036b17ea 3316 integrity_alg[SHARED_SECRET_MAX - 1] = 0;
b411b363
PR
3317 }
3318
7d4c782c 3319 if (pi->cmd != P_PROTOCOL_UPDATE) {
bde89a9e 3320 clear_bit(CONN_DRY_RUN, &connection->flags);
b411b363 3321
fbc12f45 3322 if (cf & CF_DRY_RUN)
bde89a9e 3323 set_bit(CONN_DRY_RUN, &connection->flags);
b411b363 3324
fbc12f45 3325 rcu_read_lock();
bde89a9e 3326 nc = rcu_dereference(connection->net_conf);
b411b363 3327
fbc12f45 3328 if (p_proto != nc->wire_protocol) {
1ec861eb 3329 drbd_err(connection, "incompatible %s settings\n", "protocol");
fbc12f45
AG
3330 goto disconnect_rcu_unlock;
3331 }
b411b363 3332
fbc12f45 3333 if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
1ec861eb 3334 drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri");
fbc12f45
AG
3335 goto disconnect_rcu_unlock;
3336 }
b411b363 3337
fbc12f45 3338 if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
1ec861eb 3339 drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri");
fbc12f45
AG
3340 goto disconnect_rcu_unlock;
3341 }
b411b363 3342
fbc12f45 3343 if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
1ec861eb 3344 drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri");
fbc12f45
AG
3345 goto disconnect_rcu_unlock;
3346 }
b411b363 3347
fbc12f45 3348 if (p_discard_my_data && nc->discard_my_data) {
1ec861eb 3349 drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
fbc12f45
AG
3350 goto disconnect_rcu_unlock;
3351 }
b411b363 3352
fbc12f45 3353 if (p_two_primaries != nc->two_primaries) {
1ec861eb 3354 drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries");
fbc12f45
AG
3355 goto disconnect_rcu_unlock;
3356 }
b411b363 3357
fbc12f45 3358 if (strcmp(integrity_alg, nc->integrity_alg)) {
1ec861eb 3359 drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg");
fbc12f45
AG
3360 goto disconnect_rcu_unlock;
3361 }
b411b363 3362
fbc12f45 3363 rcu_read_unlock();
b411b363
PR
3364 }
3365
7d4c782c
AG
3366 if (integrity_alg[0]) {
3367 int hash_size;
3368
3369 /*
3370 * We can only change the peer data integrity algorithm
3371 * here. Changing our own data integrity algorithm
3372 * requires that we send a P_PROTOCOL_UPDATE packet at
3373 * the same time; otherwise, the peer has no way to
3374 * tell between which packets the algorithm should
3375 * change.
3376 */
b411b363 3377
7d4c782c
AG
3378 peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
3379 if (!peer_integrity_tfm) {
1ec861eb 3380 drbd_err(connection, "peer data-integrity-alg %s not supported\n",
7d4c782c
AG
3381 integrity_alg);
3382 goto disconnect;
3383 }
b411b363 3384
7d4c782c
AG
3385 hash_size = crypto_hash_digestsize(peer_integrity_tfm);
3386 int_dig_in = kmalloc(hash_size, GFP_KERNEL);
3387 int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
3388 if (!(int_dig_in && int_dig_vv)) {
1ec861eb 3389 drbd_err(connection, "Allocation of buffers for data integrity checking failed\n");
b411b363
PR
3390 goto disconnect;
3391 }
b411b363
PR
3392 }
3393
7d4c782c
AG
3394 new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
3395 if (!new_net_conf) {
1ec861eb 3396 drbd_err(connection, "Allocation of new net_conf failed\n");
7d4c782c
AG
3397 goto disconnect;
3398 }
3399
bde89a9e 3400 mutex_lock(&connection->data.mutex);
0500813f 3401 mutex_lock(&connection->resource->conf_update);
bde89a9e 3402 old_net_conf = connection->net_conf;
7d4c782c
AG
3403 *new_net_conf = *old_net_conf;
3404
3405 new_net_conf->wire_protocol = p_proto;
3406 new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
3407 new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
3408 new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
3409 new_net_conf->two_primaries = p_two_primaries;
3410
bde89a9e 3411 rcu_assign_pointer(connection->net_conf, new_net_conf);
0500813f 3412 mutex_unlock(&connection->resource->conf_update);
bde89a9e 3413 mutex_unlock(&connection->data.mutex);
7d4c782c 3414
bde89a9e
AG
3415 crypto_free_hash(connection->peer_integrity_tfm);
3416 kfree(connection->int_dig_in);
3417 kfree(connection->int_dig_vv);
3418 connection->peer_integrity_tfm = peer_integrity_tfm;
3419 connection->int_dig_in = int_dig_in;
3420 connection->int_dig_vv = int_dig_vv;
7d4c782c
AG
3421
3422 if (strcmp(old_net_conf->integrity_alg, integrity_alg))
1ec861eb 3423 drbd_info(connection, "peer data-integrity-alg: %s\n",
7d4c782c
AG
3424 integrity_alg[0] ? integrity_alg : "(none)");
3425
3426 synchronize_rcu();
3427 kfree(old_net_conf);
82bc0194 3428 return 0;
b411b363 3429
44ed167d
PR
3430disconnect_rcu_unlock:
3431 rcu_read_unlock();
b411b363 3432disconnect:
b792c35c 3433 crypto_free_hash(peer_integrity_tfm);
036b17ea
PR
3434 kfree(int_dig_in);
3435 kfree(int_dig_vv);
bde89a9e 3436 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3437 return -EIO;
b411b363
PR
3438}
3439
3440/* helper function
3441 * input: alg name, feature name
3442 * return: NULL (alg name was "")
3443 * ERR_PTR(error) if something goes wrong
3444 * or the crypto hash ptr, if it worked out ok. */
8ce953aa 3445static struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
b411b363
PR
3446 const char *alg, const char *name)
3447{
3448 struct crypto_hash *tfm;
3449
3450 if (!alg[0])
3451 return NULL;
3452
3453 tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
3454 if (IS_ERR(tfm)) {
d0180171 3455 drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n",
b411b363
PR
3456 alg, name, PTR_ERR(tfm));
3457 return tfm;
3458 }
b411b363
PR
3459 return tfm;
3460}
3461
bde89a9e 3462static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
4a76b161 3463{
bde89a9e 3464 void *buffer = connection->data.rbuf;
4a76b161
AG
3465 int size = pi->size;
3466
3467 while (size) {
3468 int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
bde89a9e 3469 s = drbd_recv(connection, buffer, s);
4a76b161
AG
3470 if (s <= 0) {
3471 if (s < 0)
3472 return s;
3473 break;
3474 }
3475 size -= s;
3476 }
3477 if (size)
3478 return -EIO;
3479 return 0;
3480}
3481
3482/*
3483 * config_unknown_volume - device configuration command for unknown volume
3484 *
3485 * When a device is added to an existing connection, the node on which the
3486 * device is added first will send configuration commands to its peer but the
3487 * peer will not know about the device yet. It will warn and ignore these
3488 * commands. Once the device is added on the second node, the second node will
3489 * send the same device configuration commands, but in the other direction.
3490 *
3491 * (We can also end up here if drbd is misconfigured.)
3492 */
bde89a9e 3493static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
4a76b161 3494{
1ec861eb 3495 drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
2fcb8f30 3496 cmdname(pi->cmd), pi->vnr);
bde89a9e 3497 return ignore_remaining_packet(connection, pi);
4a76b161
AG
3498}
3499
bde89a9e 3500static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
b411b363 3501{
9f4fe9ad 3502 struct drbd_peer_device *peer_device;
b30ab791 3503 struct drbd_device *device;
e658983a 3504 struct p_rs_param_95 *p;
b411b363
PR
3505 unsigned int header_size, data_size, exp_max_sz;
3506 struct crypto_hash *verify_tfm = NULL;
3507 struct crypto_hash *csums_tfm = NULL;
2ec91e0e 3508 struct net_conf *old_net_conf, *new_net_conf = NULL;
813472ce 3509 struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
bde89a9e 3510 const int apv = connection->agreed_pro_version;
813472ce 3511 struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
778f271d 3512 int fifo_size = 0;
82bc0194 3513 int err;
b411b363 3514
9f4fe9ad
AG
3515 peer_device = conn_peer_device(connection, pi->vnr);
3516 if (!peer_device)
bde89a9e 3517 return config_unknown_volume(connection, pi);
9f4fe9ad 3518 device = peer_device->device;
b411b363
PR
3519
3520 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
3521 : apv == 88 ? sizeof(struct p_rs_param)
3522 + SHARED_SECRET_MAX
8e26f9cc
PR
3523 : apv <= 94 ? sizeof(struct p_rs_param_89)
3524 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363 3525
e2857216 3526 if (pi->size > exp_max_sz) {
d0180171 3527 drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n",
e2857216 3528 pi->size, exp_max_sz);
82bc0194 3529 return -EIO;
b411b363
PR
3530 }
3531
3532 if (apv <= 88) {
e658983a 3533 header_size = sizeof(struct p_rs_param);
e2857216 3534 data_size = pi->size - header_size;
8e26f9cc 3535 } else if (apv <= 94) {
e658983a 3536 header_size = sizeof(struct p_rs_param_89);
e2857216 3537 data_size = pi->size - header_size;
0b0ba1ef 3538 D_ASSERT(device, data_size == 0);
8e26f9cc 3539 } else {
e658983a 3540 header_size = sizeof(struct p_rs_param_95);
e2857216 3541 data_size = pi->size - header_size;
0b0ba1ef 3542 D_ASSERT(device, data_size == 0);
b411b363
PR
3543 }
3544
3545 /* initialize verify_alg and csums_alg */
e658983a 3546 p = pi->data;
b411b363
PR
3547 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
3548
9f4fe9ad 3549 err = drbd_recv_all(peer_device->connection, p, header_size);
82bc0194
AG
3550 if (err)
3551 return err;
b411b363 3552
0500813f 3553 mutex_lock(&connection->resource->conf_update);
9f4fe9ad 3554 old_net_conf = peer_device->connection->net_conf;
b30ab791 3555 if (get_ldev(device)) {
813472ce
PR
3556 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3557 if (!new_disk_conf) {
b30ab791 3558 put_ldev(device);
0500813f 3559 mutex_unlock(&connection->resource->conf_update);
d0180171 3560 drbd_err(device, "Allocation of new disk_conf failed\n");
813472ce
PR
3561 return -ENOMEM;
3562 }
daeda1cc 3563
b30ab791 3564 old_disk_conf = device->ldev->disk_conf;
813472ce 3565 *new_disk_conf = *old_disk_conf;
b411b363 3566
6394b935 3567 new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
813472ce 3568 }
b411b363
PR
3569
3570 if (apv >= 88) {
3571 if (apv == 88) {
5de73827 3572 if (data_size > SHARED_SECRET_MAX || data_size == 0) {
d0180171 3573 drbd_err(device, "verify-alg of wrong size, "
5de73827
PR
3574 "peer wants %u, accepting only up to %u byte\n",
3575 data_size, SHARED_SECRET_MAX);
813472ce
PR
3576 err = -EIO;
3577 goto reconnect;
b411b363
PR
3578 }
3579
9f4fe9ad 3580 err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
813472ce
PR
3581 if (err)
3582 goto reconnect;
b411b363
PR
3583 /* we expect NUL terminated string */
3584 /* but just in case someone tries to be evil */
0b0ba1ef 3585 D_ASSERT(device, p->verify_alg[data_size-1] == 0);
b411b363
PR
3586 p->verify_alg[data_size-1] = 0;
3587
3588 } else /* apv >= 89 */ {
3589 /* we still expect NUL terminated strings */
3590 /* but just in case someone tries to be evil */
0b0ba1ef
AG
3591 D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0);
3592 D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0);
b411b363
PR
3593 p->verify_alg[SHARED_SECRET_MAX-1] = 0;
3594 p->csums_alg[SHARED_SECRET_MAX-1] = 0;
3595 }
3596
2ec91e0e 3597 if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
b30ab791 3598 if (device->state.conn == C_WF_REPORT_PARAMS) {
d0180171 3599 drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
2ec91e0e 3600 old_net_conf->verify_alg, p->verify_alg);
b411b363
PR
3601 goto disconnect;
3602 }
b30ab791 3603 verify_tfm = drbd_crypto_alloc_digest_safe(device,
b411b363
PR
3604 p->verify_alg, "verify-alg");
3605 if (IS_ERR(verify_tfm)) {
3606 verify_tfm = NULL;
3607 goto disconnect;
3608 }
3609 }
3610
2ec91e0e 3611 if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
b30ab791 3612 if (device->state.conn == C_WF_REPORT_PARAMS) {
d0180171 3613 drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
2ec91e0e 3614 old_net_conf->csums_alg, p->csums_alg);
b411b363
PR
3615 goto disconnect;
3616 }
b30ab791 3617 csums_tfm = drbd_crypto_alloc_digest_safe(device,
b411b363
PR
3618 p->csums_alg, "csums-alg");
3619 if (IS_ERR(csums_tfm)) {
3620 csums_tfm = NULL;
3621 goto disconnect;
3622 }
3623 }
3624
813472ce 3625 if (apv > 94 && new_disk_conf) {
daeda1cc
PR
3626 new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
3627 new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
3628 new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
3629 new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
778f271d 3630
daeda1cc 3631 fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
b30ab791 3632 if (fifo_size != device->rs_plan_s->size) {
813472ce
PR
3633 new_plan = fifo_alloc(fifo_size);
3634 if (!new_plan) {
d0180171 3635 drbd_err(device, "kmalloc of fifo_buffer failed");
b30ab791 3636 put_ldev(device);
778f271d
PR
3637 goto disconnect;
3638 }
3639 }
8e26f9cc 3640 }
b411b363 3641
91fd4dad 3642 if (verify_tfm || csums_tfm) {
2ec91e0e
PR
3643 new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
3644 if (!new_net_conf) {
d0180171 3645 drbd_err(device, "Allocation of new net_conf failed\n");
91fd4dad
PR
3646 goto disconnect;
3647 }
3648
2ec91e0e 3649 *new_net_conf = *old_net_conf;
91fd4dad
PR
3650
3651 if (verify_tfm) {
2ec91e0e
PR
3652 strcpy(new_net_conf->verify_alg, p->verify_alg);
3653 new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
9f4fe9ad
AG
3654 crypto_free_hash(peer_device->connection->verify_tfm);
3655 peer_device->connection->verify_tfm = verify_tfm;
d0180171 3656 drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
91fd4dad
PR
3657 }
3658 if (csums_tfm) {
2ec91e0e
PR
3659 strcpy(new_net_conf->csums_alg, p->csums_alg);
3660 new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
9f4fe9ad
AG
3661 crypto_free_hash(peer_device->connection->csums_tfm);
3662 peer_device->connection->csums_tfm = csums_tfm;
d0180171 3663 drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
91fd4dad 3664 }
bde89a9e 3665 rcu_assign_pointer(connection->net_conf, new_net_conf);
778f271d 3666 }
b411b363
PR
3667 }
3668
813472ce 3669 if (new_disk_conf) {
b30ab791
AG
3670 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
3671 put_ldev(device);
813472ce
PR
3672 }
3673
3674 if (new_plan) {
b30ab791
AG
3675 old_plan = device->rs_plan_s;
3676 rcu_assign_pointer(device->rs_plan_s, new_plan);
b411b363 3677 }
daeda1cc 3678
0500813f 3679 mutex_unlock(&connection->resource->conf_update);
daeda1cc
PR
3680 synchronize_rcu();
3681 if (new_net_conf)
3682 kfree(old_net_conf);
3683 kfree(old_disk_conf);
813472ce 3684 kfree(old_plan);
daeda1cc 3685
82bc0194 3686 return 0;
b411b363 3687
813472ce
PR
3688reconnect:
3689 if (new_disk_conf) {
b30ab791 3690 put_ldev(device);
813472ce
PR
3691 kfree(new_disk_conf);
3692 }
0500813f 3693 mutex_unlock(&connection->resource->conf_update);
813472ce
PR
3694 return -EIO;
3695
b411b363 3696disconnect:
813472ce
PR
3697 kfree(new_plan);
3698 if (new_disk_conf) {
b30ab791 3699 put_ldev(device);
813472ce
PR
3700 kfree(new_disk_conf);
3701 }
0500813f 3702 mutex_unlock(&connection->resource->conf_update);
b411b363
PR
3703 /* just for completeness: actually not needed,
3704 * as this is not reached if csums_tfm was ok. */
3705 crypto_free_hash(csums_tfm);
3706 /* but free the verify_tfm again, if csums_tfm did not work out */
3707 crypto_free_hash(verify_tfm);
9f4fe9ad 3708 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3709 return -EIO;
b411b363
PR
3710}
3711
b411b363 3712/* warn if the arguments differ by more than 12.5% */
b30ab791 3713static void warn_if_differ_considerably(struct drbd_device *device,
b411b363
PR
3714 const char *s, sector_t a, sector_t b)
3715{
3716 sector_t d;
3717 if (a == 0 || b == 0)
3718 return;
3719 d = (a > b) ? (a - b) : (b - a);
3720 if (d > (a>>3) || d > (b>>3))
d0180171 3721 drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
b411b363
PR
3722 (unsigned long long)a, (unsigned long long)b);
3723}
3724
bde89a9e 3725static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
b411b363 3726{
9f4fe9ad 3727 struct drbd_peer_device *peer_device;
b30ab791 3728 struct drbd_device *device;
e658983a 3729 struct p_sizes *p = pi->data;
e96c9633 3730 enum determine_dev_size dd = DS_UNCHANGED;
6a8d68b1 3731 sector_t p_size, p_usize, p_csize, my_usize;
b411b363 3732 int ldsc = 0; /* local disk size changed */
e89b591c 3733 enum dds_flags ddsf;
b411b363 3734
9f4fe9ad
AG
3735 peer_device = conn_peer_device(connection, pi->vnr);
3736 if (!peer_device)
bde89a9e 3737 return config_unknown_volume(connection, pi);
9f4fe9ad 3738 device = peer_device->device;
4a76b161 3739
b411b363
PR
3740 p_size = be64_to_cpu(p->d_size);
3741 p_usize = be64_to_cpu(p->u_size);
6a8d68b1 3742 p_csize = be64_to_cpu(p->c_size);
b411b363 3743
b411b363
PR
3744 /* just store the peer's disk size for now.
3745 * we still need to figure out whether we accept that. */
b30ab791 3746 device->p_size = p_size;
b411b363 3747
b30ab791 3748 if (get_ldev(device)) {
daeda1cc 3749 rcu_read_lock();
b30ab791 3750 my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
daeda1cc
PR
3751 rcu_read_unlock();
3752
b30ab791
AG
3753 warn_if_differ_considerably(device, "lower level device sizes",
3754 p_size, drbd_get_max_capacity(device->ldev));
3755 warn_if_differ_considerably(device, "user requested size",
daeda1cc 3756 p_usize, my_usize);
b411b363
PR
3757
3758 /* if this is the first connect, or an otherwise expected
3759 * param exchange, choose the minimum */
b30ab791 3760 if (device->state.conn == C_WF_REPORT_PARAMS)
daeda1cc 3761 p_usize = min_not_zero(my_usize, p_usize);
b411b363
PR
3762
3763 /* Never shrink a device with usable data during connect.
3764 But allow online shrinking if we are connected. */
b30ab791
AG
3765 if (drbd_new_dev_size(device, device->ldev, p_usize, 0) <
3766 drbd_get_capacity(device->this_bdev) &&
3767 device->state.disk >= D_OUTDATED &&
3768 device->state.conn < C_CONNECTED) {
d0180171 3769 drbd_err(device, "The peer's disk size is too small!\n");
9f4fe9ad 3770 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
b30ab791 3771 put_ldev(device);
82bc0194 3772 return -EIO;
b411b363 3773 }
daeda1cc
PR
3774
3775 if (my_usize != p_usize) {
3776 struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
3777
3778 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3779 if (!new_disk_conf) {
d0180171 3780 drbd_err(device, "Allocation of new disk_conf failed\n");
b30ab791 3781 put_ldev(device);
daeda1cc
PR
3782 return -ENOMEM;
3783 }
3784
0500813f 3785 mutex_lock(&connection->resource->conf_update);
b30ab791 3786 old_disk_conf = device->ldev->disk_conf;
daeda1cc
PR
3787 *new_disk_conf = *old_disk_conf;
3788 new_disk_conf->disk_size = p_usize;
3789
b30ab791 3790 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
0500813f 3791 mutex_unlock(&connection->resource->conf_update);
daeda1cc
PR
3792 synchronize_rcu();
3793 kfree(old_disk_conf);
3794
d0180171 3795 drbd_info(device, "Peer sets u_size to %lu sectors\n",
daeda1cc 3796 (unsigned long)my_usize);
b411b363 3797 }
daeda1cc 3798
b30ab791 3799 put_ldev(device);
b411b363 3800 }
b411b363 3801
20c68fde 3802 device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
20c68fde
LE
3803 /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
3804 In case we cleared the QUEUE_FLAG_DISCARD from our queue in
3805 drbd_reconsider_max_bio_size(), we can be sure that after
3806 drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
3807
e89b591c 3808 ddsf = be16_to_cpu(p->dds_flags);
b30ab791 3809 if (get_ldev(device)) {
8fe39aac 3810 drbd_reconsider_max_bio_size(device, device->ldev);
b30ab791
AG
3811 dd = drbd_determine_dev_size(device, ddsf, NULL);
3812 put_ldev(device);
e96c9633 3813 if (dd == DS_ERROR)
82bc0194 3814 return -EIO;
b30ab791 3815 drbd_md_sync(device);
b411b363 3816 } else {
6a8d68b1
LE
3817 /*
3818 * I am diskless, need to accept the peer's *current* size.
3819 * I must NOT accept the peers backing disk size,
3820 * it may have been larger than mine all along...
3821 *
3822 * At this point, the peer knows more about my disk, or at
3823 * least about what we last agreed upon, than myself.
3824 * So if his c_size is less than his d_size, the most likely
3825 * reason is that *my* d_size was smaller last time we checked.
3826 *
3827 * However, if he sends a zero current size,
3828 * take his (user-capped or) backing disk size anyways.
3829 */
8fe39aac 3830 drbd_reconsider_max_bio_size(device, NULL);
6a8d68b1 3831 drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
b411b363
PR
3832 }
3833
b30ab791
AG
3834 if (get_ldev(device)) {
3835 if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
3836 device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
b411b363
PR
3837 ldsc = 1;
3838 }
3839
b30ab791 3840 put_ldev(device);
b411b363
PR
3841 }
3842
b30ab791 3843 if (device->state.conn > C_WF_REPORT_PARAMS) {
b411b363 3844 if (be64_to_cpu(p->c_size) !=
b30ab791 3845 drbd_get_capacity(device->this_bdev) || ldsc) {
b411b363
PR
3846 /* we have different sizes, probably peer
3847 * needs to know my new size... */
69a22773 3848 drbd_send_sizes(peer_device, 0, ddsf);
b411b363 3849 }
b30ab791
AG
3850 if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
3851 (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
3852 if (device->state.pdsk >= D_INCONSISTENT &&
3853 device->state.disk >= D_INCONSISTENT) {
e89b591c 3854 if (ddsf & DDSF_NO_RESYNC)
d0180171 3855 drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
e89b591c 3856 else
b30ab791 3857 resync_after_online_grow(device);
e89b591c 3858 } else
b30ab791 3859 set_bit(RESYNC_AFTER_NEG, &device->flags);
b411b363
PR
3860 }
3861 }
3862
82bc0194 3863 return 0;
b411b363
PR
3864}
3865
bde89a9e 3866static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
b411b363 3867{
9f4fe9ad 3868 struct drbd_peer_device *peer_device;
b30ab791 3869 struct drbd_device *device;
e658983a 3870 struct p_uuids *p = pi->data;
b411b363 3871 u64 *p_uuid;
62b0da3a 3872 int i, updated_uuids = 0;
b411b363 3873
9f4fe9ad
AG
3874 peer_device = conn_peer_device(connection, pi->vnr);
3875 if (!peer_device)
bde89a9e 3876 return config_unknown_volume(connection, pi);
9f4fe9ad 3877 device = peer_device->device;
4a76b161 3878
b411b363 3879 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
063eacf8 3880 if (!p_uuid) {
d0180171 3881 drbd_err(device, "kmalloc of p_uuid failed\n");
063eacf8
JW
3882 return false;
3883 }
b411b363
PR
3884
3885 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
3886 p_uuid[i] = be64_to_cpu(p->uuid[i]);
3887
b30ab791
AG
3888 kfree(device->p_uuid);
3889 device->p_uuid = p_uuid;
b411b363 3890
b30ab791
AG
3891 if (device->state.conn < C_CONNECTED &&
3892 device->state.disk < D_INCONSISTENT &&
3893 device->state.role == R_PRIMARY &&
3894 (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
d0180171 3895 drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
b30ab791 3896 (unsigned long long)device->ed_uuid);
9f4fe9ad 3897 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 3898 return -EIO;
b411b363
PR
3899 }
3900
b30ab791 3901 if (get_ldev(device)) {
b411b363 3902 int skip_initial_sync =
b30ab791 3903 device->state.conn == C_CONNECTED &&
9f4fe9ad 3904 peer_device->connection->agreed_pro_version >= 90 &&
b30ab791 3905 device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
b411b363
PR
3906 (p_uuid[UI_FLAGS] & 8);
3907 if (skip_initial_sync) {
d0180171 3908 drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
b30ab791 3909 drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
20ceb2b2
LE
3910 "clear_n_write from receive_uuids",
3911 BM_LOCKED_TEST_ALLOWED);
b30ab791
AG
3912 _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
3913 _drbd_uuid_set(device, UI_BITMAP, 0);
3914 _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
b411b363 3915 CS_VERBOSE, NULL);
b30ab791 3916 drbd_md_sync(device);
62b0da3a 3917 updated_uuids = 1;
b411b363 3918 }
b30ab791
AG
3919 put_ldev(device);
3920 } else if (device->state.disk < D_INCONSISTENT &&
3921 device->state.role == R_PRIMARY) {
18a50fa2
PR
3922 /* I am a diskless primary, the peer just created a new current UUID
3923 for me. */
b30ab791 3924 updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
b411b363
PR
3925 }
3926
3927 /* Before we test for the disk state, we should wait until an eventually
3928 ongoing cluster wide state change is finished. That is important if
3929 we are primary and are detaching from our disk. We need to see the
3930 new disk state... */
b30ab791
AG
3931 mutex_lock(device->state_mutex);
3932 mutex_unlock(device->state_mutex);
3933 if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
3934 updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
62b0da3a
LE
3935
3936 if (updated_uuids)
b30ab791 3937 drbd_print_uuids(device, "receiver updated UUIDs to");
b411b363 3938
82bc0194 3939 return 0;
b411b363
PR
3940}
3941
3942/**
3943 * convert_state() - Converts the peer's view of the cluster state to our point of view
3944 * @ps: The state as seen by the peer.
3945 */
3946static union drbd_state convert_state(union drbd_state ps)
3947{
3948 union drbd_state ms;
3949
3950 static enum drbd_conns c_tab[] = {
369bea63 3951 [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
b411b363
PR
3952 [C_CONNECTED] = C_CONNECTED,
3953
3954 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3955 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3956 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3957 [C_VERIFY_S] = C_VERIFY_T,
3958 [C_MASK] = C_MASK,
3959 };
3960
3961 ms.i = ps.i;
3962
3963 ms.conn = c_tab[ps.conn];
3964 ms.peer = ps.role;
3965 ms.role = ps.peer;
3966 ms.pdsk = ps.disk;
3967 ms.disk = ps.pdsk;
3968 ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3969
3970 return ms;
3971}
3972
bde89a9e 3973static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
b411b363 3974{
9f4fe9ad 3975 struct drbd_peer_device *peer_device;
b30ab791 3976 struct drbd_device *device;
e658983a 3977 struct p_req_state *p = pi->data;
b411b363 3978 union drbd_state mask, val;
bf885f8a 3979 enum drbd_state_rv rv;
b411b363 3980
9f4fe9ad
AG
3981 peer_device = conn_peer_device(connection, pi->vnr);
3982 if (!peer_device)
4a76b161 3983 return -EIO;
9f4fe9ad 3984 device = peer_device->device;
4a76b161 3985
b411b363
PR
3986 mask.i = be32_to_cpu(p->mask);
3987 val.i = be32_to_cpu(p->val);
3988
9f4fe9ad 3989 if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
b30ab791 3990 mutex_is_locked(device->state_mutex)) {
69a22773 3991 drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
82bc0194 3992 return 0;
b411b363
PR
3993 }
3994
3995 mask = convert_state(mask);
3996 val = convert_state(val);
3997
b30ab791 3998 rv = drbd_change_state(device, CS_VERBOSE, mask, val);
69a22773 3999 drbd_send_sr_reply(peer_device, rv);
b411b363 4000
b30ab791 4001 drbd_md_sync(device);
b411b363 4002
82bc0194 4003 return 0;
b411b363
PR
4004}
4005
bde89a9e 4006static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4007{
e658983a 4008 struct p_req_state *p = pi->data;
b411b363 4009 union drbd_state mask, val;
bf885f8a 4010 enum drbd_state_rv rv;
b411b363 4011
b411b363
PR
4012 mask.i = be32_to_cpu(p->mask);
4013 val.i = be32_to_cpu(p->val);
4014
bde89a9e
AG
4015 if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
4016 mutex_is_locked(&connection->cstate_mutex)) {
4017 conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
82bc0194 4018 return 0;
b411b363
PR
4019 }
4020
4021 mask = convert_state(mask);
4022 val = convert_state(val);
4023
bde89a9e
AG
4024 rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
4025 conn_send_sr_reply(connection, rv);
b411b363 4026
82bc0194 4027 return 0;
b411b363
PR
4028}
4029
bde89a9e 4030static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4031{
9f4fe9ad 4032 struct drbd_peer_device *peer_device;
b30ab791 4033 struct drbd_device *device;
e658983a 4034 struct p_state *p = pi->data;
4ac4aada 4035 union drbd_state os, ns, peer_state;
b411b363 4036 enum drbd_disk_state real_peer_disk;
65d922c3 4037 enum chg_state_flags cs_flags;
b411b363
PR
4038 int rv;
4039
9f4fe9ad
AG
4040 peer_device = conn_peer_device(connection, pi->vnr);
4041 if (!peer_device)
bde89a9e 4042 return config_unknown_volume(connection, pi);
9f4fe9ad 4043 device = peer_device->device;
4a76b161 4044
b411b363
PR
4045 peer_state.i = be32_to_cpu(p->state);
4046
4047 real_peer_disk = peer_state.disk;
4048 if (peer_state.disk == D_NEGOTIATING) {
b30ab791 4049 real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
d0180171 4050 drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
b411b363
PR
4051 }
4052
0500813f 4053 spin_lock_irq(&device->resource->req_lock);
b411b363 4054 retry:
b30ab791 4055 os = ns = drbd_read_state(device);
0500813f 4056 spin_unlock_irq(&device->resource->req_lock);
b411b363 4057
545752d5
LE
4058 /* If some other part of the code (asender thread, timeout)
4059 * already decided to close the connection again,
4060 * we must not "re-establish" it here. */
4061 if (os.conn <= C_TEAR_DOWN)
58ffa580 4062 return -ECONNRESET;
545752d5 4063
40424e4a
LE
4064 /* If this is the "end of sync" confirmation, usually the peer disk
4065 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
4066 * set) resync started in PausedSyncT, or if the timing of pause-/
4067 * unpause-sync events has been "just right", the peer disk may
4068 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
4069 */
4070 if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
4071 real_peer_disk == D_UP_TO_DATE &&
e9ef7bb6
LE
4072 os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
4073 /* If we are (becoming) SyncSource, but peer is still in sync
4074 * preparation, ignore its uptodate-ness to avoid flapping, it
4075 * will change to inconsistent once the peer reaches active
4076 * syncing states.
4077 * It may have changed syncer-paused flags, however, so we
4078 * cannot ignore this completely. */
4079 if (peer_state.conn > C_CONNECTED &&
4080 peer_state.conn < C_SYNC_SOURCE)
4081 real_peer_disk = D_INCONSISTENT;
4082
4083 /* if peer_state changes to connected at the same time,
4084 * it explicitly notifies us that it finished resync.
4085 * Maybe we should finish it up, too? */
4086 else if (os.conn >= C_SYNC_SOURCE &&
4087 peer_state.conn == C_CONNECTED) {
b30ab791
AG
4088 if (drbd_bm_total_weight(device) <= device->rs_failed)
4089 drbd_resync_finished(device);
82bc0194 4090 return 0;
e9ef7bb6
LE
4091 }
4092 }
4093
02b91b55
LE
4094 /* explicit verify finished notification, stop sector reached. */
4095 if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
4096 peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
b30ab791
AG
4097 ov_out_of_sync_print(device);
4098 drbd_resync_finished(device);
58ffa580 4099 return 0;
02b91b55
LE
4100 }
4101
e9ef7bb6
LE
4102 /* peer says his disk is inconsistent, while we think it is uptodate,
4103 * and this happens while the peer still thinks we have a sync going on,
4104 * but we think we are already done with the sync.
4105 * We ignore this to avoid flapping pdsk.
4106 * This should not happen, if the peer is a recent version of drbd. */
4107 if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
4108 os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
4109 real_peer_disk = D_UP_TO_DATE;
4110
4ac4aada
LE
4111 if (ns.conn == C_WF_REPORT_PARAMS)
4112 ns.conn = C_CONNECTED;
b411b363 4113
67531718
PR
4114 if (peer_state.conn == C_AHEAD)
4115 ns.conn = C_BEHIND;
4116
b30ab791
AG
4117 if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
4118 get_ldev_if_state(device, D_NEGOTIATING)) {
b411b363
PR
4119 int cr; /* consider resync */
4120
4121 /* if we established a new connection */
4ac4aada 4122 cr = (os.conn < C_CONNECTED);
b411b363
PR
4123 /* if we had an established connection
4124 * and one of the nodes newly attaches a disk */
4ac4aada 4125 cr |= (os.conn == C_CONNECTED &&
b411b363 4126 (peer_state.disk == D_NEGOTIATING ||
4ac4aada 4127 os.disk == D_NEGOTIATING));
b411b363
PR
4128 /* if we have both been inconsistent, and the peer has been
4129 * forced to be UpToDate with --overwrite-data */
b30ab791 4130 cr |= test_bit(CONSIDER_RESYNC, &device->flags);
b411b363
PR
4131 /* if we had been plain connected, and the admin requested to
4132 * start a sync by "invalidate" or "invalidate-remote" */
4ac4aada 4133 cr |= (os.conn == C_CONNECTED &&
b411b363
PR
4134 (peer_state.conn >= C_STARTING_SYNC_S &&
4135 peer_state.conn <= C_WF_BITMAP_T));
4136
4137 if (cr)
69a22773 4138 ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
b411b363 4139
b30ab791 4140 put_ldev(device);
4ac4aada
LE
4141 if (ns.conn == C_MASK) {
4142 ns.conn = C_CONNECTED;
b30ab791
AG
4143 if (device->state.disk == D_NEGOTIATING) {
4144 drbd_force_state(device, NS(disk, D_FAILED));
b411b363 4145 } else if (peer_state.disk == D_NEGOTIATING) {
d0180171 4146 drbd_err(device, "Disk attach process on the peer node was aborted.\n");
b411b363 4147 peer_state.disk = D_DISKLESS;
580b9767 4148 real_peer_disk = D_DISKLESS;
b411b363 4149 } else {
9f4fe9ad 4150 if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
82bc0194 4151 return -EIO;
0b0ba1ef 4152 D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
9f4fe9ad 4153 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 4154 return -EIO;
b411b363
PR
4155 }
4156 }
4157 }
4158
0500813f 4159 spin_lock_irq(&device->resource->req_lock);
b30ab791 4160 if (os.i != drbd_read_state(device).i)
b411b363 4161 goto retry;
b30ab791 4162 clear_bit(CONSIDER_RESYNC, &device->flags);
b411b363
PR
4163 ns.peer = peer_state.role;
4164 ns.pdsk = real_peer_disk;
4165 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
4ac4aada 4166 if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
b30ab791 4167 ns.disk = device->new_state_tmp.disk;
4ac4aada 4168 cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
b30ab791
AG
4169 if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
4170 test_bit(NEW_CUR_UUID, &device->flags)) {
8554df1c 4171 /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
481c6f50 4172 for temporal network outages! */
0500813f 4173 spin_unlock_irq(&device->resource->req_lock);
d0180171 4174 drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
9f4fe9ad 4175 tl_clear(peer_device->connection);
b30ab791
AG
4176 drbd_uuid_new_current(device);
4177 clear_bit(NEW_CUR_UUID, &device->flags);
9f4fe9ad 4178 conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
82bc0194 4179 return -EIO;
481c6f50 4180 }
b30ab791
AG
4181 rv = _drbd_set_state(device, ns, cs_flags, NULL);
4182 ns = drbd_read_state(device);
0500813f 4183 spin_unlock_irq(&device->resource->req_lock);
b411b363
PR
4184
4185 if (rv < SS_SUCCESS) {
9f4fe9ad 4186 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
82bc0194 4187 return -EIO;
b411b363
PR
4188 }
4189
4ac4aada
LE
4190 if (os.conn > C_WF_REPORT_PARAMS) {
4191 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
b411b363
PR
4192 peer_state.disk != D_NEGOTIATING ) {
4193 /* we want resync, peer has not yet decided to sync... */
4194 /* Nowadays only used when forcing a node into primary role and
4195 setting its disk to UpToDate with that */
69a22773
AG
4196 drbd_send_uuids(peer_device);
4197 drbd_send_current_state(peer_device);
b411b363
PR
4198 }
4199 }
4200
b30ab791 4201 clear_bit(DISCARD_MY_DATA, &device->flags);
b411b363 4202
b30ab791 4203 drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
b411b363 4204
82bc0194 4205 return 0;
b411b363
PR
4206}
4207
bde89a9e 4208static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4209{
9f4fe9ad 4210 struct drbd_peer_device *peer_device;
b30ab791 4211 struct drbd_device *device;
e658983a 4212 struct p_rs_uuid *p = pi->data;
4a76b161 4213
9f4fe9ad
AG
4214 peer_device = conn_peer_device(connection, pi->vnr);
4215 if (!peer_device)
4a76b161 4216 return -EIO;
9f4fe9ad 4217 device = peer_device->device;
b411b363 4218
b30ab791
AG
4219 wait_event(device->misc_wait,
4220 device->state.conn == C_WF_SYNC_UUID ||
4221 device->state.conn == C_BEHIND ||
4222 device->state.conn < C_CONNECTED ||
4223 device->state.disk < D_NEGOTIATING);
b411b363 4224
0b0ba1ef 4225 /* D_ASSERT(device, device->state.conn == C_WF_SYNC_UUID ); */
b411b363 4226
b411b363
PR
4227 /* Here the _drbd_uuid_ functions are right, current should
4228 _not_ be rotated into the history */
b30ab791
AG
4229 if (get_ldev_if_state(device, D_NEGOTIATING)) {
4230 _drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
4231 _drbd_uuid_set(device, UI_BITMAP, 0UL);
b411b363 4232
b30ab791
AG
4233 drbd_print_uuids(device, "updated sync uuid");
4234 drbd_start_resync(device, C_SYNC_TARGET);
b411b363 4235
b30ab791 4236 put_ldev(device);
b411b363 4237 } else
d0180171 4238 drbd_err(device, "Ignoring SyncUUID packet!\n");
b411b363 4239
82bc0194 4240 return 0;
b411b363
PR
4241}
4242
2c46407d
AG
4243/**
4244 * receive_bitmap_plain
4245 *
4246 * Return 0 when done, 1 when another iteration is needed, and a negative error
4247 * code upon failure.
4248 */
4249static int
69a22773 4250receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
e658983a 4251 unsigned long *p, struct bm_xfer_ctx *c)
b411b363 4252{
50d0b1ad 4253 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
69a22773 4254 drbd_header_size(peer_device->connection);
e658983a 4255 unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
50d0b1ad 4256 c->bm_words - c->word_offset);
e658983a 4257 unsigned int want = num_words * sizeof(*p);
2c46407d 4258 int err;
b411b363 4259
50d0b1ad 4260 if (want != size) {
69a22773 4261 drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
2c46407d 4262 return -EIO;
b411b363
PR
4263 }
4264 if (want == 0)
2c46407d 4265 return 0;
69a22773 4266 err = drbd_recv_all(peer_device->connection, p, want);
82bc0194 4267 if (err)
2c46407d 4268 return err;
b411b363 4269
69a22773 4270 drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
b411b363
PR
4271
4272 c->word_offset += num_words;
4273 c->bit_offset = c->word_offset * BITS_PER_LONG;
4274 if (c->bit_offset > c->bm_bits)
4275 c->bit_offset = c->bm_bits;
4276
2c46407d 4277 return 1;
b411b363
PR
4278}
4279
a02d1240
AG
4280static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
4281{
4282 return (enum drbd_bitmap_code)(p->encoding & 0x0f);
4283}
4284
4285static int dcbp_get_start(struct p_compressed_bm *p)
4286{
4287 return (p->encoding & 0x80) != 0;
4288}
4289
4290static int dcbp_get_pad_bits(struct p_compressed_bm *p)
4291{
4292 return (p->encoding >> 4) & 0x7;
4293}
4294
2c46407d
AG
4295/**
4296 * recv_bm_rle_bits
4297 *
4298 * Return 0 when done, 1 when another iteration is needed, and a negative error
4299 * code upon failure.
4300 */
4301static int
69a22773 4302recv_bm_rle_bits(struct drbd_peer_device *peer_device,
b411b363 4303 struct p_compressed_bm *p,
c6d25cfe
PR
4304 struct bm_xfer_ctx *c,
4305 unsigned int len)
b411b363
PR
4306{
4307 struct bitstream bs;
4308 u64 look_ahead;
4309 u64 rl;
4310 u64 tmp;
4311 unsigned long s = c->bit_offset;
4312 unsigned long e;
a02d1240 4313 int toggle = dcbp_get_start(p);
b411b363
PR
4314 int have;
4315 int bits;
4316
a02d1240 4317 bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
b411b363
PR
4318
4319 bits = bitstream_get_bits(&bs, &look_ahead, 64);
4320 if (bits < 0)
2c46407d 4321 return -EIO;
b411b363
PR
4322
4323 for (have = bits; have > 0; s += rl, toggle = !toggle) {
4324 bits = vli_decode_bits(&rl, look_ahead);
4325 if (bits <= 0)
2c46407d 4326 return -EIO;
b411b363
PR
4327
4328 if (toggle) {
4329 e = s + rl -1;
4330 if (e >= c->bm_bits) {
69a22773 4331 drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
2c46407d 4332 return -EIO;
b411b363 4333 }
69a22773 4334 _drbd_bm_set_bits(peer_device->device, s, e);
b411b363
PR
4335 }
4336
4337 if (have < bits) {
69a22773 4338 drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
b411b363
PR
4339 have, bits, look_ahead,
4340 (unsigned int)(bs.cur.b - p->code),
4341 (unsigned int)bs.buf_len);
2c46407d 4342 return -EIO;
b411b363 4343 }
d2da5b0c
LE
4344 /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
4345 if (likely(bits < 64))
4346 look_ahead >>= bits;
4347 else
4348 look_ahead = 0;
b411b363
PR
4349 have -= bits;
4350
4351 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
4352 if (bits < 0)
2c46407d 4353 return -EIO;
b411b363
PR
4354 look_ahead |= tmp << have;
4355 have += bits;
4356 }
4357
4358 c->bit_offset = s;
4359 bm_xfer_ctx_bit_to_word_offset(c);
4360
2c46407d 4361 return (s != c->bm_bits);
b411b363
PR
4362}
4363
2c46407d
AG
4364/**
4365 * decode_bitmap_c
4366 *
4367 * Return 0 when done, 1 when another iteration is needed, and a negative error
4368 * code upon failure.
4369 */
4370static int
69a22773 4371decode_bitmap_c(struct drbd_peer_device *peer_device,
b411b363 4372 struct p_compressed_bm *p,
c6d25cfe
PR
4373 struct bm_xfer_ctx *c,
4374 unsigned int len)
b411b363 4375{
a02d1240 4376 if (dcbp_get_code(p) == RLE_VLI_Bits)
69a22773 4377 return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
b411b363
PR
4378
4379 /* other variants had been implemented for evaluation,
4380 * but have been dropped as this one turned out to be "best"
4381 * during all our tests. */
4382
69a22773
AG
4383 drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
4384 conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
2c46407d 4385 return -EIO;
b411b363
PR
4386}
4387
b30ab791 4388void INFO_bm_xfer_stats(struct drbd_device *device,
b411b363
PR
4389 const char *direction, struct bm_xfer_ctx *c)
4390{
4391 /* what would it take to transfer it "plaintext" */
a6b32bc3 4392 unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
50d0b1ad
AG
4393 unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
4394 unsigned int plain =
4395 header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
4396 c->bm_words * sizeof(unsigned long);
4397 unsigned int total = c->bytes[0] + c->bytes[1];
4398 unsigned int r;
b411b363
PR
4399
4400 /* total can not be zero. but just in case: */
4401 if (total == 0)
4402 return;
4403
4404 /* don't report if not compressed */
4405 if (total >= plain)
4406 return;
4407
4408 /* total < plain. check for overflow, still */
4409 r = (total > UINT_MAX/1000) ? (total / (plain/1000))
4410 : (1000 * total / plain);
4411
4412 if (r > 1000)
4413 r = 1000;
4414
4415 r = 1000 - r;
d0180171 4416 drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
b411b363
PR
4417 "total %u; compression: %u.%u%%\n",
4418 direction,
4419 c->bytes[1], c->packets[1],
4420 c->bytes[0], c->packets[0],
4421 total, r/10, r % 10);
4422}
4423
4424/* Since we are processing the bitfield from lower addresses to higher,
4425 it does not matter if the process it in 32 bit chunks or 64 bit
4426 chunks as long as it is little endian. (Understand it as byte stream,
4427 beginning with the lowest byte...) If we would use big endian
4428 we would need to process it from the highest address to the lowest,
4429 in order to be agnostic to the 32 vs 64 bits issue.
4430
4431 returns 0 on failure, 1 if we successfully received it. */
bde89a9e 4432static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4433{
9f4fe9ad 4434 struct drbd_peer_device *peer_device;
b30ab791 4435 struct drbd_device *device;
b411b363 4436 struct bm_xfer_ctx c;
2c46407d 4437 int err;
4a76b161 4438
9f4fe9ad
AG
4439 peer_device = conn_peer_device(connection, pi->vnr);
4440 if (!peer_device)
4a76b161 4441 return -EIO;
9f4fe9ad 4442 device = peer_device->device;
b411b363 4443
b30ab791 4444 drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
20ceb2b2
LE
4445 /* you are supposed to send additional out-of-sync information
4446 * if you actually set bits during this phase */
b411b363 4447
b411b363 4448 c = (struct bm_xfer_ctx) {
b30ab791
AG
4449 .bm_bits = drbd_bm_bits(device),
4450 .bm_words = drbd_bm_words(device),
b411b363
PR
4451 };
4452
2c46407d 4453 for(;;) {
e658983a 4454 if (pi->cmd == P_BITMAP)
69a22773 4455 err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
e658983a 4456 else if (pi->cmd == P_COMPRESSED_BITMAP) {
b411b363
PR
4457 /* MAYBE: sanity check that we speak proto >= 90,
4458 * and the feature is enabled! */
e658983a 4459 struct p_compressed_bm *p = pi->data;
b411b363 4460
bde89a9e 4461 if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
d0180171 4462 drbd_err(device, "ReportCBitmap packet too large\n");
82bc0194 4463 err = -EIO;
b411b363
PR
4464 goto out;
4465 }
e658983a 4466 if (pi->size <= sizeof(*p)) {
d0180171 4467 drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
82bc0194 4468 err = -EIO;
78fcbdae 4469 goto out;
b411b363 4470 }
9f4fe9ad 4471 err = drbd_recv_all(peer_device->connection, p, pi->size);
e658983a
AG
4472 if (err)
4473 goto out;
69a22773 4474 err = decode_bitmap_c(peer_device, p, &c, pi->size);
b411b363 4475 } else {
d0180171 4476 drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
82bc0194 4477 err = -EIO;
b411b363
PR
4478 goto out;
4479 }
4480
e2857216 4481 c.packets[pi->cmd == P_BITMAP]++;
bde89a9e 4482 c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
b411b363 4483
2c46407d
AG
4484 if (err <= 0) {
4485 if (err < 0)
4486 goto out;
b411b363 4487 break;
2c46407d 4488 }
9f4fe9ad 4489 err = drbd_recv_header(peer_device->connection, pi);
82bc0194 4490 if (err)
b411b363 4491 goto out;
2c46407d 4492 }
b411b363 4493
b30ab791 4494 INFO_bm_xfer_stats(device, "receive", &c);
b411b363 4495
b30ab791 4496 if (device->state.conn == C_WF_BITMAP_T) {
de1f8e4a
AG
4497 enum drbd_state_rv rv;
4498
b30ab791 4499 err = drbd_send_bitmap(device);
82bc0194 4500 if (err)
b411b363
PR
4501 goto out;
4502 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
b30ab791 4503 rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
0b0ba1ef 4504 D_ASSERT(device, rv == SS_SUCCESS);
b30ab791 4505 } else if (device->state.conn != C_WF_BITMAP_S) {
b411b363
PR
4506 /* admin may have requested C_DISCONNECTING,
4507 * other threads may have noticed network errors */
d0180171 4508 drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
b30ab791 4509 drbd_conn_str(device->state.conn));
b411b363 4510 }
82bc0194 4511 err = 0;
b411b363 4512
b411b363 4513 out:
b30ab791
AG
4514 drbd_bm_unlock(device);
4515 if (!err && device->state.conn == C_WF_BITMAP_S)
4516 drbd_start_resync(device, C_SYNC_SOURCE);
82bc0194 4517 return err;
b411b363
PR
4518}
4519
bde89a9e 4520static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
b411b363 4521{
1ec861eb 4522 drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
e2857216 4523 pi->cmd, pi->size);
b411b363 4524
bde89a9e 4525 return ignore_remaining_packet(connection, pi);
b411b363
PR
4526}
4527
bde89a9e 4528static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
0ced55a3 4529{
e7f52dfb
LE
4530 /* Make sure we've acked all the TCP data associated
4531 * with the data requests being unplugged */
bde89a9e 4532 drbd_tcp_quickack(connection->data.socket);
0ced55a3 4533
82bc0194 4534 return 0;
0ced55a3
PR
4535}
4536
bde89a9e 4537static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
73a01a18 4538{
9f4fe9ad 4539 struct drbd_peer_device *peer_device;
b30ab791 4540 struct drbd_device *device;
e658983a 4541 struct p_block_desc *p = pi->data;
4a76b161 4542
9f4fe9ad
AG
4543 peer_device = conn_peer_device(connection, pi->vnr);
4544 if (!peer_device)
4a76b161 4545 return -EIO;
9f4fe9ad 4546 device = peer_device->device;
73a01a18 4547
b30ab791 4548 switch (device->state.conn) {
f735e363
LE
4549 case C_WF_SYNC_UUID:
4550 case C_WF_BITMAP_T:
4551 case C_BEHIND:
4552 break;
4553 default:
d0180171 4554 drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
b30ab791 4555 drbd_conn_str(device->state.conn));
f735e363
LE
4556 }
4557
b30ab791 4558 drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
73a01a18 4559
82bc0194 4560 return 0;
73a01a18
PR
4561}
4562
02918be2
PR
4563struct data_cmd {
4564 int expect_payload;
4565 size_t pkt_size;
bde89a9e 4566 int (*fn)(struct drbd_connection *, struct packet_info *);
02918be2
PR
4567};
4568
4569static struct data_cmd drbd_cmd_handler[] = {
4570 [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
4571 [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
4572 [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
4573 [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
e658983a
AG
4574 [P_BITMAP] = { 1, 0, receive_bitmap } ,
4575 [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
4576 [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote },
02918be2
PR
4577 [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4578 [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
e658983a
AG
4579 [P_SYNC_PARAM] = { 1, 0, receive_SyncParam },
4580 [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam },
02918be2
PR
4581 [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
4582 [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
4583 [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
4584 [P_STATE] = { 0, sizeof(struct p_state), receive_state },
4585 [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
4586 [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
4587 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4588 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4589 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4590 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
73a01a18 4591 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
4a76b161 4592 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
036b17ea 4593 [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
a0fb3c47 4594 [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data },
b411b363
PR
4595};
4596
bde89a9e 4597static void drbdd(struct drbd_connection *connection)
b411b363 4598{
77351055 4599 struct packet_info pi;
02918be2 4600 size_t shs; /* sub header size */
82bc0194 4601 int err;
b411b363 4602
bde89a9e 4603 while (get_t_state(&connection->receiver) == RUNNING) {
deebe195 4604 struct data_cmd *cmd;
b411b363 4605
bde89a9e 4606 drbd_thread_current_set_cpu(&connection->receiver);
944410e9 4607 update_receiver_timing_details(connection, drbd_recv_header);
bde89a9e 4608 if (drbd_recv_header(connection, &pi))
02918be2 4609 goto err_out;
b411b363 4610
deebe195 4611 cmd = &drbd_cmd_handler[pi.cmd];
4a76b161 4612 if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
1ec861eb 4613 drbd_err(connection, "Unexpected data packet %s (0x%04x)",
2fcb8f30 4614 cmdname(pi.cmd), pi.cmd);
02918be2 4615 goto err_out;
0b33a916 4616 }
b411b363 4617
e658983a
AG
4618 shs = cmd->pkt_size;
4619 if (pi.size > shs && !cmd->expect_payload) {
1ec861eb 4620 drbd_err(connection, "No payload expected %s l:%d\n",
2fcb8f30 4621 cmdname(pi.cmd), pi.size);
02918be2 4622 goto err_out;
b411b363 4623 }
b411b363 4624
c13f7e1a 4625 if (shs) {
944410e9 4626 update_receiver_timing_details(connection, drbd_recv_all_warn);
bde89a9e 4627 err = drbd_recv_all_warn(connection, pi.data, shs);
a5c31904 4628 if (err)
c13f7e1a 4629 goto err_out;
e2857216 4630 pi.size -= shs;
c13f7e1a
LE
4631 }
4632
944410e9 4633 update_receiver_timing_details(connection, cmd->fn);
bde89a9e 4634 err = cmd->fn(connection, &pi);
4a76b161 4635 if (err) {
1ec861eb 4636 drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
9f5bdc33 4637 cmdname(pi.cmd), err, pi.size);
02918be2 4638 goto err_out;
b411b363
PR
4639 }
4640 }
82bc0194 4641 return;
b411b363 4642
82bc0194 4643 err_out:
bde89a9e 4644 conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
b411b363
PR
4645}
4646
bde89a9e 4647static void conn_disconnect(struct drbd_connection *connection)
b411b363 4648{
c06ece6b 4649 struct drbd_peer_device *peer_device;
bbeb641c 4650 enum drbd_conns oc;
376694a0 4651 int vnr;
b411b363 4652
bde89a9e 4653 if (connection->cstate == C_STANDALONE)
b411b363 4654 return;
b411b363 4655
545752d5
LE
4656 /* We are about to start the cleanup after connection loss.
4657 * Make sure drbd_make_request knows about that.
4658 * Usually we should be in some network failure state already,
4659 * but just in case we are not, we fix it up here.
4660 */
bde89a9e 4661 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
545752d5 4662
b411b363 4663 /* asender does not clean up anything. it must not interfere, either */
bde89a9e
AG
4664 drbd_thread_stop(&connection->asender);
4665 drbd_free_sock(connection);
360cc740 4666
c141ebda 4667 rcu_read_lock();
c06ece6b
AG
4668 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
4669 struct drbd_device *device = peer_device->device;
b30ab791 4670 kref_get(&device->kref);
c141ebda 4671 rcu_read_unlock();
69a22773 4672 drbd_disconnected(peer_device);
c06ece6b 4673 kref_put(&device->kref, drbd_destroy_device);
c141ebda
PR
4674 rcu_read_lock();
4675 }
4676 rcu_read_unlock();
4677
bde89a9e 4678 if (!list_empty(&connection->current_epoch->list))
1ec861eb 4679 drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
12038a3a 4680 /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
bde89a9e
AG
4681 atomic_set(&connection->current_epoch->epoch_size, 0);
4682 connection->send.seen_any_write_yet = false;
12038a3a 4683
1ec861eb 4684 drbd_info(connection, "Connection closed\n");
360cc740 4685
bde89a9e
AG
4686 if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
4687 conn_try_outdate_peer_async(connection);
cb703454 4688
0500813f 4689 spin_lock_irq(&connection->resource->req_lock);
bde89a9e 4690 oc = connection->cstate;
bbeb641c 4691 if (oc >= C_UNCONNECTED)
bde89a9e 4692 _conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
bbeb641c 4693
0500813f 4694 spin_unlock_irq(&connection->resource->req_lock);
360cc740 4695
f3dfa40a 4696 if (oc == C_DISCONNECTING)
bde89a9e 4697 conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
360cc740
PR
4698}
4699
69a22773 4700static int drbd_disconnected(struct drbd_peer_device *peer_device)
360cc740 4701{
69a22773 4702 struct drbd_device *device = peer_device->device;
360cc740 4703 unsigned int i;
b411b363 4704
85719573 4705 /* wait for current activity to cease. */
0500813f 4706 spin_lock_irq(&device->resource->req_lock);
b30ab791
AG
4707 _drbd_wait_ee_list_empty(device, &device->active_ee);
4708 _drbd_wait_ee_list_empty(device, &device->sync_ee);
4709 _drbd_wait_ee_list_empty(device, &device->read_ee);
0500813f 4710 spin_unlock_irq(&device->resource->req_lock);
b411b363
PR
4711
4712 /* We do not have data structures that would allow us to
4713 * get the rs_pending_cnt down to 0 again.
4714 * * On C_SYNC_TARGET we do not have any data structures describing
4715 * the pending RSDataRequest's we have sent.
4716 * * On C_SYNC_SOURCE there is no data structure that tracks
4717 * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
4718 * And no, it is not the sum of the reference counts in the
4719 * resync_LRU. The resync_LRU tracks the whole operation including
4720 * the disk-IO, while the rs_pending_cnt only tracks the blocks
4721 * on the fly. */
b30ab791
AG
4722 drbd_rs_cancel_all(device);
4723 device->rs_total = 0;
4724 device->rs_failed = 0;
4725 atomic_set(&device->rs_pending_cnt, 0);
4726 wake_up(&device->misc_wait);
b411b363 4727
b30ab791
AG
4728 del_timer_sync(&device->resync_timer);
4729 resync_timer_fn((unsigned long)device);
b411b363 4730
b411b363
PR
4731 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
4732 * w_make_resync_request etc. which may still be on the worker queue
4733 * to be "canceled" */
b5043c5e 4734 drbd_flush_workqueue(&peer_device->connection->sender_work);
b411b363 4735
b30ab791 4736 drbd_finish_peer_reqs(device);
b411b363 4737
d10b4ea3
PR
4738 /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
4739 might have issued a work again. The one before drbd_finish_peer_reqs() is
4740 necessary to reclain net_ee in drbd_finish_peer_reqs(). */
b5043c5e 4741 drbd_flush_workqueue(&peer_device->connection->sender_work);
d10b4ea3 4742
08332d73
LE
4743 /* need to do it again, drbd_finish_peer_reqs() may have populated it
4744 * again via drbd_try_clear_on_disk_bm(). */
b30ab791 4745 drbd_rs_cancel_all(device);
b411b363 4746
b30ab791
AG
4747 kfree(device->p_uuid);
4748 device->p_uuid = NULL;
b411b363 4749
b30ab791 4750 if (!drbd_suspended(device))
69a22773 4751 tl_clear(peer_device->connection);
b411b363 4752
b30ab791 4753 drbd_md_sync(device);
b411b363 4754
20ceb2b2
LE
4755 /* serialize with bitmap writeout triggered by the state change,
4756 * if any. */
b30ab791 4757 wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
20ceb2b2 4758
b411b363
PR
4759 /* tcp_close and release of sendpage pages can be deferred. I don't
4760 * want to use SO_LINGER, because apparently it can be deferred for
4761 * more than 20 seconds (longest time I checked).
4762 *
4763 * Actually we don't care for exactly when the network stack does its
4764 * put_page(), but release our reference on these pages right here.
4765 */
b30ab791 4766 i = drbd_free_peer_reqs(device, &device->net_ee);
b411b363 4767 if (i)
d0180171 4768 drbd_info(device, "net_ee not empty, killed %u entries\n", i);
b30ab791 4769 i = atomic_read(&device->pp_in_use_by_net);
435f0740 4770 if (i)
d0180171 4771 drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
b30ab791 4772 i = atomic_read(&device->pp_in_use);
b411b363 4773 if (i)
d0180171 4774 drbd_info(device, "pp_in_use = %d, expected 0\n", i);
b411b363 4775
0b0ba1ef
AG
4776 D_ASSERT(device, list_empty(&device->read_ee));
4777 D_ASSERT(device, list_empty(&device->active_ee));
4778 D_ASSERT(device, list_empty(&device->sync_ee));
4779 D_ASSERT(device, list_empty(&device->done_ee));
b411b363 4780
360cc740 4781 return 0;
b411b363
PR
4782}
4783
4784/*
4785 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
4786 * we can agree on is stored in agreed_pro_version.
4787 *
4788 * feature flags and the reserved array should be enough room for future
4789 * enhancements of the handshake protocol, and possible plugins...
4790 *
4791 * for now, they are expected to be zero, but ignored.
4792 */
bde89a9e 4793static int drbd_send_features(struct drbd_connection *connection)
b411b363 4794{
9f5bdc33
AG
4795 struct drbd_socket *sock;
4796 struct p_connection_features *p;
b411b363 4797
bde89a9e
AG
4798 sock = &connection->data;
4799 p = conn_prepare_command(connection, sock);
9f5bdc33 4800 if (!p)
e8d17b01 4801 return -EIO;
b411b363
PR
4802 memset(p, 0, sizeof(*p));
4803 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
4804 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
20c68fde 4805 p->feature_flags = cpu_to_be32(PRO_FEATURES);
bde89a9e 4806 return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
b411b363
PR
4807}
4808
4809/*
4810 * return values:
4811 * 1 yes, we have a valid connection
4812 * 0 oops, did not work out, please try again
4813 * -1 peer talks different language,
4814 * no point in trying again, please go standalone.
4815 */
bde89a9e 4816static int drbd_do_features(struct drbd_connection *connection)
b411b363 4817{
bde89a9e 4818 /* ASSERT current == connection->receiver ... */
e658983a
AG
4819 struct p_connection_features *p;
4820 const int expect = sizeof(struct p_connection_features);
77351055 4821 struct packet_info pi;
a5c31904 4822 int err;
b411b363 4823
bde89a9e 4824 err = drbd_send_features(connection);
e8d17b01 4825 if (err)
b411b363
PR
4826 return 0;
4827
bde89a9e 4828 err = drbd_recv_header(connection, &pi);
69bc7bc3 4829 if (err)
b411b363
PR
4830 return 0;
4831
6038178e 4832 if (pi.cmd != P_CONNECTION_FEATURES) {
1ec861eb 4833 drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
2fcb8f30 4834 cmdname(pi.cmd), pi.cmd);
b411b363
PR
4835 return -1;
4836 }
4837
77351055 4838 if (pi.size != expect) {
1ec861eb 4839 drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n",
77351055 4840 expect, pi.size);
b411b363
PR
4841 return -1;
4842 }
4843
e658983a 4844 p = pi.data;
bde89a9e 4845 err = drbd_recv_all_warn(connection, p, expect);
a5c31904 4846 if (err)
b411b363 4847 return 0;
b411b363 4848
b411b363
PR
4849 p->protocol_min = be32_to_cpu(p->protocol_min);
4850 p->protocol_max = be32_to_cpu(p->protocol_max);
4851 if (p->protocol_max == 0)
4852 p->protocol_max = p->protocol_min;
4853
4854 if (PRO_VERSION_MAX < p->protocol_min ||
4855 PRO_VERSION_MIN > p->protocol_max)
4856 goto incompat;
4857
bde89a9e 4858 connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
20c68fde 4859 connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
b411b363 4860
1ec861eb 4861 drbd_info(connection, "Handshake successful: "
bde89a9e 4862 "Agreed network protocol version %d\n", connection->agreed_pro_version);
b411b363 4863
20c68fde
LE
4864 drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n",
4865 connection->agreed_features & FF_TRIM ? " " : " not ");
4866
b411b363
PR
4867 return 1;
4868
4869 incompat:
1ec861eb 4870 drbd_err(connection, "incompatible DRBD dialects: "
b411b363
PR
4871 "I support %d-%d, peer supports %d-%d\n",
4872 PRO_VERSION_MIN, PRO_VERSION_MAX,
4873 p->protocol_min, p->protocol_max);
4874 return -1;
4875}
4876
4877#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
bde89a9e 4878static int drbd_do_auth(struct drbd_connection *connection)
b411b363 4879{
1ec861eb
AG
4880 drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
4881 drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
b10d96cb 4882 return -1;
b411b363
PR
4883}
4884#else
4885#define CHALLENGE_LEN 64
b10d96cb
JT
4886
4887/* Return value:
4888 1 - auth succeeded,
4889 0 - failed, try again (network error),
4890 -1 - auth failed, don't try again.
4891*/
4892
bde89a9e 4893static int drbd_do_auth(struct drbd_connection *connection)
b411b363 4894{
9f5bdc33 4895 struct drbd_socket *sock;
b411b363
PR
4896 char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
4897 struct scatterlist sg;
4898 char *response = NULL;
4899 char *right_response = NULL;
4900 char *peers_ch = NULL;
44ed167d
PR
4901 unsigned int key_len;
4902 char secret[SHARED_SECRET_MAX]; /* 64 byte */
b411b363
PR
4903 unsigned int resp_size;
4904 struct hash_desc desc;
77351055 4905 struct packet_info pi;
44ed167d 4906 struct net_conf *nc;
69bc7bc3 4907 int err, rv;
b411b363 4908
9f5bdc33 4909 /* FIXME: Put the challenge/response into the preallocated socket buffer. */
b411b363 4910
44ed167d 4911 rcu_read_lock();
bde89a9e 4912 nc = rcu_dereference(connection->net_conf);
44ed167d
PR
4913 key_len = strlen(nc->shared_secret);
4914 memcpy(secret, nc->shared_secret, key_len);
4915 rcu_read_unlock();
4916
bde89a9e 4917 desc.tfm = connection->cram_hmac_tfm;
b411b363
PR
4918 desc.flags = 0;
4919
bde89a9e 4920 rv = crypto_hash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
b411b363 4921 if (rv) {
1ec861eb 4922 drbd_err(connection, "crypto_hash_setkey() failed with %d\n", rv);
b10d96cb 4923 rv = -1;
b411b363
PR
4924 goto fail;
4925 }
4926
4927 get_random_bytes(my_challenge, CHALLENGE_LEN);
4928
bde89a9e
AG
4929 sock = &connection->data;
4930 if (!conn_prepare_command(connection, sock)) {
9f5bdc33
AG
4931 rv = 0;
4932 goto fail;
4933 }
bde89a9e 4934 rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
9f5bdc33 4935 my_challenge, CHALLENGE_LEN);
b411b363
PR
4936 if (!rv)
4937 goto fail;
4938
bde89a9e 4939 err = drbd_recv_header(connection, &pi);
69bc7bc3
AG
4940 if (err) {
4941 rv = 0;
b411b363 4942 goto fail;
69bc7bc3 4943 }
b411b363 4944
77351055 4945 if (pi.cmd != P_AUTH_CHALLENGE) {
1ec861eb 4946 drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
2fcb8f30 4947 cmdname(pi.cmd), pi.cmd);
b411b363
PR
4948 rv = 0;
4949 goto fail;
4950 }
4951
77351055 4952 if (pi.size > CHALLENGE_LEN * 2) {
1ec861eb 4953 drbd_err(connection, "expected AuthChallenge payload too big.\n");
b10d96cb 4954 rv = -1;
b411b363
PR
4955 goto fail;
4956 }
4957
67cca286
PR
4958 if (pi.size < CHALLENGE_LEN) {
4959 drbd_err(connection, "AuthChallenge payload too small.\n");
4960 rv = -1;
4961 goto fail;
4962 }
4963
77351055 4964 peers_ch = kmalloc(pi.size, GFP_NOIO);
b411b363 4965 if (peers_ch == NULL) {
1ec861eb 4966 drbd_err(connection, "kmalloc of peers_ch failed\n");
b10d96cb 4967 rv = -1;
b411b363
PR
4968 goto fail;
4969 }
4970
bde89a9e 4971 err = drbd_recv_all_warn(connection, peers_ch, pi.size);
a5c31904 4972 if (err) {
b411b363
PR
4973 rv = 0;
4974 goto fail;
4975 }
4976
67cca286
PR
4977 if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
4978 drbd_err(connection, "Peer presented the same challenge!\n");
4979 rv = -1;
4980 goto fail;
4981 }
4982
bde89a9e 4983 resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm);
b411b363
PR
4984 response = kmalloc(resp_size, GFP_NOIO);
4985 if (response == NULL) {
1ec861eb 4986 drbd_err(connection, "kmalloc of response failed\n");
b10d96cb 4987 rv = -1;
b411b363
PR
4988 goto fail;
4989 }
4990
4991 sg_init_table(&sg, 1);
77351055 4992 sg_set_buf(&sg, peers_ch, pi.size);
b411b363
PR
4993
4994 rv = crypto_hash_digest(&desc, &sg, sg.length, response);
4995 if (rv) {
1ec861eb 4996 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
b10d96cb 4997 rv = -1;
b411b363
PR
4998 goto fail;
4999 }
5000
bde89a9e 5001 if (!conn_prepare_command(connection, sock)) {
9f5bdc33 5002 rv = 0;
b411b363 5003 goto fail;
9f5bdc33 5004 }
bde89a9e 5005 rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
9f5bdc33 5006 response, resp_size);
b411b363
PR
5007 if (!rv)
5008 goto fail;
5009
bde89a9e 5010 err = drbd_recv_header(connection, &pi);
69bc7bc3 5011 if (err) {
b411b363
PR
5012 rv = 0;
5013 goto fail;
5014 }
5015
77351055 5016 if (pi.cmd != P_AUTH_RESPONSE) {
1ec861eb 5017 drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
2fcb8f30 5018 cmdname(pi.cmd), pi.cmd);
b411b363
PR
5019 rv = 0;
5020 goto fail;
5021 }
5022
77351055 5023 if (pi.size != resp_size) {
1ec861eb 5024 drbd_err(connection, "expected AuthResponse payload of wrong size\n");
b411b363
PR
5025 rv = 0;
5026 goto fail;
5027 }
b411b363 5028
bde89a9e 5029 err = drbd_recv_all_warn(connection, response , resp_size);
a5c31904 5030 if (err) {
b411b363
PR
5031 rv = 0;
5032 goto fail;
5033 }
5034
5035 right_response = kmalloc(resp_size, GFP_NOIO);
2d1ee87d 5036 if (right_response == NULL) {
1ec861eb 5037 drbd_err(connection, "kmalloc of right_response failed\n");
b10d96cb 5038 rv = -1;
b411b363
PR
5039 goto fail;
5040 }
5041
5042 sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
5043
5044 rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
5045 if (rv) {
1ec861eb 5046 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
b10d96cb 5047 rv = -1;
b411b363
PR
5048 goto fail;
5049 }
5050
5051 rv = !memcmp(response, right_response, resp_size);
5052
5053 if (rv)
1ec861eb 5054 drbd_info(connection, "Peer authenticated using %d bytes HMAC\n",
44ed167d 5055 resp_size);
b10d96cb
JT
5056 else
5057 rv = -1;
b411b363
PR
5058
5059 fail:
5060 kfree(peers_ch);
5061 kfree(response);
5062 kfree(right_response);
5063
5064 return rv;
5065}
5066#endif
5067
8fe60551 5068int drbd_receiver(struct drbd_thread *thi)
b411b363 5069{
bde89a9e 5070 struct drbd_connection *connection = thi->connection;
b411b363
PR
5071 int h;
5072
1ec861eb 5073 drbd_info(connection, "receiver (re)started\n");
b411b363
PR
5074
5075 do {
bde89a9e 5076 h = conn_connect(connection);
b411b363 5077 if (h == 0) {
bde89a9e 5078 conn_disconnect(connection);
20ee6390 5079 schedule_timeout_interruptible(HZ);
b411b363
PR
5080 }
5081 if (h == -1) {
1ec861eb 5082 drbd_warn(connection, "Discarding network configuration.\n");
bde89a9e 5083 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363
PR
5084 }
5085 } while (h == 0);
5086
91fd4dad 5087 if (h > 0)
bde89a9e 5088 drbdd(connection);
b411b363 5089
bde89a9e 5090 conn_disconnect(connection);
b411b363 5091
1ec861eb 5092 drbd_info(connection, "receiver terminated\n");
b411b363
PR
5093 return 0;
5094}
5095
5096/* ********* acknowledge sender ******** */
5097
bde89a9e 5098static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5099{
e658983a 5100 struct p_req_state_reply *p = pi->data;
e4f78ede
PR
5101 int retcode = be32_to_cpu(p->retcode);
5102
5103 if (retcode >= SS_SUCCESS) {
bde89a9e 5104 set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
e4f78ede 5105 } else {
bde89a9e 5106 set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
1ec861eb 5107 drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
e4f78ede
PR
5108 drbd_set_st_err_str(retcode), retcode);
5109 }
bde89a9e 5110 wake_up(&connection->ping_wait);
e4f78ede 5111
2735a594 5112 return 0;
e4f78ede 5113}
b411b363 5114
bde89a9e 5115static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5116{
9f4fe9ad 5117 struct drbd_peer_device *peer_device;
b30ab791 5118 struct drbd_device *device;
e658983a 5119 struct p_req_state_reply *p = pi->data;
b411b363
PR
5120 int retcode = be32_to_cpu(p->retcode);
5121
9f4fe9ad
AG
5122 peer_device = conn_peer_device(connection, pi->vnr);
5123 if (!peer_device)
2735a594 5124 return -EIO;
9f4fe9ad 5125 device = peer_device->device;
1952e916 5126
bde89a9e 5127 if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
0b0ba1ef 5128 D_ASSERT(device, connection->agreed_pro_version < 100);
bde89a9e 5129 return got_conn_RqSReply(connection, pi);
4d0fc3fd
PR
5130 }
5131
b411b363 5132 if (retcode >= SS_SUCCESS) {
b30ab791 5133 set_bit(CL_ST_CHG_SUCCESS, &device->flags);
b411b363 5134 } else {
b30ab791 5135 set_bit(CL_ST_CHG_FAIL, &device->flags);
d0180171 5136 drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
e4f78ede 5137 drbd_set_st_err_str(retcode), retcode);
b411b363 5138 }
b30ab791 5139 wake_up(&device->state_wait);
b411b363 5140
2735a594 5141 return 0;
b411b363
PR
5142}
5143
bde89a9e 5144static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5145{
bde89a9e 5146 return drbd_send_ping_ack(connection);
b411b363
PR
5147
5148}
5149
bde89a9e 5150static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
b411b363
PR
5151{
5152 /* restore idle timeout */
bde89a9e
AG
5153 connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
5154 if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
5155 wake_up(&connection->ping_wait);
b411b363 5156
2735a594 5157 return 0;
b411b363
PR
5158}
5159
bde89a9e 5160static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5161{
9f4fe9ad 5162 struct drbd_peer_device *peer_device;
b30ab791 5163 struct drbd_device *device;
e658983a 5164 struct p_block_ack *p = pi->data;
b411b363
PR
5165 sector_t sector = be64_to_cpu(p->sector);
5166 int blksize = be32_to_cpu(p->blksize);
5167
9f4fe9ad
AG
5168 peer_device = conn_peer_device(connection, pi->vnr);
5169 if (!peer_device)
2735a594 5170 return -EIO;
9f4fe9ad 5171 device = peer_device->device;
1952e916 5172
9f4fe9ad 5173 D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
b411b363 5174
69a22773 5175 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
b411b363 5176
b30ab791
AG
5177 if (get_ldev(device)) {
5178 drbd_rs_complete_io(device, sector);
5179 drbd_set_in_sync(device, sector, blksize);
1d53f09e 5180 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
b30ab791
AG
5181 device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
5182 put_ldev(device);
1d53f09e 5183 }
b30ab791
AG
5184 dec_rs_pending(device);
5185 atomic_add(blksize >> 9, &device->rs_sect_in);
b411b363 5186
2735a594 5187 return 0;
b411b363
PR
5188}
5189
bc9c5c41 5190static int
b30ab791 5191validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector,
bc9c5c41
AG
5192 struct rb_root *root, const char *func,
5193 enum drbd_req_event what, bool missing_ok)
b411b363
PR
5194{
5195 struct drbd_request *req;
5196 struct bio_and_error m;
5197
0500813f 5198 spin_lock_irq(&device->resource->req_lock);
b30ab791 5199 req = find_request(device, root, id, sector, missing_ok, func);
b411b363 5200 if (unlikely(!req)) {
0500813f 5201 spin_unlock_irq(&device->resource->req_lock);
85997675 5202 return -EIO;
b411b363
PR
5203 }
5204 __req_mod(req, what, &m);
0500813f 5205 spin_unlock_irq(&device->resource->req_lock);
b411b363
PR
5206
5207 if (m.bio)
b30ab791 5208 complete_master_bio(device, &m);
85997675 5209 return 0;
b411b363
PR
5210}
5211
bde89a9e 5212static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5213{
9f4fe9ad 5214 struct drbd_peer_device *peer_device;
b30ab791 5215 struct drbd_device *device;
e658983a 5216 struct p_block_ack *p = pi->data;
b411b363
PR
5217 sector_t sector = be64_to_cpu(p->sector);
5218 int blksize = be32_to_cpu(p->blksize);
5219 enum drbd_req_event what;
5220
9f4fe9ad
AG
5221 peer_device = conn_peer_device(connection, pi->vnr);
5222 if (!peer_device)
2735a594 5223 return -EIO;
9f4fe9ad 5224 device = peer_device->device;
1952e916 5225
69a22773 5226 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
b411b363 5227
579b57ed 5228 if (p->block_id == ID_SYNCER) {
b30ab791
AG
5229 drbd_set_in_sync(device, sector, blksize);
5230 dec_rs_pending(device);
2735a594 5231 return 0;
b411b363 5232 }
e05e1e59 5233 switch (pi->cmd) {
b411b363 5234 case P_RS_WRITE_ACK:
8554df1c 5235 what = WRITE_ACKED_BY_PEER_AND_SIS;
b411b363
PR
5236 break;
5237 case P_WRITE_ACK:
8554df1c 5238 what = WRITE_ACKED_BY_PEER;
b411b363
PR
5239 break;
5240 case P_RECV_ACK:
8554df1c 5241 what = RECV_ACKED_BY_PEER;
b411b363 5242 break;
d4dabbe2
LE
5243 case P_SUPERSEDED:
5244 what = CONFLICT_RESOLVED;
b411b363 5245 break;
7be8da07 5246 case P_RETRY_WRITE:
7be8da07 5247 what = POSTPONE_WRITE;
b411b363
PR
5248 break;
5249 default:
2735a594 5250 BUG();
b411b363
PR
5251 }
5252
b30ab791
AG
5253 return validate_req_change_req_state(device, p->block_id, sector,
5254 &device->write_requests, __func__,
2735a594 5255 what, false);
b411b363
PR
5256}
5257
bde89a9e 5258static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5259{
9f4fe9ad 5260 struct drbd_peer_device *peer_device;
b30ab791 5261 struct drbd_device *device;
e658983a 5262 struct p_block_ack *p = pi->data;
b411b363 5263 sector_t sector = be64_to_cpu(p->sector);
2deb8336 5264 int size = be32_to_cpu(p->blksize);
85997675 5265 int err;
b411b363 5266
9f4fe9ad
AG
5267 peer_device = conn_peer_device(connection, pi->vnr);
5268 if (!peer_device)
2735a594 5269 return -EIO;
9f4fe9ad 5270 device = peer_device->device;
b411b363 5271
69a22773 5272 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
b411b363 5273
579b57ed 5274 if (p->block_id == ID_SYNCER) {
b30ab791
AG
5275 dec_rs_pending(device);
5276 drbd_rs_failed_io(device, sector, size);
2735a594 5277 return 0;
b411b363 5278 }
2deb8336 5279
b30ab791
AG
5280 err = validate_req_change_req_state(device, p->block_id, sector,
5281 &device->write_requests, __func__,
303d1448 5282 NEG_ACKED, true);
85997675 5283 if (err) {
c3afd8f5
AG
5284 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
5285 The master bio might already be completed, therefore the
5286 request is no longer in the collision hash. */
5287 /* In Protocol B we might already have got a P_RECV_ACK
5288 but then get a P_NEG_ACK afterwards. */
b30ab791 5289 drbd_set_out_of_sync(device, sector, size);
2deb8336 5290 }
2735a594 5291 return 0;
b411b363
PR
5292}
5293
bde89a9e 5294static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5295{
9f4fe9ad 5296 struct drbd_peer_device *peer_device;
b30ab791 5297 struct drbd_device *device;
e658983a 5298 struct p_block_ack *p = pi->data;
b411b363
PR
5299 sector_t sector = be64_to_cpu(p->sector);
5300
9f4fe9ad
AG
5301 peer_device = conn_peer_device(connection, pi->vnr);
5302 if (!peer_device)
2735a594 5303 return -EIO;
9f4fe9ad 5304 device = peer_device->device;
1952e916 5305
69a22773 5306 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
7be8da07 5307
d0180171 5308 drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
b411b363
PR
5309 (unsigned long long)sector, be32_to_cpu(p->blksize));
5310
b30ab791
AG
5311 return validate_req_change_req_state(device, p->block_id, sector,
5312 &device->read_requests, __func__,
2735a594 5313 NEG_ACKED, false);
b411b363
PR
5314}
5315
bde89a9e 5316static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5317{
9f4fe9ad 5318 struct drbd_peer_device *peer_device;
b30ab791 5319 struct drbd_device *device;
b411b363
PR
5320 sector_t sector;
5321 int size;
e658983a 5322 struct p_block_ack *p = pi->data;
1952e916 5323
9f4fe9ad
AG
5324 peer_device = conn_peer_device(connection, pi->vnr);
5325 if (!peer_device)
2735a594 5326 return -EIO;
9f4fe9ad 5327 device = peer_device->device;
b411b363
PR
5328
5329 sector = be64_to_cpu(p->sector);
5330 size = be32_to_cpu(p->blksize);
b411b363 5331
69a22773 5332 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
b411b363 5333
b30ab791 5334 dec_rs_pending(device);
b411b363 5335
b30ab791
AG
5336 if (get_ldev_if_state(device, D_FAILED)) {
5337 drbd_rs_complete_io(device, sector);
e05e1e59 5338 switch (pi->cmd) {
d612d309 5339 case P_NEG_RS_DREPLY:
b30ab791 5340 drbd_rs_failed_io(device, sector, size);
d612d309
PR
5341 case P_RS_CANCEL:
5342 break;
5343 default:
2735a594 5344 BUG();
d612d309 5345 }
b30ab791 5346 put_ldev(device);
b411b363
PR
5347 }
5348
2735a594 5349 return 0;
b411b363
PR
5350}
5351
bde89a9e 5352static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5353{
e658983a 5354 struct p_barrier_ack *p = pi->data;
c06ece6b 5355 struct drbd_peer_device *peer_device;
9ed57dcb 5356 int vnr;
1952e916 5357
bde89a9e 5358 tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
b411b363 5359
9ed57dcb 5360 rcu_read_lock();
c06ece6b
AG
5361 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5362 struct drbd_device *device = peer_device->device;
5363
b30ab791
AG
5364 if (device->state.conn == C_AHEAD &&
5365 atomic_read(&device->ap_in_flight) == 0 &&
5366 !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
5367 device->start_resync_timer.expires = jiffies + HZ;
5368 add_timer(&device->start_resync_timer);
9ed57dcb 5369 }
c4752ef1 5370 }
9ed57dcb 5371 rcu_read_unlock();
c4752ef1 5372
2735a594 5373 return 0;
b411b363
PR
5374}
5375
bde89a9e 5376static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
b411b363 5377{
9f4fe9ad 5378 struct drbd_peer_device *peer_device;
b30ab791 5379 struct drbd_device *device;
e658983a 5380 struct p_block_ack *p = pi->data;
84b8c06b 5381 struct drbd_device_work *dw;
b411b363
PR
5382 sector_t sector;
5383 int size;
5384
9f4fe9ad
AG
5385 peer_device = conn_peer_device(connection, pi->vnr);
5386 if (!peer_device)
2735a594 5387 return -EIO;
9f4fe9ad 5388 device = peer_device->device;
1952e916 5389
b411b363
PR
5390 sector = be64_to_cpu(p->sector);
5391 size = be32_to_cpu(p->blksize);
5392
69a22773 5393 update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
b411b363
PR
5394
5395 if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
b30ab791 5396 drbd_ov_out_of_sync_found(device, sector, size);
b411b363 5397 else
b30ab791 5398 ov_out_of_sync_print(device);
b411b363 5399
b30ab791 5400 if (!get_ldev(device))
2735a594 5401 return 0;
1d53f09e 5402
b30ab791
AG
5403 drbd_rs_complete_io(device, sector);
5404 dec_rs_pending(device);
b411b363 5405
b30ab791 5406 --device->ov_left;
ea5442af
LE
5407
5408 /* let's advance progress step marks only for every other megabyte */
b30ab791
AG
5409 if ((device->ov_left & 0x200) == 0x200)
5410 drbd_advance_rs_marks(device, device->ov_left);
ea5442af 5411
b30ab791 5412 if (device->ov_left == 0) {
84b8c06b
AG
5413 dw = kmalloc(sizeof(*dw), GFP_NOIO);
5414 if (dw) {
5415 dw->w.cb = w_ov_finished;
5416 dw->device = device;
5417 drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
b411b363 5418 } else {
84b8c06b 5419 drbd_err(device, "kmalloc(dw) failed.");
b30ab791
AG
5420 ov_out_of_sync_print(device);
5421 drbd_resync_finished(device);
b411b363
PR
5422 }
5423 }
b30ab791 5424 put_ldev(device);
2735a594 5425 return 0;
b411b363
PR
5426}
5427
bde89a9e 5428static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
0ced55a3 5429{
2735a594 5430 return 0;
b411b363
PR
5431}
5432
bde89a9e 5433static int connection_finish_peer_reqs(struct drbd_connection *connection)
0ced55a3 5434{
c06ece6b 5435 struct drbd_peer_device *peer_device;
c141ebda 5436 int vnr, not_empty = 0;
32862ec7
PR
5437
5438 do {
bde89a9e 5439 clear_bit(SIGNAL_ASENDER, &connection->flags);
32862ec7 5440 flush_signals(current);
c141ebda
PR
5441
5442 rcu_read_lock();
c06ece6b
AG
5443 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5444 struct drbd_device *device = peer_device->device;
b30ab791 5445 kref_get(&device->kref);
c141ebda 5446 rcu_read_unlock();
b30ab791 5447 if (drbd_finish_peer_reqs(device)) {
05a10ec7 5448 kref_put(&device->kref, drbd_destroy_device);
c141ebda 5449 return 1;
d3fcb490 5450 }
05a10ec7 5451 kref_put(&device->kref, drbd_destroy_device);
c141ebda 5452 rcu_read_lock();
082a3439 5453 }
bde89a9e 5454 set_bit(SIGNAL_ASENDER, &connection->flags);
082a3439 5455
0500813f 5456 spin_lock_irq(&connection->resource->req_lock);
c06ece6b
AG
5457 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5458 struct drbd_device *device = peer_device->device;
b30ab791 5459 not_empty = !list_empty(&device->done_ee);
082a3439
PR
5460 if (not_empty)
5461 break;
5462 }
0500813f 5463 spin_unlock_irq(&connection->resource->req_lock);
c141ebda 5464 rcu_read_unlock();
32862ec7
PR
5465 } while (not_empty);
5466
5467 return 0;
0ced55a3
PR
5468}
5469
b411b363
PR
5470struct asender_cmd {
5471 size_t pkt_size;
bde89a9e 5472 int (*fn)(struct drbd_connection *connection, struct packet_info *);
b411b363
PR
5473};
5474
7201b972 5475static struct asender_cmd asender_tbl[] = {
e658983a
AG
5476 [P_PING] = { 0, got_Ping },
5477 [P_PING_ACK] = { 0, got_PingAck },
b411b363
PR
5478 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
5479 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
5480 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
d4dabbe2 5481 [P_SUPERSEDED] = { sizeof(struct p_block_ack), got_BlockAck },
b411b363
PR
5482 [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
5483 [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
1952e916 5484 [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply },
b411b363
PR
5485 [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
5486 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
5487 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
5488 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
02918be2 5489 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
1952e916
AG
5490 [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply },
5491 [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
5492 [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
7201b972 5493};
b411b363
PR
5494
5495int drbd_asender(struct drbd_thread *thi)
5496{
bde89a9e 5497 struct drbd_connection *connection = thi->connection;
b411b363 5498 struct asender_cmd *cmd = NULL;
77351055 5499 struct packet_info pi;
257d0af6 5500 int rv;
bde89a9e 5501 void *buf = connection->meta.rbuf;
b411b363 5502 int received = 0;
bde89a9e 5503 unsigned int header_size = drbd_header_size(connection);
52b061a4 5504 int expect = header_size;
44ed167d
PR
5505 bool ping_timeout_active = false;
5506 struct net_conf *nc;
bb77d34e 5507 int ping_timeo, tcp_cork, ping_int;
3990e04d 5508 struct sched_param param = { .sched_priority = 2 };
b411b363 5509
3990e04d
PR
5510 rv = sched_setscheduler(current, SCHED_RR, &param);
5511 if (rv < 0)
1ec861eb 5512 drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv);
b411b363 5513
e77a0a5c 5514 while (get_t_state(thi) == RUNNING) {
80822284 5515 drbd_thread_current_set_cpu(thi);
b411b363 5516
44ed167d 5517 rcu_read_lock();
bde89a9e 5518 nc = rcu_dereference(connection->net_conf);
44ed167d 5519 ping_timeo = nc->ping_timeo;
bb77d34e 5520 tcp_cork = nc->tcp_cork;
44ed167d
PR
5521 ping_int = nc->ping_int;
5522 rcu_read_unlock();
5523
bde89a9e
AG
5524 if (test_and_clear_bit(SEND_PING, &connection->flags)) {
5525 if (drbd_send_ping(connection)) {
1ec861eb 5526 drbd_err(connection, "drbd_send_ping has failed\n");
b411b363 5527 goto reconnect;
841ce241 5528 }
bde89a9e 5529 connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
44ed167d 5530 ping_timeout_active = true;
b411b363
PR
5531 }
5532
32862ec7
PR
5533 /* TODO: conditionally cork; it may hurt latency if we cork without
5534 much to send */
bb77d34e 5535 if (tcp_cork)
bde89a9e
AG
5536 drbd_tcp_cork(connection->meta.socket);
5537 if (connection_finish_peer_reqs(connection)) {
1ec861eb 5538 drbd_err(connection, "connection_finish_peer_reqs() failed\n");
32862ec7 5539 goto reconnect;
b411b363
PR
5540 }
5541 /* but unconditionally uncork unless disabled */
bb77d34e 5542 if (tcp_cork)
bde89a9e 5543 drbd_tcp_uncork(connection->meta.socket);
b411b363
PR
5544
5545 /* short circuit, recv_msg would return EINTR anyways. */
5546 if (signal_pending(current))
5547 continue;
5548
bde89a9e
AG
5549 rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
5550 clear_bit(SIGNAL_ASENDER, &connection->flags);
b411b363
PR
5551
5552 flush_signals(current);
5553
5554 /* Note:
5555 * -EINTR (on meta) we got a signal
5556 * -EAGAIN (on meta) rcvtimeo expired
5557 * -ECONNRESET other side closed the connection
5558 * -ERESTARTSYS (on data) we got a signal
5559 * rv < 0 other than above: unexpected error!
5560 * rv == expected: full header or command
5561 * rv < expected: "woken" by signal during receive
5562 * rv == 0 : "connection shut down by peer"
5563 */
abde9cc6 5564received_more:
b411b363
PR
5565 if (likely(rv > 0)) {
5566 received += rv;
5567 buf += rv;
5568 } else if (rv == 0) {
bde89a9e 5569 if (test_bit(DISCONNECT_SENT, &connection->flags)) {
b66623e3
PR
5570 long t;
5571 rcu_read_lock();
bde89a9e 5572 t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
b66623e3
PR
5573 rcu_read_unlock();
5574
bde89a9e
AG
5575 t = wait_event_timeout(connection->ping_wait,
5576 connection->cstate < C_WF_REPORT_PARAMS,
b66623e3 5577 t);
599377ac
PR
5578 if (t)
5579 break;
5580 }
1ec861eb 5581 drbd_err(connection, "meta connection shut down by peer.\n");
b411b363
PR
5582 goto reconnect;
5583 } else if (rv == -EAGAIN) {
cb6518cb
LE
5584 /* If the data socket received something meanwhile,
5585 * that is good enough: peer is still alive. */
bde89a9e
AG
5586 if (time_after(connection->last_received,
5587 jiffies - connection->meta.socket->sk->sk_rcvtimeo))
cb6518cb 5588 continue;
f36af18c 5589 if (ping_timeout_active) {
1ec861eb 5590 drbd_err(connection, "PingAck did not arrive in time.\n");
b411b363
PR
5591 goto reconnect;
5592 }
bde89a9e 5593 set_bit(SEND_PING, &connection->flags);
b411b363
PR
5594 continue;
5595 } else if (rv == -EINTR) {
5596 continue;
5597 } else {
1ec861eb 5598 drbd_err(connection, "sock_recvmsg returned %d\n", rv);
b411b363
PR
5599 goto reconnect;
5600 }
5601
5602 if (received == expect && cmd == NULL) {
bde89a9e 5603 if (decode_header(connection, connection->meta.rbuf, &pi))
b411b363 5604 goto reconnect;
7201b972 5605 cmd = &asender_tbl[pi.cmd];
1952e916 5606 if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
1ec861eb 5607 drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
2fcb8f30 5608 cmdname(pi.cmd), pi.cmd);
b411b363
PR
5609 goto disconnect;
5610 }
e658983a 5611 expect = header_size + cmd->pkt_size;
52b061a4 5612 if (pi.size != expect - header_size) {
1ec861eb 5613 drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
77351055 5614 pi.cmd, pi.size);
b411b363 5615 goto reconnect;
257d0af6 5616 }
b411b363
PR
5617 }
5618 if (received == expect) {
2735a594 5619 bool err;
a4fbda8e 5620
bde89a9e 5621 err = cmd->fn(connection, &pi);
2735a594 5622 if (err) {
1ec861eb 5623 drbd_err(connection, "%pf failed\n", cmd->fn);
b411b363 5624 goto reconnect;
1952e916 5625 }
b411b363 5626
bde89a9e 5627 connection->last_received = jiffies;
f36af18c 5628
44ed167d
PR
5629 if (cmd == &asender_tbl[P_PING_ACK]) {
5630 /* restore idle timeout */
bde89a9e 5631 connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
44ed167d
PR
5632 ping_timeout_active = false;
5633 }
f36af18c 5634
bde89a9e 5635 buf = connection->meta.rbuf;
b411b363 5636 received = 0;
52b061a4 5637 expect = header_size;
b411b363
PR
5638 cmd = NULL;
5639 }
abde9cc6
LE
5640 if (test_bit(SEND_PING, &connection->flags))
5641 continue;
5642 rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT);
5643 if (rv > 0)
5644 goto received_more;
b411b363
PR
5645 }
5646
5647 if (0) {
5648reconnect:
bde89a9e
AG
5649 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
5650 conn_md_sync(connection);
b411b363
PR
5651 }
5652 if (0) {
5653disconnect:
bde89a9e 5654 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
b411b363 5655 }
bde89a9e 5656 clear_bit(SIGNAL_ASENDER, &connection->flags);
b411b363 5657
1ec861eb 5658 drbd_info(connection, "asender terminated\n");
b411b363
PR
5659
5660 return 0;
5661}
This page took 0.793112 seconds and 5 git commands to generate.