drivers/block/drbd/drbd_receiver.c

   1 /*
   2    drbd_receiver.c
   3
   4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10    drbd is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 2, or (at your option)
  13    any later version.
  14
  15    drbd is distributed in the hope that it will be useful,
  16    but WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18    GNU General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with drbd; see the file COPYING.  If not, write to
  22    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25
  26 #include <linux/module.h>
  27
  28 #include <asm/uaccess.h>
  29 #include <net/sock.h>
  30
  31 #include <linux/drbd.h>
  32 #include <linux/fs.h>
  33 #include <linux/file.h>
  34 #include <linux/in.h>
  35 #include <linux/mm.h>
  36 #include <linux/memcontrol.h>
  37 #include <linux/mm_inline.h>
  38 #include <linux/slab.h>
  39 #include <linux/smp_lock.h>
  40 #include <linux/pkt_sched.h>
  41 #define __KERNEL_SYSCALLS__
  42 #include <linux/unistd.h>
  43 #include <linux/vmalloc.h>
  44 #include <linux/random.h>
  45 #include <linux/string.h>
  46 #include <linux/scatterlist.h>
  47 #include "drbd_int.h"
  48 #include "drbd_req.h"
  49
  50 #include "drbd_vli.h"
  51
  52 struct flush_work {
  53         struct drbd_work w;
  54         struct drbd_epoch *epoch;
  55 };
  56
  57 enum finish_epoch {
  58         FE_STILL_LIVE,
  59         FE_DESTROYED,
  60         FE_RECYCLED,
  61 };
  62
  63 static int drbd_do_handshake(struct drbd_conf *mdev);
  64 static int drbd_do_auth(struct drbd_conf *mdev);
  65
  66 static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
  67 static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
  68
  69 static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
  70 {
  71         struct drbd_epoch *prev;
  72         spin_lock(&mdev->epoch_lock);
  73         prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
  74         if (prev == epoch || prev == mdev->current_epoch)
  75                 prev = NULL;
  76         spin_unlock(&mdev->epoch_lock);
  77         return prev;
  78 }
  79
  80 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
  81
  82 /*
  83  * some helper functions to deal with single linked page lists,
  84  * page->private being our "next" pointer.
  85  */
  86
  87 /* If at least n pages are linked at head, get n pages off.
  88  * Otherwise, don't modify head, and return NULL.
  89  * Locking is the responsibility of the caller.
  90  */
  91 static struct page *page_chain_del(struct page **head, int n)
  92 {
  93         struct page *page;
  94         struct page *tmp;
  95
  96         BUG_ON(!n);
  97         BUG_ON(!head);
  98
  99         page = *head;
 100
 101         if (!page)
 102                 return NULL;
 103
 104         while (page) {
 105                 tmp = page_chain_next(page);
 106                 if (--n == 0)
 107                         break; /* found sufficient pages */
 108                 if (tmp == NULL)
 109                         /* insufficient pages, don't use any of them. */
 110                         return NULL;
 111                 page = tmp;
 112         }
 113
 114         /* add end of list marker for the returned list */
 115         set_page_private(page, 0);
 116         /* actual return value, and adjustment of head */
 117         page = *head;
 118         *head = tmp;
 119         return page;
 120 }
 121
 122 /* may be used outside of locks to find the tail of a (usually short)
 123  * "private" page chain, before adding it back to a global chain head
 124  * with page_chain_add() under a spinlock. */
 125 static struct page *page_chain_tail(struct page *page, int *len)
 126 {
 127         struct page *tmp;
 128         int i = 1;
 129         while ((tmp = page_chain_next(page)))
 130                 ++i, page = tmp;
 131         if (len)
 132                 *len = i;
 133         return page;
 134 }
 135
 136 static int page_chain_free(struct page *page)
 137 {
 138         struct page *tmp;
 139         int i = 0;
 140         page_chain_for_each_safe(page, tmp) {
 141                 put_page(page);
 142                 ++i;
 143         }
 144         return i;
 145 }
 146
 147 static void page_chain_add(struct page **head,
 148                 struct page *chain_first, struct page *chain_last)
 149 {
 150 #if 1
 151         struct page *tmp;
 152         tmp = page_chain_tail(chain_first, NULL);
 153         BUG_ON(tmp != chain_last);
 154 #endif
 155
 156         /* add chain to head */
 157         set_page_private(chain_last, (unsigned long)*head);
 158         *head = chain_first;
 159 }
 160
 161 static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
 162 {
 163         struct page *page = NULL;
 164         struct page *tmp = NULL;
 165         int i = 0;
 166
 167         /* Yes, testing drbd_pp_vacant outside the lock is racy.
 168          * So what. It saves a spin_lock. */
 169         if (drbd_pp_vacant >= number) {
 170                 spin_lock(&drbd_pp_lock);
 171                 page = page_chain_del(&drbd_pp_pool, number);
 172                 if (page)
 173                         drbd_pp_vacant -= number;
 174                 spin_unlock(&drbd_pp_lock);
 175                 if (page)
 176                         return page;
 177         }
 178
 179         /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
 180          * "criss-cross" setup, that might cause write-out on some other DRBD,
 181          * which in turn might block on the other node at this very place.  */
 182         for (i = 0; i < number; i++) {
 183                 tmp = alloc_page(GFP_TRY);
 184                 if (!tmp)
 185                         break;
 186                 set_page_private(tmp, (unsigned long)page);
 187                 page = tmp;
 188         }
 189
 190         if (i == number)
 191                 return page;
 192
 193         /* Not enough pages immediately available this time.
 194          * No need to jump around here, drbd_pp_alloc will retry this
 195          * function "soon". */
 196         if (page) {
 197                 tmp = page_chain_tail(page, NULL);
 198                 spin_lock(&drbd_pp_lock);
 199                 page_chain_add(&drbd_pp_pool, page, tmp);
 200                 drbd_pp_vacant += i;
 201                 spin_unlock(&drbd_pp_lock);
 202         }
 203         return NULL;
 204 }
 205
 206 /* kick lower level device, if we have more than (arbitrary number)
 207  * reference counts on it, which typically are locally submitted io
 208  * requests.  don't use unacked_cnt, so we speed up proto A and B, too. */
 209 static void maybe_kick_lo(struct drbd_conf *mdev)
 210 {
 211         if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
 212                 drbd_kick_lo(mdev);
 213 }
 214
 215 static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
 216 {
 217         struct drbd_epoch_entry *e;
 218         struct list_head *le, *tle;
 219
 220         /* The EEs are always appended to the end of the list. Since
 221            they are sent in order over the wire, they have to finish
 222            in order. As soon as we see the first not finished we can
 223            stop to examine the list... */
 224
 225         list_for_each_safe(le, tle, &mdev->net_ee) {
 226                 e = list_entry(le, struct drbd_epoch_entry, w.list);
 227                 if (drbd_ee_has_active_page(e))
 228                         break;
 229                 list_move(le, to_be_freed);
 230         }
 231 }
 232
 233 static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
 234 {
 235         LIST_HEAD(reclaimed);
 236         struct drbd_epoch_entry *e, *t;
 237
 238         maybe_kick_lo(mdev);
 239         spin_lock_irq(&mdev->req_lock);
 240         reclaim_net_ee(mdev, &reclaimed);
 241         spin_unlock_irq(&mdev->req_lock);
 242
 243         list_for_each_entry_safe(e, t, &reclaimed, w.list)
 244                 drbd_free_ee(mdev, e);
 245 }
 246
 247 /**
 248  * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
 249  * @mdev:       DRBD device.
 250  * @number:     number of pages requested
 251  * @retry:      whether to retry, if not enough pages are available right now
 252  *
 253  * Tries to allocate number pages, first from our own page pool, then from
 254  * the kernel, unless this allocation would exceed the max_buffers setting.
 255  * Possibly retry until DRBD frees sufficient pages somewhere else.
 256  *
 257  * Returns a page chain linked via page->private.
 258  */
 259 static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
 260 {
 261         struct page *page = NULL;
 262         DEFINE_WAIT(wait);
 263
 264         /* Yes, we may run up to @number over max_buffers. If we
 265          * follow it strictly, the admin will get it wrong anyways. */
 266         if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
 267                 page = drbd_pp_first_pages_or_try_alloc(mdev, number);
 268
 269         while (page == NULL) {
 270                 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
 271
 272                 drbd_kick_lo_and_reclaim_net(mdev);
 273
 274                 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
 275                         page = drbd_pp_first_pages_or_try_alloc(mdev, number);
 276                         if (page)
 277                                 break;
 278                 }
 279
 280                 if (!retry)
 281                         break;
 282
 283                 if (signal_pending(current)) {
 284                         dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
 285                         break;
 286                 }
 287
 288                 schedule();
 289         }
 290         finish_wait(&drbd_pp_wait, &wait);
 291
 292         if (page)
 293                 atomic_add(number, &mdev->pp_in_use);
 294         return page;
 295 }
 296
 297 /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
 298  * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
 299  * Either links the page chain back to the global pool,
 300  * or returns all pages to the system. */
 301 static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
 302 {
 303         int i;
 304         if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
 305                 i = page_chain_free(page);
 306         else {
 307                 struct page *tmp;
 308                 tmp = page_chain_tail(page, &i);
 309                 spin_lock(&drbd_pp_lock);
 310                 page_chain_add(&drbd_pp_pool, page, tmp);
 311                 drbd_pp_vacant += i;
 312                 spin_unlock(&drbd_pp_lock);
 313         }
 314         atomic_sub(i, &mdev->pp_in_use);
 315         i = atomic_read(&mdev->pp_in_use);
 316         if (i < 0)
 317                 dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
 318         wake_up(&drbd_pp_wait);
 319 }
 320
 321 /*
 322 You need to hold the req_lock:
 323  _drbd_wait_ee_list_empty()
 324
 325 You must not have the req_lock:
 326  drbd_free_ee()
 327  drbd_alloc_ee()
 328  drbd_init_ee()
 329  drbd_release_ee()
 330  drbd_ee_fix_bhs()
 331  drbd_process_done_ee()
 332  drbd_clear_done_ee()
 333  drbd_wait_ee_list_empty()
 334 */
 335
 336 struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 337                                      u64 id,
 338                                      sector_t sector,
 339                                      unsigned int data_size,
 340                                      gfp_t gfp_mask) __must_hold(local)
 341 {
 342         struct drbd_epoch_entry *e;
 343         struct page *page;
 344         unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
 345
 346         if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
 347                 return NULL;
 348
 349         e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
 350         if (!e) {
 351                 if (!(gfp_mask & __GFP_NOWARN))
 352                         dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
 353                 return NULL;
 354         }
 355
 356         page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
 357         if (!page)
 358                 goto fail;
 359
 360         INIT_HLIST_NODE(&e->colision);
 361         e->epoch = NULL;
 362         e->mdev = mdev;
 363         e->pages = page;
 364         atomic_set(&e->pending_bios, 0);
 365         e->size = data_size;
 366         e->flags = 0;
 367         e->sector = sector;
 368         e->block_id = id;
 369
 370         return e;
 371
 372  fail:
 373         mempool_free(e, drbd_ee_mempool);
 374         return NULL;
 375 }
 376
 377 void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
 378 {
 379         if (e->flags & EE_HAS_DIGEST)
 380                 kfree(e->digest);
 381         drbd_pp_free(mdev, e->pages);
 382         D_ASSERT(atomic_read(&e->pending_bios) == 0);
 383         D_ASSERT(hlist_unhashed(&e->colision));
 384         mempool_free(e, drbd_ee_mempool);
 385 }
 386
 387 int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
 388 {
 389         LIST_HEAD(work_list);
 390         struct drbd_epoch_entry *e, *t;
 391         int count = 0;
 392
 393         spin_lock_irq(&mdev->req_lock);
 394         list_splice_init(list, &work_list);
 395         spin_unlock_irq(&mdev->req_lock);
 396
 397         list_for_each_entry_safe(e, t, &work_list, w.list) {
 398                 drbd_free_ee(mdev, e);
 399                 count++;
 400         }
 401         return count;
 402 }
 403
 404
 405 /*
 406  * This function is called from _asender only_
 407  * but see also comments in _req_mod(,barrier_acked)
 408  * and receive_Barrier.
 409  *
 410  * Move entries from net_ee to done_ee, if ready.
 411  * Grab done_ee, call all callbacks, free the entries.
 412  * The callbacks typically send out ACKs.
 413  */
 414 static int drbd_process_done_ee(struct drbd_conf *mdev)
 415 {
 416         LIST_HEAD(work_list);
 417         LIST_HEAD(reclaimed);
 418         struct drbd_epoch_entry *e, *t;
 419         int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
 420
 421         spin_lock_irq(&mdev->req_lock);
 422         reclaim_net_ee(mdev, &reclaimed);
 423         list_splice_init(&mdev->done_ee, &work_list);
 424         spin_unlock_irq(&mdev->req_lock);
 425
 426         list_for_each_entry_safe(e, t, &reclaimed, w.list)
 427                 drbd_free_ee(mdev, e);
 428
 429         /* possible callbacks here:
 430          * e_end_block, and e_end_resync_block, e_send_discard_ack.
 431          * all ignore the last argument.
 432          */
 433         list_for_each_entry_safe(e, t, &work_list, w.list) {
 434                 /* list_del not necessary, next/prev members not touched */
 435                 ok = e->w.cb(mdev, &e->w, !ok) && ok;
 436                 drbd_free_ee(mdev, e);
 437         }
 438         wake_up(&mdev->ee_wait);
 439
 440         return ok;
 441 }
 442
 443 void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
 444 {
 445         DEFINE_WAIT(wait);
 446
 447         /* avoids spin_lock/unlock
 448          * and calling prepare_to_wait in the fast path */
 449         while (!list_empty(head)) {
 450                 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
 451                 spin_unlock_irq(&mdev->req_lock);
 452                 drbd_kick_lo(mdev);
 453                 schedule();
 454                 finish_wait(&mdev->ee_wait, &wait);
 455                 spin_lock_irq(&mdev->req_lock);
 456         }
 457 }
 458
 459 void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
 460 {
 461         spin_lock_irq(&mdev->req_lock);
 462         _drbd_wait_ee_list_empty(mdev, head);
 463         spin_unlock_irq(&mdev->req_lock);
 464 }
 465
 466 /* see also kernel_accept; which is only present since 2.6.18.
 467  * also we want to log which part of it failed, exactly */
 468 static int drbd_accept(struct drbd_conf *mdev, const char **what,
 469                 struct socket *sock, struct socket **newsock)
 470 {
 471         struct sock *sk = sock->sk;
 472         int err = 0;
 473
 474         *what = "listen";
 475         err = sock->ops->listen(sock, 5);
 476         if (err < 0)
 477                 goto out;
 478
 479         *what = "sock_create_lite";
 480         err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
 481                                newsock);
 482         if (err < 0)
 483                 goto out;
 484
 485         *what = "accept";
 486         err = sock->ops->accept(sock, *newsock, 0);
 487         if (err < 0) {
 488                 sock_release(*newsock);
 489                 *newsock = NULL;
 490                 goto out;
 491         }
 492         (*newsock)->ops  = sock->ops;
 493
 494 out:
 495         return err;
 496 }
 497
 498 static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
 499                     void *buf, size_t size, int flags)
 500 {
 501         mm_segment_t oldfs;
 502         struct kvec iov = {
 503                 .iov_base = buf,
 504                 .iov_len = size,
 505         };
 506         struct msghdr msg = {
 507                 .msg_iovlen = 1,
 508                 .msg_iov = (struct iovec *)&iov,
 509                 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
 510         };
 511         int rv;
 512
 513         oldfs = get_fs();
 514         set_fs(KERNEL_DS);
 515         rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
 516         set_fs(oldfs);
 517
 518         return rv;
 519 }
 520
 521 static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
 522 {
 523         mm_segment_t oldfs;
 524         struct kvec iov = {
 525                 .iov_base = buf,
 526                 .iov_len = size,
 527         };
 528         struct msghdr msg = {
 529                 .msg_iovlen = 1,
 530                 .msg_iov = (struct iovec *)&iov,
 531                 .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
 532         };
 533         int rv;
 534
 535         oldfs = get_fs();
 536         set_fs(KERNEL_DS);
 537
 538         for (;;) {
 539                 rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
 540                 if (rv == size)
 541                         break;
 542
 543                 /* Note:
 544                  * ECONNRESET   other side closed the connection
 545                  * ERESTARTSYS  (on  sock) we got a signal
 546                  */
 547
 548                 if (rv < 0) {
 549                         if (rv == -ECONNRESET)
 550                                 dev_info(DEV, "sock was reset by peer\n");
 551                         else if (rv != -ERESTARTSYS)
 552                                 dev_err(DEV, "sock_recvmsg returned %d\n", rv);
 553                         break;
 554                 } else if (rv == 0) {
 555                         dev_info(DEV, "sock was shut down by peer\n");
 556                         break;
 557                 } else  {
 558                         /* signal came in, or peer/link went down,
 559                          * after we read a partial message
 560                          */
 561                         /* D_ASSERT(signal_pending(current)); */
 562                         break;
 563                 }
 564         };
 565
 566         set_fs(oldfs);
 567
 568         if (rv != size)
 569                 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
 570
 571         return rv;
 572 }
 573
 574 /* quoting tcp(7):
 575  *   On individual connections, the socket buffer size must be set prior to the
 576  *   listen(2) or connect(2) calls in order to have it take effect.
 577  * This is our wrapper to do so.
 578  */
 579 static void drbd_setbufsize(struct socket *sock, unsigned int snd,
 580                 unsigned int rcv)
 581 {
 582         /* open coded SO_SNDBUF, SO_RCVBUF */
 583         if (snd) {
 584                 sock->sk->sk_sndbuf = snd;
 585                 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 586         }
 587         if (rcv) {
 588                 sock->sk->sk_rcvbuf = rcv;
 589                 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 590         }
 591 }
 592
 593 static struct socket *drbd_try_connect(struct drbd_conf *mdev)
 594 {
 595         const char *what;
 596         struct socket *sock;
 597         struct sockaddr_in6 src_in6;
 598         int err;
 599         int disconnect_on_error = 1;
 600
 601         if (!get_net_conf(mdev))
 602                 return NULL;
 603
 604         what = "sock_create_kern";
 605         err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
 606                 SOCK_STREAM, IPPROTO_TCP, &sock);
 607         if (err < 0) {
 608                 sock = NULL;
 609                 goto out;
 610         }
 611
 612         sock->sk->sk_rcvtimeo =
 613         sock->sk->sk_sndtimeo =  mdev->net_conf->try_connect_int*HZ;
 614         drbd_setbufsize(sock, mdev->net_conf->sndbuf_size,
 615                         mdev->net_conf->rcvbuf_size);
 616
 617        /* explicitly bind to the configured IP as source IP
 618         *  for the outgoing connections.
 619         *  This is needed for multihomed hosts and to be
 620         *  able to use lo: interfaces for drbd.
 621         * Make sure to use 0 as port number, so linux selects
 622         *  a free one dynamically.
 623         */
 624         memcpy(&src_in6, mdev->net_conf->my_addr,
 625                min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
 626         if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
 627                 src_in6.sin6_port = 0;
 628         else
 629                 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
 630
 631         what = "bind before connect";
 632         err = sock->ops->bind(sock,
 633                               (struct sockaddr *) &src_in6,
 634                               mdev->net_conf->my_addr_len);
 635         if (err < 0)
 636                 goto out;
 637
 638         /* connect may fail, peer not yet available.
 639          * stay C_WF_CONNECTION, don't go Disconnecting! */
 640         disconnect_on_error = 0;
 641         what = "connect";
 642         err = sock->ops->connect(sock,
 643                                  (struct sockaddr *)mdev->net_conf->peer_addr,
 644                                  mdev->net_conf->peer_addr_len, 0);
 645
 646 out:
 647         if (err < 0) {
 648                 if (sock) {
 649                         sock_release(sock);
 650                         sock = NULL;
 651                 }
 652                 switch (-err) {
 653                         /* timeout, busy, signal pending */
 654                 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
 655                 case EINTR: case ERESTARTSYS:
 656                         /* peer not (yet) available, network problem */
 657                 case ECONNREFUSED: case ENETUNREACH:
 658                 case EHOSTDOWN:    case EHOSTUNREACH:
 659                         disconnect_on_error = 0;
 660                         break;
 661                 default:
 662                         dev_err(DEV, "%s failed, err = %d\n", what, err);
 663                 }
 664                 if (disconnect_on_error)
 665                         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
 666         }
 667         put_net_conf(mdev);
 668         return sock;
 669 }
 670
 671 static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
 672 {
 673         int timeo, err;
 674         struct socket *s_estab = NULL, *s_listen;
 675         const char *what;
 676
 677         if (!get_net_conf(mdev))
 678                 return NULL;
 679
 680         what = "sock_create_kern";
 681         err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
 682                 SOCK_STREAM, IPPROTO_TCP, &s_listen);
 683         if (err) {
 684                 s_listen = NULL;
 685                 goto out;
 686         }
 687
 688         timeo = mdev->net_conf->try_connect_int * HZ;
 689         timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
 690
 691         s_listen->sk->sk_reuse    = 1; /* SO_REUSEADDR */
 692         s_listen->sk->sk_rcvtimeo = timeo;
 693         s_listen->sk->sk_sndtimeo = timeo;
 694         drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
 695                         mdev->net_conf->rcvbuf_size);
 696
 697         what = "bind before listen";
 698         err = s_listen->ops->bind(s_listen,
 699                               (struct sockaddr *) mdev->net_conf->my_addr,
 700                               mdev->net_conf->my_addr_len);
 701         if (err < 0)
 702                 goto out;
 703
 704         err = drbd_accept(mdev, &what, s_listen, &s_estab);
 705
 706 out:
 707         if (s_listen)
 708                 sock_release(s_listen);
 709         if (err < 0) {
 710                 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
 711                         dev_err(DEV, "%s failed, err = %d\n", what, err);
 712                         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
 713                 }
 714         }
 715         put_net_conf(mdev);
 716
 717         return s_estab;
 718 }
 719
 720 static int drbd_send_fp(struct drbd_conf *mdev,
 721         struct socket *sock, enum drbd_packets cmd)
 722 {
 723         struct p_header80 *h = &mdev->data.sbuf.header.h80;
 724
 725         return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
 726 }
 727
 728 static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
 729 {
 730         struct p_header80 *h = &mdev->data.rbuf.header.h80;
 731         int rr;
 732
 733         rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
 734
 735         if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
 736                 return be16_to_cpu(h->command);
 737
 738         return 0xffff;
 739 }
 740
 741 /**
 742  * drbd_socket_okay() - Free the socket if its connection is not okay
 743  * @mdev:       DRBD device.
 744  * @sock:       pointer to the pointer to the socket.
 745  */
 746 static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
 747 {
 748         int rr;
 749         char tb[4];
 750
 751         if (!*sock)
 752                 return FALSE;
 753
 754         rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
 755
 756         if (rr > 0 || rr == -EAGAIN) {
 757                 return TRUE;
 758         } else {
 759                 sock_release(*sock);
 760                 *sock = NULL;
 761                 return FALSE;
 762         }
 763 }
 764
 765 /*
 766  * return values:
 767  *   1 yes, we have a valid connection
 768  *   0 oops, did not work out, please try again
 769  *  -1 peer talks different language,
 770  *     no point in trying again, please go standalone.
 771  *  -2 We do not have a network config...
 772  */
 773 static int drbd_connect(struct drbd_conf *mdev)
 774 {
 775         struct socket *s, *sock, *msock;
 776         int try, h, ok;
 777
 778         D_ASSERT(!mdev->data.socket);
 779
 780         if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
 781                 return -2;
 782
 783         clear_bit(DISCARD_CONCURRENT, &mdev->flags);
 784
 785         sock  = NULL;
 786         msock = NULL;
 787
 788         do {
 789                 for (try = 0;;) {
 790                         /* 3 tries, this should take less than a second! */
 791                         s = drbd_try_connect(mdev);
 792                         if (s || ++try >= 3)
 793                                 break;
 794                         /* give the other side time to call bind() & listen() */
 795                         __set_current_state(TASK_INTERRUPTIBLE);
 796                         schedule_timeout(HZ / 10);
 797                 }
 798
 799                 if (s) {
 800                         if (!sock) {
 801                                 drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
 802                                 sock = s;
 803                                 s = NULL;
 804                         } else if (!msock) {
 805                                 drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
 806                                 msock = s;
 807                                 s = NULL;
 808                         } else {
 809                                 dev_err(DEV, "Logic error in drbd_connect()\n");
 810                                 goto out_release_sockets;
 811                         }
 812                 }
 813
 814                 if (sock && msock) {
 815                         __set_current_state(TASK_INTERRUPTIBLE);
 816                         schedule_timeout(HZ / 10);
 817                         ok = drbd_socket_okay(mdev, &sock);
 818                         ok = drbd_socket_okay(mdev, &msock) && ok;
 819                         if (ok)
 820                                 break;
 821                 }
 822
 823 retry:
 824                 s = drbd_wait_for_connect(mdev);
 825                 if (s) {
 826                         try = drbd_recv_fp(mdev, s);
 827                         drbd_socket_okay(mdev, &sock);
 828                         drbd_socket_okay(mdev, &msock);
 829                         switch (try) {
 830                         case P_HAND_SHAKE_S:
 831                                 if (sock) {
 832                                         dev_warn(DEV, "initial packet S crossed\n");
 833                                         sock_release(sock);
 834                                 }
 835                                 sock = s;
 836                                 break;
 837                         case P_HAND_SHAKE_M:
 838                                 if (msock) {
 839                                         dev_warn(DEV, "initial packet M crossed\n");
 840                                         sock_release(msock);
 841                                 }
 842                                 msock = s;
 843                                 set_bit(DISCARD_CONCURRENT, &mdev->flags);
 844                                 break;
 845                         default:
 846                                 dev_warn(DEV, "Error receiving initial packet\n");
 847                                 sock_release(s);
 848                                 if (random32() & 1)
 849                                         goto retry;
 850                         }
 851                 }
 852
 853                 if (mdev->state.conn <= C_DISCONNECTING)
 854                         goto out_release_sockets;
 855                 if (signal_pending(current)) {
 856                         flush_signals(current);
 857                         smp_rmb();
 858                         if (get_t_state(&mdev->receiver) == Exiting)
 859                                 goto out_release_sockets;
 860                 }
 861
 862                 if (sock && msock) {
 863                         ok = drbd_socket_okay(mdev, &sock);
 864                         ok = drbd_socket_okay(mdev, &msock) && ok;
 865                         if (ok)
 866                                 break;
 867                 }
 868         } while (1);
 869
 870         msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
 871         sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
 872
 873         sock->sk->sk_allocation = GFP_NOIO;
 874         msock->sk->sk_allocation = GFP_NOIO;
 875
 876         sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
 877         msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
 878
 879         /* NOT YET ...
 880          * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
 881          * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
 882          * first set it to the P_HAND_SHAKE timeout,
 883          * which we set to 4x the configured ping_timeout. */
 884         sock->sk->sk_sndtimeo =
 885         sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
 886
 887         msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
 888         msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
 889
 890         /* we don't want delays.
 891          * we use TCP_CORK where apropriate, though */
 892         drbd_tcp_nodelay(sock);
 893         drbd_tcp_nodelay(msock);
 894
 895         mdev->data.socket = sock;
 896         mdev->meta.socket = msock;
 897         mdev->last_received = jiffies;
 898
 899         D_ASSERT(mdev->asender.task == NULL);
 900
 901         h = drbd_do_handshake(mdev);
 902         if (h <= 0)
 903                 return h;
 904
 905         if (mdev->cram_hmac_tfm) {
 906                 /* drbd_request_state(mdev, NS(conn, WFAuth)); */
 907                 switch (drbd_do_auth(mdev)) {
 908                 case -1:
 909                         dev_err(DEV, "Authentication of peer failed\n");
 910                         return -1;
 911                 case 0:
 912                         dev_err(DEV, "Authentication of peer failed, trying again.\n");
 913                         return 0;
 914                 }
 915         }
 916
 917         if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
 918                 return 0;
 919
 920         sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
 921         sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
 922
 923         atomic_set(&mdev->packet_seq, 0);
 924         mdev->peer_seq = 0;
 925
 926         drbd_thread_start(&mdev->asender);
 927
 928         if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
 929                 drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
 930                 put_ldev(mdev);
 931         }
 932
 933         if (!drbd_send_protocol(mdev))
 934                 return -1;
 935         drbd_send_sync_param(mdev, &mdev->sync_conf);
 936         drbd_send_sizes(mdev, 0, 0);
 937         drbd_send_uuids(mdev);
 938         drbd_send_state(mdev);
 939         clear_bit(USE_DEGR_WFC_T, &mdev->flags);
 940         clear_bit(RESIZE_PENDING, &mdev->flags);
 941
 942         return 1;
 943
 944 out_release_sockets:
 945         if (sock)
 946                 sock_release(sock);
 947         if (msock)
 948                 sock_release(msock);
 949         return -1;
 950 }
 951
 952 static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
 953 {
 954         union p_header *h = &mdev->data.rbuf.header;
 955         int r;
 956
 957         r = drbd_recv(mdev, h, sizeof(*h));
 958         if (unlikely(r != sizeof(*h))) {
 959                 dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
 960                 return FALSE;
 961         }
 962
 963         if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
 964                 *cmd = be16_to_cpu(h->h80.command);
 965                 *packet_size = be16_to_cpu(h->h80.length);
 966         } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
 967                 *cmd = be16_to_cpu(h->h95.command);
 968                 *packet_size = be32_to_cpu(h->h95.length);
 969         } else {
 970                 dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
 971                     (long)be32_to_cpu(h->h80.magic),
 972                     h->h80.command, h->h80.length);
 973                 return FALSE;
 974         }
 975         mdev->last_received = jiffies;
 976
 977         return TRUE;
 978 }
 979
 980 static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
 981 {
 982         int rv;
 983
 984         if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
 985                 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
 986                                         NULL, BLKDEV_IFL_WAIT);
 987                 if (rv) {
 988                         dev_err(DEV, "local disk flush failed with status %d\n", rv);
 989                         /* would rather check on EOPNOTSUPP, but that is not reliable.
 990                          * don't try again for ANY return value != 0
 991                          * if (rv == -EOPNOTSUPP) */
 992                         drbd_bump_write_ordering(mdev, WO_drain_io);
 993                 }
 994                 put_ldev(mdev);
 995         }
 996
 997         return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
 998 }
 999
1000 static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1001 {
1002         struct flush_work *fw = (struct flush_work *)w;
1003         struct drbd_epoch *epoch = fw->epoch;
1004
1005         kfree(w);
1006
1007         if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
1008                 drbd_flush_after_epoch(mdev, epoch);
1009
1010         drbd_may_finish_epoch(mdev, epoch, EV_PUT |
1011                               (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
1012
1013         return 1;
1014 }
1015
1016 /**
1017  * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
1018  * @mdev:       DRBD device.
1019  * @epoch:      Epoch object.
1020  * @ev:         Epoch event.
1021  */
1022 static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1023                                                struct drbd_epoch *epoch,
1024                                                enum epoch_event ev)
1025 {
1026         int finish, epoch_size;
1027         struct drbd_epoch *next_epoch;
1028         int schedule_flush = 0;
1029         enum finish_epoch rv = FE_STILL_LIVE;
1030
1031         spin_lock(&mdev->epoch_lock);
1032         do {
1033                 next_epoch = NULL;
1034                 finish = 0;
1035
1036                 epoch_size = atomic_read(&epoch->epoch_size);
1037
1038                 switch (ev & ~EV_CLEANUP) {
1039                 case EV_PUT:
1040                         atomic_dec(&epoch->active);
1041                         break;
1042                 case EV_GOT_BARRIER_NR:
1043                         set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1044
1045                         /* Special case: If we just switched from WO_bio_barrier to
1046                            WO_bdev_flush we should not finish the current epoch */
1047                         if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
1048                             mdev->write_ordering != WO_bio_barrier &&
1049                             epoch == mdev->current_epoch)
1050                                 clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
1051                         break;
1052                 case EV_BARRIER_DONE:
1053                         set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
1054                         break;
1055                 case EV_BECAME_LAST:
1056                         /* nothing to do*/
1057                         break;
1058                 }
1059
1060                 if (epoch_size != 0 &&
1061                     atomic_read(&epoch->active) == 0 &&
1062                     test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) &&
1063                     epoch->list.prev == &mdev->current_epoch->list &&
1064                     !test_bit(DE_IS_FINISHING, &epoch->flags)) {
1065                         /* Nearly all conditions are met to finish that epoch... */
1066                         if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
1067                             mdev->write_ordering == WO_none ||
1068                             (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
1069                             ev & EV_CLEANUP) {
1070                                 finish = 1;
1071                                 set_bit(DE_IS_FINISHING, &epoch->flags);
1072                         } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
1073                                  mdev->write_ordering == WO_bio_barrier) {
1074                                 atomic_inc(&epoch->active);
1075                                 schedule_flush = 1;
1076                         }
1077                 }
1078                 if (finish) {
1079                         if (!(ev & EV_CLEANUP)) {
1080                                 spin_unlock(&mdev->epoch_lock);
1081                                 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
1082                                 spin_lock(&mdev->epoch_lock);
1083                         }
1084                         dec_unacked(mdev);
1085
1086                         if (mdev->current_epoch != epoch) {
1087                                 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1088                                 list_del(&epoch->list);
1089                                 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1090                                 mdev->epochs--;
1091                                 kfree(epoch);
1092
1093                                 if (rv == FE_STILL_LIVE)
1094                                         rv = FE_DESTROYED;
1095                         } else {
1096                                 epoch->flags = 0;
1097                                 atomic_set(&epoch->epoch_size, 0);
1098                                 /* atomic_set(&epoch->active, 0); is already zero */
1099                                 if (rv == FE_STILL_LIVE)
1100                                         rv = FE_RECYCLED;
1101                         }
1102                 }
1103
1104                 if (!next_epoch)
1105                         break;
1106
1107                 epoch = next_epoch;
1108         } while (1);
1109
1110         spin_unlock(&mdev->epoch_lock);
1111
1112         if (schedule_flush) {
1113                 struct flush_work *fw;
1114                 fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
1115                 if (fw) {
1116                         fw->w.cb = w_flush;
1117                         fw->epoch = epoch;
1118                         drbd_queue_work(&mdev->data.work, &fw->w);
1119                 } else {
1120                         dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
1121                         set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1122                         /* That is not a recursion, only one level */
1123                         drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
1124                         drbd_may_finish_epoch(mdev, epoch, EV_PUT);
1125                 }
1126         }
1127
1128         return rv;
1129 }
1130
1131 /**
1132  * drbd_bump_write_ordering() - Fall back to an other write ordering method
1133  * @mdev:       DRBD device.
1134  * @wo:         Write ordering method to try.
1135  */
1136 void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
1137 {
1138         enum write_ordering_e pwo;
1139         static char *write_ordering_str[] = {
1140                 [WO_none] = "none",
1141                 [WO_drain_io] = "drain",
1142                 [WO_bdev_flush] = "flush",
1143                 [WO_bio_barrier] = "barrier",
1144         };
1145
1146         pwo = mdev->write_ordering;
1147         wo = min(pwo, wo);
1148         if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
1149                 wo = WO_bdev_flush;
1150         if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
1151                 wo = WO_drain_io;
1152         if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
1153                 wo = WO_none;
1154         mdev->write_ordering = wo;
1155         if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
1156                 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
1157 }
1158
1159 /**
1160  * drbd_submit_ee()
1161  * @mdev:       DRBD device.
1162  * @e:          epoch entry
1163  * @rw:         flag field, see bio->bi_rw
1164  */
1165 /* TODO allocate from our own bio_set. */
1166 int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1167                 const unsigned rw, const int fault_type)
1168 {
1169         struct bio *bios = NULL;
1170         struct bio *bio;
1171         struct page *page = e->pages;
1172         sector_t sector = e->sector;
1173         unsigned ds = e->size;
1174         unsigned n_bios = 0;
1175         unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
1176
1177         /* In most cases, we will only need one bio.  But in case the lower
1178          * level restrictions happen to be different at this offset on this
1179          * side than those of the sending peer, we may need to submit the
1180          * request in more than one bio. */
1181 next_bio:
1182         bio = bio_alloc(GFP_NOIO, nr_pages);
1183         if (!bio) {
1184                 dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
1185                 goto fail;
1186         }
1187         /* > e->sector, unless this is the first bio */
1188         bio->bi_sector = sector;
1189         bio->bi_bdev = mdev->ldev->backing_bdev;
1190         /* we special case some flags in the multi-bio case, see below
1191          * (REQ_UNPLUG, REQ_HARDBARRIER) */
1192         bio->bi_rw = rw;
1193         bio->bi_private = e;
1194         bio->bi_end_io = drbd_endio_sec;
1195
1196         bio->bi_next = bios;
1197         bios = bio;
1198         ++n_bios;
1199
1200         page_chain_for_each(page) {
1201                 unsigned len = min_t(unsigned, ds, PAGE_SIZE);
1202                 if (!bio_add_page(bio, page, len, 0)) {
1203                         /* a single page must always be possible! */
1204                         BUG_ON(bio->bi_vcnt == 0);
1205                         goto next_bio;
1206                 }
1207                 ds -= len;
1208                 sector += len >> 9;
1209                 --nr_pages;
1210         }
1211         D_ASSERT(page == NULL);
1212         D_ASSERT(ds == 0);
1213
1214         atomic_set(&e->pending_bios, n_bios);
1215         do {
1216                 bio = bios;
1217                 bios = bios->bi_next;
1218                 bio->bi_next = NULL;
1219
1220                 /* strip off REQ_UNPLUG unless it is the last bio */
1221                 if (bios)
1222                         bio->bi_rw &= ~REQ_UNPLUG;
1223
1224                 drbd_generic_make_request(mdev, fault_type, bio);
1225
1226                 /* strip off REQ_HARDBARRIER,
1227                  * unless it is the first or last bio */
1228                 if (bios && bios->bi_next)
1229                         bios->bi_rw &= ~REQ_HARDBARRIER;
1230         } while (bios);
1231         maybe_kick_lo(mdev);
1232         return 0;
1233
1234 fail:
1235         while (bios) {
1236                 bio = bios;
1237                 bios = bios->bi_next;
1238                 bio_put(bio);
1239         }
1240         return -ENOMEM;
1241 }
1242
1243 /**
1244  * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set
1245  * @mdev:       DRBD device.
1246  * @w:          work object.
1247  * @cancel:     The connection will be closed anyways (unused in this callback)
1248  */
1249 int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
1250 {
1251         struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1252         /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
1253            (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
1254            so that we can finish that epoch in drbd_may_finish_epoch().
1255            That is necessary if we already have a long chain of Epochs, before
1256            we realize that REQ_HARDBARRIER is actually not supported */
1257
1258         /* As long as the -ENOTSUPP on the barrier is reported immediately
1259            that will never trigger. If it is reported late, we will just
1260            print that warning and continue correctly for all future requests
1261            with WO_bdev_flush */
1262         if (previous_epoch(mdev, e->epoch))
1263                 dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
1264
1265         /* we still have a local reference,
1266          * get_ldev was done in receive_Data. */
1267
1268         e->w.cb = e_end_block;
1269         if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
1270                 /* drbd_submit_ee fails for one reason only:
1271                  * if was not able to allocate sufficient bios.
1272                  * requeue, try again later. */
1273                 e->w.cb = w_e_reissue;
1274                 drbd_queue_work(&mdev->data.work, &e->w);
1275         }
1276         return 1;
1277 }
1278
1279 static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1280 {
1281         int rv, issue_flush;
1282         struct p_barrier *p = &mdev->data.rbuf.barrier;
1283         struct drbd_epoch *epoch;
1284
1285         inc_unacked(mdev);
1286
1287         if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
1288                 drbd_kick_lo(mdev);
1289
1290         mdev->current_epoch->barrier_nr = p->barrier;
1291         rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
1292
1293         /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1294          * the activity log, which means it would not be resynced in case the
1295          * R_PRIMARY crashes now.
1296          * Therefore we must send the barrier_ack after the barrier request was
1297          * completed. */
1298         switch (mdev->write_ordering) {
1299         case WO_bio_barrier:
1300         case WO_none:
1301                 if (rv == FE_RECYCLED)
1302                         return TRUE;
1303                 break;
1304
1305         case WO_bdev_flush:
1306         case WO_drain_io:
1307                 if (rv == FE_STILL_LIVE) {
1308                         set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1309                         drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1310                         rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1311                 }
1312                 if (rv == FE_RECYCLED)
1313                         return TRUE;
1314
1315                 /* The asender will send all the ACKs and barrier ACKs out, since
1316                    all EEs moved from the active_ee to the done_ee. We need to
1317                    provide a new epoch object for the EEs that come in soon */
1318                 break;
1319         }
1320
1321         /* receiver context, in the writeout path of the other node.
1322          * avoid potential distributed deadlock */
1323         epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1324         if (!epoch) {
1325                 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1326                 issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1327                 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1328                 if (issue_flush) {
1329                         rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1330                         if (rv == FE_RECYCLED)
1331                                 return TRUE;
1332                 }
1333
1334                 drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
1335
1336                 return TRUE;
1337         }
1338
1339         epoch->flags = 0;
1340         atomic_set(&epoch->epoch_size, 0);
1341         atomic_set(&epoch->active, 0);
1342
1343         spin_lock(&mdev->epoch_lock);
1344         if (atomic_read(&mdev->current_epoch->epoch_size)) {
1345                 list_add(&epoch->list, &mdev->current_epoch->list);
1346                 mdev->current_epoch = epoch;
1347                 mdev->epochs++;
1348         } else {
1349                 /* The current_epoch got recycled while we allocated this one... */
1350                 kfree(epoch);
1351         }
1352         spin_unlock(&mdev->epoch_lock);
1353
1354         return TRUE;
1355 }
1356
1357 /* used from receive_RSDataReply (recv_resync_read)
1358  * and from receive_Data */
1359 static struct drbd_epoch_entry *
1360 read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
1361 {
1362         const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
1363         struct drbd_epoch_entry *e;
1364         struct page *page;
1365         int dgs, ds, rr;
1366         void *dig_in = mdev->int_dig_in;
1367         void *dig_vv = mdev->int_dig_vv;
1368         unsigned long *data;
1369
1370         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1371                 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1372
1373         if (dgs) {
1374                 rr = drbd_recv(mdev, dig_in, dgs);
1375                 if (rr != dgs) {
1376                         dev_warn(DEV, "short read receiving data digest: read %d expected %d\n",
1377                              rr, dgs);
1378                         return NULL;
1379                 }
1380         }
1381
1382         data_size -= dgs;
1383
1384         ERR_IF(data_size &  0x1ff) return NULL;
1385         ERR_IF(data_size >  DRBD_MAX_SEGMENT_SIZE) return NULL;
1386
1387         /* even though we trust out peer,
1388          * we sometimes have to double check. */
1389         if (sector + (data_size>>9) > capacity) {
1390                 dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n",
1391                         (unsigned long long)capacity,
1392                         (unsigned long long)sector, data_size);
1393                 return NULL;
1394         }
1395
1396         /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1397          * "criss-cross" setup, that might cause write-out on some other DRBD,
1398          * which in turn might block on the other node at this very place.  */
1399         e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
1400         if (!e)
1401                 return NULL;
1402
1403         ds = data_size;
1404         page = e->pages;
1405         page_chain_for_each(page) {
1406                 unsigned len = min_t(int, ds, PAGE_SIZE);
1407                 data = kmap(page);
1408                 rr = drbd_recv(mdev, data, len);
1409                 if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) {
1410                         dev_err(DEV, "Fault injection: Corrupting data on receive\n");
1411                         data[0] = data[0] ^ (unsigned long)-1;
1412                 }
1413                 kunmap(page);
1414                 if (rr != len) {
1415                         drbd_free_ee(mdev, e);
1416                         dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1417                              rr, len);
1418                         return NULL;
1419                 }
1420                 ds -= rr;
1421         }
1422
1423         if (dgs) {
1424                 drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
1425                 if (memcmp(dig_in, dig_vv, dgs)) {
1426                         dev_err(DEV, "Digest integrity check FAILED.\n");
1427                         drbd_bcast_ee(mdev, "digest failed",
1428                                         dgs, dig_in, dig_vv, e);
1429                         drbd_free_ee(mdev, e);
1430                         return NULL;
1431                 }
1432         }
1433         mdev->recv_cnt += data_size>>9;
1434         return e;
1435 }
1436
1437 /* drbd_drain_block() just takes a data block
1438  * out of the socket input buffer, and discards it.
1439  */
1440 static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1441 {
1442         struct page *page;
1443         int rr, rv = 1;
1444         void *data;
1445
1446         if (!data_size)
1447                 return TRUE;
1448
1449         page = drbd_pp_alloc(mdev, 1, 1);
1450
1451         data = kmap(page);
1452         while (data_size) {
1453                 rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
1454                 if (rr != min_t(int, data_size, PAGE_SIZE)) {
1455                         rv = 0;
1456                         dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1457                              rr, min_t(int, data_size, PAGE_SIZE));
1458                         break;
1459                 }
1460                 data_size -= rr;
1461         }
1462         kunmap(page);
1463         drbd_pp_free(mdev, page);
1464         return rv;
1465 }
1466
1467 static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1468                            sector_t sector, int data_size)
1469 {
1470         struct bio_vec *bvec;
1471         struct bio *bio;
1472         int dgs, rr, i, expect;
1473         void *dig_in = mdev->int_dig_in;
1474         void *dig_vv = mdev->int_dig_vv;
1475
1476         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1477                 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1478
1479         if (dgs) {
1480                 rr = drbd_recv(mdev, dig_in, dgs);
1481                 if (rr != dgs) {
1482                         dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n",
1483                              rr, dgs);
1484                         return 0;
1485                 }
1486         }
1487
1488         data_size -= dgs;
1489
1490         /* optimistically update recv_cnt.  if receiving fails below,
1491          * we disconnect anyways, and counters will be reset. */
1492         mdev->recv_cnt += data_size>>9;
1493
1494         bio = req->master_bio;
1495         D_ASSERT(sector == bio->bi_sector);
1496
1497         bio_for_each_segment(bvec, bio, i) {
1498                 expect = min_t(int, data_size, bvec->bv_len);
1499                 rr = drbd_recv(mdev,
1500                              kmap(bvec->bv_page)+bvec->bv_offset,
1501                              expect);
1502                 kunmap(bvec->bv_page);
1503                 if (rr != expect) {
1504                         dev_warn(DEV, "short read receiving data reply: "
1505                              "read %d expected %d\n",
1506                              rr, expect);
1507                         return 0;
1508                 }
1509                 data_size -= rr;
1510         }
1511
1512         if (dgs) {
1513                 drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
1514                 if (memcmp(dig_in, dig_vv, dgs)) {
1515                         dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
1516                         return 0;
1517                 }
1518         }
1519
1520         D_ASSERT(data_size == 0);
1521         return 1;
1522 }
1523
1524 /* e_end_resync_block() is called via
1525  * drbd_process_done_ee() by asender only */
1526 static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1527 {
1528         struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1529         sector_t sector = e->sector;
1530         int ok;
1531
1532         D_ASSERT(hlist_unhashed(&e->colision));
1533
1534         if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1535                 drbd_set_in_sync(mdev, sector, e->size);
1536                 ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
1537         } else {
1538                 /* Record failure to sync */
1539                 drbd_rs_failed_io(mdev, sector, e->size);
1540
1541                 ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
1542         }
1543         dec_unacked(mdev);
1544
1545         return ok;
1546 }
1547
1548 static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
1549 {
1550         struct drbd_epoch_entry *e;
1551
1552         e = read_in_block(mdev, ID_SYNCER, sector, data_size);
1553         if (!e)
1554                 goto fail;
1555
1556         dec_rs_pending(mdev);
1557
1558         inc_unacked(mdev);
1559         /* corresponding dec_unacked() in e_end_resync_block()
1560          * respective _drbd_clear_done_ee */
1561
1562         e->w.cb = e_end_resync_block;
1563
1564         spin_lock_irq(&mdev->req_lock);
1565         list_add(&e->w.list, &mdev->sync_ee);
1566         spin_unlock_irq(&mdev->req_lock);
1567
1568         atomic_add(data_size >> 9, &mdev->rs_sect_ev);
1569         if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
1570                 return TRUE;
1571
1572         drbd_free_ee(mdev, e);
1573 fail:
1574         put_ldev(mdev);
1575         return FALSE;
1576 }
1577
1578 static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1579 {
1580         struct drbd_request *req;
1581         sector_t sector;
1582         int ok;
1583         struct p_data *p = &mdev->data.rbuf.data;
1584
1585         sector = be64_to_cpu(p->sector);
1586
1587         spin_lock_irq(&mdev->req_lock);
1588         req = _ar_id_to_req(mdev, p->block_id, sector);
1589         spin_unlock_irq(&mdev->req_lock);
1590         if (unlikely(!req)) {
1591                 dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
1592                 return FALSE;
1593         }
1594
1595         /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid
1596          * special casing it there for the various failure cases.
1597          * still no race with drbd_fail_pending_reads */
1598         ok = recv_dless_read(mdev, req, sector, data_size);
1599
1600         if (ok)
1601                 req_mod(req, data_received);
1602         /* else: nothing. handled from drbd_disconnect...
1603          * I don't think we may complete this just yet
1604          * in case we are "on-disconnect: freeze" */
1605
1606         return ok;
1607 }
1608
1609 static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1610 {
1611         sector_t sector;
1612         int ok;
1613         struct p_data *p = &mdev->data.rbuf.data;
1614
1615         sector = be64_to_cpu(p->sector);
1616         D_ASSERT(p->block_id == ID_SYNCER);
1617
1618         if (get_ldev(mdev)) {
1619                 /* data is submitted to disk within recv_resync_read.
1620                  * corresponding put_ldev done below on error,
1621                  * or in drbd_endio_write_sec. */
1622                 ok = recv_resync_read(mdev, sector, data_size);
1623         } else {
1624                 if (__ratelimit(&drbd_ratelimit_state))
1625                         dev_err(DEV, "Can not write resync data to local disk.\n");
1626
1627                 ok = drbd_drain_block(mdev, data_size);
1628
1629                 drbd_send_ack_dp(mdev, P_NEG_ACK, p);
1630         }
1631
1632         atomic_add(data_size >> 9, &mdev->rs_sect_in);
1633
1634         return ok;
1635 }
1636
1637 /* e_end_block() is called via drbd_process_done_ee().
1638  * this means this function only runs in the asender thread
1639  */
1640 static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1641 {
1642         struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1643         sector_t sector = e->sector;
1644         struct drbd_epoch *epoch;
1645         int ok = 1, pcmd;
1646
1647         if (e->flags & EE_IS_BARRIER) {
1648                 epoch = previous_epoch(mdev, e->epoch);
1649                 if (epoch)
1650                         drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
1651         }
1652
1653         if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
1654                 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1655                         pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
1656                                 mdev->state.conn <= C_PAUSED_SYNC_T &&
1657                                 e->flags & EE_MAY_SET_IN_SYNC) ?
1658                                 P_RS_WRITE_ACK : P_WRITE_ACK;
1659                         ok &= drbd_send_ack(mdev, pcmd, e);
1660                         if (pcmd == P_RS_WRITE_ACK)
1661                                 drbd_set_in_sync(mdev, sector, e->size);
1662                 } else {
1663                         ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
1664                         /* we expect it to be marked out of sync anyways...
1665                          * maybe assert this?  */
1666                 }
1667                 dec_unacked(mdev);
1668         }
1669         /* we delete from the conflict detection hash _after_ we sent out the
1670          * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
1671         if (mdev->net_conf->two_primaries) {
1672                 spin_lock_irq(&mdev->req_lock);
1673                 D_ASSERT(!hlist_unhashed(&e->colision));
1674                 hlist_del_init(&e->colision);
1675                 spin_unlock_irq(&mdev->req_lock);
1676         } else {
1677                 D_ASSERT(hlist_unhashed(&e->colision));
1678         }
1679
1680         drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
1681
1682         return ok;
1683 }
1684
1685 static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1686 {
1687         struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1688         int ok = 1;
1689
1690         D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1691         ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
1692
1693         spin_lock_irq(&mdev->req_lock);
1694         D_ASSERT(!hlist_unhashed(&e->colision));
1695         hlist_del_init(&e->colision);
1696         spin_unlock_irq(&mdev->req_lock);
1697
1698         dec_unacked(mdev);
1699
1700         return ok;
1701 }
1702
1703 /* Called from receive_Data.
1704  * Synchronize packets on sock with packets on msock.
1705  *
1706  * This is here so even when a P_DATA packet traveling via sock overtook an Ack
1707  * packet traveling on msock, they are still processed in the order they have
1708  * been sent.
1709  *
1710  * Note: we don't care for Ack packets overtaking P_DATA packets.
1711  *
1712  * In case packet_seq is larger than mdev->peer_seq number, there are
1713  * outstanding packets on the msock. We wait for them to arrive.
1714  * In case we are the logically next packet, we update mdev->peer_seq
1715  * ourselves. Correctly handles 32bit wrap around.
1716  *
1717  * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
1718  * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
1719  * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
1720  * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
1721  *
1722  * returns 0 if we may process the packet,
1723  * -ERESTARTSYS if we were interrupted (by disconnect signal). */
1724 static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
1725 {
1726         DEFINE_WAIT(wait);
1727         unsigned int p_seq;
1728         long timeout;
1729         int ret = 0;
1730         spin_lock(&mdev->peer_seq_lock);
1731         for (;;) {
1732                 prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
1733                 if (seq_le(packet_seq, mdev->peer_seq+1))
1734                         break;
1735                 if (signal_pending(current)) {
1736                         ret = -ERESTARTSYS;
1737                         break;
1738                 }
1739                 p_seq = mdev->peer_seq;
1740                 spin_unlock(&mdev->peer_seq_lock);
1741                 timeout = schedule_timeout(30*HZ);
1742                 spin_lock(&mdev->peer_seq_lock);
1743                 if (timeout == 0 && p_seq == mdev->peer_seq) {
1744                         ret = -ETIMEDOUT;
1745                         dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
1746                         break;
1747                 }
1748         }
1749         finish_wait(&mdev->seq_wait, &wait);
1750         if (mdev->peer_seq+1 == packet_seq)
1751                 mdev->peer_seq++;
1752         spin_unlock(&mdev->peer_seq_lock);
1753         return ret;
1754 }
1755
1756 /* mirrored write */
1757 static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1758 {
1759         sector_t sector;
1760         struct drbd_epoch_entry *e;
1761         struct p_data *p = &mdev->data.rbuf.data;
1762         int rw = WRITE;
1763         u32 dp_flags;
1764
1765         if (!get_ldev(mdev)) {
1766                 if (__ratelimit(&drbd_ratelimit_state))
1767                         dev_err(DEV, "Can not write mirrored data block "
1768                             "to local disk.\n");
1769                 spin_lock(&mdev->peer_seq_lock);
1770                 if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
1771                         mdev->peer_seq++;
1772                 spin_unlock(&mdev->peer_seq_lock);
1773
1774                 drbd_send_ack_dp(mdev, P_NEG_ACK, p);
1775                 atomic_inc(&mdev->current_epoch->epoch_size);
1776                 return drbd_drain_block(mdev, data_size);
1777         }
1778
1779         /* get_ldev(mdev) successful.
1780          * Corresponding put_ldev done either below (on various errors),
1781          * or in drbd_endio_write_sec, if we successfully submit the data at
1782          * the end of this function. */
1783
1784         sector = be64_to_cpu(p->sector);
1785         e = read_in_block(mdev, p->block_id, sector, data_size);
1786         if (!e) {
1787                 put_ldev(mdev);
1788                 return FALSE;
1789         }
1790
1791         e->w.cb = e_end_block;
1792
1793         spin_lock(&mdev->epoch_lock);
1794         e->epoch = mdev->current_epoch;
1795         atomic_inc(&e->epoch->epoch_size);
1796         atomic_inc(&e->epoch->active);
1797
1798         if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
1799                 struct drbd_epoch *epoch;
1800                 /* Issue a barrier if we start a new epoch, and the previous epoch
1801                    was not a epoch containing a single request which already was
1802                    a Barrier. */
1803                 epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
1804                 if (epoch == e->epoch) {
1805                         set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1806                         rw |= REQ_HARDBARRIER;
1807                         e->flags |= EE_IS_BARRIER;
1808                 } else {
1809                         if (atomic_read(&epoch->epoch_size) > 1 ||
1810                             !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
1811                                 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1812                                 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1813                                 rw |= REQ_HARDBARRIER;
1814                                 e->flags |= EE_IS_BARRIER;
1815                         }
1816                 }
1817         }
1818         spin_unlock(&mdev->epoch_lock);
1819
1820         dp_flags = be32_to_cpu(p->dp_flags);
1821         if (dp_flags & DP_HARDBARRIER) {
1822                 dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
1823                 /* rw |= REQ_HARDBARRIER; */
1824         }
1825         if (dp_flags & DP_RW_SYNC)
1826                 rw |= REQ_SYNC | REQ_UNPLUG;
1827         if (dp_flags & DP_MAY_SET_IN_SYNC)
1828                 e->flags |= EE_MAY_SET_IN_SYNC;
1829
1830         /* I'm the receiver, I do hold a net_cnt reference. */
1831         if (!mdev->net_conf->two_primaries) {
1832                 spin_lock_irq(&mdev->req_lock);
1833         } else {
1834                 /* don't get the req_lock yet,
1835                  * we may sleep in drbd_wait_peer_seq */
1836                 const int size = e->size;
1837                 const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags);
1838                 DEFINE_WAIT(wait);
1839                 struct drbd_request *i;
1840                 struct hlist_node *n;
1841                 struct hlist_head *slot;
1842                 int first;
1843
1844                 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1845                 BUG_ON(mdev->ee_hash == NULL);
1846                 BUG_ON(mdev->tl_hash == NULL);
1847
1848                 /* conflict detection and handling:
1849                  * 1. wait on the sequence number,
1850                  *    in case this data packet overtook ACK packets.
1851                  * 2. check our hash tables for conflicting requests.
1852                  *    we only need to walk the tl_hash, since an ee can not
1853                  *    have a conflict with an other ee: on the submitting
1854                  *    node, the corresponding req had already been conflicting,
1855                  *    and a conflicting req is never sent.
1856                  *
1857                  * Note: for two_primaries, we are protocol C,
1858                  * so there cannot be any request that is DONE
1859                  * but still on the transfer log.
1860                  *
1861                  * unconditionally add to the ee_hash.
1862                  *
1863                  * if no conflicting request is found:
1864                  *    submit.
1865                  *
1866                  * if any conflicting request is found
1867                  * that has not yet been acked,
1868                  * AND I have the "discard concurrent writes" flag:
1869                  *       queue (via done_ee) the P_DISCARD_ACK; OUT.
1870                  *
1871                  * if any conflicting request is found:
1872                  *       block the receiver, waiting on misc_wait
1873                  *       until no more conflicting requests are there,
1874                  *       or we get interrupted (disconnect).
1875                  *
1876                  *       we do not just write after local io completion of those
1877                  *       requests, but only after req is done completely, i.e.
1878                  *       we wait for the P_DISCARD_ACK to arrive!
1879                  *
1880                  *       then proceed normally, i.e. submit.
1881                  */
1882                 if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
1883                         goto out_interrupted;
1884
1885                 spin_lock_irq(&mdev->req_lock);
1886
1887                 hlist_add_head(&e->colision, ee_hash_slot(mdev, sector));
1888
1889 #define OVERLAPS overlaps(i->sector, i->size, sector, size)
1890                 slot = tl_hash_slot(mdev, sector);
1891                 first = 1;
1892                 for (;;) {
1893                         int have_unacked = 0;
1894                         int have_conflict = 0;
1895                         prepare_to_wait(&mdev->misc_wait, &wait,
1896                                 TASK_INTERRUPTIBLE);
1897                         hlist_for_each_entry(i, n, slot, colision) {
1898                                 if (OVERLAPS) {
1899                                         /* only ALERT on first iteration,
1900                                          * we may be woken up early... */
1901                                         if (first)
1902                                                 dev_alert(DEV, "%s[%u] Concurrent local write detected!"
1903                                                       " new: %llus +%u; pending: %llus +%u\n",
1904                                                       current->comm, current->pid,
1905                                                       (unsigned long long)sector, size,
1906                                                       (unsigned long long)i->sector, i->size);
1907                                         if (i->rq_state & RQ_NET_PENDING)
1908                                                 ++have_unacked;
1909                                         ++have_conflict;
1910                                 }
1911                         }
1912 #undef OVERLAPS
1913                         if (!have_conflict)
1914                                 break;
1915
1916                         /* Discard Ack only for the _first_ iteration */
1917                         if (first && discard && have_unacked) {
1918                                 dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
1919                                      (unsigned long long)sector);
1920                                 inc_unacked(mdev);
1921                                 e->w.cb = e_send_discard_ack;
1922                                 list_add_tail(&e->w.list, &mdev->done_ee);
1923
1924                                 spin_unlock_irq(&mdev->req_lock);
1925
1926                                 /* we could probably send that P_DISCARD_ACK ourselves,
1927                                  * but I don't like the receiver using the msock */
1928
1929                                 put_ldev(mdev);
1930                                 wake_asender(mdev);
1931                                 finish_wait(&mdev->misc_wait, &wait);
1932                                 return TRUE;
1933                         }
1934
1935                         if (signal_pending(current)) {
1936                                 hlist_del_init(&e->colision);
1937
1938                                 spin_unlock_irq(&mdev->req_lock);
1939
1940                                 finish_wait(&mdev->misc_wait, &wait);
1941                                 goto out_interrupted;
1942                         }
1943
1944                         spin_unlock_irq(&mdev->req_lock);
1945                         if (first) {
1946                                 first = 0;
1947                                 dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
1948                                      "sec=%llus\n", (unsigned long long)sector);
1949                         } else if (discard) {
1950                                 /* we had none on the first iteration.
1951                                  * there must be none now. */
1952                                 D_ASSERT(have_unacked == 0);
1953                         }
1954                         schedule();
1955                         spin_lock_irq(&mdev->req_lock);
1956                 }
1957                 finish_wait(&mdev->misc_wait, &wait);
1958         }
1959
1960         list_add(&e->w.list, &mdev->active_ee);
1961         spin_unlock_irq(&mdev->req_lock);
1962
1963         switch (mdev->net_conf->wire_protocol) {
1964         case DRBD_PROT_C:
1965                 inc_unacked(mdev);
1966                 /* corresponding dec_unacked() in e_end_block()
1967                  * respective _drbd_clear_done_ee */
1968                 break;
1969         case DRBD_PROT_B:
1970                 /* I really don't like it that the receiver thread
1971                  * sends on the msock, but anyways */
1972                 drbd_send_ack(mdev, P_RECV_ACK, e);
1973                 break;
1974         case DRBD_PROT_A:
1975                 /* nothing to do */
1976                 break;
1977         }
1978
1979         if (mdev->state.pdsk == D_DISKLESS) {
1980                 /* In case we have the only disk of the cluster, */
1981                 drbd_set_out_of_sync(mdev, e->sector, e->size);
1982                 e->flags |= EE_CALL_AL_COMPLETE_IO;
1983                 drbd_al_begin_io(mdev, e->sector);
1984         }
1985
1986         if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
1987                 return TRUE;
1988
1989 out_interrupted:
1990         /* yes, the epoch_size now is imbalanced.
1991          * but we drop the connection anyways, so we don't have a chance to
1992          * receive a barrier... atomic_inc(&mdev->epoch_size); */
1993         put_ldev(mdev);
1994         drbd_free_ee(mdev, e);
1995         return FALSE;
1996 }
1997
1998 /* We may throttle resync, if the lower device seems to be busy,
1999  * and current sync rate is above c_min_rate.
2000  *
2001  * To decide whether or not the lower device is busy, we use a scheme similar
2002  * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2003  * (more than 64 sectors) of activity we cannot account for with our own resync
2004  * activity, it obviously is "busy".
2005  *
2006  * The current sync rate used here uses only the most recent two step marks,
2007  * to have a short time average so we can react faster.
2008  */
2009 int drbd_rs_should_slow_down(struct drbd_conf *mdev)
2010 {
2011         struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
2012         unsigned long db, dt, dbdt;
2013         int curr_events;
2014         int throttle = 0;
2015
2016         /* feature disabled? */
2017         if (mdev->sync_conf.c_min_rate == 0)
2018                 return 0;
2019
2020         curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
2021                       (int)part_stat_read(&disk->part0, sectors[1]) -
2022                         atomic_read(&mdev->rs_sect_ev);
2023         if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
2024                 unsigned long rs_left;
2025                 int i;
2026
2027                 mdev->rs_last_events = curr_events;
2028
2029                 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2030                  * approx. */
2031                 i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS;
2032                 rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
2033
2034                 dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
2035                 if (!dt)
2036                         dt++;
2037                 db = mdev->rs_mark_left[i] - rs_left;
2038                 dbdt = Bit2KB(db/dt);
2039
2040                 if (dbdt > mdev->sync_conf.c_min_rate)
2041                         throttle = 1;
2042         }
2043         return throttle;
2044 }
2045
2046
2047 static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
2048 {
2049         sector_t sector;
2050         const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
2051         struct drbd_epoch_entry *e;
2052         struct digest_info *di = NULL;
2053         int size;
2054         unsigned int fault_type;
2055         struct p_block_req *p = &mdev->data.rbuf.block_req;
2056
2057         sector = be64_to_cpu(p->sector);
2058         size   = be32_to_cpu(p->blksize);
2059
2060         if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
2061                 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2062                                 (unsigned long long)sector, size);
2063                 return FALSE;
2064         }
2065         if (sector + (size>>9) > capacity) {
2066                 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2067                                 (unsigned long long)sector, size);
2068                 return FALSE;
2069         }
2070
2071         if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
2072                 if (__ratelimit(&drbd_ratelimit_state))
2073                         dev_err(DEV, "Can not satisfy peer's read request, "
2074                             "no local data.\n");
2075                 drbd_send_ack_rp(mdev, cmd == P_DATA_REQUEST ? P_NEG_DREPLY :
2076                                  P_NEG_RS_DREPLY , p);
2077                 return TRUE;
2078         }
2079
2080         /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2081          * "criss-cross" setup, that might cause write-out on some other DRBD,
2082          * which in turn might block on the other node at this very place.  */
2083         e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
2084         if (!e) {
2085                 put_ldev(mdev);
2086                 return FALSE;
2087         }
2088
2089         switch (cmd) {
2090         case P_DATA_REQUEST:
2091                 e->w.cb = w_e_end_data_req;
2092                 fault_type = DRBD_FAULT_DT_RD;
2093                 /* application IO, don't drbd_rs_begin_io */
2094                 goto submit;
2095
2096         case P_RS_DATA_REQUEST:
2097                 e->w.cb = w_e_end_rsdata_req;
2098                 fault_type = DRBD_FAULT_RS_RD;
2099                 break;
2100
2101         case P_OV_REPLY:
2102         case P_CSUM_RS_REQUEST:
2103                 fault_type = DRBD_FAULT_RS_RD;
2104                 di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
2105                 if (!di)
2106                         goto out_free_e;
2107
2108                 di->digest_size = digest_size;
2109                 di->digest = (((char *)di)+sizeof(struct digest_info));
2110
2111                 e->digest = di;
2112                 e->flags |= EE_HAS_DIGEST;
2113
2114                 if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
2115                         goto out_free_e;
2116
2117                 if (cmd == P_CSUM_RS_REQUEST) {
2118                         D_ASSERT(mdev->agreed_pro_version >= 89);
2119                         e->w.cb = w_e_end_csum_rs_req;
2120                 } else if (cmd == P_OV_REPLY) {
2121                         e->w.cb = w_e_end_ov_reply;
2122                         dec_rs_pending(mdev);
2123                         /* drbd_rs_begin_io done when we sent this request,
2124                          * but accounting still needs to be done. */
2125                         goto submit_for_resync;
2126                 }
2127                 break;
2128
2129         case P_OV_REQUEST:
2130                 if (mdev->state.conn >= C_CONNECTED &&
2131                     mdev->state.conn != C_VERIFY_T)
2132                         dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
2133                                 drbd_conn_str(mdev->state.conn));
2134                 if (mdev->ov_start_sector == ~(sector_t)0 &&
2135                     mdev->agreed_pro_version >= 90) {
2136                         mdev->ov_start_sector = sector;
2137                         mdev->ov_position = sector;
2138                         mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector);
2139                         dev_info(DEV, "Online Verify start sector: %llu\n",
2140                                         (unsigned long long)sector);
2141                 }
2142                 e->w.cb = w_e_end_ov_req;
2143                 fault_type = DRBD_FAULT_RS_RD;
2144                 break;
2145
2146         default:
2147                 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
2148                     cmdname(cmd));
2149                 fault_type = DRBD_FAULT_MAX;
2150                 goto out_free_e;
2151         }
2152
2153         /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2154          * wrt the receiver, but it is not as straightforward as it may seem.
2155          * Various places in the resync start and stop logic assume resync
2156          * requests are processed in order, requeuing this on the worker thread
2157          * introduces a bunch of new code for synchronization between threads.
2158          *
2159          * Unlimited throttling before drbd_rs_begin_io may stall the resync
2160          * "forever", throttling after drbd_rs_begin_io will lock that extent
2161          * for application writes for the same time.  For now, just throttle
2162          * here, where the rest of the code expects the receiver to sleep for
2163          * a while, anyways.
2164          */
2165
2166         /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2167          * this defers syncer requests for some time, before letting at least
2168          * on request through.  The resync controller on the receiving side
2169          * will adapt to the incoming rate accordingly.
2170          *
2171          * We cannot throttle here if remote is Primary/SyncTarget:
2172          * we would also throttle its application reads.
2173          * In that case, throttling is done on the SyncTarget only.
2174          */
2175         if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
2176                 msleep(100);
2177         if (drbd_rs_begin_io(mdev, e->sector))
2178                 goto out_free_e;
2179
2180 submit_for_resync:
2181         atomic_add(size >> 9, &mdev->rs_sect_ev);
2182
2183 submit:
2184         inc_unacked(mdev);
2185         spin_lock_irq(&mdev->req_lock);
2186         list_add_tail(&e->w.list, &mdev->read_ee);
2187         spin_unlock_irq(&mdev->req_lock);
2188
2189         if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
2190                 return TRUE;
2191
2192 out_free_e:
2193         put_ldev(mdev);
2194         drbd_free_ee(mdev, e);
2195         return FALSE;
2196 }
2197
2198 static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
2199 {
2200         int self, peer, rv = -100;
2201         unsigned long ch_self, ch_peer;
2202
2203         self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2204         peer = mdev->p_uuid[UI_BITMAP] & 1;
2205
2206         ch_peer = mdev->p_uuid[UI_SIZE];
2207         ch_self = mdev->comm_bm_set;
2208
2209         switch (mdev->net_conf->after_sb_0p) {
2210         case ASB_CONSENSUS:
2211         case ASB_DISCARD_SECONDARY:
2212         case ASB_CALL_HELPER:
2213                 dev_err(DEV, "Configuration error.\n");
2214                 break;
2215         case ASB_DISCONNECT:
2216                 break;
2217         case ASB_DISCARD_YOUNGER_PRI:
2218                 if (self == 0 && peer == 1) {
2219                         rv = -1;
2220                         break;
2221                 }
2222                 if (self == 1 && peer == 0) {
2223                         rv =  1;
2224                         break;
2225                 }
2226                 /* Else fall through to one of the other strategies... */
2227         case ASB_DISCARD_OLDER_PRI:
2228                 if (self == 0 && peer == 1) {
2229                         rv = 1;
2230                         break;
2231                 }
2232                 if (self == 1 && peer == 0) {
2233                         rv = -1;
2234                         break;
2235                 }
2236                 /* Else fall through to one of the other strategies... */
2237                 dev_warn(DEV, "Discard younger/older primary did not find a decision\n"
2238                      "Using discard-least-changes instead\n");
2239         case ASB_DISCARD_ZERO_CHG:
2240                 if (ch_peer == 0 && ch_self == 0) {
2241                         rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2242                                 ? -1 : 1;
2243                         break;
2244                 } else {
2245                         if (ch_peer == 0) { rv =  1; break; }
2246                         if (ch_self == 0) { rv = -1; break; }
2247                 }
2248                 if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
2249                         break;
2250         case ASB_DISCARD_LEAST_CHG:
2251                 if      (ch_self < ch_peer)
2252                         rv = -1;
2253                 else if (ch_self > ch_peer)
2254                         rv =  1;
2255                 else /* ( ch_self == ch_peer ) */
2256                      /* Well, then use something else. */
2257                         rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2258                                 ? -1 : 1;
2259                 break;
2260         case ASB_DISCARD_LOCAL:
2261                 rv = -1;
2262                 break;
2263         case ASB_DISCARD_REMOTE:
2264                 rv =  1;
2265         }
2266
2267         return rv;
2268 }
2269
2270 static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2271 {
2272         int self, peer, hg, rv = -100;
2273
2274         self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2275         peer = mdev->p_uuid[UI_BITMAP] & 1;
2276
2277         switch (mdev->net_conf->after_sb_1p) {
2278         case ASB_DISCARD_YOUNGER_PRI:
2279         case ASB_DISCARD_OLDER_PRI:
2280         case ASB_DISCARD_LEAST_CHG:
2281         case ASB_DISCARD_LOCAL:
2282         case ASB_DISCARD_REMOTE:
2283                 dev_err(DEV, "Configuration error.\n");
2284                 break;
2285         case ASB_DISCONNECT:
2286                 break;
2287         case ASB_CONSENSUS:
2288                 hg = drbd_asb_recover_0p(mdev);
2289                 if (hg == -1 && mdev->state.role == R_SECONDARY)
2290                         rv = hg;
2291                 if (hg == 1  && mdev->state.role == R_PRIMARY)
2292                         rv = hg;
2293                 break;
2294         case ASB_VIOLENTLY:
2295                 rv = drbd_asb_recover_0p(mdev);
2296                 break;
2297         case ASB_DISCARD_SECONDARY:
2298                 return mdev->state.role == R_PRIMARY ? 1 : -1;
2299         case ASB_CALL_HELPER:
2300                 hg = drbd_asb_recover_0p(mdev);
2301                 if (hg == -1 && mdev->state.role == R_PRIMARY) {
2302                         self = drbd_set_role(mdev, R_SECONDARY, 0);
2303                          /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2304                           * we might be here in C_WF_REPORT_PARAMS which is transient.
2305                           * we do not need to wait for the after state change work either. */
2306                         self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2307                         if (self != SS_SUCCESS) {
2308                                 drbd_khelper(mdev, "pri-lost-after-sb");
2309                         } else {
2310                                 dev_warn(DEV, "Successfully gave up primary role.\n");
2311                                 rv = hg;
2312                         }
2313                 } else
2314                         rv = hg;
2315         }
2316
2317         return rv;
2318 }
2319
2320 static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2321 {
2322         int self, peer, hg, rv = -100;
2323
2324         self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2325         peer = mdev->p_uuid[UI_BITMAP] & 1;
2326
2327         switch (mdev->net_conf->after_sb_2p) {
2328         case ASB_DISCARD_YOUNGER_PRI:
2329         case ASB_DISCARD_OLDER_PRI:
2330         case ASB_DISCARD_LEAST_CHG:
2331         case ASB_DISCARD_LOCAL:
2332         case ASB_DISCARD_REMOTE:
2333         case ASB_CONSENSUS:
2334         case ASB_DISCARD_SECONDARY:
2335                 dev_err(DEV, "Configuration error.\n");
2336                 break;
2337         case ASB_VIOLENTLY:
2338                 rv = drbd_asb_recover_0p(mdev);
2339                 break;
2340         case ASB_DISCONNECT:
2341                 break;
2342         case ASB_CALL_HELPER:
2343                 hg = drbd_asb_recover_0p(mdev);
2344                 if (hg == -1) {
2345                          /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2346                           * we might be here in C_WF_REPORT_PARAMS which is transient.
2347                           * we do not need to wait for the after state change work either. */
2348                         self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2349                         if (self != SS_SUCCESS) {
2350                                 drbd_khelper(mdev, "pri-lost-after-sb");
2351                         } else {
2352                                 dev_warn(DEV, "Successfully gave up primary role.\n");
2353                                 rv = hg;
2354                         }
2355                 } else
2356                         rv = hg;
2357         }
2358
2359         return rv;
2360 }
2361
2362 static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
2363                            u64 bits, u64 flags)
2364 {
2365         if (!uuid) {
2366                 dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
2367                 return;
2368         }
2369         dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
2370              text,
2371              (unsigned long long)uuid[UI_CURRENT],
2372              (unsigned long long)uuid[UI_BITMAP],
2373              (unsigned long long)uuid[UI_HISTORY_START],
2374              (unsigned long long)uuid[UI_HISTORY_END],
2375              (unsigned long long)bits,
2376              (unsigned long long)flags);
2377 }
2378
2379 /*
2380   100   after split brain try auto recover
2381     2   C_SYNC_SOURCE set BitMap
2382     1   C_SYNC_SOURCE use BitMap
2383     0   no Sync
2384    -1   C_SYNC_TARGET use BitMap
2385    -2   C_SYNC_TARGET set BitMap
2386  -100   after split brain, disconnect
2387 -1000   unrelated data
2388  */
2389 static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
2390 {
2391         u64 self, peer;
2392         int i, j;
2393
2394         self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2395         peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2396
2397         *rule_nr = 10;
2398         if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2399                 return 0;
2400
2401         *rule_nr = 20;
2402         if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2403              peer != UUID_JUST_CREATED)
2404                 return -2;
2405
2406         *rule_nr = 30;
2407         if (self != UUID_JUST_CREATED &&
2408             (peer == UUID_JUST_CREATED || peer == (u64)0))
2409                 return 2;
2410
2411         if (self == peer) {
2412                 int rct, dc; /* roles at crash time */
2413
2414                 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2415
2416                         if (mdev->agreed_pro_version < 91)
2417                                 return -1001;
2418
2419                         if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2420                             (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
2421                                 dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
2422                                 drbd_uuid_set_bm(mdev, 0UL);
2423
2424                                 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2425                                                mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2426                                 *rule_nr = 34;
2427                         } else {
2428                                 dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
2429                                 *rule_nr = 36;
2430                         }
2431
2432                         return 1;
2433                 }
2434
2435                 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
2436
2437                         if (mdev->agreed_pro_version < 91)
2438                                 return -1001;
2439
2440                         if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2441                             (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
2442                                 dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
2443
2444                                 mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
2445                                 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
2446                                 mdev->p_uuid[UI_BITMAP] = 0UL;
2447
2448                                 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2449                                 *rule_nr = 35;
2450                         } else {
2451                                 dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
2452                                 *rule_nr = 37;
2453                         }
2454
2455                         return -1;
2456                 }
2457
2458                 /* Common power [off|failure] */
2459                 rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
2460                         (mdev->p_uuid[UI_FLAGS] & 2);
2461                 /* lowest bit is set when we were primary,
2462                  * next bit (weight 2) is set when peer was primary */
2463                 *rule_nr = 40;
2464
2465                 switch (rct) {
2466                 case 0: /* !self_pri && !peer_pri */ return 0;
2467                 case 1: /*  self_pri && !peer_pri */ return 1;
2468                 case 2: /* !self_pri &&  peer_pri */ return -1;
2469                 case 3: /*  self_pri &&  peer_pri */
2470                         dc = test_bit(DISCARD_CONCURRENT, &mdev->flags);
2471                         return dc ? -1 : 1;
2472                 }
2473         }
2474
2475         *rule_nr = 50;
2476         peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2477         if (self == peer)
2478                 return -1;
2479
2480         *rule_nr = 51;
2481         peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2482         if (self == peer) {
2483                 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2484                 peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1);
2485                 if (self == peer) {
2486                         /* The last P_SYNC_UUID did not get though. Undo the last start of
2487                            resync as sync source modifications of the peer's UUIDs. */
2488
2489                         if (mdev->agreed_pro_version < 91)
2490                                 return -1001;
2491
2492                         mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2493                         mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
2494                         return -1;
2495                 }
2496         }
2497
2498         *rule_nr = 60;
2499         self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2500         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2501                 peer = mdev->p_uuid[i] & ~((u64)1);
2502                 if (self == peer)
2503                         return -2;
2504         }
2505
2506         *rule_nr = 70;
2507         self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2508         peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2509         if (self == peer)
2510                 return 1;
2511
2512         *rule_nr = 71;
2513         self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2514         if (self == peer) {
2515                 self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1);
2516                 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2517                 if (self == peer) {
2518                         /* The last P_SYNC_UUID did not get though. Undo the last start of
2519                            resync as sync source modifications of our UUIDs. */
2520
2521                         if (mdev->agreed_pro_version < 91)
2522                                 return -1001;
2523
2524                         _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
2525                         _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
2526
2527                         dev_info(DEV, "Undid last start of resync:\n");
2528
2529                         drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2530                                        mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2531
2532                         return 1;
2533                 }
2534         }
2535
2536
2537         *rule_nr = 80;
2538         peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2539         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2540                 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2541                 if (self == peer)
2542                         return 2;
2543         }
2544
2545         *rule_nr = 90;
2546         self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2547         peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2548         if (self == peer && self != ((u64)0))
2549                 return 100;
2550
2551         *rule_nr = 100;
2552         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2553                 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2554                 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
2555                         peer = mdev->p_uuid[j] & ~((u64)1);
2556                         if (self == peer)
2557                                 return -100;
2558                 }
2559         }
2560
2561         return -1000;
2562 }
2563
2564 /* drbd_sync_handshake() returns the new conn state on success, or
2565    CONN_MASK (-1) on failure.
2566  */
2567 static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
2568                                            enum drbd_disk_state peer_disk) __must_hold(local)
2569 {
2570         int hg, rule_nr;
2571         enum drbd_conns rv = C_MASK;
2572         enum drbd_disk_state mydisk;
2573
2574         mydisk = mdev->state.disk;
2575         if (mydisk == D_NEGOTIATING)
2576                 mydisk = mdev->new_state_tmp.disk;
2577
2578         dev_info(DEV, "drbd_sync_handshake:\n");
2579         drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
2580         drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
2581                        mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2582
2583         hg = drbd_uuid_compare(mdev, &rule_nr);
2584
2585         dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
2586
2587         if (hg == -1000) {
2588                 dev_alert(DEV, "Unrelated data, aborting!\n");
2589                 return C_MASK;
2590         }
2591         if (hg == -1001) {
2592                 dev_alert(DEV, "To resolve this both sides have to support at least protocol\n");
2593                 return C_MASK;
2594         }
2595
2596         if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
2597             (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
2598                 int f = (hg == -100) || abs(hg) == 2;
2599                 hg = mydisk > D_INCONSISTENT ? 1 : -1;
2600                 if (f)
2601                         hg = hg*2;
2602                 dev_info(DEV, "Becoming sync %s due to disk states.\n",
2603                      hg > 0 ? "source" : "target");
2604         }
2605
2606         if (abs(hg) == 100)
2607                 drbd_khelper(mdev, "initial-split-brain");
2608
2609         if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
2610                 int pcount = (mdev->state.role == R_PRIMARY)
2611                            + (peer_role == R_PRIMARY);
2612                 int forced = (hg == -100);
2613
2614                 switch (pcount) {
2615                 case 0:
2616                         hg = drbd_asb_recover_0p(mdev);
2617                         break;
2618                 case 1:
2619                         hg = drbd_asb_recover_1p(mdev);
2620                         break;
2621                 case 2:
2622                         hg = drbd_asb_recover_2p(mdev);
2623                         break;
2624                 }
2625                 if (abs(hg) < 100) {
2626                         dev_warn(DEV, "Split-Brain detected, %d primaries, "
2627                              "automatically solved. Sync from %s node\n",
2628                              pcount, (hg < 0) ? "peer" : "this");
2629                         if (forced) {
2630                                 dev_warn(DEV, "Doing a full sync, since"
2631                                      " UUIDs where ambiguous.\n");
2632                                 hg = hg*2;
2633                         }
2634                 }
2635         }
2636
2637         if (hg == -100) {
2638                 if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
2639                         hg = -1;
2640                 if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
2641                         hg = 1;
2642
2643                 if (abs(hg) < 100)
2644                         dev_warn(DEV, "Split-Brain detected, manually solved. "
2645                              "Sync from %s node\n",
2646                              (hg < 0) ? "peer" : "this");
2647         }
2648
2649         if (hg == -100) {
2650                 /* FIXME this log message is not correct if we end up here
2651                  * after an attempted attach on a diskless node.
2652                  * We just refuse to attach -- well, we drop the "connection"
2653                  * to that disk, in a way... */
2654                 dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
2655                 drbd_khelper(mdev, "split-brain");
2656                 return C_MASK;
2657         }
2658
2659         if (hg > 0 && mydisk <= D_INCONSISTENT) {
2660                 dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
2661                 return C_MASK;
2662         }
2663
2664         if (hg < 0 && /* by intention we do not use mydisk here. */
2665             mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
2666                 switch (mdev->net_conf->rr_conflict) {
2667                 case ASB_CALL_HELPER:
2668                         drbd_khelper(mdev, "pri-lost");
2669                         /* fall through */
2670                 case ASB_DISCONNECT:
2671                         dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
2672                         return C_MASK;
2673                 case ASB_VIOLENTLY:
2674                         dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
2675                              "assumption\n");
2676                 }
2677         }
2678
2679         if (mdev->net_conf->dry_run || test_bit(CONN_DRY_RUN, &mdev->flags)) {
2680                 if (hg == 0)
2681                         dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
2682                 else
2683                         dev_info(DEV, "dry-run connect: Would become %s, doing a %s resync.",
2684                                  drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
2685                                  abs(hg) >= 2 ? "full" : "bit-map based");
2686                 return C_MASK;
2687         }
2688
2689         if (abs(hg) >= 2) {
2690                 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
2691                 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake"))
2692                         return C_MASK;
2693         }
2694
2695         if (hg > 0) { /* become sync source. */
2696                 rv = C_WF_BITMAP_S;
2697         } else if (hg < 0) { /* become sync target */
2698                 rv = C_WF_BITMAP_T;
2699         } else {
2700                 rv = C_CONNECTED;
2701                 if (drbd_bm_total_weight(mdev)) {
2702                         dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
2703                              drbd_bm_total_weight(mdev));
2704                 }
2705         }
2706
2707         return rv;
2708 }
2709
2710 /* returns 1 if invalid */
2711 static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
2712 {
2713         /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
2714         if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
2715             (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
2716                 return 0;
2717
2718         /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
2719         if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
2720             self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
2721                 return 1;
2722
2723         /* everything else is valid if they are equal on both sides. */
2724         if (peer == self)
2725                 return 0;
2726
2727         /* everything es is invalid. */
2728         return 1;
2729 }
2730
2731 static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
2732 {
2733         struct p_protocol *p = &mdev->data.rbuf.protocol;
2734         int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
2735         int p_want_lose, p_two_primaries, cf;
2736         char p_integrity_alg[SHARED_SECRET_MAX] = "";
2737
2738         p_proto         = be32_to_cpu(p->protocol);
2739         p_after_sb_0p   = be32_to_cpu(p->after_sb_0p);
2740         p_after_sb_1p   = be32_to_cpu(p->after_sb_1p);
2741         p_after_sb_2p   = be32_to_cpu(p->after_sb_2p);
2742         p_two_primaries = be32_to_cpu(p->two_primaries);
2743         cf              = be32_to_cpu(p->conn_flags);
2744         p_want_lose = cf & CF_WANT_LOSE;
2745
2746         clear_bit(CONN_DRY_RUN, &mdev->flags);
2747
2748         if (cf & CF_DRY_RUN)
2749                 set_bit(CONN_DRY_RUN, &mdev->flags);
2750
2751         if (p_proto != mdev->net_conf->wire_protocol) {
2752                 dev_err(DEV, "incompatible communication protocols\n");
2753                 goto disconnect;
2754         }
2755
2756         if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
2757                 dev_err(DEV, "incompatible after-sb-0pri settings\n");
2758                 goto disconnect;
2759         }
2760
2761         if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
2762                 dev_err(DEV, "incompatible after-sb-1pri settings\n");
2763                 goto disconnect;
2764         }
2765
2766         if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
2767                 dev_err(DEV, "incompatible after-sb-2pri settings\n");
2768                 goto disconnect;
2769         }
2770
2771         if (p_want_lose && mdev->net_conf->want_lose) {
2772                 dev_err(DEV, "both sides have the 'want_lose' flag set\n");
2773                 goto disconnect;
2774         }
2775
2776         if (p_two_primaries != mdev->net_conf->two_primaries) {
2777                 dev_err(DEV, "incompatible setting of the two-primaries options\n");
2778                 goto disconnect;
2779         }
2780
2781         if (mdev->agreed_pro_version >= 87) {
2782                 unsigned char *my_alg = mdev->net_conf->integrity_alg;
2783
2784                 if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
2785                         return FALSE;
2786
2787                 p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
2788                 if (strcmp(p_integrity_alg, my_alg)) {
2789                         dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
2790                         goto disconnect;
2791                 }
2792                 dev_info(DEV, "data-integrity-alg: %s\n",
2793                      my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
2794         }
2795
2796         return TRUE;
2797
2798 disconnect:
2799         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2800         return FALSE;
2801 }
2802
2803 /* helper function
2804  * input: alg name, feature name
2805  * return: NULL (alg name was "")
2806  *         ERR_PTR(error) if something goes wrong
2807  *         or the crypto hash ptr, if it worked out ok. */
2808 struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
2809                 const char *alg, const char *name)
2810 {
2811         struct crypto_hash *tfm;
2812
2813         if (!alg[0])
2814                 return NULL;
2815
2816         tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
2817         if (IS_ERR(tfm)) {
2818                 dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
2819                         alg, name, PTR_ERR(tfm));
2820                 return tfm;
2821         }
2822         if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
2823                 crypto_free_hash(tfm);
2824                 dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
2825                 return ERR_PTR(-EINVAL);
2826         }
2827         return tfm;
2828 }
2829
2830 static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
2831 {
2832         int ok = TRUE;
2833         struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
2834         unsigned int header_size, data_size, exp_max_sz;
2835         struct crypto_hash *verify_tfm = NULL;
2836         struct crypto_hash *csums_tfm = NULL;
2837         const int apv = mdev->agreed_pro_version;
2838         int *rs_plan_s = NULL;
2839         int fifo_size = 0;
2840
2841         exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
2842                     : apv == 88 ? sizeof(struct p_rs_param)
2843                                         + SHARED_SECRET_MAX
2844                     : apv <= 94 ? sizeof(struct p_rs_param_89)
2845                     : /* apv >= 95 */ sizeof(struct p_rs_param_95);
2846
2847         if (packet_size > exp_max_sz) {
2848                 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
2849                     packet_size, exp_max_sz);
2850                 return FALSE;
2851         }
2852
2853         if (apv <= 88) {
2854                 header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
2855                 data_size   = packet_size  - header_size;
2856         } else if (apv <= 94) {
2857                 header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
2858                 data_size   = packet_size  - header_size;
2859                 D_ASSERT(data_size == 0);
2860         } else {
2861                 header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
2862                 data_size   = packet_size  - header_size;
2863                 D_ASSERT(data_size == 0);
2864         }
2865
2866         /* initialize verify_alg and csums_alg */
2867         memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2868
2869         if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
2870                 return FALSE;
2871
2872         mdev->sync_conf.rate      = be32_to_cpu(p->rate);
2873
2874         if (apv >= 88) {
2875                 if (apv == 88) {
2876                         if (data_size > SHARED_SECRET_MAX) {
2877                                 dev_err(DEV, "verify-alg too long, "
2878                                     "peer wants %u, accepting only %u byte\n",
2879                                                 data_size, SHARED_SECRET_MAX);
2880                                 return FALSE;
2881                         }
2882
2883                         if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
2884                                 return FALSE;
2885
2886                         /* we expect NUL terminated string */
2887                         /* but just in case someone tries to be evil */
2888                         D_ASSERT(p->verify_alg[data_size-1] == 0);
2889                         p->verify_alg[data_size-1] = 0;
2890
2891                 } else /* apv >= 89 */ {
2892                         /* we still expect NUL terminated strings */
2893                         /* but just in case someone tries to be evil */
2894                         D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
2895                         D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
2896                         p->verify_alg[SHARED_SECRET_MAX-1] = 0;
2897                         p->csums_alg[SHARED_SECRET_MAX-1] = 0;
2898                 }
2899
2900                 if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
2901                         if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2902                                 dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
2903                                     mdev->sync_conf.verify_alg, p->verify_alg);
2904                                 goto disconnect;
2905                         }
2906                         verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
2907                                         p->verify_alg, "verify-alg");
2908                         if (IS_ERR(verify_tfm)) {
2909                                 verify_tfm = NULL;
2910                                 goto disconnect;
2911                         }
2912                 }
2913
2914                 if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
2915                         if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2916                                 dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
2917                                     mdev->sync_conf.csums_alg, p->csums_alg);
2918                                 goto disconnect;
2919                         }
2920                         csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
2921                                         p->csums_alg, "csums-alg");
2922                         if (IS_ERR(csums_tfm)) {
2923                                 csums_tfm = NULL;
2924                                 goto disconnect;
2925                         }
2926                 }
2927
2928                 if (apv > 94) {
2929                         mdev->sync_conf.rate      = be32_to_cpu(p->rate);
2930                         mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
2931                         mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
2932                         mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
2933                         mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
2934
2935                         fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
2936                         if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
2937                                 rs_plan_s   = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
2938                                 if (!rs_plan_s) {
2939                                         dev_err(DEV, "kmalloc of fifo_buffer failed");
2940                                         goto disconnect;
2941                                 }
2942                         }
2943                 }
2944
2945                 spin_lock(&mdev->peer_seq_lock);
2946                 /* lock against drbd_nl_syncer_conf() */
2947                 if (verify_tfm) {
2948                         strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
2949                         mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
2950                         crypto_free_hash(mdev->verify_tfm);
2951                         mdev->verify_tfm = verify_tfm;
2952                         dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
2953                 }
2954                 if (csums_tfm) {
2955                         strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
2956                         mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
2957                         crypto_free_hash(mdev->csums_tfm);
2958                         mdev->csums_tfm = csums_tfm;
2959                         dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
2960                 }
2961                 if (fifo_size != mdev->rs_plan_s.size) {
2962                         kfree(mdev->rs_plan_s.values);
2963                         mdev->rs_plan_s.values = rs_plan_s;
2964                         mdev->rs_plan_s.size   = fifo_size;
2965                         mdev->rs_planed = 0;
2966                 }
2967                 spin_unlock(&mdev->peer_seq_lock);
2968         }
2969
2970         return ok;
2971 disconnect:
2972         /* just for completeness: actually not needed,
2973          * as this is not reached if csums_tfm was ok. */
2974         crypto_free_hash(csums_tfm);
2975         /* but free the verify_tfm again, if csums_tfm did not work out */
2976         crypto_free_hash(verify_tfm);
2977         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2978         return FALSE;
2979 }
2980
2981 static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
2982 {
2983         /* sorry, we currently have no working implementation
2984          * of distributed TCQ */
2985 }
2986
2987 /* warn if the arguments differ by more than 12.5% */
2988 static void warn_if_differ_considerably(struct drbd_conf *mdev,
2989         const char *s, sector_t a, sector_t b)
2990 {
2991         sector_t d;
2992         if (a == 0 || b == 0)
2993                 return;
2994         d = (a > b) ? (a - b) : (b - a);
2995         if (d > (a>>3) || d > (b>>3))
2996                 dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
2997                      (unsigned long long)a, (unsigned long long)b);
2998 }
2999
3000 static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3001 {
3002         struct p_sizes *p = &mdev->data.rbuf.sizes;
3003         enum determine_dev_size dd = unchanged;
3004         unsigned int max_seg_s;
3005         sector_t p_size, p_usize, my_usize;
3006         int ldsc = 0; /* local disk size changed */
3007         enum dds_flags ddsf;
3008
3009         p_size = be64_to_cpu(p->d_size);
3010         p_usize = be64_to_cpu(p->u_size);
3011
3012         if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
3013                 dev_err(DEV, "some backing storage is needed\n");
3014                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3015                 return FALSE;
3016         }
3017
3018         /* just store the peer's disk size for now.
3019          * we still need to figure out whether we accept that. */
3020         mdev->p_size = p_size;
3021
3022 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
3023         if (get_ldev(mdev)) {
3024                 warn_if_differ_considerably(mdev, "lower level device sizes",
3025                            p_size, drbd_get_max_capacity(mdev->ldev));
3026                 warn_if_differ_considerably(mdev, "user requested size",
3027                                             p_usize, mdev->ldev->dc.disk_size);
3028
3029                 /* if this is the first connect, or an otherwise expected
3030                  * param exchange, choose the minimum */
3031                 if (mdev->state.conn == C_WF_REPORT_PARAMS)
3032                         p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
3033                                              p_usize);
3034
3035                 my_usize = mdev->ldev->dc.disk_size;
3036
3037                 if (mdev->ldev->dc.disk_size != p_usize) {
3038                         mdev->ldev->dc.disk_size = p_usize;
3039                         dev_info(DEV, "Peer sets u_size to %lu sectors\n",
3040                              (unsigned long)mdev->ldev->dc.disk_size);
3041                 }
3042
3043                 /* Never shrink a device with usable data during connect.
3044                    But allow online shrinking if we are connected. */
3045                 if (drbd_new_dev_size(mdev, mdev->ldev, 0) <
3046                    drbd_get_capacity(mdev->this_bdev) &&
3047                    mdev->state.disk >= D_OUTDATED &&
3048                    mdev->state.conn < C_CONNECTED) {
3049                         dev_err(DEV, "The peer's disk size is too small!\n");
3050                         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3051                         mdev->ldev->dc.disk_size = my_usize;
3052                         put_ldev(mdev);
3053                         return FALSE;
3054                 }
3055                 put_ldev(mdev);
3056         }
3057 #undef min_not_zero
3058
3059         ddsf = be16_to_cpu(p->dds_flags);
3060         if (get_ldev(mdev)) {
3061                 dd = drbd_determin_dev_size(mdev, ddsf);
3062                 put_ldev(mdev);
3063                 if (dd == dev_size_error)
3064                         return FALSE;
3065                 drbd_md_sync(mdev);
3066         } else {
3067                 /* I am diskless, need to accept the peer's size. */
3068                 drbd_set_my_capacity(mdev, p_size);
3069         }
3070
3071         if (get_ldev(mdev)) {
3072                 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
3073                         mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
3074                         ldsc = 1;
3075                 }
3076
3077                 if (mdev->agreed_pro_version < 94)
3078                         max_seg_s = be32_to_cpu(p->max_segment_size);
3079                 else /* drbd 8.3.8 onwards */
3080                         max_seg_s = DRBD_MAX_SEGMENT_SIZE;
3081
3082                 if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
3083                         drbd_setup_queue_param(mdev, max_seg_s);
3084
3085                 drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
3086                 put_ldev(mdev);
3087         }
3088
3089         if (mdev->state.conn > C_WF_REPORT_PARAMS) {
3090                 if (be64_to_cpu(p->c_size) !=
3091                     drbd_get_capacity(mdev->this_bdev) || ldsc) {
3092                         /* we have different sizes, probably peer
3093                          * needs to know my new size... */
3094                         drbd_send_sizes(mdev, 0, ddsf);
3095                 }
3096                 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
3097                     (dd == grew && mdev->state.conn == C_CONNECTED)) {
3098                         if (mdev->state.pdsk >= D_INCONSISTENT &&
3099                             mdev->state.disk >= D_INCONSISTENT) {
3100                                 if (ddsf & DDSF_NO_RESYNC)
3101                                         dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
3102                                 else
3103                                         resync_after_online_grow(mdev);
3104                         } else
3105                                 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
3106                 }
3107         }
3108
3109         return TRUE;
3110 }
3111
3112 static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3113 {
3114         struct p_uuids *p = &mdev->data.rbuf.uuids;
3115         u64 *p_uuid;
3116         int i;
3117
3118         p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
3119
3120         for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
3121                 p_uuid[i] = be64_to_cpu(p->uuid[i]);
3122
3123         kfree(mdev->p_uuid);
3124         mdev->p_uuid = p_uuid;
3125
3126         if (mdev->state.conn < C_CONNECTED &&
3127             mdev->state.disk < D_INCONSISTENT &&
3128             mdev->state.role == R_PRIMARY &&
3129             (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
3130                 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
3131                     (unsigned long long)mdev->ed_uuid);
3132                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3133                 return FALSE;
3134         }
3135
3136         if (get_ldev(mdev)) {
3137                 int skip_initial_sync =
3138                         mdev->state.conn == C_CONNECTED &&
3139                         mdev->agreed_pro_version >= 90 &&
3140                         mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
3141                         (p_uuid[UI_FLAGS] & 8);
3142                 if (skip_initial_sync) {
3143                         dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
3144                         drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
3145                                         "clear_n_write from receive_uuids");
3146                         _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
3147                         _drbd_uuid_set(mdev, UI_BITMAP, 0);
3148                         _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3149                                         CS_VERBOSE, NULL);
3150                         drbd_md_sync(mdev);
3151                 }
3152                 put_ldev(mdev);
3153         } else if (mdev->state.disk < D_INCONSISTENT &&
3154                    mdev->state.role == R_PRIMARY) {
3155                 /* I am a diskless primary, the peer just created a new current UUID
3156                    for me. */
3157                 drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3158         }
3159
3160         /* Before we test for the disk state, we should wait until an eventually
3161            ongoing cluster wide state change is finished. That is important if
3162            we are primary and are detaching from our disk. We need to see the
3163            new disk state... */
3164         wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
3165         if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
3166                 drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3167
3168         return TRUE;
3169 }
3170
3171 /**
3172  * convert_state() - Converts the peer's view of the cluster state to our point of view
3173  * @ps:         The state as seen by the peer.
3174  */
3175 static union drbd_state convert_state(union drbd_state ps)
3176 {
3177         union drbd_state ms;
3178
3179         static enum drbd_conns c_tab[] = {
3180                 [C_CONNECTED] = C_CONNECTED,
3181
3182                 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3183                 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3184                 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3185                 [C_VERIFY_S]       = C_VERIFY_T,
3186                 [C_MASK]   = C_MASK,
3187         };
3188
3189         ms.i = ps.i;
3190
3191         ms.conn = c_tab[ps.conn];
3192         ms.peer = ps.role;
3193         ms.role = ps.peer;
3194         ms.pdsk = ps.disk;
3195         ms.disk = ps.pdsk;
3196         ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3197
3198         return ms;
3199 }
3200
3201 static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3202 {
3203         struct p_req_state *p = &mdev->data.rbuf.req_state;
3204         union drbd_state mask, val;
3205         int rv;
3206
3207         mask.i = be32_to_cpu(p->mask);
3208         val.i = be32_to_cpu(p->val);
3209
3210         if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
3211             test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
3212                 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
3213                 return TRUE;
3214         }
3215
3216         mask = convert_state(mask);
3217         val = convert_state(val);
3218
3219         rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
3220
3221         drbd_send_sr_reply(mdev, rv);
3222         drbd_md_sync(mdev);
3223
3224         return TRUE;
3225 }
3226
3227 static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3228 {
3229         struct p_state *p = &mdev->data.rbuf.state;
3230         enum drbd_conns nconn, oconn;
3231         union drbd_state ns, peer_state;
3232         enum drbd_disk_state real_peer_disk;
3233         enum chg_state_flags cs_flags;
3234         int rv;
3235
3236         peer_state.i = be32_to_cpu(p->state);
3237
3238         real_peer_disk = peer_state.disk;
3239         if (peer_state.disk == D_NEGOTIATING) {
3240                 real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
3241                 dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
3242         }
3243
3244         spin_lock_irq(&mdev->req_lock);
3245  retry:
3246         oconn = nconn = mdev->state.conn;
3247         spin_unlock_irq(&mdev->req_lock);
3248
3249         if (nconn == C_WF_REPORT_PARAMS)
3250                 nconn = C_CONNECTED;
3251
3252         if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3253             get_ldev_if_state(mdev, D_NEGOTIATING)) {
3254                 int cr; /* consider resync */
3255
3256                 /* if we established a new connection */
3257                 cr  = (oconn < C_CONNECTED);
3258                 /* if we had an established connection
3259                  * and one of the nodes newly attaches a disk */
3260                 cr |= (oconn == C_CONNECTED &&
3261                        (peer_state.disk == D_NEGOTIATING ||
3262                         mdev->state.disk == D_NEGOTIATING));
3263                 /* if we have both been inconsistent, and the peer has been
3264                  * forced to be UpToDate with --overwrite-data */
3265                 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
3266                 /* if we had been plain connected, and the admin requested to
3267                  * start a sync by "invalidate" or "invalidate-remote" */
3268                 cr |= (oconn == C_CONNECTED &&
3269                                 (peer_state.conn >= C_STARTING_SYNC_S &&
3270                                  peer_state.conn <= C_WF_BITMAP_T));
3271
3272                 if (cr)
3273                         nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
3274
3275                 put_ldev(mdev);
3276                 if (nconn == C_MASK) {
3277                         nconn = C_CONNECTED;
3278                         if (mdev->state.disk == D_NEGOTIATING) {
3279                                 drbd_force_state(mdev, NS(disk, D_DISKLESS));
3280                         } else if (peer_state.disk == D_NEGOTIATING) {
3281                                 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3282                                 peer_state.disk = D_DISKLESS;
3283                                 real_peer_disk = D_DISKLESS;
3284                         } else {
3285                                 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
3286                                         return FALSE;
3287                                 D_ASSERT(oconn == C_WF_REPORT_PARAMS);
3288                                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3289                                 return FALSE;
3290                         }
3291                 }
3292         }
3293
3294         spin_lock_irq(&mdev->req_lock);
3295         if (mdev->state.conn != oconn)
3296                 goto retry;
3297         clear_bit(CONSIDER_RESYNC, &mdev->flags);
3298         ns.i = mdev->state.i;
3299         ns.conn = nconn;
3300         ns.peer = peer_state.role;
3301         ns.pdsk = real_peer_disk;
3302         ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
3303         if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
3304                 ns.disk = mdev->new_state_tmp.disk;
3305         cs_flags = CS_VERBOSE + (oconn < C_CONNECTED && nconn >= C_CONNECTED ? 0 : CS_HARD);
3306         if (ns.pdsk == D_CONSISTENT && ns.susp && nconn == C_CONNECTED && oconn < C_CONNECTED &&
3307             test_bit(NEW_CUR_UUID, &mdev->flags)) {
3308                 /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
3309                    for temporal network outages! */
3310                 spin_unlock_irq(&mdev->req_lock);
3311                 dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
3312                 tl_clear(mdev);
3313                 drbd_uuid_new_current(mdev);
3314                 clear_bit(NEW_CUR_UUID, &mdev->flags);
3315                 drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
3316                 return FALSE;
3317         }
3318         rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
3319         ns = mdev->state;
3320         spin_unlock_irq(&mdev->req_lock);
3321
3322         if (rv < SS_SUCCESS) {
3323                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3324                 return FALSE;
3325         }
3326
3327         if (oconn > C_WF_REPORT_PARAMS) {
3328                 if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
3329                     peer_state.disk != D_NEGOTIATING ) {
3330                         /* we want resync, peer has not yet decided to sync... */
3331                         /* Nowadays only used when forcing a node into primary role and
3332                            setting its disk to UpToDate with that */
3333                         drbd_send_uuids(mdev);
3334                         drbd_send_state(mdev);
3335                 }
3336         }
3337
3338         mdev->net_conf->want_lose = 0;
3339
3340         drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
3341
3342         return TRUE;
3343 }
3344
3345 static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3346 {
3347         struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
3348
3349         wait_event(mdev->misc_wait,
3350                    mdev->state.conn == C_WF_SYNC_UUID ||
3351                    mdev->state.conn < C_CONNECTED ||
3352                    mdev->state.disk < D_NEGOTIATING);
3353
3354         /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
3355
3356         /* Here the _drbd_uuid_ functions are right, current should
3357            _not_ be rotated into the history */
3358         if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
3359                 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
3360                 _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
3361
3362                 drbd_start_resync(mdev, C_SYNC_TARGET);
3363
3364                 put_ldev(mdev);
3365         } else
3366                 dev_err(DEV, "Ignoring SyncUUID packet!\n");
3367
3368         return TRUE;
3369 }
3370
3371 enum receive_bitmap_ret { OK, DONE, FAILED };
3372
3373 static enum receive_bitmap_ret
3374 receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
3375                      unsigned long *buffer, struct bm_xfer_ctx *c)
3376 {
3377         unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
3378         unsigned want = num_words * sizeof(long);
3379
3380         if (want != data_size) {
3381                 dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
3382                 return FAILED;
3383         }
3384         if (want == 0)
3385                 return DONE;
3386         if (drbd_recv(mdev, buffer, want) != want)
3387                 return FAILED;
3388
3389         drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
3390
3391         c->word_offset += num_words;
3392         c->bit_offset = c->word_offset * BITS_PER_LONG;
3393         if (c->bit_offset > c->bm_bits)
3394                 c->bit_offset = c->bm_bits;
3395
3396         return OK;
3397 }
3398
3399 static enum receive_bitmap_ret
3400 recv_bm_rle_bits(struct drbd_conf *mdev,
3401                 struct p_compressed_bm *p,
3402                 struct bm_xfer_ctx *c)
3403 {
3404         struct bitstream bs;
3405         u64 look_ahead;
3406         u64 rl;
3407         u64 tmp;
3408         unsigned long s = c->bit_offset;
3409         unsigned long e;
3410         int len = p->head.length - (sizeof(*p) - sizeof(p->head));
3411         int toggle = DCBP_get_start(p);
3412         int have;
3413         int bits;
3414
3415         bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
3416
3417         bits = bitstream_get_bits(&bs, &look_ahead, 64);
3418         if (bits < 0)
3419                 return FAILED;
3420
3421         for (have = bits; have > 0; s += rl, toggle = !toggle) {
3422                 bits = vli_decode_bits(&rl, look_ahead);
3423                 if (bits <= 0)
3424                         return FAILED;
3425
3426                 if (toggle) {
3427                         e = s + rl -1;
3428                         if (e >= c->bm_bits) {
3429                                 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
3430                                 return FAILED;
3431                         }
3432                         _drbd_bm_set_bits(mdev, s, e);
3433                 }
3434
3435                 if (have < bits) {
3436                         dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
3437                                 have, bits, look_ahead,
3438                                 (unsigned int)(bs.cur.b - p->code),
3439                                 (unsigned int)bs.buf_len);
3440                         return FAILED;
3441                 }
3442                 look_ahead >>= bits;
3443                 have -= bits;
3444
3445                 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
3446                 if (bits < 0)
3447                         return FAILED;
3448                 look_ahead |= tmp << have;
3449                 have += bits;
3450         }
3451
3452         c->bit_offset = s;
3453         bm_xfer_ctx_bit_to_word_offset(c);
3454
3455         return (s == c->bm_bits) ? DONE : OK;
3456 }
3457
3458 static enum receive_bitmap_ret
3459 decode_bitmap_c(struct drbd_conf *mdev,
3460                 struct p_compressed_bm *p,
3461                 struct bm_xfer_ctx *c)
3462 {
3463         if (DCBP_get_code(p) == RLE_VLI_Bits)
3464                 return recv_bm_rle_bits(mdev, p, c);
3465
3466         /* other variants had been implemented for evaluation,
3467          * but have been dropped as this one turned out to be "best"
3468          * during all our tests. */
3469
3470         dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
3471         drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3472         return FAILED;
3473 }
3474
3475 void INFO_bm_xfer_stats(struct drbd_conf *mdev,
3476                 const char *direction, struct bm_xfer_ctx *c)
3477 {
3478         /* what would it take to transfer it "plaintext" */
3479         unsigned plain = sizeof(struct p_header80) *
3480                 ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
3481                 + c->bm_words * sizeof(long);
3482         unsigned total = c->bytes[0] + c->bytes[1];
3483         unsigned r;
3484
3485         /* total can not be zero. but just in case: */
3486         if (total == 0)
3487                 return;
3488
3489         /* don't report if not compressed */
3490         if (total >= plain)
3491                 return;
3492
3493         /* total < plain. check for overflow, still */
3494         r = (total > UINT_MAX/1000) ? (total / (plain/1000))
3495                                     : (1000 * total / plain);
3496
3497         if (r > 1000)
3498                 r = 1000;
3499
3500         r = 1000 - r;
3501         dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
3502              "total %u; compression: %u.%u%%\n",
3503                         direction,
3504                         c->bytes[1], c->packets[1],
3505                         c->bytes[0], c->packets[0],
3506                         total, r/10, r % 10);
3507 }
3508
3509 /* Since we are processing the bitfield from lower addresses to higher,
3510    it does not matter if the process it in 32 bit chunks or 64 bit
3511    chunks as long as it is little endian. (Understand it as byte stream,
3512    beginning with the lowest byte...) If we would use big endian
3513    we would need to process it from the highest address to the lowest,
3514    in order to be agnostic to the 32 vs 64 bits issue.
3515
3516    returns 0 on failure, 1 if we successfully received it. */
3517 static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3518 {
3519         struct bm_xfer_ctx c;
3520         void *buffer;
3521         enum receive_bitmap_ret ret;
3522         int ok = FALSE;
3523         struct p_header80 *h = &mdev->data.rbuf.header.h80;
3524
3525         wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
3526
3527         drbd_bm_lock(mdev, "receive bitmap");
3528
3529         /* maybe we should use some per thread scratch page,
3530          * and allocate that during initial device creation? */
3531         buffer   = (unsigned long *) __get_free_page(GFP_NOIO);
3532         if (!buffer) {
3533                 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
3534                 goto out;
3535         }
3536
3537         c = (struct bm_xfer_ctx) {
3538                 .bm_bits = drbd_bm_bits(mdev),
3539                 .bm_words = drbd_bm_words(mdev),
3540         };
3541
3542         do {
3543                 if (cmd == P_BITMAP) {
3544                         ret = receive_bitmap_plain(mdev, data_size, buffer, &c);
3545                 } else if (cmd == P_COMPRESSED_BITMAP) {
3546                         /* MAYBE: sanity check that we speak proto >= 90,
3547                          * and the feature is enabled! */
3548                         struct p_compressed_bm *p;
3549
3550                         if (data_size > BM_PACKET_PAYLOAD_BYTES) {
3551                                 dev_err(DEV, "ReportCBitmap packet too large\n");
3552                                 goto out;
3553                         }
3554                         /* use the page buff */
3555                         p = buffer;
3556                         memcpy(p, h, sizeof(*h));
3557                         if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
3558                                 goto out;
3559                         if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
3560                                 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
3561                                 return FAILED;
3562                         }
3563                         ret = decode_bitmap_c(mdev, p, &c);
3564                 } else {
3565                         dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
3566                         goto out;
3567                 }
3568
3569                 c.packets[cmd == P_BITMAP]++;
3570                 c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
3571
3572                 if (ret != OK)
3573                         break;
3574
3575                 if (!drbd_recv_header(mdev, &cmd, &data_size))
3576                         goto out;
3577         } while (ret == OK);
3578         if (ret == FAILED)
3579                 goto out;
3580
3581         INFO_bm_xfer_stats(mdev, "receive", &c);
3582
3583         if (mdev->state.conn == C_WF_BITMAP_T) {
3584                 ok = !drbd_send_bitmap(mdev);
3585                 if (!ok)
3586                         goto out;
3587                 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
3588                 ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
3589                 D_ASSERT(ok == SS_SUCCESS);
3590         } else if (mdev->state.conn != C_WF_BITMAP_S) {
3591                 /* admin may have requested C_DISCONNECTING,
3592                  * other threads may have noticed network errors */
3593                 dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
3594                     drbd_conn_str(mdev->state.conn));
3595         }
3596
3597         ok = TRUE;
3598  out:
3599         drbd_bm_unlock(mdev);
3600         if (ok && mdev->state.conn == C_WF_BITMAP_S)
3601                 drbd_start_resync(mdev, C_SYNC_SOURCE);
3602         free_page((unsigned long) buffer);
3603         return ok;
3604 }
3605
3606 static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3607 {
3608         /* TODO zero copy sink :) */
3609         static char sink[128];
3610         int size, want, r;
3611
3612         dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
3613                  cmd, data_size);
3614
3615         size = data_size;
3616         while (size > 0) {
3617                 want = min_t(int, size, sizeof(sink));
3618                 r = drbd_recv(mdev, sink, want);
3619                 ERR_IF(r <= 0) break;
3620                 size -= r;
3621         }
3622         return size == 0;
3623 }
3624
3625 static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3626 {
3627         if (mdev->state.disk >= D_INCONSISTENT)
3628                 drbd_kick_lo(mdev);
3629
3630         /* Make sure we've acked all the TCP data associated
3631          * with the data requests being unplugged */
3632         drbd_tcp_quickack(mdev->data.socket);
3633
3634         return TRUE;
3635 }
3636
3637 typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
3638
3639 struct data_cmd {
3640         int expect_payload;
3641         size_t pkt_size;
3642         drbd_cmd_handler_f function;
3643 };
3644
3645 static struct data_cmd drbd_cmd_handler[] = {
3646         [P_DATA]            = { 1, sizeof(struct p_data), receive_Data },
3647         [P_DATA_REPLY]      = { 1, sizeof(struct p_data), receive_DataReply },
3648         [P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
3649         [P_BARRIER]         = { 0, sizeof(struct p_barrier), receive_Barrier } ,
3650         [P_BITMAP]          = { 1, sizeof(struct p_header80), receive_bitmap } ,
3651         [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
3652         [P_UNPLUG_REMOTE]   = { 0, sizeof(struct p_header80), receive_UnplugRemote },
3653         [P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
3654         [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3655         [P_SYNC_PARAM]      = { 1, sizeof(struct p_header80), receive_SyncParam },
3656         [P_SYNC_PARAM89]    = { 1, sizeof(struct p_header80), receive_SyncParam },
3657         [P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
3658         [P_UUIDS]           = { 0, sizeof(struct p_uuids), receive_uuids },
3659         [P_SIZES]           = { 0, sizeof(struct p_sizes), receive_sizes },
3660         [P_STATE]           = { 0, sizeof(struct p_state), receive_state },
3661         [P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
3662         [P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
3663         [P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
3664         [P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
3665         [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
3666         [P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
3667         /* anything missing from this table is in
3668          * the asender_tbl, see get_asender_cmd */
3669         [P_MAX_CMD]         = { 0, 0, NULL },
3670 };
3671
3672 /* All handler functions that expect a sub-header get that sub-heder in
3673    mdev->data.rbuf.header.head.payload.
3674
3675    Usually in mdev->data.rbuf.header.head the callback can find the usual
3676    p_header, but they may not rely on that. Since there is also p_header95 !
3677  */
3678
3679 static void drbdd(struct drbd_conf *mdev)
3680 {
3681         union p_header *header = &mdev->data.rbuf.header;
3682         unsigned int packet_size;
3683         enum drbd_packets cmd;
3684         size_t shs; /* sub header size */
3685         int rv;
3686
3687         while (get_t_state(&mdev->receiver) == Running) {
3688                 drbd_thread_current_set_cpu(mdev);
3689                 if (!drbd_recv_header(mdev, &cmd, &packet_size))
3690                         goto err_out;
3691
3692                 if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
3693                         dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
3694                         goto err_out;
3695                 }
3696
3697                 shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
3698                 rv = drbd_recv(mdev, &header->h80.payload, shs);
3699                 if (unlikely(rv != shs)) {
3700                         dev_err(DEV, "short read while reading sub header: rv=%d\n", rv);
3701                         goto err_out;
3702                 }
3703
3704                 if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
3705                         dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
3706                         goto err_out;
3707                 }
3708
3709                 rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
3710
3711                 if (unlikely(!rv)) {
3712                         dev_err(DEV, "error receiving %s, l: %d!\n",
3713                             cmdname(cmd), packet_size);
3714                         goto err_out;
3715                 }
3716         }
3717
3718         if (0) {
3719         err_out:
3720                 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3721         }
3722 }
3723
3724 void drbd_flush_workqueue(struct drbd_conf *mdev)
3725 {
3726         struct drbd_wq_barrier barr;
3727
3728         barr.w.cb = w_prev_work_done;
3729         init_completion(&barr.done);
3730         drbd_queue_work(&mdev->data.work, &barr.w);
3731         wait_for_completion(&barr.done);
3732 }
3733
3734 void drbd_free_tl_hash(struct drbd_conf *mdev)
3735 {
3736         struct hlist_head *h;
3737
3738         spin_lock_irq(&mdev->req_lock);
3739
3740         if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
3741                 spin_unlock_irq(&mdev->req_lock);
3742                 return;
3743         }
3744         /* paranoia code */
3745         for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
3746                 if (h->first)
3747                         dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
3748                                 (int)(h - mdev->ee_hash), h->first);
3749         kfree(mdev->ee_hash);
3750         mdev->ee_hash = NULL;
3751         mdev->ee_hash_s = 0;
3752
3753         /* paranoia code */
3754         for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
3755                 if (h->first)
3756                         dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
3757                                 (int)(h - mdev->tl_hash), h->first);
3758         kfree(mdev->tl_hash);
3759         mdev->tl_hash = NULL;
3760         mdev->tl_hash_s = 0;
3761         spin_unlock_irq(&mdev->req_lock);
3762 }
3763
3764 static void drbd_disconnect(struct drbd_conf *mdev)
3765 {
3766         enum drbd_fencing_p fp;
3767         union drbd_state os, ns;
3768         int rv = SS_UNKNOWN_ERROR;
3769         unsigned int i;
3770
3771         if (mdev->state.conn == C_STANDALONE)
3772                 return;
3773         if (mdev->state.conn >= C_WF_CONNECTION)
3774                 dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n",
3775                                 drbd_conn_str(mdev->state.conn));
3776
3777         /* asender does not clean up anything. it must not interfere, either */
3778         drbd_thread_stop(&mdev->asender);
3779         drbd_free_sock(mdev);
3780
3781         /* wait for current activity to cease. */
3782         spin_lock_irq(&mdev->req_lock);
3783         _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
3784         _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
3785         _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
3786         spin_unlock_irq(&mdev->req_lock);
3787
3788         /* We do not have data structures that would allow us to
3789          * get the rs_pending_cnt down to 0 again.
3790          *  * On C_SYNC_TARGET we do not have any data structures describing
3791          *    the pending RSDataRequest's we have sent.
3792          *  * On C_SYNC_SOURCE there is no data structure that tracks
3793          *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
3794          *  And no, it is not the sum of the reference counts in the
3795          *  resync_LRU. The resync_LRU tracks the whole operation including
3796          *  the disk-IO, while the rs_pending_cnt only tracks the blocks
3797          *  on the fly. */
3798         drbd_rs_cancel_all(mdev);
3799         mdev->rs_total = 0;
3800         mdev->rs_failed = 0;
3801         atomic_set(&mdev->rs_pending_cnt, 0);
3802         wake_up(&mdev->misc_wait);
3803
3804         /* make sure syncer is stopped and w_resume_next_sg queued */
3805         del_timer_sync(&mdev->resync_timer);
3806         set_bit(STOP_SYNC_TIMER, &mdev->flags);
3807         resync_timer_fn((unsigned long)mdev);
3808
3809         /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
3810          * w_make_resync_request etc. which may still be on the worker queue
3811          * to be "canceled" */
3812         drbd_flush_workqueue(mdev);
3813
3814         /* This also does reclaim_net_ee().  If we do this too early, we might
3815          * miss some resync ee and pages.*/
3816         drbd_process_done_ee(mdev);
3817
3818         kfree(mdev->p_uuid);
3819         mdev->p_uuid = NULL;
3820
3821         if (!mdev->state.susp)
3822                 tl_clear(mdev);
3823
3824         dev_info(DEV, "Connection closed\n");
3825
3826         drbd_md_sync(mdev);
3827
3828         fp = FP_DONT_CARE;
3829         if (get_ldev(mdev)) {
3830                 fp = mdev->ldev->dc.fencing;
3831                 put_ldev(mdev);
3832         }
3833
3834         if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
3835                 drbd_try_outdate_peer_async(mdev);
3836
3837         spin_lock_irq(&mdev->req_lock);
3838         os = mdev->state;
3839         if (os.conn >= C_UNCONNECTED) {
3840                 /* Do not restart in case we are C_DISCONNECTING */
3841                 ns = os;
3842                 ns.conn = C_UNCONNECTED;
3843                 rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
3844         }
3845         spin_unlock_irq(&mdev->req_lock);
3846
3847         if (os.conn == C_DISCONNECTING) {
3848                 wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
3849
3850                 if (!mdev->state.susp) {
3851                         /* we must not free the tl_hash
3852                          * while application io is still on the fly */
3853                         wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
3854                         drbd_free_tl_hash(mdev);
3855                 }
3856
3857                 crypto_free_hash(mdev->cram_hmac_tfm);
3858                 mdev->cram_hmac_tfm = NULL;
3859
3860                 kfree(mdev->net_conf);
3861                 mdev->net_conf = NULL;
3862                 drbd_request_state(mdev, NS(conn, C_STANDALONE));
3863         }
3864
3865         /* tcp_close and release of sendpage pages can be deferred.  I don't
3866          * want to use SO_LINGER, because apparently it can be deferred for
3867          * more than 20 seconds (longest time I checked).
3868          *
3869          * Actually we don't care for exactly when the network stack does its
3870          * put_page(), but release our reference on these pages right here.
3871          */
3872         i = drbd_release_ee(mdev, &mdev->net_ee);
3873         if (i)
3874                 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
3875         i = atomic_read(&mdev->pp_in_use);
3876         if (i)
3877                 dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
3878
3879         D_ASSERT(list_empty(&mdev->read_ee));
3880         D_ASSERT(list_empty(&mdev->active_ee));
3881         D_ASSERT(list_empty(&mdev->sync_ee));
3882         D_ASSERT(list_empty(&mdev->done_ee));
3883
3884         /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
3885         atomic_set(&mdev->current_epoch->epoch_size, 0);
3886         D_ASSERT(list_empty(&mdev->current_epoch->list));
3887 }
3888
3889 /*
3890  * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
3891  * we can agree on is stored in agreed_pro_version.
3892  *
3893  * feature flags and the reserved array should be enough room for future
3894  * enhancements of the handshake protocol, and possible plugins...
3895  *
3896  * for now, they are expected to be zero, but ignored.
3897  */
3898 static int drbd_send_handshake(struct drbd_conf *mdev)
3899 {
3900         /* ASSERT current == mdev->receiver ... */
3901         struct p_handshake *p = &mdev->data.sbuf.handshake;
3902         int ok;
3903
3904         if (mutex_lock_interruptible(&mdev->data.mutex)) {
3905                 dev_err(DEV, "interrupted during initial handshake\n");
3906                 return 0; /* interrupted. not ok. */
3907         }
3908
3909         if (mdev->data.socket == NULL) {
3910                 mutex_unlock(&mdev->data.mutex);
3911                 return 0;
3912         }
3913
3914         memset(p, 0, sizeof(*p));
3915         p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
3916         p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
3917         ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
3918                              (struct p_header80 *)p, sizeof(*p), 0 );
3919         mutex_unlock(&mdev->data.mutex);
3920         return ok;
3921 }
3922
3923 /*
3924  * return values:
3925  *   1 yes, we have a valid connection
3926  *   0 oops, did not work out, please try again
3927  *  -1 peer talks different language,
3928  *     no point in trying again, please go standalone.
3929  */
3930 static int drbd_do_handshake(struct drbd_conf *mdev)
3931 {
3932         /* ASSERT current == mdev->receiver ... */
3933         struct p_handshake *p = &mdev->data.rbuf.handshake;
3934         const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
3935         unsigned int length;
3936         enum drbd_packets cmd;
3937         int rv;
3938
3939         rv = drbd_send_handshake(mdev);
3940         if (!rv)
3941                 return 0;
3942
3943         rv = drbd_recv_header(mdev, &cmd, &length);
3944         if (!rv)
3945                 return 0;
3946
3947         if (cmd != P_HAND_SHAKE) {
3948                 dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
3949                      cmdname(cmd), cmd);
3950                 return -1;
3951         }
3952
3953         if (length != expect) {
3954                 dev_err(DEV, "expected HandShake length: %u, received: %u\n",
3955                      expect, length);
3956                 return -1;
3957         }
3958
3959         rv = drbd_recv(mdev, &p->head.payload, expect);
3960
3961         if (rv != expect) {
3962                 dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv);
3963                 return 0;
3964         }
3965
3966         p->protocol_min = be32_to_cpu(p->protocol_min);
3967         p->protocol_max = be32_to_cpu(p->protocol_max);
3968         if (p->protocol_max == 0)
3969                 p->protocol_max = p->protocol_min;
3970
3971         if (PRO_VERSION_MAX < p->protocol_min ||
3972             PRO_VERSION_MIN > p->protocol_max)
3973                 goto incompat;
3974
3975         mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
3976
3977         dev_info(DEV, "Handshake successful: "
3978              "Agreed network protocol version %d\n", mdev->agreed_pro_version);
3979
3980         return 1;
3981
3982  incompat:
3983         dev_err(DEV, "incompatible DRBD dialects: "
3984             "I support %d-%d, peer supports %d-%d\n",
3985             PRO_VERSION_MIN, PRO_VERSION_MAX,
3986             p->protocol_min, p->protocol_max);
3987         return -1;
3988 }
3989
3990 #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
3991 static int drbd_do_auth(struct drbd_conf *mdev)
3992 {
3993         dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
3994         dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
3995         return -1;
3996 }
3997 #else
3998 #define CHALLENGE_LEN 64
3999
4000 /* Return value:
4001         1 - auth succeeded,
4002         0 - failed, try again (network error),
4003         -1 - auth failed, don't try again.
4004 */
4005
4006 static int drbd_do_auth(struct drbd_conf *mdev)
4007 {
4008         char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
4009         struct scatterlist sg;
4010         char *response = NULL;
4011         char *right_response = NULL;
4012         char *peers_ch = NULL;
4013         unsigned int key_len = strlen(mdev->net_conf->shared_secret);
4014         unsigned int resp_size;
4015         struct hash_desc desc;
4016         enum drbd_packets cmd;
4017         unsigned int length;
4018         int rv;
4019
4020         desc.tfm = mdev->cram_hmac_tfm;
4021         desc.flags = 0;
4022
4023         rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
4024                                 (u8 *)mdev->net_conf->shared_secret, key_len);
4025         if (rv) {
4026                 dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
4027                 rv = -1;
4028                 goto fail;
4029         }
4030
4031         get_random_bytes(my_challenge, CHALLENGE_LEN);
4032
4033         rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
4034         if (!rv)
4035                 goto fail;
4036
4037         rv = drbd_recv_header(mdev, &cmd, &length);
4038         if (!rv)
4039                 goto fail;
4040
4041         if (cmd != P_AUTH_CHALLENGE) {
4042                 dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
4043                     cmdname(cmd), cmd);
4044                 rv = 0;
4045                 goto fail;
4046         }
4047
4048         if (length > CHALLENGE_LEN * 2) {
4049                 dev_err(DEV, "expected AuthChallenge payload too big.\n");
4050                 rv = -1;
4051                 goto fail;
4052         }
4053
4054         peers_ch = kmalloc(length, GFP_NOIO);
4055         if (peers_ch == NULL) {
4056                 dev_err(DEV, "kmalloc of peers_ch failed\n");
4057                 rv = -1;
4058                 goto fail;
4059         }
4060
4061         rv = drbd_recv(mdev, peers_ch, length);
4062
4063         if (rv != length) {
4064                 dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
4065                 rv = 0;
4066                 goto fail;
4067         }
4068
4069         resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
4070         response = kmalloc(resp_size, GFP_NOIO);
4071         if (response == NULL) {
4072                 dev_err(DEV, "kmalloc of response failed\n");
4073                 rv = -1;
4074                 goto fail;
4075         }
4076
4077         sg_init_table(&sg, 1);
4078         sg_set_buf(&sg, peers_ch, length);
4079
4080         rv = crypto_hash_digest(&desc, &sg, sg.length, response);
4081         if (rv) {
4082                 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
4083                 rv = -1;
4084                 goto fail;
4085         }
4086
4087         rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
4088         if (!rv)
4089                 goto fail;
4090
4091         rv = drbd_recv_header(mdev, &cmd, &length);
4092         if (!rv)
4093                 goto fail;
4094
4095         if (cmd != P_AUTH_RESPONSE) {
4096                 dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
4097                         cmdname(cmd), cmd);
4098                 rv = 0;
4099                 goto fail;
4100         }
4101
4102         if (length != resp_size) {
4103                 dev_err(DEV, "expected AuthResponse payload of wrong size\n");
4104                 rv = 0;
4105                 goto fail;
4106         }
4107
4108         rv = drbd_recv(mdev, response , resp_size);
4109
4110         if (rv != resp_size) {
4111                 dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv);
4112                 rv = 0;
4113                 goto fail;
4114         }
4115
4116         right_response = kmalloc(resp_size, GFP_NOIO);
4117         if (right_response == NULL) {
4118                 dev_err(DEV, "kmalloc of right_response failed\n");
4119                 rv = -1;
4120                 goto fail;
4121         }
4122
4123         sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
4124
4125         rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
4126         if (rv) {
4127                 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
4128                 rv = -1;
4129                 goto fail;
4130         }
4131
4132         rv = !memcmp(response, right_response, resp_size);
4133
4134         if (rv)
4135                 dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
4136                      resp_size, mdev->net_conf->cram_hmac_alg);
4137         else
4138                 rv = -1;
4139
4140  fail:
4141         kfree(peers_ch);
4142         kfree(response);
4143         kfree(right_response);
4144
4145         return rv;
4146 }
4147 #endif
4148
4149 int drbdd_init(struct drbd_thread *thi)
4150 {
4151         struct drbd_conf *mdev = thi->mdev;
4152         unsigned int minor = mdev_to_minor(mdev);
4153         int h;
4154
4155         sprintf(current->comm, "drbd%d_receiver", minor);
4156
4157         dev_info(DEV, "receiver (re)started\n");
4158
4159         do {
4160                 h = drbd_connect(mdev);
4161                 if (h == 0) {
4162                         drbd_disconnect(mdev);
4163                         __set_current_state(TASK_INTERRUPTIBLE);
4164                         schedule_timeout(HZ);
4165                 }
4166                 if (h == -1) {
4167                         dev_warn(DEV, "Discarding network configuration.\n");
4168                         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4169                 }
4170         } while (h == 0);
4171
4172         if (h > 0) {
4173                 if (get_net_conf(mdev)) {
4174                         drbdd(mdev);
4175                         put_net_conf(mdev);
4176                 }
4177         }
4178
4179         drbd_disconnect(mdev);
4180
4181         dev_info(DEV, "receiver terminated\n");
4182         return 0;
4183 }
4184
4185 /* ********* acknowledge sender ******** */
4186
4187 static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
4188 {
4189         struct p_req_state_reply *p = (struct p_req_state_reply *)h;
4190
4191         int retcode = be32_to_cpu(p->retcode);
4192
4193         if (retcode >= SS_SUCCESS) {
4194                 set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
4195         } else {
4196                 set_bit(CL_ST_CHG_FAIL, &mdev->flags);
4197                 dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
4198                     drbd_set_st_err_str(retcode), retcode);
4199         }
4200         wake_up(&mdev->state_wait);
4201
4202         return TRUE;
4203 }
4204
4205 static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
4206 {
4207         return drbd_send_ping_ack(mdev);
4208
4209 }
4210
4211 static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
4212 {
4213         /* restore idle timeout */
4214         mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
4215         if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags))
4216                 wake_up(&mdev->misc_wait);
4217
4218         return TRUE;
4219 }
4220
4221 static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
4222 {
4223         struct p_block_ack *p = (struct p_block_ack *)h;
4224         sector_t sector = be64_to_cpu(p->sector);
4225         int blksize = be32_to_cpu(p->blksize);
4226
4227         D_ASSERT(mdev->agreed_pro_version >= 89);
4228
4229         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4230
4231         drbd_rs_complete_io(mdev, sector);
4232         drbd_set_in_sync(mdev, sector, blksize);
4233         /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4234         mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4235         dec_rs_pending(mdev);
4236         atomic_add(blksize >> 9, &mdev->rs_sect_in);
4237
4238         return TRUE;
4239 }
4240
4241 /* when we receive the ACK for a write request,
4242  * verify that we actually know about it */
4243 static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
4244         u64 id, sector_t sector)
4245 {
4246         struct hlist_head *slot = tl_hash_slot(mdev, sector);
4247         struct hlist_node *n;
4248         struct drbd_request *req;
4249
4250         hlist_for_each_entry(req, n, slot, colision) {
4251                 if ((unsigned long)req == (unsigned long)id) {
4252                         if (req->sector != sector) {
4253                                 dev_err(DEV, "_ack_id_to_req: found req %p but it has "
4254                                     "wrong sector (%llus versus %llus)\n", req,
4255                                     (unsigned long long)req->sector,
4256                                     (unsigned long long)sector);
4257                                 break;
4258                         }
4259                         return req;
4260                 }
4261         }
4262         dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n",
4263                 (void *)(unsigned long)id, (unsigned long long)sector);
4264         return NULL;
4265 }
4266
4267 typedef struct drbd_request *(req_validator_fn)
4268         (struct drbd_conf *mdev, u64 id, sector_t sector);
4269
4270 static int validate_req_change_req_state(struct drbd_conf *mdev,
4271         u64 id, sector_t sector, req_validator_fn validator,
4272         const char *func, enum drbd_req_event what)
4273 {
4274         struct drbd_request *req;
4275         struct bio_and_error m;
4276
4277         spin_lock_irq(&mdev->req_lock);
4278         req = validator(mdev, id, sector);
4279         if (unlikely(!req)) {
4280                 spin_unlock_irq(&mdev->req_lock);
4281                 dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func);
4282                 return FALSE;
4283         }
4284         __req_mod(req, what, &m);
4285         spin_unlock_irq(&mdev->req_lock);
4286
4287         if (m.bio)
4288                 complete_master_bio(mdev, &m);
4289         return TRUE;
4290 }
4291
4292 static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
4293 {
4294         struct p_block_ack *p = (struct p_block_ack *)h;
4295         sector_t sector = be64_to_cpu(p->sector);
4296         int blksize = be32_to_cpu(p->blksize);
4297         enum drbd_req_event what;
4298
4299         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4300
4301         if (is_syncer_block_id(p->block_id)) {
4302                 drbd_set_in_sync(mdev, sector, blksize);
4303                 dec_rs_pending(mdev);
4304                 return TRUE;
4305         }
4306         switch (be16_to_cpu(h->command)) {
4307         case P_RS_WRITE_ACK:
4308                 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4309                 what = write_acked_by_peer_and_sis;
4310                 break;
4311         case P_WRITE_ACK:
4312                 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4313                 what = write_acked_by_peer;
4314                 break;
4315         case P_RECV_ACK:
4316                 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
4317                 what = recv_acked_by_peer;
4318                 break;
4319         case P_DISCARD_ACK:
4320                 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4321                 what = conflict_discarded_by_peer;
4322                 break;
4323         default:
4324                 D_ASSERT(0);
4325                 return FALSE;
4326         }
4327
4328         return validate_req_change_req_state(mdev, p->block_id, sector,
4329                 _ack_id_to_req, __func__ , what);
4330 }
4331
4332 static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
4333 {
4334         struct p_block_ack *p = (struct p_block_ack *)h;
4335         sector_t sector = be64_to_cpu(p->sector);
4336
4337         if (__ratelimit(&drbd_ratelimit_state))
4338                 dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n");
4339
4340         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4341
4342         if (is_syncer_block_id(p->block_id)) {
4343                 int size = be32_to_cpu(p->blksize);
4344                 dec_rs_pending(mdev);
4345                 drbd_rs_failed_io(mdev, sector, size);
4346                 return TRUE;
4347         }
4348         return validate_req_change_req_state(mdev, p->block_id, sector,
4349                 _ack_id_to_req, __func__ , neg_acked);
4350 }
4351
4352 static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
4353 {
4354         struct p_block_ack *p = (struct p_block_ack *)h;
4355         sector_t sector = be64_to_cpu(p->sector);
4356
4357         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4358         dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
4359             (unsigned long long)sector, be32_to_cpu(p->blksize));
4360
4361         return validate_req_change_req_state(mdev, p->block_id, sector,
4362                 _ar_id_to_req, __func__ , neg_acked);
4363 }
4364
4365 static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
4366 {
4367         sector_t sector;
4368         int size;
4369         struct p_block_ack *p = (struct p_block_ack *)h;
4370
4371         sector = be64_to_cpu(p->sector);
4372         size = be32_to_cpu(p->blksize);
4373
4374         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4375
4376         dec_rs_pending(mdev);
4377
4378         if (get_ldev_if_state(mdev, D_FAILED)) {
4379                 drbd_rs_complete_io(mdev, sector);
4380                 drbd_rs_failed_io(mdev, sector, size);
4381                 put_ldev(mdev);
4382         }
4383
4384         return TRUE;
4385 }
4386
4387 static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
4388 {
4389         struct p_barrier_ack *p = (struct p_barrier_ack *)h;
4390
4391         tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
4392
4393         return TRUE;
4394 }
4395
4396 static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
4397 {
4398         struct p_block_ack *p = (struct p_block_ack *)h;
4399         struct drbd_work *w;
4400         sector_t sector;
4401         int size;
4402
4403         sector = be64_to_cpu(p->sector);
4404         size = be32_to_cpu(p->blksize);
4405
4406         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4407
4408         if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
4409                 drbd_ov_oos_found(mdev, sector, size);
4410         else
4411                 ov_oos_print(mdev);
4412
4413         drbd_rs_complete_io(mdev, sector);
4414         dec_rs_pending(mdev);
4415
4416         if (--mdev->ov_left == 0) {
4417                 w = kmalloc(sizeof(*w), GFP_NOIO);
4418                 if (w) {
4419                         w->cb = w_ov_finished;
4420                         drbd_queue_work_front(&mdev->data.work, w);
4421                 } else {
4422                         dev_err(DEV, "kmalloc(w) failed.");
4423                         ov_oos_print(mdev);
4424                         drbd_resync_finished(mdev);
4425                 }
4426         }
4427         return TRUE;
4428 }
4429
4430 static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
4431 {
4432         return TRUE;
4433 }
4434
4435 struct asender_cmd {
4436         size_t pkt_size;
4437         int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
4438 };
4439
4440 static struct asender_cmd *get_asender_cmd(int cmd)
4441 {
4442         static struct asender_cmd asender_tbl[] = {
4443                 /* anything missing from this table is in
4444                  * the drbd_cmd_handler (drbd_default_handler) table,
4445                  * see the beginning of drbdd() */
4446         [P_PING]            = { sizeof(struct p_header80), got_Ping },
4447         [P_PING_ACK]        = { sizeof(struct p_header80), got_PingAck },
4448         [P_RECV_ACK]        = { sizeof(struct p_block_ack), got_BlockAck },
4449         [P_WRITE_ACK]       = { sizeof(struct p_block_ack), got_BlockAck },
4450         [P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
4451         [P_DISCARD_ACK]     = { sizeof(struct p_block_ack), got_BlockAck },
4452         [P_NEG_ACK]         = { sizeof(struct p_block_ack), got_NegAck },
4453         [P_NEG_DREPLY]      = { sizeof(struct p_block_ack), got_NegDReply },
4454         [P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply},
4455         [P_OV_RESULT]       = { sizeof(struct p_block_ack), got_OVResult },
4456         [P_BARRIER_ACK]     = { sizeof(struct p_barrier_ack), got_BarrierAck },
4457         [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
4458         [P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
4459         [P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
4460         [P_MAX_CMD]         = { 0, NULL },
4461         };
4462         if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
4463                 return NULL;
4464         return &asender_tbl[cmd];
4465 }
4466
4467 int drbd_asender(struct drbd_thread *thi)
4468 {
4469         struct drbd_conf *mdev = thi->mdev;
4470         struct p_header80 *h = &mdev->meta.rbuf.header.h80;
4471         struct asender_cmd *cmd = NULL;
4472
4473         int rv, len;
4474         void *buf    = h;
4475         int received = 0;
4476         int expect   = sizeof(struct p_header80);
4477         int empty;
4478
4479         sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
4480
4481         current->policy = SCHED_RR;  /* Make this a realtime task! */
4482         current->rt_priority = 2;    /* more important than all other tasks */
4483
4484         while (get_t_state(thi) == Running) {
4485                 drbd_thread_current_set_cpu(mdev);
4486                 if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
4487                         ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
4488                         mdev->meta.socket->sk->sk_rcvtimeo =
4489                                 mdev->net_conf->ping_timeo*HZ/10;
4490                 }
4491
4492                 /* conditionally cork;
4493                  * it may hurt latency if we cork without much to send */
4494                 if (!mdev->net_conf->no_cork &&
4495                         3 < atomic_read(&mdev->unacked_cnt))
4496                         drbd_tcp_cork(mdev->meta.socket);
4497                 while (1) {
4498                         clear_bit(SIGNAL_ASENDER, &mdev->flags);
4499                         flush_signals(current);
4500                         if (!drbd_process_done_ee(mdev)) {
4501                                 dev_err(DEV, "process_done_ee() = NOT_OK\n");
4502                                 goto reconnect;
4503                         }
4504                         /* to avoid race with newly queued ACKs */
4505                         set_bit(SIGNAL_ASENDER, &mdev->flags);
4506                         spin_lock_irq(&mdev->req_lock);
4507                         empty = list_empty(&mdev->done_ee);
4508                         spin_unlock_irq(&mdev->req_lock);
4509                         /* new ack may have been queued right here,
4510                          * but then there is also a signal pending,
4511                          * and we start over... */
4512                         if (empty)
4513                                 break;
4514                 }
4515                 /* but unconditionally uncork unless disabled */
4516                 if (!mdev->net_conf->no_cork)
4517                         drbd_tcp_uncork(mdev->meta.socket);
4518
4519                 /* short circuit, recv_msg would return EINTR anyways. */
4520                 if (signal_pending(current))
4521                         continue;
4522
4523                 rv = drbd_recv_short(mdev, mdev->meta.socket,
4524                                      buf, expect-received, 0);
4525                 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4526
4527                 flush_signals(current);
4528
4529                 /* Note:
4530                  * -EINTR        (on meta) we got a signal
4531                  * -EAGAIN       (on meta) rcvtimeo expired
4532                  * -ECONNRESET   other side closed the connection
4533                  * -ERESTARTSYS  (on data) we got a signal
4534                  * rv <  0       other than above: unexpected error!
4535                  * rv == expected: full header or command
4536                  * rv <  expected: "woken" by signal during receive
4537                  * rv == 0       : "connection shut down by peer"
4538                  */
4539                 if (likely(rv > 0)) {
4540                         received += rv;
4541                         buf      += rv;
4542                 } else if (rv == 0) {
4543                         dev_err(DEV, "meta connection shut down by peer.\n");
4544                         goto reconnect;
4545                 } else if (rv == -EAGAIN) {
4546                         if (mdev->meta.socket->sk->sk_rcvtimeo ==
4547                             mdev->net_conf->ping_timeo*HZ/10) {
4548                                 dev_err(DEV, "PingAck did not arrive in time.\n");
4549                                 goto reconnect;
4550                         }
4551                         set_bit(SEND_PING, &mdev->flags);
4552                         continue;
4553                 } else if (rv == -EINTR) {
4554                         continue;
4555                 } else {
4556                         dev_err(DEV, "sock_recvmsg returned %d\n", rv);
4557                         goto reconnect;
4558                 }
4559
4560                 if (received == expect && cmd == NULL) {
4561                         if (unlikely(h->magic != BE_DRBD_MAGIC)) {
4562                                 dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
4563                                     (long)be32_to_cpu(h->magic),
4564                                     h->command, h->length);
4565                                 goto reconnect;
4566                         }
4567                         cmd = get_asender_cmd(be16_to_cpu(h->command));
4568                         len = be16_to_cpu(h->length);
4569                         if (unlikely(cmd == NULL)) {
4570                                 dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
4571                                     (long)be32_to_cpu(h->magic),
4572                                     h->command, h->length);
4573                                 goto disconnect;
4574                         }
4575                         expect = cmd->pkt_size;
4576                         ERR_IF(len != expect-sizeof(struct p_header80))
4577                                 goto reconnect;
4578                 }
4579                 if (received == expect) {
4580                         D_ASSERT(cmd != NULL);
4581                         if (!cmd->process(mdev, h))
4582                                 goto reconnect;
4583
4584                         buf      = h;
4585                         received = 0;
4586                         expect   = sizeof(struct p_header80);
4587                         cmd      = NULL;
4588                 }
4589         }
4590
4591         if (0) {
4592 reconnect:
4593                 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
4594         }
4595         if (0) {
4596 disconnect:
4597                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4598         }
4599         clear_bit(SIGNAL_ASENDER, &mdev->flags);
4600
4601         D_ASSERT(mdev->state.conn < C_CONNECTED);
4602         dev_info(DEV, "asender terminated\n");
4603
4604         return 0;
4605 }