drivers/block/drbd/drbd_worker.c

   1 /*
   2    drbd_worker.c
   3
   4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10    drbd is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 2, or (at your option)
  13    any later version.
  14
  15    drbd is distributed in the hope that it will be useful,
  16    but WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18    GNU General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with drbd; see the file COPYING.  If not, write to
  22    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23
  24  */
  25
  26 #include <linux/module.h>
  27 #include <linux/drbd.h>
  28 #include <linux/sched.h>
  29 #include <linux/wait.h>
  30 #include <linux/mm.h>
  31 #include <linux/memcontrol.h>
  32 #include <linux/mm_inline.h>
  33 #include <linux/slab.h>
  34 #include <linux/random.h>
  35 #include <linux/string.h>
  36 #include <linux/scatterlist.h>
  37
  38 #include "drbd_int.h"
  39 #include "drbd_protocol.h"
  40 #include "drbd_req.h"
  41
  42 static int w_make_ov_request(struct drbd_work *w, int cancel);
  43
  44
  45 /* endio handlers:
  46  *   drbd_md_io_complete (defined here)
  47  *   drbd_request_endio (defined here)
  48  *   drbd_peer_request_endio (defined here)
  49  *   bm_async_io_complete (defined in drbd_bitmap.c)
  50  *
  51  * For all these callbacks, note the following:
  52  * The callbacks will be called in irq context by the IDE drivers,
  53  * and in Softirqs/Tasklets/BH context by the SCSI drivers.
  54  * Try to get the locking right :)
  55  *
  56  */
  57
  58
  59 /* About the global_state_lock
  60    Each state transition on an device holds a read lock. In case we have
  61    to evaluate the resync after dependencies, we grab a write lock, because
  62    we need stable states on all devices for that.  */
  63 rwlock_t global_state_lock;
  64
  65 /* used for synchronous meta data and bitmap IO
  66  * submitted by drbd_md_sync_page_io()
  67  */
  68 void drbd_md_io_complete(struct bio *bio, int error)
  69 {
  70         struct drbd_md_io *md_io;
  71         struct drbd_device *device;
  72
  73         md_io = (struct drbd_md_io *)bio->bi_private;
  74         device = container_of(md_io, struct drbd_device, md_io);
  75
  76         md_io->error = error;
  77
  78         /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
  79          * to timeout on the lower level device, and eventually detach from it.
  80          * If this io completion runs after that timeout expired, this
  81          * drbd_md_put_buffer() may allow us to finally try and re-attach.
  82          * During normal operation, this only puts that extra reference
  83          * down to 1 again.
  84          * Make sure we first drop the reference, and only then signal
  85          * completion, or we may (in drbd_al_read_log()) cycle so fast into the
  86          * next drbd_md_sync_page_io(), that we trigger the
  87          * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
  88          */
  89         drbd_md_put_buffer(device);
  90         md_io->done = 1;
  91         wake_up(&device->misc_wait);
  92         bio_put(bio);
  93         if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
  94                 put_ldev(device);
  95 }
  96
  97 /* reads on behalf of the partner,
  98  * "submitted" by the receiver
  99  */
 100 static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
 101 {
 102         unsigned long flags = 0;
 103         struct drbd_device *device = peer_req->w.device;
 104
 105         spin_lock_irqsave(&device->resource->req_lock, flags);
 106         device->read_cnt += peer_req->i.size >> 9;
 107         list_del(&peer_req->w.list);
 108         if (list_empty(&device->read_ee))
 109                 wake_up(&device->ee_wait);
 110         if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
 111                 __drbd_chk_io_error(device, DRBD_READ_ERROR);
 112         spin_unlock_irqrestore(&device->resource->req_lock, flags);
 113
 114         drbd_queue_work(&first_peer_device(device)->connection->sender_work, &peer_req->w);
 115         put_ldev(device);
 116 }
 117
 118 /* writes on behalf of the partner, or resync writes,
 119  * "submitted" by the receiver, final stage.  */
 120 static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
 121 {
 122         unsigned long flags = 0;
 123         struct drbd_device *device = peer_req->w.device;
 124         struct drbd_interval i;
 125         int do_wake;
 126         u64 block_id;
 127         int do_al_complete_io;
 128
 129         /* after we moved peer_req to done_ee,
 130          * we may no longer access it,
 131          * it may be freed/reused already!
 132          * (as soon as we release the req_lock) */
 133         i = peer_req->i;
 134         do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
 135         block_id = peer_req->block_id;
 136
 137         spin_lock_irqsave(&device->resource->req_lock, flags);
 138         device->writ_cnt += peer_req->i.size >> 9;
 139         list_move_tail(&peer_req->w.list, &device->done_ee);
 140
 141         /*
 142          * Do not remove from the write_requests tree here: we did not send the
 143          * Ack yet and did not wake possibly waiting conflicting requests.
 144          * Removed from the tree from "drbd_process_done_ee" within the
 145          * appropriate w.cb (e_end_block/e_end_resync_block) or from
 146          * _drbd_clear_done_ee.
 147          */
 148
 149         do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
 150
 151         if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
 152                 __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
 153         spin_unlock_irqrestore(&device->resource->req_lock, flags);
 154
 155         if (block_id == ID_SYNCER)
 156                 drbd_rs_complete_io(device, i.sector);
 157
 158         if (do_wake)
 159                 wake_up(&device->ee_wait);
 160
 161         if (do_al_complete_io)
 162                 drbd_al_complete_io(device, &i);
 163
 164         wake_asender(first_peer_device(device)->connection);
 165         put_ldev(device);
 166 }
 167
 168 /* writes on behalf of the partner, or resync writes,
 169  * "submitted" by the receiver.
 170  */
 171 void drbd_peer_request_endio(struct bio *bio, int error)
 172 {
 173         struct drbd_peer_request *peer_req = bio->bi_private;
 174         struct drbd_device *device = peer_req->w.device;
 175         int uptodate = bio_flagged(bio, BIO_UPTODATE);
 176         int is_write = bio_data_dir(bio) == WRITE;
 177
 178         if (error && __ratelimit(&drbd_ratelimit_state))
 179                 drbd_warn(device, "%s: error=%d s=%llus\n",
 180                                 is_write ? "write" : "read", error,
 181                                 (unsigned long long)peer_req->i.sector);
 182         if (!error && !uptodate) {
 183                 if (__ratelimit(&drbd_ratelimit_state))
 184                         drbd_warn(device, "%s: setting error to -EIO s=%llus\n",
 185                                         is_write ? "write" : "read",
 186                                         (unsigned long long)peer_req->i.sector);
 187                 /* strange behavior of some lower level drivers...
 188                  * fail the request by clearing the uptodate flag,
 189                  * but do not return any error?! */
 190                 error = -EIO;
 191         }
 192
 193         if (error)
 194                 set_bit(__EE_WAS_ERROR, &peer_req->flags);
 195
 196         bio_put(bio); /* no need for the bio anymore */
 197         if (atomic_dec_and_test(&peer_req->pending_bios)) {
 198                 if (is_write)
 199                         drbd_endio_write_sec_final(peer_req);
 200                 else
 201                         drbd_endio_read_sec_final(peer_req);
 202         }
 203 }
 204
 205 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
 206  */
 207 void drbd_request_endio(struct bio *bio, int error)
 208 {
 209         unsigned long flags;
 210         struct drbd_request *req = bio->bi_private;
 211         struct drbd_device *device = req->w.device;
 212         struct bio_and_error m;
 213         enum drbd_req_event what;
 214         int uptodate = bio_flagged(bio, BIO_UPTODATE);
 215
 216         if (!error && !uptodate) {
 217                 drbd_warn(device, "p %s: setting error to -EIO\n",
 218                          bio_data_dir(bio) == WRITE ? "write" : "read");
 219                 /* strange behavior of some lower level drivers...
 220                  * fail the request by clearing the uptodate flag,
 221                  * but do not return any error?! */
 222                 error = -EIO;
 223         }
 224
 225
 226         /* If this request was aborted locally before,
 227          * but now was completed "successfully",
 228          * chances are that this caused arbitrary data corruption.
 229          *
 230          * "aborting" requests, or force-detaching the disk, is intended for
 231          * completely blocked/hung local backing devices which do no longer
 232          * complete requests at all, not even do error completions.  In this
 233          * situation, usually a hard-reset and failover is the only way out.
 234          *
 235          * By "aborting", basically faking a local error-completion,
 236          * we allow for a more graceful swichover by cleanly migrating services.
 237          * Still the affected node has to be rebooted "soon".
 238          *
 239          * By completing these requests, we allow the upper layers to re-use
 240          * the associated data pages.
 241          *
 242          * If later the local backing device "recovers", and now DMAs some data
 243          * from disk into the original request pages, in the best case it will
 244          * just put random data into unused pages; but typically it will corrupt
 245          * meanwhile completely unrelated data, causing all sorts of damage.
 246          *
 247          * Which means delayed successful completion,
 248          * especially for READ requests,
 249          * is a reason to panic().
 250          *
 251          * We assume that a delayed *error* completion is OK,
 252          * though we still will complain noisily about it.
 253          */
 254         if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
 255                 if (__ratelimit(&drbd_ratelimit_state))
 256                         drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
 257
 258                 if (!error)
 259                         panic("possible random memory corruption caused by delayed completion of aborted local request\n");
 260         }
 261
 262         /* to avoid recursion in __req_mod */
 263         if (unlikely(error)) {
 264                 what = (bio_data_dir(bio) == WRITE)
 265                         ? WRITE_COMPLETED_WITH_ERROR
 266                         : (bio_rw(bio) == READ)
 267                           ? READ_COMPLETED_WITH_ERROR
 268                           : READ_AHEAD_COMPLETED_WITH_ERROR;
 269         } else
 270                 what = COMPLETED_OK;
 271
 272         bio_put(req->private_bio);
 273         req->private_bio = ERR_PTR(error);
 274
 275         /* not req_mod(), we need irqsave here! */
 276         spin_lock_irqsave(&device->resource->req_lock, flags);
 277         __req_mod(req, what, &m);
 278         spin_unlock_irqrestore(&device->resource->req_lock, flags);
 279         put_ldev(device);
 280
 281         if (m.bio)
 282                 complete_master_bio(device, &m);
 283 }
 284
 285 void drbd_csum_ee(struct crypto_hash *tfm, struct drbd_peer_request *peer_req, void *digest)
 286 {
 287         struct hash_desc desc;
 288         struct scatterlist sg;
 289         struct page *page = peer_req->pages;
 290         struct page *tmp;
 291         unsigned len;
 292
 293         desc.tfm = tfm;
 294         desc.flags = 0;
 295
 296         sg_init_table(&sg, 1);
 297         crypto_hash_init(&desc);
 298
 299         while ((tmp = page_chain_next(page))) {
 300                 /* all but the last page will be fully used */
 301                 sg_set_page(&sg, page, PAGE_SIZE, 0);
 302                 crypto_hash_update(&desc, &sg, sg.length);
 303                 page = tmp;
 304         }
 305         /* and now the last, possibly only partially used page */
 306         len = peer_req->i.size & (PAGE_SIZE - 1);
 307         sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
 308         crypto_hash_update(&desc, &sg, sg.length);
 309         crypto_hash_final(&desc, digest);
 310 }
 311
 312 void drbd_csum_bio(struct crypto_hash *tfm, struct bio *bio, void *digest)
 313 {
 314         struct hash_desc desc;
 315         struct scatterlist sg;
 316         struct bio_vec bvec;
 317         struct bvec_iter iter;
 318
 319         desc.tfm = tfm;
 320         desc.flags = 0;
 321
 322         sg_init_table(&sg, 1);
 323         crypto_hash_init(&desc);
 324
 325         bio_for_each_segment(bvec, bio, iter) {
 326                 sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
 327                 crypto_hash_update(&desc, &sg, sg.length);
 328         }
 329         crypto_hash_final(&desc, digest);
 330 }
 331
 332 /* MAYBE merge common code with w_e_end_ov_req */
 333 static int w_e_send_csum(struct drbd_work *w, int cancel)
 334 {
 335         struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
 336         struct drbd_device *device = w->device;
 337         int digest_size;
 338         void *digest;
 339         int err = 0;
 340
 341         if (unlikely(cancel))
 342                 goto out;
 343
 344         if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
 345                 goto out;
 346
 347         digest_size = crypto_hash_digestsize(first_peer_device(device)->connection->csums_tfm);
 348         digest = kmalloc(digest_size, GFP_NOIO);
 349         if (digest) {
 350                 sector_t sector = peer_req->i.sector;
 351                 unsigned int size = peer_req->i.size;
 352                 drbd_csum_ee(first_peer_device(device)->connection->csums_tfm, peer_req, digest);
 353                 /* Free peer_req and pages before send.
 354                  * In case we block on congestion, we could otherwise run into
 355                  * some distributed deadlock, if the other side blocks on
 356                  * congestion as well, because our receiver blocks in
 357                  * drbd_alloc_pages due to pp_in_use > max_buffers. */
 358                 drbd_free_peer_req(device, peer_req);
 359                 peer_req = NULL;
 360                 inc_rs_pending(device);
 361                 err = drbd_send_drequest_csum(first_peer_device(device), sector, size,
 362                                               digest, digest_size,
 363                                               P_CSUM_RS_REQUEST);
 364                 kfree(digest);
 365         } else {
 366                 drbd_err(device, "kmalloc() of digest failed.\n");
 367                 err = -ENOMEM;
 368         }
 369
 370 out:
 371         if (peer_req)
 372                 drbd_free_peer_req(device, peer_req);
 373
 374         if (unlikely(err))
 375                 drbd_err(device, "drbd_send_drequest(..., csum) failed\n");
 376         return err;
 377 }
 378
 379 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
 380
 381 static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size)
 382 {
 383         struct drbd_device *device = peer_device->device;
 384         struct drbd_peer_request *peer_req;
 385
 386         if (!get_ldev(device))
 387                 return -EIO;
 388
 389         if (drbd_rs_should_slow_down(device, sector))
 390                 goto defer;
 391
 392         /* GFP_TRY, because if there is no memory available right now, this may
 393          * be rescheduled for later. It is "only" background resync, after all. */
 394         peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
 395                                        size, GFP_TRY);
 396         if (!peer_req)
 397                 goto defer;
 398
 399         peer_req->w.cb = w_e_send_csum;
 400         spin_lock_irq(&device->resource->req_lock);
 401         list_add(&peer_req->w.list, &device->read_ee);
 402         spin_unlock_irq(&device->resource->req_lock);
 403
 404         atomic_add(size >> 9, &device->rs_sect_ev);
 405         if (drbd_submit_peer_request(device, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
 406                 return 0;
 407
 408         /* If it failed because of ENOMEM, retry should help.  If it failed
 409          * because bio_add_page failed (probably broken lower level driver),
 410          * retry may or may not help.
 411          * If it does not, you may need to force disconnect. */
 412         spin_lock_irq(&device->resource->req_lock);
 413         list_del(&peer_req->w.list);
 414         spin_unlock_irq(&device->resource->req_lock);
 415
 416         drbd_free_peer_req(device, peer_req);
 417 defer:
 418         put_ldev(device);
 419         return -EAGAIN;
 420 }
 421
 422 int w_resync_timer(struct drbd_work *w, int cancel)
 423 {
 424         struct drbd_device *device = w->device;
 425         switch (device->state.conn) {
 426         case C_VERIFY_S:
 427                 w_make_ov_request(w, cancel);
 428                 break;
 429         case C_SYNC_TARGET:
 430                 w_make_resync_request(w, cancel);
 431                 break;
 432         }
 433
 434         return 0;
 435 }
 436
 437 void resync_timer_fn(unsigned long data)
 438 {
 439         struct drbd_device *device = (struct drbd_device *) data;
 440
 441         if (list_empty(&device->resync_work.list))
 442                 drbd_queue_work(&first_peer_device(device)->connection->sender_work, &device->resync_work);
 443 }
 444
 445 static void fifo_set(struct fifo_buffer *fb, int value)
 446 {
 447         int i;
 448
 449         for (i = 0; i < fb->size; i++)
 450                 fb->values[i] = value;
 451 }
 452
 453 static int fifo_push(struct fifo_buffer *fb, int value)
 454 {
 455         int ov;
 456
 457         ov = fb->values[fb->head_index];
 458         fb->values[fb->head_index++] = value;
 459
 460         if (fb->head_index >= fb->size)
 461                 fb->head_index = 0;
 462
 463         return ov;
 464 }
 465
 466 static void fifo_add_val(struct fifo_buffer *fb, int value)
 467 {
 468         int i;
 469
 470         for (i = 0; i < fb->size; i++)
 471                 fb->values[i] += value;
 472 }
 473
 474 struct fifo_buffer *fifo_alloc(int fifo_size)
 475 {
 476         struct fifo_buffer *fb;
 477
 478         fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
 479         if (!fb)
 480                 return NULL;
 481
 482         fb->head_index = 0;
 483         fb->size = fifo_size;
 484         fb->total = 0;
 485
 486         return fb;
 487 }
 488
 489 static int drbd_rs_controller(struct drbd_device *device)
 490 {
 491         struct disk_conf *dc;
 492         unsigned int sect_in;  /* Number of sectors that came in since the last turn */
 493         unsigned int want;     /* The number of sectors we want in the proxy */
 494         int req_sect; /* Number of sectors to request in this turn */
 495         int correction; /* Number of sectors more we need in the proxy*/
 496         int cps; /* correction per invocation of drbd_rs_controller() */
 497         int steps; /* Number of time steps to plan ahead */
 498         int curr_corr;
 499         int max_sect;
 500         struct fifo_buffer *plan;
 501
 502         sect_in = atomic_xchg(&device->rs_sect_in, 0); /* Number of sectors that came in */
 503         device->rs_in_flight -= sect_in;
 504
 505         dc = rcu_dereference(device->ldev->disk_conf);
 506         plan = rcu_dereference(device->rs_plan_s);
 507
 508         steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
 509
 510         if (device->rs_in_flight + sect_in == 0) { /* At start of resync */
 511                 want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
 512         } else { /* normal path */
 513                 want = dc->c_fill_target ? dc->c_fill_target :
 514                         sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
 515         }
 516
 517         correction = want - device->rs_in_flight - plan->total;
 518
 519         /* Plan ahead */
 520         cps = correction / steps;
 521         fifo_add_val(plan, cps);
 522         plan->total += cps * steps;
 523
 524         /* What we do in this step */
 525         curr_corr = fifo_push(plan, 0);
 526         plan->total -= curr_corr;
 527
 528         req_sect = sect_in + curr_corr;
 529         if (req_sect < 0)
 530                 req_sect = 0;
 531
 532         max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
 533         if (req_sect > max_sect)
 534                 req_sect = max_sect;
 535
 536         /*
 537         drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
 538                  sect_in, device->rs_in_flight, want, correction,
 539                  steps, cps, device->rs_planed, curr_corr, req_sect);
 540         */
 541
 542         return req_sect;
 543 }
 544
 545 static int drbd_rs_number_requests(struct drbd_device *device)
 546 {
 547         int number;
 548
 549         rcu_read_lock();
 550         if (rcu_dereference(device->rs_plan_s)->size) {
 551                 number = drbd_rs_controller(device) >> (BM_BLOCK_SHIFT - 9);
 552                 device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
 553         } else {
 554                 device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
 555                 number = SLEEP_TIME * device->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
 556         }
 557         rcu_read_unlock();
 558
 559         /* ignore the amount of pending requests, the resync controller should
 560          * throttle down to incoming reply rate soon enough anyways. */
 561         return number;
 562 }
 563
 564 int w_make_resync_request(struct drbd_work *w, int cancel)
 565 {
 566         struct drbd_device *device = w->device;
 567         unsigned long bit;
 568         sector_t sector;
 569         const sector_t capacity = drbd_get_capacity(device->this_bdev);
 570         int max_bio_size;
 571         int number, rollback_i, size;
 572         int align, queued, sndbuf;
 573         int i = 0;
 574
 575         if (unlikely(cancel))
 576                 return 0;
 577
 578         if (device->rs_total == 0) {
 579                 /* empty resync? */
 580                 drbd_resync_finished(device);
 581                 return 0;
 582         }
 583
 584         if (!get_ldev(device)) {
 585                 /* Since we only need to access device->rsync a
 586                    get_ldev_if_state(device,D_FAILED) would be sufficient, but
 587                    to continue resync with a broken disk makes no sense at
 588                    all */
 589                 drbd_err(device, "Disk broke down during resync!\n");
 590                 return 0;
 591         }
 592
 593         max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
 594         number = drbd_rs_number_requests(device);
 595         if (number == 0)
 596                 goto requeue;
 597
 598         for (i = 0; i < number; i++) {
 599                 /* Stop generating RS requests, when half of the send buffer is filled */
 600                 mutex_lock(&first_peer_device(device)->connection->data.mutex);
 601                 if (first_peer_device(device)->connection->data.socket) {
 602                         queued = first_peer_device(device)->connection->data.socket->sk->sk_wmem_queued;
 603                         sndbuf = first_peer_device(device)->connection->data.socket->sk->sk_sndbuf;
 604                 } else {
 605                         queued = 1;
 606                         sndbuf = 0;
 607                 }
 608                 mutex_unlock(&first_peer_device(device)->connection->data.mutex);
 609                 if (queued > sndbuf / 2)
 610                         goto requeue;
 611
 612 next_sector:
 613                 size = BM_BLOCK_SIZE;
 614                 bit  = drbd_bm_find_next(device, device->bm_resync_fo);
 615
 616                 if (bit == DRBD_END_OF_BITMAP) {
 617                         device->bm_resync_fo = drbd_bm_bits(device);
 618                         put_ldev(device);
 619                         return 0;
 620                 }
 621
 622                 sector = BM_BIT_TO_SECT(bit);
 623
 624                 if (drbd_rs_should_slow_down(device, sector) ||
 625                     drbd_try_rs_begin_io(device, sector)) {
 626                         device->bm_resync_fo = bit;
 627                         goto requeue;
 628                 }
 629                 device->bm_resync_fo = bit + 1;
 630
 631                 if (unlikely(drbd_bm_test_bit(device, bit) == 0)) {
 632                         drbd_rs_complete_io(device, sector);
 633                         goto next_sector;
 634                 }
 635
 636 #if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
 637                 /* try to find some adjacent bits.
 638                  * we stop if we have already the maximum req size.
 639                  *
 640                  * Additionally always align bigger requests, in order to
 641                  * be prepared for all stripe sizes of software RAIDs.
 642                  */
 643                 align = 1;
 644                 rollback_i = i;
 645                 for (;;) {
 646                         if (size + BM_BLOCK_SIZE > max_bio_size)
 647                                 break;
 648
 649                         /* Be always aligned */
 650                         if (sector & ((1<<(align+3))-1))
 651                                 break;
 652
 653                         /* do not cross extent boundaries */
 654                         if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
 655                                 break;
 656                         /* now, is it actually dirty, after all?
 657                          * caution, drbd_bm_test_bit is tri-state for some
 658                          * obscure reason; ( b == 0 ) would get the out-of-band
 659                          * only accidentally right because of the "oddly sized"
 660                          * adjustment below */
 661                         if (drbd_bm_test_bit(device, bit+1) != 1)
 662                                 break;
 663                         bit++;
 664                         size += BM_BLOCK_SIZE;
 665                         if ((BM_BLOCK_SIZE << align) <= size)
 666                                 align++;
 667                         i++;
 668                 }
 669                 /* if we merged some,
 670                  * reset the offset to start the next drbd_bm_find_next from */
 671                 if (size > BM_BLOCK_SIZE)
 672                         device->bm_resync_fo = bit + 1;
 673 #endif
 674
 675                 /* adjust very last sectors, in case we are oddly sized */
 676                 if (sector + (size>>9) > capacity)
 677                         size = (capacity-sector)<<9;
 678                 if (first_peer_device(device)->connection->agreed_pro_version >= 89 &&
 679                     first_peer_device(device)->connection->csums_tfm) {
 680                         switch (read_for_csum(first_peer_device(device), sector, size)) {
 681                         case -EIO: /* Disk failure */
 682                                 put_ldev(device);
 683                                 return -EIO;
 684                         case -EAGAIN: /* allocation failed, or ldev busy */
 685                                 drbd_rs_complete_io(device, sector);
 686                                 device->bm_resync_fo = BM_SECT_TO_BIT(sector);
 687                                 i = rollback_i;
 688                                 goto requeue;
 689                         case 0:
 690                                 /* everything ok */
 691                                 break;
 692                         default:
 693                                 BUG();
 694                         }
 695                 } else {
 696                         int err;
 697
 698                         inc_rs_pending(device);
 699                         err = drbd_send_drequest(first_peer_device(device), P_RS_DATA_REQUEST,
 700                                                  sector, size, ID_SYNCER);
 701                         if (err) {
 702                                 drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
 703                                 dec_rs_pending(device);
 704                                 put_ldev(device);
 705                                 return err;
 706                         }
 707                 }
 708         }
 709
 710         if (device->bm_resync_fo >= drbd_bm_bits(device)) {
 711                 /* last syncer _request_ was sent,
 712                  * but the P_RS_DATA_REPLY not yet received.  sync will end (and
 713                  * next sync group will resume), as soon as we receive the last
 714                  * resync data block, and the last bit is cleared.
 715                  * until then resync "work" is "inactive" ...
 716                  */
 717                 put_ldev(device);
 718                 return 0;
 719         }
 720
 721  requeue:
 722         device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
 723         mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
 724         put_ldev(device);
 725         return 0;
 726 }
 727
 728 static int w_make_ov_request(struct drbd_work *w, int cancel)
 729 {
 730         struct drbd_device *device = w->device;
 731         int number, i, size;
 732         sector_t sector;
 733         const sector_t capacity = drbd_get_capacity(device->this_bdev);
 734         bool stop_sector_reached = false;
 735
 736         if (unlikely(cancel))
 737                 return 1;
 738
 739         number = drbd_rs_number_requests(device);
 740
 741         sector = device->ov_position;
 742         for (i = 0; i < number; i++) {
 743                 if (sector >= capacity)
 744                         return 1;
 745
 746                 /* We check for "finished" only in the reply path:
 747                  * w_e_end_ov_reply().
 748                  * We need to send at least one request out. */
 749                 stop_sector_reached = i > 0
 750                         && verify_can_do_stop_sector(device)
 751                         && sector >= device->ov_stop_sector;
 752                 if (stop_sector_reached)
 753                         break;
 754
 755                 size = BM_BLOCK_SIZE;
 756
 757                 if (drbd_rs_should_slow_down(device, sector) ||
 758                     drbd_try_rs_begin_io(device, sector)) {
 759                         device->ov_position = sector;
 760                         goto requeue;
 761                 }
 762
 763                 if (sector + (size>>9) > capacity)
 764                         size = (capacity-sector)<<9;
 765
 766                 inc_rs_pending(device);
 767                 if (drbd_send_ov_request(first_peer_device(device), sector, size)) {
 768                         dec_rs_pending(device);
 769                         return 0;
 770                 }
 771                 sector += BM_SECT_PER_BIT;
 772         }
 773         device->ov_position = sector;
 774
 775  requeue:
 776         device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
 777         if (i == 0 || !stop_sector_reached)
 778                 mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
 779         return 1;
 780 }
 781
 782 int w_ov_finished(struct drbd_work *w, int cancel)
 783 {
 784         struct drbd_device *device = w->device;
 785         kfree(w);
 786         ov_out_of_sync_print(device);
 787         drbd_resync_finished(device);
 788
 789         return 0;
 790 }
 791
 792 static int w_resync_finished(struct drbd_work *w, int cancel)
 793 {
 794         struct drbd_device *device = w->device;
 795         kfree(w);
 796
 797         drbd_resync_finished(device);
 798
 799         return 0;
 800 }
 801
 802 static void ping_peer(struct drbd_device *device)
 803 {
 804         struct drbd_connection *connection = first_peer_device(device)->connection;
 805
 806         clear_bit(GOT_PING_ACK, &connection->flags);
 807         request_ping(connection);
 808         wait_event(connection->ping_wait,
 809                    test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED);
 810 }
 811
 812 int drbd_resync_finished(struct drbd_device *device)
 813 {
 814         unsigned long db, dt, dbdt;
 815         unsigned long n_oos;
 816         union drbd_state os, ns;
 817         struct drbd_work *w;
 818         char *khelper_cmd = NULL;
 819         int verify_done = 0;
 820
 821         /* Remove all elements from the resync LRU. Since future actions
 822          * might set bits in the (main) bitmap, then the entries in the
 823          * resync LRU would be wrong. */
 824         if (drbd_rs_del_all(device)) {
 825                 /* In case this is not possible now, most probably because
 826                  * there are P_RS_DATA_REPLY Packets lingering on the worker's
 827                  * queue (or even the read operations for those packets
 828                  * is not finished by now).   Retry in 100ms. */
 829
 830                 schedule_timeout_interruptible(HZ / 10);
 831                 w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
 832                 if (w) {
 833                         w->cb = w_resync_finished;
 834                         w->device = device;
 835                         drbd_queue_work(&first_peer_device(device)->connection->sender_work, w);
 836                         return 1;
 837                 }
 838                 drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
 839         }
 840
 841         dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
 842         if (dt <= 0)
 843                 dt = 1;
 844
 845         db = device->rs_total;
 846         /* adjust for verify start and stop sectors, respective reached position */
 847         if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
 848                 db -= device->ov_left;
 849
 850         dbdt = Bit2KB(db/dt);
 851         device->rs_paused /= HZ;
 852
 853         if (!get_ldev(device))
 854                 goto out;
 855
 856         ping_peer(device);
 857
 858         spin_lock_irq(&device->resource->req_lock);
 859         os = drbd_read_state(device);
 860
 861         verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
 862
 863         /* This protects us against multiple calls (that can happen in the presence
 864            of application IO), and against connectivity loss just before we arrive here. */
 865         if (os.conn <= C_CONNECTED)
 866                 goto out_unlock;
 867
 868         ns = os;
 869         ns.conn = C_CONNECTED;
 870
 871         drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
 872              verify_done ? "Online verify" : "Resync",
 873              dt + device->rs_paused, device->rs_paused, dbdt);
 874
 875         n_oos = drbd_bm_total_weight(device);
 876
 877         if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
 878                 if (n_oos) {
 879                         drbd_alert(device, "Online verify found %lu %dk block out of sync!\n",
 880                               n_oos, Bit2KB(1));
 881                         khelper_cmd = "out-of-sync";
 882                 }
 883         } else {
 884                 D_ASSERT(device, (n_oos - device->rs_failed) == 0);
 885
 886                 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
 887                         khelper_cmd = "after-resync-target";
 888
 889                 if (first_peer_device(device)->connection->csums_tfm && device->rs_total) {
 890                         const unsigned long s = device->rs_same_csum;
 891                         const unsigned long t = device->rs_total;
 892                         const int ratio =
 893                                 (t == 0)     ? 0 :
 894                         (t < 100000) ? ((s*100)/t) : (s/(t/100));
 895                         drbd_info(device, "%u %% had equal checksums, eliminated: %luK; "
 896                              "transferred %luK total %luK\n",
 897                              ratio,
 898                              Bit2KB(device->rs_same_csum),
 899                              Bit2KB(device->rs_total - device->rs_same_csum),
 900                              Bit2KB(device->rs_total));
 901                 }
 902         }
 903
 904         if (device->rs_failed) {
 905                 drbd_info(device, "            %lu failed blocks\n", device->rs_failed);
 906
 907                 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
 908                         ns.disk = D_INCONSISTENT;
 909                         ns.pdsk = D_UP_TO_DATE;
 910                 } else {
 911                         ns.disk = D_UP_TO_DATE;
 912                         ns.pdsk = D_INCONSISTENT;
 913                 }
 914         } else {
 915                 ns.disk = D_UP_TO_DATE;
 916                 ns.pdsk = D_UP_TO_DATE;
 917
 918                 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
 919                         if (device->p_uuid) {
 920                                 int i;
 921                                 for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
 922                                         _drbd_uuid_set(device, i, device->p_uuid[i]);
 923                                 drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]);
 924                                 _drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]);
 925                         } else {
 926                                 drbd_err(device, "device->p_uuid is NULL! BUG\n");
 927                         }
 928                 }
 929
 930                 if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
 931                         /* for verify runs, we don't update uuids here,
 932                          * so there would be nothing to report. */
 933                         drbd_uuid_set_bm(device, 0UL);
 934                         drbd_print_uuids(device, "updated UUIDs");
 935                         if (device->p_uuid) {
 936                                 /* Now the two UUID sets are equal, update what we
 937                                  * know of the peer. */
 938                                 int i;
 939                                 for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
 940                                         device->p_uuid[i] = device->ldev->md.uuid[i];
 941                         }
 942                 }
 943         }
 944
 945         _drbd_set_state(device, ns, CS_VERBOSE, NULL);
 946 out_unlock:
 947         spin_unlock_irq(&device->resource->req_lock);
 948         put_ldev(device);
 949 out:
 950         device->rs_total  = 0;
 951         device->rs_failed = 0;
 952         device->rs_paused = 0;
 953
 954         /* reset start sector, if we reached end of device */
 955         if (verify_done && device->ov_left == 0)
 956                 device->ov_start_sector = 0;
 957
 958         drbd_md_sync(device);
 959
 960         if (khelper_cmd)
 961                 drbd_khelper(device, khelper_cmd);
 962
 963         return 1;
 964 }
 965
 966 /* helper */
 967 static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
 968 {
 969         if (drbd_peer_req_has_active_page(peer_req)) {
 970                 /* This might happen if sendpage() has not finished */
 971                 int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
 972                 atomic_add(i, &device->pp_in_use_by_net);
 973                 atomic_sub(i, &device->pp_in_use);
 974                 spin_lock_irq(&device->resource->req_lock);
 975                 list_add_tail(&peer_req->w.list, &device->net_ee);
 976                 spin_unlock_irq(&device->resource->req_lock);
 977                 wake_up(&drbd_pp_wait);
 978         } else
 979                 drbd_free_peer_req(device, peer_req);
 980 }
 981
 982 /**
 983  * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
 984  * @device:     DRBD device.
 985  * @w:          work object.
 986  * @cancel:     The connection will be closed anyways
 987  */
 988 int w_e_end_data_req(struct drbd_work *w, int cancel)
 989 {
 990         struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
 991         struct drbd_device *device = w->device;
 992         int err;
 993
 994         if (unlikely(cancel)) {
 995                 drbd_free_peer_req(device, peer_req);
 996                 dec_unacked(device);
 997                 return 0;
 998         }
 999
1000         if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1001                 err = drbd_send_block(first_peer_device(device), P_DATA_REPLY, peer_req);
1002         } else {
1003                 if (__ratelimit(&drbd_ratelimit_state))
1004                         drbd_err(device, "Sending NegDReply. sector=%llus.\n",
1005                             (unsigned long long)peer_req->i.sector);
1006
1007                 err = drbd_send_ack(first_peer_device(device), P_NEG_DREPLY, peer_req);
1008         }
1009
1010         dec_unacked(device);
1011
1012         move_to_net_ee_or_free(device, peer_req);
1013
1014         if (unlikely(err))
1015                 drbd_err(device, "drbd_send_block() failed\n");
1016         return err;
1017 }
1018
1019 /**
1020  * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
1021  * @device:     DRBD device.
1022  * @w:          work object.
1023  * @cancel:     The connection will be closed anyways
1024  */
1025 int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
1026 {
1027         struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1028         struct drbd_device *device = w->device;
1029         int err;
1030
1031         if (unlikely(cancel)) {
1032                 drbd_free_peer_req(device, peer_req);
1033                 dec_unacked(device);
1034                 return 0;
1035         }
1036
1037         if (get_ldev_if_state(device, D_FAILED)) {
1038                 drbd_rs_complete_io(device, peer_req->i.sector);
1039                 put_ldev(device);
1040         }
1041
1042         if (device->state.conn == C_AHEAD) {
1043                 err = drbd_send_ack(first_peer_device(device), P_RS_CANCEL, peer_req);
1044         } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1045                 if (likely(device->state.pdsk >= D_INCONSISTENT)) {
1046                         inc_rs_pending(device);
1047                         err = drbd_send_block(first_peer_device(device), P_RS_DATA_REPLY, peer_req);
1048                 } else {
1049                         if (__ratelimit(&drbd_ratelimit_state))
1050                                 drbd_err(device, "Not sending RSDataReply, "
1051                                     "partner DISKLESS!\n");
1052                         err = 0;
1053                 }
1054         } else {
1055                 if (__ratelimit(&drbd_ratelimit_state))
1056                         drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
1057                             (unsigned long long)peer_req->i.sector);
1058
1059                 err = drbd_send_ack(first_peer_device(device), P_NEG_RS_DREPLY, peer_req);
1060
1061                 /* update resync data with failure */
1062                 drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size);
1063         }
1064
1065         dec_unacked(device);
1066
1067         move_to_net_ee_or_free(device, peer_req);
1068
1069         if (unlikely(err))
1070                 drbd_err(device, "drbd_send_block() failed\n");
1071         return err;
1072 }
1073
1074 int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
1075 {
1076         struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1077         struct drbd_device *device = w->device;
1078         struct digest_info *di;
1079         int digest_size;
1080         void *digest = NULL;
1081         int err, eq = 0;
1082
1083         if (unlikely(cancel)) {
1084                 drbd_free_peer_req(device, peer_req);
1085                 dec_unacked(device);
1086                 return 0;
1087         }
1088
1089         if (get_ldev(device)) {
1090                 drbd_rs_complete_io(device, peer_req->i.sector);
1091                 put_ldev(device);
1092         }
1093
1094         di = peer_req->digest;
1095
1096         if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1097                 /* quick hack to try to avoid a race against reconfiguration.
1098                  * a real fix would be much more involved,
1099                  * introducing more locking mechanisms */
1100                 if (first_peer_device(device)->connection->csums_tfm) {
1101                         digest_size = crypto_hash_digestsize(first_peer_device(device)->connection->csums_tfm);
1102                         D_ASSERT(device, digest_size == di->digest_size);
1103                         digest = kmalloc(digest_size, GFP_NOIO);
1104                 }
1105                 if (digest) {
1106                         drbd_csum_ee(first_peer_device(device)->connection->csums_tfm, peer_req, digest);
1107                         eq = !memcmp(digest, di->digest, digest_size);
1108                         kfree(digest);
1109                 }
1110
1111                 if (eq) {
1112                         drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size);
1113                         /* rs_same_csums unit is BM_BLOCK_SIZE */
1114                         device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
1115                         err = drbd_send_ack(first_peer_device(device), P_RS_IS_IN_SYNC, peer_req);
1116                 } else {
1117                         inc_rs_pending(device);
1118                         peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
1119                         peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
1120                         kfree(di);
1121                         err = drbd_send_block(first_peer_device(device), P_RS_DATA_REPLY, peer_req);
1122                 }
1123         } else {
1124                 err = drbd_send_ack(first_peer_device(device), P_NEG_RS_DREPLY, peer_req);
1125                 if (__ratelimit(&drbd_ratelimit_state))
1126                         drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
1127         }
1128
1129         dec_unacked(device);
1130         move_to_net_ee_or_free(device, peer_req);
1131
1132         if (unlikely(err))
1133                 drbd_err(device, "drbd_send_block/ack() failed\n");
1134         return err;
1135 }
1136
1137 int w_e_end_ov_req(struct drbd_work *w, int cancel)
1138 {
1139         struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1140         struct drbd_device *device = w->device;
1141         sector_t sector = peer_req->i.sector;
1142         unsigned int size = peer_req->i.size;
1143         int digest_size;
1144         void *digest;
1145         int err = 0;
1146
1147         if (unlikely(cancel))
1148                 goto out;
1149
1150         digest_size = crypto_hash_digestsize(first_peer_device(device)->connection->verify_tfm);
1151         digest = kmalloc(digest_size, GFP_NOIO);
1152         if (!digest) {
1153                 err = 1;        /* terminate the connection in case the allocation failed */
1154                 goto out;
1155         }
1156
1157         if (likely(!(peer_req->flags & EE_WAS_ERROR)))
1158                 drbd_csum_ee(first_peer_device(device)->connection->verify_tfm, peer_req, digest);
1159         else
1160                 memset(digest, 0, digest_size);
1161
1162         /* Free e and pages before send.
1163          * In case we block on congestion, we could otherwise run into
1164          * some distributed deadlock, if the other side blocks on
1165          * congestion as well, because our receiver blocks in
1166          * drbd_alloc_pages due to pp_in_use > max_buffers. */
1167         drbd_free_peer_req(device, peer_req);
1168         peer_req = NULL;
1169         inc_rs_pending(device);
1170         err = drbd_send_drequest_csum(first_peer_device(device), sector, size, digest, digest_size, P_OV_REPLY);
1171         if (err)
1172                 dec_rs_pending(device);
1173         kfree(digest);
1174
1175 out:
1176         if (peer_req)
1177                 drbd_free_peer_req(device, peer_req);
1178         dec_unacked(device);
1179         return err;
1180 }
1181
1182 void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size)
1183 {
1184         if (device->ov_last_oos_start + device->ov_last_oos_size == sector) {
1185                 device->ov_last_oos_size += size>>9;
1186         } else {
1187                 device->ov_last_oos_start = sector;
1188                 device->ov_last_oos_size = size>>9;
1189         }
1190         drbd_set_out_of_sync(device, sector, size);
1191 }
1192
1193 int w_e_end_ov_reply(struct drbd_work *w, int cancel)
1194 {
1195         struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1196         struct drbd_device *device = w->device;
1197         struct digest_info *di;
1198         void *digest;
1199         sector_t sector = peer_req->i.sector;
1200         unsigned int size = peer_req->i.size;
1201         int digest_size;
1202         int err, eq = 0;
1203         bool stop_sector_reached = false;
1204
1205         if (unlikely(cancel)) {
1206                 drbd_free_peer_req(device, peer_req);
1207                 dec_unacked(device);
1208                 return 0;
1209         }
1210
1211         /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1212          * the resync lru has been cleaned up already */
1213         if (get_ldev(device)) {
1214                 drbd_rs_complete_io(device, peer_req->i.sector);
1215                 put_ldev(device);
1216         }
1217
1218         di = peer_req->digest;
1219
1220         if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1221                 digest_size = crypto_hash_digestsize(first_peer_device(device)->connection->verify_tfm);
1222                 digest = kmalloc(digest_size, GFP_NOIO);
1223                 if (digest) {
1224                         drbd_csum_ee(first_peer_device(device)->connection->verify_tfm, peer_req, digest);
1225
1226                         D_ASSERT(device, digest_size == di->digest_size);
1227                         eq = !memcmp(digest, di->digest, digest_size);
1228                         kfree(digest);
1229                 }
1230         }
1231
1232         /* Free peer_req and pages before send.
1233          * In case we block on congestion, we could otherwise run into
1234          * some distributed deadlock, if the other side blocks on
1235          * congestion as well, because our receiver blocks in
1236          * drbd_alloc_pages due to pp_in_use > max_buffers. */
1237         drbd_free_peer_req(device, peer_req);
1238         if (!eq)
1239                 drbd_ov_out_of_sync_found(device, sector, size);
1240         else
1241                 ov_out_of_sync_print(device);
1242
1243         err = drbd_send_ack_ex(first_peer_device(device), P_OV_RESULT, sector, size,
1244                                eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
1245
1246         dec_unacked(device);
1247
1248         --device->ov_left;
1249
1250         /* let's advance progress step marks only for every other megabyte */
1251         if ((device->ov_left & 0x200) == 0x200)
1252                 drbd_advance_rs_marks(device, device->ov_left);
1253
1254         stop_sector_reached = verify_can_do_stop_sector(device) &&
1255                 (sector + (size>>9)) >= device->ov_stop_sector;
1256
1257         if (device->ov_left == 0 || stop_sector_reached) {
1258                 ov_out_of_sync_print(device);
1259                 drbd_resync_finished(device);
1260         }
1261
1262         return err;
1263 }
1264
1265 /* FIXME
1266  * We need to track the number of pending barrier acks,
1267  * and to be able to wait for them.
1268  * See also comment in drbd_adm_attach before drbd_suspend_io.
1269  */
1270 static int drbd_send_barrier(struct drbd_connection *connection)
1271 {
1272         struct p_barrier *p;
1273         struct drbd_socket *sock;
1274
1275         sock = &connection->data;
1276         p = conn_prepare_command(connection, sock);
1277         if (!p)
1278                 return -EIO;
1279         p->barrier = connection->send.current_epoch_nr;
1280         p->pad = 0;
1281         connection->send.current_epoch_writes = 0;
1282
1283         return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
1284 }
1285
1286 int w_send_write_hint(struct drbd_work *w, int cancel)
1287 {
1288         struct drbd_device *device = w->device;
1289         struct drbd_socket *sock;
1290
1291         if (cancel)
1292                 return 0;
1293         sock = &first_peer_device(device)->connection->data;
1294         if (!drbd_prepare_command(first_peer_device(device), sock))
1295                 return -EIO;
1296         return drbd_send_command(first_peer_device(device), sock, P_UNPLUG_REMOTE, 0, NULL, 0);
1297 }
1298
1299 static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch)
1300 {
1301         if (!connection->send.seen_any_write_yet) {
1302                 connection->send.seen_any_write_yet = true;
1303                 connection->send.current_epoch_nr = epoch;
1304                 connection->send.current_epoch_writes = 0;
1305         }
1306 }
1307
1308 static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch)
1309 {
1310         /* re-init if first write on this connection */
1311         if (!connection->send.seen_any_write_yet)
1312                 return;
1313         if (connection->send.current_epoch_nr != epoch) {
1314                 if (connection->send.current_epoch_writes)
1315                         drbd_send_barrier(connection);
1316                 connection->send.current_epoch_nr = epoch;
1317         }
1318 }
1319
1320 int w_send_out_of_sync(struct drbd_work *w, int cancel)
1321 {
1322         struct drbd_request *req = container_of(w, struct drbd_request, w);
1323         struct drbd_device *device = w->device;
1324         struct drbd_connection *connection = first_peer_device(device)->connection;
1325         int err;
1326
1327         if (unlikely(cancel)) {
1328                 req_mod(req, SEND_CANCELED);
1329                 return 0;
1330         }
1331
1332         /* this time, no connection->send.current_epoch_writes++;
1333          * If it was sent, it was the closing barrier for the last
1334          * replicated epoch, before we went into AHEAD mode.
1335          * No more barriers will be sent, until we leave AHEAD mode again. */
1336         maybe_send_barrier(connection, req->epoch);
1337
1338         err = drbd_send_out_of_sync(first_peer_device(device), req);
1339         req_mod(req, OOS_HANDED_TO_NETWORK);
1340
1341         return err;
1342 }
1343
1344 /**
1345  * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1346  * @device:     DRBD device.
1347  * @w:          work object.
1348  * @cancel:     The connection will be closed anyways
1349  */
1350 int w_send_dblock(struct drbd_work *w, int cancel)
1351 {
1352         struct drbd_request *req = container_of(w, struct drbd_request, w);
1353         struct drbd_device *device = w->device;
1354         struct drbd_connection *connection = first_peer_device(device)->connection;
1355         int err;
1356
1357         if (unlikely(cancel)) {
1358                 req_mod(req, SEND_CANCELED);
1359                 return 0;
1360         }
1361
1362         re_init_if_first_write(connection, req->epoch);
1363         maybe_send_barrier(connection, req->epoch);
1364         connection->send.current_epoch_writes++;
1365
1366         err = drbd_send_dblock(first_peer_device(device), req);
1367         req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
1368
1369         return err;
1370 }
1371
1372 /**
1373  * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1374  * @device:     DRBD device.
1375  * @w:          work object.
1376  * @cancel:     The connection will be closed anyways
1377  */
1378 int w_send_read_req(struct drbd_work *w, int cancel)
1379 {
1380         struct drbd_request *req = container_of(w, struct drbd_request, w);
1381         struct drbd_device *device = w->device;
1382         struct drbd_connection *connection = first_peer_device(device)->connection;
1383         int err;
1384
1385         if (unlikely(cancel)) {
1386                 req_mod(req, SEND_CANCELED);
1387                 return 0;
1388         }
1389
1390         /* Even read requests may close a write epoch,
1391          * if there was any yet. */
1392         maybe_send_barrier(connection, req->epoch);
1393
1394         err = drbd_send_drequest(first_peer_device(device), P_DATA_REQUEST, req->i.sector, req->i.size,
1395                                  (unsigned long)req);
1396
1397         req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
1398
1399         return err;
1400 }
1401
1402 int w_restart_disk_io(struct drbd_work *w, int cancel)
1403 {
1404         struct drbd_request *req = container_of(w, struct drbd_request, w);
1405         struct drbd_device *device = w->device;
1406
1407         if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
1408                 drbd_al_begin_io(device, &req->i, false);
1409
1410         drbd_req_make_private_bio(req, req->master_bio);
1411         req->private_bio->bi_bdev = device->ldev->backing_bdev;
1412         generic_make_request(req->private_bio);
1413
1414         return 0;
1415 }
1416
1417 static int _drbd_may_sync_now(struct drbd_device *device)
1418 {
1419         struct drbd_device *odev = device;
1420         int resync_after;
1421
1422         while (1) {
1423                 if (!odev->ldev || odev->state.disk == D_DISKLESS)
1424                         return 1;
1425                 rcu_read_lock();
1426                 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
1427                 rcu_read_unlock();
1428                 if (resync_after == -1)
1429                         return 1;
1430                 odev = minor_to_device(resync_after);
1431                 if (!odev)
1432                         return 1;
1433                 if ((odev->state.conn >= C_SYNC_SOURCE &&
1434                      odev->state.conn <= C_PAUSED_SYNC_T) ||
1435                     odev->state.aftr_isp || odev->state.peer_isp ||
1436                     odev->state.user_isp)
1437                         return 0;
1438         }
1439 }
1440
1441 /**
1442  * _drbd_pause_after() - Pause resync on all devices that may not resync now
1443  * @device:     DRBD device.
1444  *
1445  * Called from process context only (admin command and after_state_ch).
1446  */
1447 static int _drbd_pause_after(struct drbd_device *device)
1448 {
1449         struct drbd_device *odev;
1450         int i, rv = 0;
1451
1452         rcu_read_lock();
1453         idr_for_each_entry(&drbd_devices, odev, i) {
1454                 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1455                         continue;
1456                 if (!_drbd_may_sync_now(odev))
1457                         rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
1458                                != SS_NOTHING_TO_DO);
1459         }
1460         rcu_read_unlock();
1461
1462         return rv;
1463 }
1464
1465 /**
1466  * _drbd_resume_next() - Resume resync on all devices that may resync now
1467  * @device:     DRBD device.
1468  *
1469  * Called from process context only (admin command and worker).
1470  */
1471 static int _drbd_resume_next(struct drbd_device *device)
1472 {
1473         struct drbd_device *odev;
1474         int i, rv = 0;
1475
1476         rcu_read_lock();
1477         idr_for_each_entry(&drbd_devices, odev, i) {
1478                 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1479                         continue;
1480                 if (odev->state.aftr_isp) {
1481                         if (_drbd_may_sync_now(odev))
1482                                 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
1483                                                         CS_HARD, NULL)
1484                                        != SS_NOTHING_TO_DO) ;
1485                 }
1486         }
1487         rcu_read_unlock();
1488         return rv;
1489 }
1490
1491 void resume_next_sg(struct drbd_device *device)
1492 {
1493         write_lock_irq(&global_state_lock);
1494         _drbd_resume_next(device);
1495         write_unlock_irq(&global_state_lock);
1496 }
1497
1498 void suspend_other_sg(struct drbd_device *device)
1499 {
1500         write_lock_irq(&global_state_lock);
1501         _drbd_pause_after(device);
1502         write_unlock_irq(&global_state_lock);
1503 }
1504
1505 /* caller must hold global_state_lock */
1506 enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
1507 {
1508         struct drbd_device *odev;
1509         int resync_after;
1510
1511         if (o_minor == -1)
1512                 return NO_ERROR;
1513         if (o_minor < -1 || o_minor > MINORMASK)
1514                 return ERR_RESYNC_AFTER;
1515
1516         /* check for loops */
1517         odev = minor_to_device(o_minor);
1518         while (1) {
1519                 if (odev == device)
1520                         return ERR_RESYNC_AFTER_CYCLE;
1521
1522                 /* You are free to depend on diskless, non-existing,
1523                  * or not yet/no longer existing minors.
1524                  * We only reject dependency loops.
1525                  * We cannot follow the dependency chain beyond a detached or
1526                  * missing minor.
1527                  */
1528                 if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
1529                         return NO_ERROR;
1530
1531                 rcu_read_lock();
1532                 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
1533                 rcu_read_unlock();
1534                 /* dependency chain ends here, no cycles. */
1535                 if (resync_after == -1)
1536                         return NO_ERROR;
1537
1538                 /* follow the dependency chain */
1539                 odev = minor_to_device(resync_after);
1540         }
1541 }
1542
1543 /* caller must hold global_state_lock */
1544 void drbd_resync_after_changed(struct drbd_device *device)
1545 {
1546         int changes;
1547
1548         do {
1549                 changes  = _drbd_pause_after(device);
1550                 changes |= _drbd_resume_next(device);
1551         } while (changes);
1552 }
1553
1554 void drbd_rs_controller_reset(struct drbd_device *device)
1555 {
1556         struct fifo_buffer *plan;
1557
1558         atomic_set(&device->rs_sect_in, 0);
1559         atomic_set(&device->rs_sect_ev, 0);
1560         device->rs_in_flight = 0;
1561
1562         /* Updating the RCU protected object in place is necessary since
1563            this function gets called from atomic context.
1564            It is valid since all other updates also lead to an completely
1565            empty fifo */
1566         rcu_read_lock();
1567         plan = rcu_dereference(device->rs_plan_s);
1568         plan->total = 0;
1569         fifo_set(plan, 0);
1570         rcu_read_unlock();
1571 }
1572
1573 void start_resync_timer_fn(unsigned long data)
1574 {
1575         struct drbd_device *device = (struct drbd_device *) data;
1576
1577         drbd_queue_work(&first_peer_device(device)->connection->sender_work, &device->start_resync_work);
1578 }
1579
1580 int w_start_resync(struct drbd_work *w, int cancel)
1581 {
1582         struct drbd_device *device = w->device;
1583
1584         if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
1585                 drbd_warn(device, "w_start_resync later...\n");
1586                 device->start_resync_timer.expires = jiffies + HZ/10;
1587                 add_timer(&device->start_resync_timer);
1588                 return 0;
1589         }
1590
1591         drbd_start_resync(device, C_SYNC_SOURCE);
1592         clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
1593         return 0;
1594 }
1595
1596 /**
1597  * drbd_start_resync() - Start the resync process
1598  * @device:     DRBD device.
1599  * @side:       Either C_SYNC_SOURCE or C_SYNC_TARGET
1600  *
1601  * This function might bring you directly into one of the
1602  * C_PAUSED_SYNC_* states.
1603  */
1604 void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1605 {
1606         union drbd_state ns;
1607         int r;
1608
1609         if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) {
1610                 drbd_err(device, "Resync already running!\n");
1611                 return;
1612         }
1613
1614         if (!test_bit(B_RS_H_DONE, &device->flags)) {
1615                 if (side == C_SYNC_TARGET) {
1616                         /* Since application IO was locked out during C_WF_BITMAP_T and
1617                            C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1618                            we check that we might make the data inconsistent. */
1619                         r = drbd_khelper(device, "before-resync-target");
1620                         r = (r >> 8) & 0xff;
1621                         if (r > 0) {
1622                                 drbd_info(device, "before-resync-target handler returned %d, "
1623                                          "dropping connection.\n", r);
1624                                 conn_request_state(first_peer_device(device)->connection, NS(conn, C_DISCONNECTING), CS_HARD);
1625                                 return;
1626                         }
1627                 } else /* C_SYNC_SOURCE */ {
1628                         r = drbd_khelper(device, "before-resync-source");
1629                         r = (r >> 8) & 0xff;
1630                         if (r > 0) {
1631                                 if (r == 3) {
1632                                         drbd_info(device, "before-resync-source handler returned %d, "
1633                                                  "ignoring. Old userland tools?", r);
1634                                 } else {
1635                                         drbd_info(device, "before-resync-source handler returned %d, "
1636                                                  "dropping connection.\n", r);
1637                                         conn_request_state(first_peer_device(device)->connection,
1638                                                            NS(conn, C_DISCONNECTING), CS_HARD);
1639                                         return;
1640                                 }
1641                         }
1642                 }
1643         }
1644
1645         if (current == first_peer_device(device)->connection->worker.task) {
1646                 /* The worker should not sleep waiting for state_mutex,
1647                    that can take long */
1648                 if (!mutex_trylock(device->state_mutex)) {
1649                         set_bit(B_RS_H_DONE, &device->flags);
1650                         device->start_resync_timer.expires = jiffies + HZ/5;
1651                         add_timer(&device->start_resync_timer);
1652                         return;
1653                 }
1654         } else {
1655                 mutex_lock(device->state_mutex);
1656         }
1657         clear_bit(B_RS_H_DONE, &device->flags);
1658
1659         write_lock_irq(&global_state_lock);
1660         /* Did some connection breakage or IO error race with us? */
1661         if (device->state.conn < C_CONNECTED
1662         || !get_ldev_if_state(device, D_NEGOTIATING)) {
1663                 write_unlock_irq(&global_state_lock);
1664                 mutex_unlock(device->state_mutex);
1665                 return;
1666         }
1667
1668         ns = drbd_read_state(device);
1669
1670         ns.aftr_isp = !_drbd_may_sync_now(device);
1671
1672         ns.conn = side;
1673
1674         if (side == C_SYNC_TARGET)
1675                 ns.disk = D_INCONSISTENT;
1676         else /* side == C_SYNC_SOURCE */
1677                 ns.pdsk = D_INCONSISTENT;
1678
1679         r = __drbd_set_state(device, ns, CS_VERBOSE, NULL);
1680         ns = drbd_read_state(device);
1681
1682         if (ns.conn < C_CONNECTED)
1683                 r = SS_UNKNOWN_ERROR;
1684
1685         if (r == SS_SUCCESS) {
1686                 unsigned long tw = drbd_bm_total_weight(device);
1687                 unsigned long now = jiffies;
1688                 int i;
1689
1690                 device->rs_failed    = 0;
1691                 device->rs_paused    = 0;
1692                 device->rs_same_csum = 0;
1693                 device->rs_last_events = 0;
1694                 device->rs_last_sect_ev = 0;
1695                 device->rs_total     = tw;
1696                 device->rs_start     = now;
1697                 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1698                         device->rs_mark_left[i] = tw;
1699                         device->rs_mark_time[i] = now;
1700                 }
1701                 _drbd_pause_after(device);
1702         }
1703         write_unlock_irq(&global_state_lock);
1704
1705         if (r == SS_SUCCESS) {
1706                 /* reset rs_last_bcast when a resync or verify is started,
1707                  * to deal with potential jiffies wrap. */
1708                 device->rs_last_bcast = jiffies - HZ;
1709
1710                 drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1711                      drbd_conn_str(ns.conn),
1712                      (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
1713                      (unsigned long) device->rs_total);
1714                 if (side == C_SYNC_TARGET)
1715                         device->bm_resync_fo = 0;
1716
1717                 /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
1718                  * with w_send_oos, or the sync target will get confused as to
1719                  * how much bits to resync.  We cannot do that always, because for an
1720                  * empty resync and protocol < 95, we need to do it here, as we call
1721                  * drbd_resync_finished from here in that case.
1722                  * We drbd_gen_and_send_sync_uuid here for protocol < 96,
1723                  * and from after_state_ch otherwise. */
1724                 if (side == C_SYNC_SOURCE &&
1725                     first_peer_device(device)->connection->agreed_pro_version < 96)
1726                         drbd_gen_and_send_sync_uuid(first_peer_device(device));
1727
1728                 if (first_peer_device(device)->connection->agreed_pro_version < 95 &&
1729                     device->rs_total == 0) {
1730                         /* This still has a race (about when exactly the peers
1731                          * detect connection loss) that can lead to a full sync
1732                          * on next handshake. In 8.3.9 we fixed this with explicit
1733                          * resync-finished notifications, but the fix
1734                          * introduces a protocol change.  Sleeping for some
1735                          * time longer than the ping interval + timeout on the
1736                          * SyncSource, to give the SyncTarget the chance to
1737                          * detect connection loss, then waiting for a ping
1738                          * response (implicit in drbd_resync_finished) reduces
1739                          * the race considerably, but does not solve it. */
1740                         if (side == C_SYNC_SOURCE) {
1741                                 struct net_conf *nc;
1742                                 int timeo;
1743
1744                                 rcu_read_lock();
1745                                 nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
1746                                 timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
1747                                 rcu_read_unlock();
1748                                 schedule_timeout_interruptible(timeo);
1749                         }
1750                         drbd_resync_finished(device);
1751                 }
1752
1753                 drbd_rs_controller_reset(device);
1754                 /* ns.conn may already be != device->state.conn,
1755                  * we may have been paused in between, or become paused until
1756                  * the timer triggers.
1757                  * No matter, that is handled in resync_timer_fn() */
1758                 if (ns.conn == C_SYNC_TARGET)
1759                         mod_timer(&device->resync_timer, jiffies);
1760
1761                 drbd_md_sync(device);
1762         }
1763         put_ldev(device);
1764         mutex_unlock(device->state_mutex);
1765 }
1766
1767 /* If the resource already closed the current epoch, but we did not
1768  * (because we have not yet seen new requests), we should send the
1769  * corresponding barrier now.  Must be checked within the same spinlock
1770  * that is used to check for new requests. */
1771 static bool need_to_send_barrier(struct drbd_connection *connection)
1772 {
1773         if (!connection->send.seen_any_write_yet)
1774                 return false;
1775
1776         /* Skip barriers that do not contain any writes.
1777          * This may happen during AHEAD mode. */
1778         if (!connection->send.current_epoch_writes)
1779                 return false;
1780
1781         /* ->req_lock is held when requests are queued on
1782          * connection->sender_work, and put into ->transfer_log.
1783          * It is also held when ->current_tle_nr is increased.
1784          * So either there are already new requests queued,
1785          * and corresponding barriers will be send there.
1786          * Or nothing new is queued yet, so the difference will be 1.
1787          */
1788         if (atomic_read(&connection->current_tle_nr) !=
1789             connection->send.current_epoch_nr + 1)
1790                 return false;
1791
1792         return true;
1793 }
1794
1795 static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
1796 {
1797         spin_lock_irq(&queue->q_lock);
1798         list_splice_init(&queue->q, work_list);
1799         spin_unlock_irq(&queue->q_lock);
1800         return !list_empty(work_list);
1801 }
1802
1803 static bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list)
1804 {
1805         spin_lock_irq(&queue->q_lock);
1806         if (!list_empty(&queue->q))
1807                 list_move(queue->q.next, work_list);
1808         spin_unlock_irq(&queue->q_lock);
1809         return !list_empty(work_list);
1810 }
1811
1812 static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list)
1813 {
1814         DEFINE_WAIT(wait);
1815         struct net_conf *nc;
1816         int uncork, cork;
1817
1818         dequeue_work_item(&connection->sender_work, work_list);
1819         if (!list_empty(work_list))
1820                 return;
1821
1822         /* Still nothing to do?
1823          * Maybe we still need to close the current epoch,
1824          * even if no new requests are queued yet.
1825          *
1826          * Also, poke TCP, just in case.
1827          * Then wait for new work (or signal). */
1828         rcu_read_lock();
1829         nc = rcu_dereference(connection->net_conf);
1830         uncork = nc ? nc->tcp_cork : 0;
1831         rcu_read_unlock();
1832         if (uncork) {
1833                 mutex_lock(&connection->data.mutex);
1834                 if (connection->data.socket)
1835                         drbd_tcp_uncork(connection->data.socket);
1836                 mutex_unlock(&connection->data.mutex);
1837         }
1838
1839         for (;;) {
1840                 int send_barrier;
1841                 prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
1842                 spin_lock_irq(&connection->resource->req_lock);
1843                 spin_lock(&connection->sender_work.q_lock);     /* FIXME get rid of this one? */
1844                 /* dequeue single item only,
1845                  * we still use drbd_queue_work_front() in some places */
1846                 if (!list_empty(&connection->sender_work.q))
1847                         list_move(connection->sender_work.q.next, work_list);
1848                 spin_unlock(&connection->sender_work.q_lock);   /* FIXME get rid of this one? */
1849                 if (!list_empty(work_list) || signal_pending(current)) {
1850                         spin_unlock_irq(&connection->resource->req_lock);
1851                         break;
1852                 }
1853                 send_barrier = need_to_send_barrier(connection);
1854                 spin_unlock_irq(&connection->resource->req_lock);
1855                 if (send_barrier) {
1856                         drbd_send_barrier(connection);
1857                         connection->send.current_epoch_nr++;
1858                 }
1859                 schedule();
1860                 /* may be woken up for other things but new work, too,
1861                  * e.g. if the current epoch got closed.
1862                  * In which case we send the barrier above. */
1863         }
1864         finish_wait(&connection->sender_work.q_wait, &wait);
1865
1866         /* someone may have changed the config while we have been waiting above. */
1867         rcu_read_lock();
1868         nc = rcu_dereference(connection->net_conf);
1869         cork = nc ? nc->tcp_cork : 0;
1870         rcu_read_unlock();
1871         mutex_lock(&connection->data.mutex);
1872         if (connection->data.socket) {
1873                 if (cork)
1874                         drbd_tcp_cork(connection->data.socket);
1875                 else if (!uncork)
1876                         drbd_tcp_uncork(connection->data.socket);
1877         }
1878         mutex_unlock(&connection->data.mutex);
1879 }
1880
1881 int drbd_worker(struct drbd_thread *thi)
1882 {
1883         struct drbd_connection *connection = thi->connection;
1884         struct drbd_work *w = NULL;
1885         struct drbd_peer_device *peer_device;
1886         LIST_HEAD(work_list);
1887         int vnr;
1888
1889         while (get_t_state(thi) == RUNNING) {
1890                 drbd_thread_current_set_cpu(thi);
1891
1892                 /* as long as we use drbd_queue_work_front(),
1893                  * we may only dequeue single work items here, not batches. */
1894                 if (list_empty(&work_list))
1895                         wait_for_work(connection, &work_list);
1896
1897                 if (signal_pending(current)) {
1898                         flush_signals(current);
1899                         if (get_t_state(thi) == RUNNING) {
1900                                 drbd_warn(connection, "Worker got an unexpected signal\n");
1901                                 continue;
1902                         }
1903                         break;
1904                 }
1905
1906                 if (get_t_state(thi) != RUNNING)
1907                         break;
1908
1909                 while (!list_empty(&work_list)) {
1910                         w = list_first_entry(&work_list, struct drbd_work, list);
1911                         list_del_init(&w->list);
1912                         if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
1913                                 continue;
1914                         if (connection->cstate >= C_WF_REPORT_PARAMS)
1915                                 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
1916                 }
1917         }
1918
1919         do {
1920                 while (!list_empty(&work_list)) {
1921                         w = list_first_entry(&work_list, struct drbd_work, list);
1922                         list_del_init(&w->list);
1923                         w->cb(w, 1);
1924                 }
1925                 dequeue_work_batch(&connection->sender_work, &work_list);
1926         } while (!list_empty(&work_list));
1927
1928         rcu_read_lock();
1929         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1930                 struct drbd_device *device = peer_device->device;
1931                 D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE);
1932                 kref_get(&device->kref);
1933                 rcu_read_unlock();
1934                 drbd_device_cleanup(device);
1935                 kref_put(&device->kref, drbd_destroy_device);
1936                 rcu_read_lock();
1937         }
1938         rcu_read_unlock();
1939
1940         return 0;
1941 }