drbd: flush drbd work queue before invalidate/invalidate remote
[deliverable/linux.git] / drivers / block / drbd / drbd_main.c
CommitLineData
b411b363
PR
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
b411b363 29#include <linux/module.h>
b411b363
PR
30#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
2a48fc0a 35#include <linux/mutex.h>
b411b363
PR
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
b411b363
PR
55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
2a48fc0a 67static DEFINE_MUTEX(drbd_main_mutex);
b411b363
PR
68int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
e9e6f3ec 81static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
b411b363 82
b411b363
PR
83MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
2b8a90b5
PR
88MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
b411b363
PR
90MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
91
92#include <linux/moduleparam.h>
93/* allow_open_on_secondary */
94MODULE_PARM_DESC(allow_oos, "DONT USE!");
95/* thanks to these macros, if compiled into the kernel (not-module),
96 * this becomes the boot parameter drbd.minor_count */
97module_param(minor_count, uint, 0444);
98module_param(disable_sendpage, bool, 0644);
99module_param(allow_oos, bool, 0);
100module_param(cn_idx, uint, 0444);
101module_param(proc_details, int, 0644);
102
103#ifdef CONFIG_DRBD_FAULT_INJECTION
104int enable_faults;
105int fault_rate;
106static int fault_count;
107int fault_devs;
108/* bitmap of enabled faults */
109module_param(enable_faults, int, 0664);
110/* fault rate % value - applies to all enabled faults */
111module_param(fault_rate, int, 0664);
112/* count of faults inserted */
113module_param(fault_count, int, 0664);
114/* bitmap of devices to insert faults on */
115module_param(fault_devs, int, 0644);
116#endif
117
118/* module parameter, defined */
2b8a90b5 119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
90ab5ee9
RR
120bool disable_sendpage;
121bool allow_oos;
b411b363
PR
122unsigned int cn_idx = CN_IDX_DRBD;
123int proc_details; /* Detail level in proc drbd*/
124
125/* Module parameter for setting the user mode helper program
126 * to run. Default is /sbin/drbdadm */
127char usermode_helper[80] = "/sbin/drbdadm";
128
129module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
130
131/* in 2.6.x, our device mapping and config info contains our virtual gendisks
132 * as member "struct gendisk *vdisk;"
133 */
134struct drbd_conf **minor_table;
135
136struct kmem_cache *drbd_request_cache;
137struct kmem_cache *drbd_ee_cache; /* epoch entries */
138struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
139struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
140mempool_t *drbd_request_mempool;
141mempool_t *drbd_ee_mempool;
4281808f 142mempool_t *drbd_md_io_page_pool;
9476f39d 143struct bio_set *drbd_md_io_bio_set;
b411b363
PR
144
145/* I do not use a standard mempool, because:
146 1) I want to hand out the pre-allocated objects first.
147 2) I want to be able to interrupt sleeping allocation with a signal.
148 Note: This is a single linked list, the next pointer is the private
149 member of struct page.
150 */
151struct page *drbd_pp_pool;
152spinlock_t drbd_pp_lock;
153int drbd_pp_vacant;
154wait_queue_head_t drbd_pp_wait;
155
156DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
157
7d4e9d09 158static const struct block_device_operations drbd_ops = {
b411b363
PR
159 .owner = THIS_MODULE,
160 .open = drbd_open,
161 .release = drbd_release,
162};
163
9476f39d
LE
164static void bio_destructor_drbd(struct bio *bio)
165{
166 bio_free(bio, drbd_md_io_bio_set);
167}
168
169struct bio *bio_alloc_drbd(gfp_t gfp_mask)
170{
171 struct bio *bio;
172
173 if (!drbd_md_io_bio_set)
174 return bio_alloc(gfp_mask, 1);
175
176 bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
177 if (!bio)
178 return NULL;
179 bio->bi_destructor = bio_destructor_drbd;
180 return bio;
181}
182
b411b363
PR
183#ifdef __CHECKER__
184/* When checking with sparse, and this is an inline function, sparse will
185 give tons of false positives. When this is a real functions sparse works.
186 */
187int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
188{
189 int io_allowed;
190
191 atomic_inc(&mdev->local_cnt);
192 io_allowed = (mdev->state.disk >= mins);
193 if (!io_allowed) {
194 if (atomic_dec_and_test(&mdev->local_cnt))
195 wake_up(&mdev->misc_wait);
196 }
197 return io_allowed;
198}
199
200#endif
201
202/**
203 * DOC: The transfer log
204 *
205 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
206 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
207 * of the list. There is always at least one &struct drbd_tl_epoch object.
208 *
209 * Each &struct drbd_tl_epoch has a circular double linked list of requests
210 * attached.
211 */
212static int tl_init(struct drbd_conf *mdev)
213{
214 struct drbd_tl_epoch *b;
215
216 /* during device minor initialization, we may well use GFP_KERNEL */
217 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
218 if (!b)
219 return 0;
220 INIT_LIST_HEAD(&b->requests);
221 INIT_LIST_HEAD(&b->w.list);
222 b->next = NULL;
223 b->br_number = 4711;
7e602c0a 224 b->n_writes = 0;
b411b363
PR
225 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
226
227 mdev->oldest_tle = b;
228 mdev->newest_tle = b;
229 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
6d7e32f5 230 INIT_LIST_HEAD(&mdev->barrier_acked_requests);
b411b363
PR
231
232 mdev->tl_hash = NULL;
233 mdev->tl_hash_s = 0;
234
235 return 1;
236}
237
238static void tl_cleanup(struct drbd_conf *mdev)
239{
240 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
241 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
242 kfree(mdev->oldest_tle);
243 mdev->oldest_tle = NULL;
244 kfree(mdev->unused_spare_tle);
245 mdev->unused_spare_tle = NULL;
246 kfree(mdev->tl_hash);
247 mdev->tl_hash = NULL;
248 mdev->tl_hash_s = 0;
249}
250
251/**
252 * _tl_add_barrier() - Adds a barrier to the transfer log
253 * @mdev: DRBD device.
254 * @new: Barrier to be added before the current head of the TL.
255 *
256 * The caller must hold the req_lock.
257 */
258void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
259{
260 struct drbd_tl_epoch *newest_before;
261
262 INIT_LIST_HEAD(&new->requests);
263 INIT_LIST_HEAD(&new->w.list);
264 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
265 new->next = NULL;
7e602c0a 266 new->n_writes = 0;
b411b363
PR
267
268 newest_before = mdev->newest_tle;
c088b2d9 269 new->br_number = newest_before->br_number+1;
b411b363
PR
270 if (mdev->newest_tle != new) {
271 mdev->newest_tle->next = new;
272 mdev->newest_tle = new;
273 }
274}
275
276/**
277 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
278 * @mdev: DRBD device.
279 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
280 * @set_size: Expected number of requests before that barrier.
281 *
282 * In case the passed barrier_nr or set_size does not match the oldest
283 * &struct drbd_tl_epoch objects this function will cause a termination
284 * of the connection.
285 */
286void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
287 unsigned int set_size)
288{
289 struct drbd_tl_epoch *b, *nob; /* next old barrier */
290 struct list_head *le, *tle;
291 struct drbd_request *r;
292
293 spin_lock_irq(&mdev->req_lock);
294
295 b = mdev->oldest_tle;
296
297 /* first some paranoia code */
298 if (b == NULL) {
299 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
300 barrier_nr);
301 goto bail;
302 }
303 if (b->br_number != barrier_nr) {
304 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
305 barrier_nr, b->br_number);
306 goto bail;
307 }
7e602c0a
PR
308 if (b->n_writes != set_size) {
309 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
310 barrier_nr, set_size, b->n_writes);
b411b363
PR
311 goto bail;
312 }
313
314 /* Clean up list of requests processed during current epoch */
315 list_for_each_safe(le, tle, &b->requests) {
316 r = list_entry(le, struct drbd_request, tl_requests);
317 _req_mod(r, barrier_acked);
318 }
319 /* There could be requests on the list waiting for completion
320 of the write to the local disk. To avoid corruptions of
321 slab's data structures we have to remove the lists head.
322
323 Also there could have been a barrier ack out of sequence, overtaking
324 the write acks - which would be a bug and violating write ordering.
325 To not deadlock in case we lose connection while such requests are
326 still pending, we need some way to find them for the
327 _req_mode(connection_lost_while_pending).
328
329 These have been list_move'd to the out_of_sequence_requests list in
330 _req_mod(, barrier_acked) above.
331 */
6d7e32f5 332 list_splice_init(&b->requests, &mdev->barrier_acked_requests);
b411b363
PR
333
334 nob = b->next;
335 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
336 _tl_add_barrier(mdev, b);
337 if (nob)
338 mdev->oldest_tle = nob;
339 /* if nob == NULL b was the only barrier, and becomes the new
340 barrier. Therefore mdev->oldest_tle points already to b */
341 } else {
342 D_ASSERT(nob != NULL);
343 mdev->oldest_tle = nob;
344 kfree(b);
345 }
346
347 spin_unlock_irq(&mdev->req_lock);
348 dec_ap_pending(mdev);
349
350 return;
351
352bail:
353 spin_unlock_irq(&mdev->req_lock);
354 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
355}
356
617049aa 357
b411b363 358/**
11b58e73 359 * _tl_restart() - Walks the transfer log, and applies an action to all requests
b411b363 360 * @mdev: DRBD device.
11b58e73 361 * @what: The action/event to perform with all request objects
b411b363 362 *
11b58e73 363 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
fd2491f4 364 * restart_frozen_disk_io.
b411b363 365 */
11b58e73 366static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
b411b363 367{
11b58e73 368 struct drbd_tl_epoch *b, *tmp, **pn;
b9b98716 369 struct list_head *le, *tle, carry_reads;
11b58e73
PR
370 struct drbd_request *req;
371 int rv, n_writes, n_reads;
b411b363
PR
372
373 b = mdev->oldest_tle;
11b58e73 374 pn = &mdev->oldest_tle;
b411b363 375 while (b) {
11b58e73
PR
376 n_writes = 0;
377 n_reads = 0;
b9b98716 378 INIT_LIST_HEAD(&carry_reads);
b411b363 379 list_for_each_safe(le, tle, &b->requests) {
11b58e73
PR
380 req = list_entry(le, struct drbd_request, tl_requests);
381 rv = _req_mod(req, what);
382
383 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
384 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
b411b363
PR
385 }
386 tmp = b->next;
387
b9b98716 388 if (n_writes) {
11b58e73
PR
389 if (what == resend) {
390 b->n_writes = n_writes;
391 if (b->w.cb == NULL) {
392 b->w.cb = w_send_barrier;
393 inc_ap_pending(mdev);
394 set_bit(CREATE_BARRIER, &mdev->flags);
395 }
396
397 drbd_queue_work(&mdev->data.work, &b->w);
398 }
399 pn = &b->next;
400 } else {
b9b98716
PR
401 if (n_reads)
402 list_add(&carry_reads, &b->requests);
11b58e73
PR
403 /* there could still be requests on that ring list,
404 * in case local io is still pending */
405 list_del(&b->requests);
406
407 /* dec_ap_pending corresponding to queue_barrier.
408 * the newest barrier may not have been queued yet,
409 * in which case w.cb is still NULL. */
410 if (b->w.cb != NULL)
411 dec_ap_pending(mdev);
412
413 if (b == mdev->newest_tle) {
414 /* recycle, but reinit! */
415 D_ASSERT(tmp == NULL);
416 INIT_LIST_HEAD(&b->requests);
b9b98716 417 list_splice(&carry_reads, &b->requests);
11b58e73
PR
418 INIT_LIST_HEAD(&b->w.list);
419 b->w.cb = NULL;
420 b->br_number = net_random();
421 b->n_writes = 0;
422
423 *pn = b;
424 break;
425 }
426 *pn = tmp;
427 kfree(b);
b411b363 428 }
b411b363 429 b = tmp;
b9b98716 430 list_splice(&carry_reads, &b->requests);
b411b363 431 }
6d7e32f5
PR
432
433 /* Actions operating on the disk state, also want to work on
434 requests that got barrier acked. */
435 switch (what) {
6d7e32f5
PR
436 case fail_frozen_disk_io:
437 case restart_frozen_disk_io:
438 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
439 req = list_entry(le, struct drbd_request, tl_requests);
440 _req_mod(req, what);
441 }
442
443 case connection_lost_while_pending:
444 case resend:
445 break;
446 default:
447 dev_err(DEV, "what = %d in _tl_restart()\n", what);
448 }
11b58e73
PR
449}
450
b411b363
PR
451
452/**
453 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
454 * @mdev: DRBD device.
455 *
456 * This is called after the connection to the peer was lost. The storage covered
457 * by the requests on the transfer gets marked as our of sync. Called from the
458 * receiver thread and the worker thread.
459 */
460void tl_clear(struct drbd_conf *mdev)
461{
b411b363
PR
462 struct list_head *le, *tle;
463 struct drbd_request *r;
b411b363
PR
464
465 spin_lock_irq(&mdev->req_lock);
466
11b58e73 467 _tl_restart(mdev, connection_lost_while_pending);
b411b363
PR
468
469 /* we expect this list to be empty. */
470 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
471
472 /* but just in case, clean it up anyways! */
473 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
474 r = list_entry(le, struct drbd_request, tl_requests);
475 /* It would be nice to complete outside of spinlock.
476 * But this is easier for now. */
477 _req_mod(r, connection_lost_while_pending);
478 }
479
480 /* ensure bit indicating barrier is required is clear */
481 clear_bit(CREATE_BARRIER, &mdev->flags);
482
288f422e
PR
483 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
484
b411b363
PR
485 spin_unlock_irq(&mdev->req_lock);
486}
487
11b58e73
PR
488void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
489{
490 spin_lock_irq(&mdev->req_lock);
491 _tl_restart(mdev, what);
b411b363
PR
492 spin_unlock_irq(&mdev->req_lock);
493}
494
fd2491f4
PR
495/**
496 * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
497 * @mdev: DRBD device.
498 */
499void tl_abort_disk_io(struct drbd_conf *mdev)
500{
501 struct drbd_tl_epoch *b;
502 struct list_head *le, *tle;
503 struct drbd_request *req;
504
505 spin_lock_irq(&mdev->req_lock);
506 b = mdev->oldest_tle;
507 while (b) {
508 list_for_each_safe(le, tle, &b->requests) {
509 req = list_entry(le, struct drbd_request, tl_requests);
510 if (!(req->rq_state & RQ_LOCAL_PENDING))
511 continue;
512 _req_mod(req, abort_disk_io);
513 }
514 b = b->next;
515 }
516
517 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
518 req = list_entry(le, struct drbd_request, tl_requests);
519 if (!(req->rq_state & RQ_LOCAL_PENDING))
520 continue;
521 _req_mod(req, abort_disk_io);
522 }
523
524 spin_unlock_irq(&mdev->req_lock);
525}
526
b411b363 527/**
81e84650 528 * cl_wide_st_chg() - true if the state change is a cluster wide one
b411b363
PR
529 * @mdev: DRBD device.
530 * @os: old (current) state.
531 * @ns: new (wanted) state.
532 */
533static int cl_wide_st_chg(struct drbd_conf *mdev,
534 union drbd_state os, union drbd_state ns)
535{
536 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
537 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
538 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
539 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
02ee8f95 540 (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
b411b363
PR
541 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
542 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
543}
544
bf885f8a
AG
545enum drbd_state_rv
546drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
547 union drbd_state mask, union drbd_state val)
b411b363
PR
548{
549 unsigned long flags;
550 union drbd_state os, ns;
bf885f8a 551 enum drbd_state_rv rv;
b411b363
PR
552
553 spin_lock_irqsave(&mdev->req_lock, flags);
554 os = mdev->state;
555 ns.i = (os.i & ~mask.i) | val.i;
556 rv = _drbd_set_state(mdev, ns, f, NULL);
557 ns = mdev->state;
558 spin_unlock_irqrestore(&mdev->req_lock, flags);
559
560 return rv;
561}
562
563/**
564 * drbd_force_state() - Impose a change which happens outside our control on our state
565 * @mdev: DRBD device.
566 * @mask: mask of state bits to change.
567 * @val: value of new state bits.
568 */
569void drbd_force_state(struct drbd_conf *mdev,
570 union drbd_state mask, union drbd_state val)
571{
572 drbd_change_state(mdev, CS_HARD, mask, val);
573}
574
bf885f8a
AG
575static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
576static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
577 union drbd_state,
578 union drbd_state);
77e8fdfc
PR
579enum sanitize_state_warnings {
580 NO_WARNING,
581 ABORTED_ONLINE_VERIFY,
582 ABORTED_RESYNC,
583 CONNECTION_LOST_NEGOTIATING,
584 IMPLICITLY_UPGRADED_DISK,
585 IMPLICITLY_UPGRADED_PDSK,
586};
b411b363 587static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
77e8fdfc 588 union drbd_state ns, enum sanitize_state_warnings *warn);
b411b363
PR
589int drbd_send_state_req(struct drbd_conf *,
590 union drbd_state, union drbd_state);
591
c8b32563
AG
592static enum drbd_state_rv
593_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
594 union drbd_state val)
b411b363
PR
595{
596 union drbd_state os, ns;
597 unsigned long flags;
bf885f8a 598 enum drbd_state_rv rv;
b411b363
PR
599
600 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
601 return SS_CW_SUCCESS;
602
603 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
604 return SS_CW_FAILED_BY_PEER;
605
606 rv = 0;
607 spin_lock_irqsave(&mdev->req_lock, flags);
608 os = mdev->state;
609 ns.i = (os.i & ~mask.i) | val.i;
610 ns = sanitize_state(mdev, os, ns, NULL);
611
612 if (!cl_wide_st_chg(mdev, os, ns))
613 rv = SS_CW_NO_NEED;
614 if (!rv) {
615 rv = is_valid_state(mdev, ns);
616 if (rv == SS_SUCCESS) {
617 rv = is_valid_state_transition(mdev, ns, os);
618 if (rv == SS_SUCCESS)
bf885f8a 619 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
b411b363
PR
620 }
621 }
622 spin_unlock_irqrestore(&mdev->req_lock, flags);
623
624 return rv;
625}
626
627/**
628 * drbd_req_state() - Perform an eventually cluster wide state change
629 * @mdev: DRBD device.
630 * @mask: mask of state bits to change.
631 * @val: value of new state bits.
632 * @f: flags
633 *
634 * Should not be called directly, use drbd_request_state() or
635 * _drbd_request_state().
636 */
bf885f8a
AG
637static enum drbd_state_rv
638drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
639 union drbd_state val, enum chg_state_flags f)
b411b363
PR
640{
641 struct completion done;
642 unsigned long flags;
643 union drbd_state os, ns;
bf885f8a 644 enum drbd_state_rv rv;
b411b363
PR
645
646 init_completion(&done);
647
648 if (f & CS_SERIALIZE)
649 mutex_lock(&mdev->state_mutex);
650
651 spin_lock_irqsave(&mdev->req_lock, flags);
652 os = mdev->state;
653 ns.i = (os.i & ~mask.i) | val.i;
654 ns = sanitize_state(mdev, os, ns, NULL);
655
656 if (cl_wide_st_chg(mdev, os, ns)) {
657 rv = is_valid_state(mdev, ns);
658 if (rv == SS_SUCCESS)
659 rv = is_valid_state_transition(mdev, ns, os);
660 spin_unlock_irqrestore(&mdev->req_lock, flags);
661
662 if (rv < SS_SUCCESS) {
663 if (f & CS_VERBOSE)
664 print_st_err(mdev, os, ns, rv);
665 goto abort;
666 }
667
668 drbd_state_lock(mdev);
669 if (!drbd_send_state_req(mdev, mask, val)) {
670 drbd_state_unlock(mdev);
671 rv = SS_CW_FAILED_BY_PEER;
672 if (f & CS_VERBOSE)
673 print_st_err(mdev, os, ns, rv);
674 goto abort;
675 }
676
677 wait_event(mdev->state_wait,
678 (rv = _req_st_cond(mdev, mask, val)));
679
680 if (rv < SS_SUCCESS) {
681 drbd_state_unlock(mdev);
682 if (f & CS_VERBOSE)
683 print_st_err(mdev, os, ns, rv);
684 goto abort;
685 }
686 spin_lock_irqsave(&mdev->req_lock, flags);
687 os = mdev->state;
688 ns.i = (os.i & ~mask.i) | val.i;
689 rv = _drbd_set_state(mdev, ns, f, &done);
690 drbd_state_unlock(mdev);
691 } else {
692 rv = _drbd_set_state(mdev, ns, f, &done);
693 }
694
695 spin_unlock_irqrestore(&mdev->req_lock, flags);
696
697 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
698 D_ASSERT(current != mdev->worker.task);
699 wait_for_completion(&done);
700 }
701
702abort:
703 if (f & CS_SERIALIZE)
704 mutex_unlock(&mdev->state_mutex);
705
706 return rv;
707}
708
709/**
710 * _drbd_request_state() - Request a state change (with flags)
711 * @mdev: DRBD device.
712 * @mask: mask of state bits to change.
713 * @val: value of new state bits.
714 * @f: flags
715 *
716 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
717 * flag, or when logging of failed state change requests is not desired.
718 */
bf885f8a
AG
719enum drbd_state_rv
720_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
721 union drbd_state val, enum chg_state_flags f)
b411b363 722{
bf885f8a 723 enum drbd_state_rv rv;
b411b363
PR
724
725 wait_event(mdev->state_wait,
726 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
727
728 return rv;
729}
730
731static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
732{
733 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
734 name,
735 drbd_conn_str(ns.conn),
736 drbd_role_str(ns.role),
737 drbd_role_str(ns.peer),
738 drbd_disk_str(ns.disk),
739 drbd_disk_str(ns.pdsk),
fb22c402 740 is_susp(ns) ? 's' : 'r',
b411b363
PR
741 ns.aftr_isp ? 'a' : '-',
742 ns.peer_isp ? 'p' : '-',
743 ns.user_isp ? 'u' : '-'
744 );
745}
746
bf885f8a
AG
747void print_st_err(struct drbd_conf *mdev, union drbd_state os,
748 union drbd_state ns, enum drbd_state_rv err)
b411b363
PR
749{
750 if (err == SS_IN_TRANSIENT_STATE)
751 return;
752 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
753 print_st(mdev, " state", os);
754 print_st(mdev, "wanted", ns);
755}
756
757
b411b363
PR
758/**
759 * is_valid_state() - Returns an SS_ error code if ns is not valid
760 * @mdev: DRBD device.
761 * @ns: State to consider.
762 */
bf885f8a
AG
763static enum drbd_state_rv
764is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
b411b363
PR
765{
766 /* See drbd_state_sw_errors in drbd_strings.c */
767
768 enum drbd_fencing_p fp;
bf885f8a 769 enum drbd_state_rv rv = SS_SUCCESS;
b411b363
PR
770
771 fp = FP_DONT_CARE;
772 if (get_ldev(mdev)) {
773 fp = mdev->ldev->dc.fencing;
774 put_ldev(mdev);
775 }
776
777 if (get_net_conf(mdev)) {
778 if (!mdev->net_conf->two_primaries &&
779 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
780 rv = SS_TWO_PRIMARIES;
781 put_net_conf(mdev);
782 }
783
784 if (rv <= 0)
785 /* already found a reason to abort */;
786 else if (ns.role == R_SECONDARY && mdev->open_cnt)
787 rv = SS_DEVICE_IN_USE;
788
789 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
790 rv = SS_NO_UP_TO_DATE_DISK;
791
792 else if (fp >= FP_RESOURCE &&
793 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
794 rv = SS_PRIMARY_NOP;
795
796 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
797 rv = SS_NO_UP_TO_DATE_DISK;
798
799 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
800 rv = SS_NO_LOCAL_DISK;
801
802 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
803 rv = SS_NO_REMOTE_DISK;
804
8d4ce82b
LE
805 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
806 rv = SS_NO_UP_TO_DATE_DISK;
807
b411b363
PR
808 else if ((ns.conn == C_CONNECTED ||
809 ns.conn == C_WF_BITMAP_S ||
810 ns.conn == C_SYNC_SOURCE ||
811 ns.conn == C_PAUSED_SYNC_S) &&
812 ns.disk == D_OUTDATED)
813 rv = SS_CONNECTED_OUTDATES;
814
815 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
816 (mdev->sync_conf.verify_alg[0] == 0))
817 rv = SS_NO_VERIFY_ALG;
818
819 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
820 mdev->agreed_pro_version < 88)
821 rv = SS_NOT_SUPPORTED;
822
fa7d9396
PR
823 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
824 rv = SS_CONNECTED_OUTDATES;
825
b411b363
PR
826 return rv;
827}
828
829/**
830 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
831 * @mdev: DRBD device.
832 * @ns: new state.
833 * @os: old state.
834 */
bf885f8a
AG
835static enum drbd_state_rv
836is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
837 union drbd_state os)
b411b363 838{
bf885f8a 839 enum drbd_state_rv rv = SS_SUCCESS;
b411b363
PR
840
841 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
842 os.conn > C_CONNECTED)
843 rv = SS_RESYNC_RUNNING;
844
845 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
846 rv = SS_ALREADY_STANDALONE;
847
848 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
849 rv = SS_IS_DISKLESS;
850
851 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
852 rv = SS_NO_NET_CONFIG;
853
854 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
855 rv = SS_LOWER_THAN_OUTDATED;
856
857 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
858 rv = SS_IN_TRANSIENT_STATE;
859
860 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
861 rv = SS_IN_TRANSIENT_STATE;
862
197296ff
PR
863 /* While establishing a connection only allow cstate to change.
864 Delay/refuse role changes, detach attach etc... */
865 if (test_bit(STATE_SENT, &mdev->flags) &&
866 !(os.conn == C_WF_REPORT_PARAMS ||
867 (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
868 rv = SS_IN_TRANSIENT_STATE;
869
b411b363
PR
870 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
871 rv = SS_NEED_CONNECTION;
872
873 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
874 ns.conn != os.conn && os.conn > C_CONNECTED)
875 rv = SS_RESYNC_RUNNING;
876
877 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
878 os.conn < C_CONNECTED)
879 rv = SS_NEED_CONNECTION;
880
1fc80cf3
PR
881 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
882 && os.conn < C_WF_REPORT_PARAMS)
883 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
884
b411b363
PR
885 return rv;
886}
887
77e8fdfc
PR
888static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
889{
890 static const char *msg_table[] = {
891 [NO_WARNING] = "",
892 [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
893 [ABORTED_RESYNC] = "Resync aborted.",
894 [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
895 [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
896 [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
897 };
898
899 if (warn != NO_WARNING)
900 dev_warn(DEV, "%s\n", msg_table[warn]);
901}
902
b411b363
PR
903/**
904 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
905 * @mdev: DRBD device.
906 * @os: old state.
907 * @ns: new state.
908 * @warn_sync_abort:
909 *
910 * When we loose connection, we have to set the state of the peers disk (pdsk)
911 * to D_UNKNOWN. This rule and many more along those lines are in this function.
912 */
913static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
77e8fdfc 914 union drbd_state ns, enum sanitize_state_warnings *warn)
b411b363
PR
915{
916 enum drbd_fencing_p fp;
ab17b68f 917 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
b411b363 918
77e8fdfc
PR
919 if (warn)
920 *warn = NO_WARNING;
921
b411b363
PR
922 fp = FP_DONT_CARE;
923 if (get_ldev(mdev)) {
924 fp = mdev->ldev->dc.fencing;
925 put_ldev(mdev);
926 }
927
928 /* Disallow Network errors to configure a device's network part */
929 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
930 os.conn <= C_DISCONNECTING)
931 ns.conn = os.conn;
932
f2906e18
LE
933 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
934 * If you try to go into some Sync* state, that shall fail (elsewhere). */
b411b363 935 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
545752d5 936 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
b411b363
PR
937 ns.conn = os.conn;
938
82f59cc6
LE
939 /* we cannot fail (again) if we already detached */
940 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
941 ns.disk = D_DISKLESS;
942
b411b363
PR
943 /* After C_DISCONNECTING only C_STANDALONE may follow */
944 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
945 ns.conn = os.conn;
946
947 if (ns.conn < C_CONNECTED) {
948 ns.peer_isp = 0;
949 ns.peer = R_UNKNOWN;
950 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
951 ns.pdsk = D_UNKNOWN;
952 }
953
954 /* Clear the aftr_isp when becoming unconfigured */
955 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
956 ns.aftr_isp = 0;
957
b411b363
PR
958 /* Abort resync if a disk fails/detaches */
959 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
960 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
77e8fdfc
PR
961 if (warn)
962 *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
963 ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
b411b363
PR
964 ns.conn = C_CONNECTED;
965 }
966
b411b363
PR
967 /* Connection breaks down before we finished "Negotiating" */
968 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
969 get_ldev_if_state(mdev, D_NEGOTIATING)) {
970 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
971 ns.disk = mdev->new_state_tmp.disk;
972 ns.pdsk = mdev->new_state_tmp.pdsk;
973 } else {
77e8fdfc
PR
974 if (warn)
975 *warn = CONNECTION_LOST_NEGOTIATING;
b411b363
PR
976 ns.disk = D_DISKLESS;
977 ns.pdsk = D_UNKNOWN;
978 }
979 put_ldev(mdev);
980 }
981
ab17b68f
PR
982 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
983 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
984 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
985 ns.disk = D_UP_TO_DATE;
986 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
987 ns.pdsk = D_UP_TO_DATE;
988 }
989
990 /* Implications of the connection stat on the disk states */
991 disk_min = D_DISKLESS;
992 disk_max = D_UP_TO_DATE;
993 pdsk_min = D_INCONSISTENT;
994 pdsk_max = D_UNKNOWN;
995 switch ((enum drbd_conns)ns.conn) {
996 case C_WF_BITMAP_T:
997 case C_PAUSED_SYNC_T:
998 case C_STARTING_SYNC_T:
999 case C_WF_SYNC_UUID:
1000 case C_BEHIND:
1001 disk_min = D_INCONSISTENT;
1002 disk_max = D_OUTDATED;
1003 pdsk_min = D_UP_TO_DATE;
1004 pdsk_max = D_UP_TO_DATE;
1005 break;
1006 case C_VERIFY_S:
1007 case C_VERIFY_T:
1008 disk_min = D_UP_TO_DATE;
1009 disk_max = D_UP_TO_DATE;
1010 pdsk_min = D_UP_TO_DATE;
1011 pdsk_max = D_UP_TO_DATE;
1012 break;
1013 case C_CONNECTED:
1014 disk_min = D_DISKLESS;
1015 disk_max = D_UP_TO_DATE;
1016 pdsk_min = D_DISKLESS;
1017 pdsk_max = D_UP_TO_DATE;
1018 break;
1019 case C_WF_BITMAP_S:
1020 case C_PAUSED_SYNC_S:
1021 case C_STARTING_SYNC_S:
1022 case C_AHEAD:
1023 disk_min = D_UP_TO_DATE;
1024 disk_max = D_UP_TO_DATE;
1025 pdsk_min = D_INCONSISTENT;
1026 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
1027 break;
1028 case C_SYNC_TARGET:
1029 disk_min = D_INCONSISTENT;
1030 disk_max = D_INCONSISTENT;
1031 pdsk_min = D_UP_TO_DATE;
1032 pdsk_max = D_UP_TO_DATE;
1033 break;
1034 case C_SYNC_SOURCE:
1035 disk_min = D_UP_TO_DATE;
1036 disk_max = D_UP_TO_DATE;
1037 pdsk_min = D_INCONSISTENT;
1038 pdsk_max = D_INCONSISTENT;
1039 break;
1040 case C_STANDALONE:
1041 case C_DISCONNECTING:
1042 case C_UNCONNECTED:
1043 case C_TIMEOUT:
1044 case C_BROKEN_PIPE:
1045 case C_NETWORK_FAILURE:
1046 case C_PROTOCOL_ERROR:
1047 case C_TEAR_DOWN:
1048 case C_WF_CONNECTION:
1049 case C_WF_REPORT_PARAMS:
1050 case C_MASK:
1051 break;
1052 }
1053 if (ns.disk > disk_max)
1054 ns.disk = disk_max;
1055
1056 if (ns.disk < disk_min) {
77e8fdfc
PR
1057 if (warn)
1058 *warn = IMPLICITLY_UPGRADED_DISK;
ab17b68f
PR
1059 ns.disk = disk_min;
1060 }
1061 if (ns.pdsk > pdsk_max)
1062 ns.pdsk = pdsk_max;
1063
1064 if (ns.pdsk < pdsk_min) {
77e8fdfc
PR
1065 if (warn)
1066 *warn = IMPLICITLY_UPGRADED_PDSK;
ab17b68f
PR
1067 ns.pdsk = pdsk_min;
1068 }
1069
b411b363 1070 if (fp == FP_STONITH &&
0a492166
PR
1071 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1072 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
fb22c402 1073 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
265be2d0
PR
1074
1075 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1076 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1077 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
fb22c402 1078 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
b411b363
PR
1079
1080 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1081 if (ns.conn == C_SYNC_SOURCE)
1082 ns.conn = C_PAUSED_SYNC_S;
1083 if (ns.conn == C_SYNC_TARGET)
1084 ns.conn = C_PAUSED_SYNC_T;
1085 } else {
1086 if (ns.conn == C_PAUSED_SYNC_S)
1087 ns.conn = C_SYNC_SOURCE;
1088 if (ns.conn == C_PAUSED_SYNC_T)
1089 ns.conn = C_SYNC_TARGET;
1090 }
1091
1092 return ns;
1093}
1094
1095/* helper for __drbd_set_state */
1096static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1097{
30b743a2
LE
1098 if (mdev->agreed_pro_version < 90)
1099 mdev->ov_start_sector = 0;
1100 mdev->rs_total = drbd_bm_bits(mdev);
1101 mdev->ov_position = 0;
b411b363
PR
1102 if (cs == C_VERIFY_T) {
1103 /* starting online verify from an arbitrary position
1104 * does not fit well into the existing protocol.
1105 * on C_VERIFY_T, we initialize ov_left and friends
1106 * implicitly in receive_DataRequest once the
1107 * first P_OV_REQUEST is received */
1108 mdev->ov_start_sector = ~(sector_t)0;
1109 } else {
1110 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
30b743a2 1111 if (bit >= mdev->rs_total) {
b411b363
PR
1112 mdev->ov_start_sector =
1113 BM_BIT_TO_SECT(mdev->rs_total - 1);
30b743a2
LE
1114 mdev->rs_total = 1;
1115 } else
1116 mdev->rs_total -= bit;
b411b363
PR
1117 mdev->ov_position = mdev->ov_start_sector;
1118 }
30b743a2 1119 mdev->ov_left = mdev->rs_total;
b411b363
PR
1120}
1121
0778286a
PR
1122static void drbd_resume_al(struct drbd_conf *mdev)
1123{
1124 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1125 dev_info(DEV, "Resumed AL updates\n");
1126}
1127
b411b363
PR
1128/**
1129 * __drbd_set_state() - Set a new DRBD state
1130 * @mdev: DRBD device.
1131 * @ns: new state.
1132 * @flags: Flags
1133 * @done: Optional completion, that will get completed after the after_state_ch() finished
1134 *
1135 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1136 */
bf885f8a
AG
1137enum drbd_state_rv
1138__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1139 enum chg_state_flags flags, struct completion *done)
b411b363
PR
1140{
1141 union drbd_state os;
bf885f8a 1142 enum drbd_state_rv rv = SS_SUCCESS;
77e8fdfc 1143 enum sanitize_state_warnings ssw;
b411b363
PR
1144 struct after_state_chg_work *ascw;
1145
1146 os = mdev->state;
1147
77e8fdfc 1148 ns = sanitize_state(mdev, os, ns, &ssw);
b411b363
PR
1149
1150 if (ns.i == os.i)
1151 return SS_NOTHING_TO_DO;
1152
1153 if (!(flags & CS_HARD)) {
1154 /* pre-state-change checks ; only look at ns */
1155 /* See drbd_state_sw_errors in drbd_strings.c */
1156
1157 rv = is_valid_state(mdev, ns);
1158 if (rv < SS_SUCCESS) {
1159 /* If the old state was illegal as well, then let
1160 this happen...*/
1161
1616a254 1162 if (is_valid_state(mdev, os) == rv)
b411b363 1163 rv = is_valid_state_transition(mdev, ns, os);
b411b363
PR
1164 } else
1165 rv = is_valid_state_transition(mdev, ns, os);
1166 }
1167
1168 if (rv < SS_SUCCESS) {
1169 if (flags & CS_VERBOSE)
1170 print_st_err(mdev, os, ns, rv);
1171 return rv;
1172 }
1173
77e8fdfc 1174 print_sanitize_warnings(mdev, ssw);
b411b363
PR
1175
1176 {
662d91a2
AG
1177 char *pbp, pb[300];
1178 pbp = pb;
1179 *pbp = 0;
1180 if (ns.role != os.role)
1181 pbp += sprintf(pbp, "role( %s -> %s ) ",
1182 drbd_role_str(os.role),
1183 drbd_role_str(ns.role));
1184 if (ns.peer != os.peer)
1185 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1186 drbd_role_str(os.peer),
1187 drbd_role_str(ns.peer));
1188 if (ns.conn != os.conn)
1189 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1190 drbd_conn_str(os.conn),
1191 drbd_conn_str(ns.conn));
1192 if (ns.disk != os.disk)
1193 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1194 drbd_disk_str(os.disk),
1195 drbd_disk_str(ns.disk));
1196 if (ns.pdsk != os.pdsk)
1197 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1198 drbd_disk_str(os.pdsk),
1199 drbd_disk_str(ns.pdsk));
1200 if (is_susp(ns) != is_susp(os))
1201 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1202 is_susp(os),
1203 is_susp(ns));
1204 if (ns.aftr_isp != os.aftr_isp)
1205 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1206 os.aftr_isp,
1207 ns.aftr_isp);
1208 if (ns.peer_isp != os.peer_isp)
1209 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1210 os.peer_isp,
1211 ns.peer_isp);
1212 if (ns.user_isp != os.user_isp)
1213 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1214 os.user_isp,
1215 ns.user_isp);
1216 dev_info(DEV, "%s\n", pb);
b411b363
PR
1217 }
1218
1219 /* solve the race between becoming unconfigured,
1220 * worker doing the cleanup, and
1221 * admin reconfiguring us:
1222 * on (re)configure, first set CONFIG_PENDING,
1223 * then wait for a potentially exiting worker,
1224 * start the worker, and schedule one no_op.
1225 * then proceed with configuration.
1226 */
1227 if (ns.disk == D_DISKLESS &&
1228 ns.conn == C_STANDALONE &&
1229 ns.role == R_SECONDARY &&
1230 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1231 set_bit(DEVICE_DYING, &mdev->flags);
1232
82f59cc6
LE
1233 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1234 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1235 * drbd_ldev_destroy() won't happen before our corresponding
1236 * after_state_ch works run, where we put_ldev again. */
1237 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1238 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1239 atomic_inc(&mdev->local_cnt);
1240
1241 mdev->state = ns;
62b0da3a
LE
1242
1243 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1244 drbd_print_uuids(mdev, "attached to UUIDs");
1245
b411b363
PR
1246 wake_up(&mdev->misc_wait);
1247 wake_up(&mdev->state_wait);
1248
b411b363
PR
1249 /* aborted verify run. log the last position */
1250 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1251 ns.conn < C_CONNECTED) {
1252 mdev->ov_start_sector =
30b743a2 1253 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
b411b363
PR
1254 dev_info(DEV, "Online Verify reached sector %llu\n",
1255 (unsigned long long)mdev->ov_start_sector);
1256 }
1257
1258 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1259 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1260 dev_info(DEV, "Syncer continues.\n");
1d7734a0
LE
1261 mdev->rs_paused += (long)jiffies
1262 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
63106d3c
PR
1263 if (ns.conn == C_SYNC_TARGET)
1264 mod_timer(&mdev->resync_timer, jiffies);
b411b363
PR
1265 }
1266
1267 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1268 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1269 dev_info(DEV, "Resync suspended\n");
1d7734a0 1270 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
b411b363
PR
1271 }
1272
1273 if (os.conn == C_CONNECTED &&
1274 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1d7734a0
LE
1275 unsigned long now = jiffies;
1276 int i;
1277
30b743a2 1278 set_ov_position(mdev, ns.conn);
1d7734a0 1279 mdev->rs_start = now;
0f0601f4
LE
1280 mdev->rs_last_events = 0;
1281 mdev->rs_last_sect_ev = 0;
b411b363
PR
1282 mdev->ov_last_oos_size = 0;
1283 mdev->ov_last_oos_start = 0;
1284
1d7734a0 1285 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
30b743a2 1286 mdev->rs_mark_left[i] = mdev->ov_left;
1d7734a0
LE
1287 mdev->rs_mark_time[i] = now;
1288 }
1289
2649f080
LE
1290 drbd_rs_controller_reset(mdev);
1291
b411b363
PR
1292 if (ns.conn == C_VERIFY_S) {
1293 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1294 (unsigned long long)mdev->ov_position);
1295 mod_timer(&mdev->resync_timer, jiffies);
1296 }
1297 }
1298
1299 if (get_ldev(mdev)) {
1300 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1301 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1302 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1303
1304 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1305 mdf |= MDF_CRASHED_PRIMARY;
1306 if (mdev->state.role == R_PRIMARY ||
1307 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1308 mdf |= MDF_PRIMARY_IND;
1309 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1310 mdf |= MDF_CONNECTED_IND;
1311 if (mdev->state.disk > D_INCONSISTENT)
1312 mdf |= MDF_CONSISTENT;
1313 if (mdev->state.disk > D_OUTDATED)
1314 mdf |= MDF_WAS_UP_TO_DATE;
1315 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1316 mdf |= MDF_PEER_OUT_DATED;
1317 if (mdf != mdev->ldev->md.flags) {
1318 mdev->ldev->md.flags = mdf;
1319 drbd_md_mark_dirty(mdev);
1320 }
1321 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1322 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1323 put_ldev(mdev);
1324 }
1325
1326 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1327 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1328 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1329 set_bit(CONSIDER_RESYNC, &mdev->flags);
1330
1331 /* Receiver should clean up itself */
1332 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1333 drbd_thread_stop_nowait(&mdev->receiver);
1334
1335 /* Now the receiver finished cleaning up itself, it should die */
1336 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1337 drbd_thread_stop_nowait(&mdev->receiver);
1338
1339 /* Upon network failure, we need to restart the receiver. */
1e86ac48 1340 if (os.conn > C_WF_CONNECTION &&
b411b363
PR
1341 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1342 drbd_thread_restart_nowait(&mdev->receiver);
1343
0778286a
PR
1344 /* Resume AL writing if we get a connection */
1345 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1346 drbd_resume_al(mdev);
1347
ba280c09
LE
1348 /* remember last connect and attach times so request_timer_fn() won't
1349 * kill newly established sessions while we are still trying to thaw
1350 * previously frozen IO */
1351 if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
1352 mdev->last_reconnect_jif = jiffies;
1353 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1354 ns.disk > D_NEGOTIATING)
1355 mdev->last_reattach_jif = jiffies;
1356
b411b363
PR
1357 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1358 if (ascw) {
1359 ascw->os = os;
1360 ascw->ns = ns;
1361 ascw->flags = flags;
1362 ascw->w.cb = w_after_state_ch;
1363 ascw->done = done;
1364 drbd_queue_work(&mdev->data.work, &ascw->w);
1365 } else {
1366 dev_warn(DEV, "Could not kmalloc an ascw\n");
1367 }
1368
1369 return rv;
1370}
1371
1372static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1373{
1374 struct after_state_chg_work *ascw =
1375 container_of(w, struct after_state_chg_work, w);
1376 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1377 if (ascw->flags & CS_WAIT_COMPLETE) {
1378 D_ASSERT(ascw->done != NULL);
1379 complete(ascw->done);
1380 }
1381 kfree(ascw);
1382
1383 return 1;
1384}
1385
1386static void abw_start_sync(struct drbd_conf *mdev, int rv)
1387{
1388 if (rv) {
1389 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1390 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1391 return;
1392 }
1393
1394 switch (mdev->state.conn) {
1395 case C_STARTING_SYNC_T:
1396 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1397 break;
1398 case C_STARTING_SYNC_S:
1399 drbd_start_resync(mdev, C_SYNC_SOURCE);
1400 break;
1401 }
1402}
1403
20ceb2b2
LE
1404int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1405 int (*io_fn)(struct drbd_conf *),
1406 char *why, enum bm_flag flags)
19f843aa
LE
1407{
1408 int rv;
1409
1410 D_ASSERT(current == mdev->worker.task);
1411
1412 /* open coded non-blocking drbd_suspend_io(mdev); */
1413 set_bit(SUSPEND_IO, &mdev->flags);
19f843aa 1414
20ceb2b2 1415 drbd_bm_lock(mdev, why, flags);
19f843aa
LE
1416 rv = io_fn(mdev);
1417 drbd_bm_unlock(mdev);
1418
1419 drbd_resume_io(mdev);
1420
1421 return rv;
1422}
1423
b411b363
PR
1424/**
1425 * after_state_ch() - Perform after state change actions that may sleep
1426 * @mdev: DRBD device.
1427 * @os: old state.
1428 * @ns: new state.
1429 * @flags: Flags
1430 */
1431static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1432 union drbd_state ns, enum chg_state_flags flags)
1433{
1434 enum drbd_fencing_p fp;
67098930 1435 enum drbd_req_event what = nothing;
fb22c402 1436 union drbd_state nsm = (union drbd_state){ .i = -1 };
b411b363
PR
1437
1438 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1439 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1440 if (mdev->p_uuid)
1441 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1442 }
1443
1444 fp = FP_DONT_CARE;
1445 if (get_ldev(mdev)) {
1446 fp = mdev->ldev->dc.fencing;
1447 put_ldev(mdev);
1448 }
1449
1450 /* Inform userspace about the change... */
1451 drbd_bcast_state(mdev, ns);
1452
1453 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1454 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1455 drbd_khelper(mdev, "pri-on-incon-degr");
1456
1457 /* Here we have the actions that are performed after a
1458 state change. This function might sleep */
1459
dfa8bedb
PR
1460 if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
1461 mod_timer(&mdev->request_timer, jiffies + HZ);
1462
fb22c402
PR
1463 nsm.i = -1;
1464 if (ns.susp_nod) {
3f98688a
PR
1465 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1466 what = resend;
265be2d0 1467
79f16f5d
PR
1468 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1469 ns.disk > D_NEGOTIATING)
3f98688a 1470 what = restart_frozen_disk_io;
fb22c402 1471
3f98688a
PR
1472 if (what != nothing)
1473 nsm.susp_nod = 0;
265be2d0
PR
1474 }
1475
fb22c402 1476 if (ns.susp_fen) {
43a5182c
PR
1477 /* case1: The outdate peer handler is successful: */
1478 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
b411b363 1479 tl_clear(mdev);
43a5182c
PR
1480 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1481 drbd_uuid_new_current(mdev);
1482 clear_bit(NEW_CUR_UUID, &mdev->flags);
43a5182c 1483 }
b411b363 1484 spin_lock_irq(&mdev->req_lock);
fb22c402 1485 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
b411b363
PR
1486 spin_unlock_irq(&mdev->req_lock);
1487 }
43a5182c
PR
1488 /* case2: The connection was established again: */
1489 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1490 clear_bit(NEW_CUR_UUID, &mdev->flags);
67098930 1491 what = resend;
fb22c402 1492 nsm.susp_fen = 0;
43a5182c 1493 }
b411b363 1494 }
67098930
PR
1495
1496 if (what != nothing) {
1497 spin_lock_irq(&mdev->req_lock);
1498 _tl_restart(mdev, what);
fb22c402
PR
1499 nsm.i &= mdev->state.i;
1500 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
67098930 1501 spin_unlock_irq(&mdev->req_lock);
b411b363 1502 }
67098930 1503
5a22db89
LE
1504 /* Became sync source. With protocol >= 96, we still need to send out
1505 * the sync uuid now. Need to do that before any drbd_send_state, or
1506 * the other side may go "paused sync" before receiving the sync uuids,
1507 * which is unexpected. */
1508 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1509 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1510 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1511 drbd_gen_and_send_sync_uuid(mdev);
1512 put_ldev(mdev);
1513 }
1514
b411b363
PR
1515 /* Do not change the order of the if above and the two below... */
1516 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
0029d624
LE
1517 /* we probably will start a resync soon.
1518 * make sure those things are properly reset. */
1519 mdev->rs_total = 0;
1520 mdev->rs_failed = 0;
1521 atomic_set(&mdev->rs_pending_cnt, 0);
1522 drbd_rs_cancel_all(mdev);
1523
b411b363 1524 drbd_send_uuids(mdev);
f479ea06 1525 drbd_send_state(mdev, ns);
b411b363 1526 }
54b956ab
LE
1527 /* No point in queuing send_bitmap if we don't have a connection
1528 * anymore, so check also the _current_ state, not only the new state
1529 * at the time this work was queued. */
1530 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1531 mdev->state.conn == C_WF_BITMAP_S)
1532 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
20ceb2b2
LE
1533 "send_bitmap (WFBitMapS)",
1534 BM_LOCKED_TEST_ALLOWED);
b411b363
PR
1535
1536 /* Lost contact to peer's copy of the data */
1537 if ((os.pdsk >= D_INCONSISTENT &&
1538 os.pdsk != D_UNKNOWN &&
1539 os.pdsk != D_OUTDATED)
1540 && (ns.pdsk < D_INCONSISTENT ||
1541 ns.pdsk == D_UNKNOWN ||
1542 ns.pdsk == D_OUTDATED)) {
b411b363
PR
1543 if (get_ldev(mdev)) {
1544 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
2c8d1967 1545 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
fb22c402 1546 if (is_susp(mdev->state)) {
43a5182c
PR
1547 set_bit(NEW_CUR_UUID, &mdev->flags);
1548 } else {
1549 drbd_uuid_new_current(mdev);
1550 drbd_send_uuids(mdev);
1551 }
2c8d1967 1552 }
b411b363
PR
1553 put_ldev(mdev);
1554 }
1555 }
1556
1557 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
bca482e9
PR
1558 if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
1559 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
2c8d1967 1560 drbd_uuid_new_current(mdev);
18a50fa2
PR
1561 drbd_send_uuids(mdev);
1562 }
b411b363
PR
1563 /* D_DISKLESS Peer becomes secondary */
1564 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
20ceb2b2
LE
1565 /* We may still be Primary ourselves.
1566 * No harm done if the bitmap still changes,
1567 * redirtied pages will follow later. */
1568 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1569 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
19f843aa
LE
1570 put_ldev(mdev);
1571 }
1572
06d33e96
LE
1573 /* Write out all changed bits on demote.
1574 * Though, no need to da that just yet
1575 * if there is a resync going on still */
1576 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1577 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
20ceb2b2
LE
1578 /* No changes to the bitmap expected this time, so assert that,
1579 * even though no harm was done if it did change. */
1580 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1581 "demote", BM_LOCKED_TEST_ALLOWED);
b411b363
PR
1582 put_ldev(mdev);
1583 }
1584
1585 /* Last part of the attaching process ... */
1586 if (ns.conn >= C_CONNECTED &&
1587 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
e89b591c 1588 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
b411b363 1589 drbd_send_uuids(mdev);
f479ea06 1590 drbd_send_state(mdev, ns);
b411b363
PR
1591 }
1592
1593 /* We want to pause/continue resync, tell peer. */
1594 if (ns.conn >= C_CONNECTED &&
1595 ((os.aftr_isp != ns.aftr_isp) ||
1596 (os.user_isp != ns.user_isp)))
f479ea06 1597 drbd_send_state(mdev, ns);
b411b363
PR
1598
1599 /* In case one of the isp bits got set, suspend other devices. */
1600 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1601 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1602 suspend_other_sg(mdev);
1603
1604 /* Make sure the peer gets informed about eventual state
1605 changes (ISP bits) while we were in WFReportParams. */
1606 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
f479ea06 1607 drbd_send_state(mdev, ns);
b411b363 1608
67531718 1609 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
f479ea06 1610 drbd_send_state(mdev, ns);
67531718 1611
b411b363
PR
1612 /* We are in the progress to start a full sync... */
1613 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1614 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
20ceb2b2
LE
1615 /* no other bitmap changes expected during this phase */
1616 drbd_queue_bitmap_io(mdev,
1617 &drbd_bmio_set_n_write, &abw_start_sync,
1618 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
b411b363
PR
1619
1620 /* We are invalidating our self... */
1621 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1622 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
20ceb2b2
LE
1623 /* other bitmap operation expected during this phase */
1624 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1625 "set_n_write from invalidate", BM_LOCKED_MASK);
b411b363 1626
82f59cc6
LE
1627 /* first half of local IO error, failure to attach,
1628 * or administrative detach */
1629 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
7caacb69
PR
1630 enum drbd_io_error_p eh = EP_PASS_ON;
1631 int was_io_error = 0;
82f59cc6 1632 /* corresponding get_ldev was in __drbd_set_state, to serialize
7caacb69
PR
1633 * our cleanup here with the transition to D_DISKLESS.
1634 * But is is still not save to dreference ldev here, since
1635 * we might come from an failed Attach before ldev was set. */
1636 if (mdev->ldev) {
1637 eh = mdev->ldev->dc.on_io_error;
1638 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1639
63a6d0bb
LE
1640 if (was_io_error && eh == EP_CALL_HELPER)
1641 drbd_khelper(mdev, "local-io-error");
1642
383606e0
LE
1643 /* Immediately allow completion of all application IO,
1644 * that waits for completion from the local disk,
1645 * if this was a force-detach due to disk_timeout
1646 * or administrator request (drbdsetup detach --force).
1647 * Do NOT abort otherwise.
1648 * Aborting local requests may cause serious problems,
1649 * if requests are completed to upper layers already,
1650 * and then later the already submitted local bio completes.
1651 * This can cause DMA into former bio pages that meanwhile
1652 * have been re-used for other things.
1653 * So aborting local requests may cause crashes,
1654 * or even worse, silent data corruption.
1655 */
1656 if (test_and_clear_bit(FORCE_DETACH, &mdev->flags))
1657 tl_abort_disk_io(mdev);
7caacb69
PR
1658
1659 /* current state still has to be D_FAILED,
1660 * there is only one way out: to D_DISKLESS,
1661 * and that may only happen after our put_ldev below. */
1662 if (mdev->state.disk != D_FAILED)
1663 dev_err(DEV,
1664 "ASSERT FAILED: disk is %s during detach\n",
1665 drbd_disk_str(mdev->state.disk));
1666
1667 if (ns.conn >= C_CONNECTED)
1668 drbd_send_state(mdev, ns);
1669
1670 drbd_rs_cancel_all(mdev);
1671
1672 /* In case we want to get something to stable storage still,
1673 * this may be the last chance.
1674 * Following put_ldev may transition to D_DISKLESS. */
1675 drbd_md_sync(mdev);
1676 }
82f59cc6 1677 put_ldev(mdev);
e9e6f3ec 1678 }
b411b363 1679
82f59cc6
LE
1680 /* second half of local IO error, failure to attach,
1681 * or administrative detach,
1682 * after local_cnt references have reached zero again */
1683 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1684 /* We must still be diskless,
1685 * re-attach has to be serialized with this! */
1686 if (mdev->state.disk != D_DISKLESS)
1687 dev_err(DEV,
1688 "ASSERT FAILED: disk is %s while going diskless\n",
1689 drbd_disk_str(mdev->state.disk));
e9e6f3ec 1690
4afc433c
PR
1691 if (ns.conn >= C_CONNECTED)
1692 drbd_send_state(mdev, ns);
1693
82f59cc6 1694 /* corresponding get_ldev in __drbd_set_state
25985edc 1695 * this may finally trigger drbd_ldev_destroy. */
82f59cc6 1696 put_ldev(mdev);
b411b363
PR
1697 }
1698
738a84b2 1699 /* Notify peer that I had a local IO error, and did not detached.. */
4afc433c 1700 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
f479ea06 1701 drbd_send_state(mdev, ns);
738a84b2 1702
b411b363
PR
1703 /* Disks got bigger while they were detached */
1704 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1705 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1706 if (ns.conn == C_CONNECTED)
1707 resync_after_online_grow(mdev);
1708 }
1709
1710 /* A resync finished or aborted, wake paused devices... */
1711 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1712 (os.peer_isp && !ns.peer_isp) ||
1713 (os.user_isp && !ns.user_isp))
1714 resume_next_sg(mdev);
1715
af85e8e8
LE
1716 /* sync target done with resync. Explicitly notify peer, even though
1717 * it should (at least for non-empty resyncs) already know itself. */
1718 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
f479ea06 1719 drbd_send_state(mdev, ns);
af85e8e8 1720
197296ff
PR
1721 /* Wake up role changes, that were delayed because of connection establishing */
1722 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
1723 clear_bit(STATE_SENT, &mdev->flags);
1724 wake_up(&mdev->state_wait);
1725 }
1726
79a30d2d
LE
1727 /* This triggers bitmap writeout of potentially still unwritten pages
1728 * if the resync finished cleanly, or aborted because of peer disk
20ceb2b2 1729 * failure, or because of connection loss.
79a30d2d
LE
1730 * For resync aborted because of local disk failure, we cannot do
1731 * any bitmap writeout anymore.
20ceb2b2 1732 * No harm done if some bits change during this phase.
79a30d2d 1733 */
20ceb2b2 1734 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
0e8488ad
LE
1735 drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
1736 "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
79a30d2d
LE
1737 put_ldev(mdev);
1738 }
02851e9f 1739
f70b3511 1740 /* free tl_hash if we Got thawed and are C_STANDALONE */
fb22c402 1741 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
f70b3511
PR
1742 drbd_free_tl_hash(mdev);
1743
b411b363
PR
1744 /* Upon network connection, we need to start the receiver */
1745 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1746 drbd_thread_start(&mdev->receiver);
1747
1748 /* Terminate worker thread if we are unconfigured - it will be
1749 restarted as needed... */
1750 if (ns.disk == D_DISKLESS &&
1751 ns.conn == C_STANDALONE &&
1752 ns.role == R_SECONDARY) {
1753 if (os.aftr_isp != ns.aftr_isp)
1754 resume_next_sg(mdev);
1755 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1756 if (test_bit(DEVICE_DYING, &mdev->flags))
1757 drbd_thread_stop_nowait(&mdev->worker);
1758 }
1759
1760 drbd_md_sync(mdev);
1761}
1762
1763
1764static int drbd_thread_setup(void *arg)
1765{
1766 struct drbd_thread *thi = (struct drbd_thread *) arg;
1767 struct drbd_conf *mdev = thi->mdev;
1768 unsigned long flags;
1769 int retval;
1770
1771restart:
1772 retval = thi->function(thi);
1773
1774 spin_lock_irqsave(&thi->t_lock, flags);
1775
1776 /* if the receiver has been "Exiting", the last thing it did
1777 * was set the conn state to "StandAlone",
1778 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1779 * and receiver thread will be "started".
1780 * drbd_thread_start needs to set "Restarting" in that case.
1781 * t_state check and assignment needs to be within the same spinlock,
1782 * so either thread_start sees Exiting, and can remap to Restarting,
1783 * or thread_start see None, and can proceed as normal.
1784 */
1785
1786 if (thi->t_state == Restarting) {
1787 dev_info(DEV, "Restarting %s\n", current->comm);
1788 thi->t_state = Running;
1789 spin_unlock_irqrestore(&thi->t_lock, flags);
1790 goto restart;
1791 }
1792
1793 thi->task = NULL;
1794 thi->t_state = None;
1795 smp_mb();
1796 complete(&thi->stop);
1797 spin_unlock_irqrestore(&thi->t_lock, flags);
1798
1799 dev_info(DEV, "Terminating %s\n", current->comm);
1800
1801 /* Release mod reference taken when thread was started */
1802 module_put(THIS_MODULE);
1803 return retval;
1804}
1805
1806static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1807 int (*func) (struct drbd_thread *))
1808{
1809 spin_lock_init(&thi->t_lock);
1810 thi->task = NULL;
1811 thi->t_state = None;
1812 thi->function = func;
1813 thi->mdev = mdev;
1814}
1815
1816int drbd_thread_start(struct drbd_thread *thi)
1817{
1818 struct drbd_conf *mdev = thi->mdev;
1819 struct task_struct *nt;
1820 unsigned long flags;
1821
1822 const char *me =
1823 thi == &mdev->receiver ? "receiver" :
1824 thi == &mdev->asender ? "asender" :
1825 thi == &mdev->worker ? "worker" : "NONSENSE";
1826
1827 /* is used from state engine doing drbd_thread_stop_nowait,
1828 * while holding the req lock irqsave */
1829 spin_lock_irqsave(&thi->t_lock, flags);
1830
1831 switch (thi->t_state) {
1832 case None:
1833 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1834 me, current->comm, current->pid);
1835
1836 /* Get ref on module for thread - this is released when thread exits */
1837 if (!try_module_get(THIS_MODULE)) {
1838 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1839 spin_unlock_irqrestore(&thi->t_lock, flags);
81e84650 1840 return false;
b411b363
PR
1841 }
1842
1843 init_completion(&thi->stop);
1844 D_ASSERT(thi->task == NULL);
1845 thi->reset_cpu_mask = 1;
1846 thi->t_state = Running;
1847 spin_unlock_irqrestore(&thi->t_lock, flags);
1848 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1849
1850 nt = kthread_create(drbd_thread_setup, (void *) thi,
1851 "drbd%d_%s", mdev_to_minor(mdev), me);
1852
1853 if (IS_ERR(nt)) {
1854 dev_err(DEV, "Couldn't start thread\n");
1855
1856 module_put(THIS_MODULE);
81e84650 1857 return false;
b411b363
PR
1858 }
1859 spin_lock_irqsave(&thi->t_lock, flags);
1860 thi->task = nt;
1861 thi->t_state = Running;
1862 spin_unlock_irqrestore(&thi->t_lock, flags);
1863 wake_up_process(nt);
1864 break;
1865 case Exiting:
1866 thi->t_state = Restarting;
1867 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1868 me, current->comm, current->pid);
1869 /* fall through */
1870 case Running:
1871 case Restarting:
1872 default:
1873 spin_unlock_irqrestore(&thi->t_lock, flags);
1874 break;
1875 }
1876
81e84650 1877 return true;
b411b363
PR
1878}
1879
1880
1881void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1882{
1883 unsigned long flags;
1884
1885 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1886
1887 /* may be called from state engine, holding the req lock irqsave */
1888 spin_lock_irqsave(&thi->t_lock, flags);
1889
1890 if (thi->t_state == None) {
1891 spin_unlock_irqrestore(&thi->t_lock, flags);
1892 if (restart)
1893 drbd_thread_start(thi);
1894 return;
1895 }
1896
1897 if (thi->t_state != ns) {
1898 if (thi->task == NULL) {
1899 spin_unlock_irqrestore(&thi->t_lock, flags);
1900 return;
1901 }
1902
1903 thi->t_state = ns;
1904 smp_mb();
1905 init_completion(&thi->stop);
1906 if (thi->task != current)
1907 force_sig(DRBD_SIGKILL, thi->task);
1908
1909 }
1910
1911 spin_unlock_irqrestore(&thi->t_lock, flags);
1912
1913 if (wait)
1914 wait_for_completion(&thi->stop);
1915}
1916
1917#ifdef CONFIG_SMP
1918/**
1919 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1920 * @mdev: DRBD device.
1921 *
1922 * Forces all threads of a device onto the same CPU. This is beneficial for
1923 * DRBD's performance. May be overwritten by user's configuration.
1924 */
1925void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1926{
1927 int ord, cpu;
1928
1929 /* user override. */
1930 if (cpumask_weight(mdev->cpu_mask))
1931 return;
1932
1933 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1934 for_each_online_cpu(cpu) {
1935 if (ord-- == 0) {
1936 cpumask_set_cpu(cpu, mdev->cpu_mask);
1937 return;
1938 }
1939 }
1940 /* should not be reached */
1941 cpumask_setall(mdev->cpu_mask);
1942}
1943
1944/**
1945 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1946 * @mdev: DRBD device.
1947 *
1948 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1949 * prematurely.
1950 */
1951void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1952{
1953 struct task_struct *p = current;
1954 struct drbd_thread *thi =
1955 p == mdev->asender.task ? &mdev->asender :
1956 p == mdev->receiver.task ? &mdev->receiver :
1957 p == mdev->worker.task ? &mdev->worker :
1958 NULL;
1959 ERR_IF(thi == NULL)
1960 return;
1961 if (!thi->reset_cpu_mask)
1962 return;
1963 thi->reset_cpu_mask = 0;
1964 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1965}
1966#endif
1967
1968/* the appropriate socket mutex must be held already */
1969int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
0b70a13d 1970 enum drbd_packets cmd, struct p_header80 *h,
b411b363
PR
1971 size_t size, unsigned msg_flags)
1972{
1973 int sent, ok;
1974
81e84650
AG
1975 ERR_IF(!h) return false;
1976 ERR_IF(!size) return false;
b411b363
PR
1977
1978 h->magic = BE_DRBD_MAGIC;
1979 h->command = cpu_to_be16(cmd);
0b70a13d 1980 h->length = cpu_to_be16(size-sizeof(struct p_header80));
b411b363 1981
b411b363
PR
1982 sent = drbd_send(mdev, sock, h, size, msg_flags);
1983
1984 ok = (sent == size);
0ddc5549
LE
1985 if (!ok && !signal_pending(current))
1986 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
b411b363
PR
1987 cmdname(cmd), (int)size, sent);
1988 return ok;
1989}
1990
1991/* don't pass the socket. we may only look at it
1992 * when we hold the appropriate socket mutex.
1993 */
1994int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
0b70a13d 1995 enum drbd_packets cmd, struct p_header80 *h, size_t size)
b411b363
PR
1996{
1997 int ok = 0;
1998 struct socket *sock;
1999
2000 if (use_data_socket) {
2001 mutex_lock(&mdev->data.mutex);
2002 sock = mdev->data.socket;
2003 } else {
2004 mutex_lock(&mdev->meta.mutex);
2005 sock = mdev->meta.socket;
2006 }
2007
2008 /* drbd_disconnect() could have called drbd_free_sock()
2009 * while we were waiting in down()... */
2010 if (likely(sock != NULL))
2011 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
2012
2013 if (use_data_socket)
2014 mutex_unlock(&mdev->data.mutex);
2015 else
2016 mutex_unlock(&mdev->meta.mutex);
2017 return ok;
2018}
2019
2020int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
2021 size_t size)
2022{
0b70a13d 2023 struct p_header80 h;
b411b363
PR
2024 int ok;
2025
2026 h.magic = BE_DRBD_MAGIC;
2027 h.command = cpu_to_be16(cmd);
2028 h.length = cpu_to_be16(size);
2029
2030 if (!drbd_get_data_sock(mdev))
2031 return 0;
2032
b411b363
PR
2033 ok = (sizeof(h) ==
2034 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
2035 ok = ok && (size ==
2036 drbd_send(mdev, mdev->data.socket, data, size, 0));
2037
2038 drbd_put_data_sock(mdev);
2039
2040 return ok;
2041}
2042
2043int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
2044{
8e26f9cc 2045 struct p_rs_param_95 *p;
b411b363
PR
2046 struct socket *sock;
2047 int size, rv;
2048 const int apv = mdev->agreed_pro_version;
2049
2050 size = apv <= 87 ? sizeof(struct p_rs_param)
2051 : apv == 88 ? sizeof(struct p_rs_param)
2052 + strlen(mdev->sync_conf.verify_alg) + 1
8e26f9cc
PR
2053 : apv <= 94 ? sizeof(struct p_rs_param_89)
2054 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363
PR
2055
2056 /* used from admin command context and receiver/worker context.
2057 * to avoid kmalloc, grab the socket right here,
2058 * then use the pre-allocated sbuf there */
2059 mutex_lock(&mdev->data.mutex);
2060 sock = mdev->data.socket;
2061
2062 if (likely(sock != NULL)) {
2063 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
2064
8e26f9cc 2065 p = &mdev->data.sbuf.rs_param_95;
b411b363
PR
2066
2067 /* initialize verify_alg and csums_alg */
2068 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2069
2070 p->rate = cpu_to_be32(sc->rate);
8e26f9cc
PR
2071 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
2072 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
2073 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
2074 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
b411b363
PR
2075
2076 if (apv >= 88)
2077 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
2078 if (apv >= 89)
2079 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
2080
2081 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
2082 } else
2083 rv = 0; /* not ok */
2084
2085 mutex_unlock(&mdev->data.mutex);
2086
2087 return rv;
2088}
2089
2090int drbd_send_protocol(struct drbd_conf *mdev)
2091{
2092 struct p_protocol *p;
cf14c2e9 2093 int size, cf, rv;
b411b363
PR
2094
2095 size = sizeof(struct p_protocol);
2096
2097 if (mdev->agreed_pro_version >= 87)
2098 size += strlen(mdev->net_conf->integrity_alg) + 1;
2099
2100 /* we must not recurse into our own queue,
2101 * as that is blocked during handshake */
2102 p = kmalloc(size, GFP_NOIO);
2103 if (p == NULL)
2104 return 0;
2105
2106 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
2107 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
2108 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
2109 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
b411b363
PR
2110 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2111
cf14c2e9
PR
2112 cf = 0;
2113 if (mdev->net_conf->want_lose)
2114 cf |= CF_WANT_LOSE;
2115 if (mdev->net_conf->dry_run) {
2116 if (mdev->agreed_pro_version >= 92)
2117 cf |= CF_DRY_RUN;
2118 else {
2119 dev_err(DEV, "--dry-run is not supported by peer");
7ac314c8 2120 kfree(p);
148efa16 2121 return -1;
cf14c2e9
PR
2122 }
2123 }
2124 p->conn_flags = cpu_to_be32(cf);
2125
b411b363
PR
2126 if (mdev->agreed_pro_version >= 87)
2127 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2128
2129 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
0b70a13d 2130 (struct p_header80 *)p, size);
b411b363
PR
2131 kfree(p);
2132 return rv;
2133}
2134
2135int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2136{
2137 struct p_uuids p;
2138 int i;
2139
2140 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2141 return 1;
2142
2143 for (i = UI_CURRENT; i < UI_SIZE; i++)
2144 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2145
2146 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2147 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2148 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2149 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2150 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2151 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2152
2153 put_ldev(mdev);
2154
2155 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
0b70a13d 2156 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2157}
2158
2159int drbd_send_uuids(struct drbd_conf *mdev)
2160{
2161 return _drbd_send_uuids(mdev, 0);
2162}
2163
2164int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2165{
2166 return _drbd_send_uuids(mdev, 8);
2167}
2168
62b0da3a
LE
2169void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2170{
2171 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2172 u64 *uuid = mdev->ldev->md.uuid;
2173 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2174 text,
2175 (unsigned long long)uuid[UI_CURRENT],
2176 (unsigned long long)uuid[UI_BITMAP],
2177 (unsigned long long)uuid[UI_HISTORY_START],
2178 (unsigned long long)uuid[UI_HISTORY_END]);
2179 put_ldev(mdev);
2180 } else {
2181 dev_info(DEV, "%s effective data uuid: %016llX\n",
2182 text,
2183 (unsigned long long)mdev->ed_uuid);
2184 }
2185}
2186
5a22db89 2187int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
b411b363
PR
2188{
2189 struct p_rs_uuid p;
5a22db89
LE
2190 u64 uuid;
2191
2192 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
b411b363 2193
5ba3dac5
PR
2194 uuid = mdev->ldev->md.uuid[UI_BITMAP];
2195 if (uuid && uuid != UUID_JUST_CREATED)
2196 uuid = uuid + UUID_NEW_BM_OFFSET;
2197 else
2198 get_random_bytes(&uuid, sizeof(u64));
5a22db89 2199 drbd_uuid_set(mdev, UI_BITMAP, uuid);
62b0da3a 2200 drbd_print_uuids(mdev, "updated sync UUID");
5a22db89
LE
2201 drbd_md_sync(mdev);
2202 p.uuid = cpu_to_be64(uuid);
b411b363
PR
2203
2204 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
0b70a13d 2205 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2206}
2207
e89b591c 2208int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
b411b363
PR
2209{
2210 struct p_sizes p;
2211 sector_t d_size, u_size;
99432fcc 2212 int q_order_type, max_bio_size;
b411b363
PR
2213 int ok;
2214
2215 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2216 D_ASSERT(mdev->ldev->backing_bdev);
2217 d_size = drbd_get_max_capacity(mdev->ldev);
2218 u_size = mdev->ldev->dc.disk_size;
2219 q_order_type = drbd_queue_order_type(mdev);
99432fcc
PR
2220 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2221 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
b411b363
PR
2222 put_ldev(mdev);
2223 } else {
2224 d_size = 0;
2225 u_size = 0;
2226 q_order_type = QUEUE_ORDERED_NONE;
99432fcc 2227 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
b411b363
PR
2228 }
2229
6809384c
PR
2230 /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
2231 if (mdev->agreed_pro_version <= 94)
2232 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
2233
b411b363
PR
2234 p.d_size = cpu_to_be64(d_size);
2235 p.u_size = cpu_to_be64(u_size);
2236 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
99432fcc 2237 p.max_bio_size = cpu_to_be32(max_bio_size);
e89b591c
PR
2238 p.queue_order_type = cpu_to_be16(q_order_type);
2239 p.dds_flags = cpu_to_be16(flags);
b411b363
PR
2240
2241 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
0b70a13d 2242 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2243 return ok;
2244}
2245
2246/**
f479ea06 2247 * drbd_send_current_state() - Sends the drbd state to the peer
b411b363
PR
2248 * @mdev: DRBD device.
2249 */
f479ea06 2250int drbd_send_current_state(struct drbd_conf *mdev)
b411b363
PR
2251{
2252 struct socket *sock;
2253 struct p_state p;
2254 int ok = 0;
2255
2256 /* Grab state lock so we wont send state if we're in the middle
2257 * of a cluster wide state change on another thread */
2258 drbd_state_lock(mdev);
2259
2260 mutex_lock(&mdev->data.mutex);
2261
2262 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2263 sock = mdev->data.socket;
2264
2265 if (likely(sock != NULL)) {
2266 ok = _drbd_send_cmd(mdev, sock, P_STATE,
0b70a13d 2267 (struct p_header80 *)&p, sizeof(p), 0);
b411b363
PR
2268 }
2269
2270 mutex_unlock(&mdev->data.mutex);
2271
2272 drbd_state_unlock(mdev);
2273 return ok;
2274}
2275
f479ea06
LE
2276/**
2277 * drbd_send_state() - After a state change, sends the new state to the peer
2278 * @mdev: DRBD device.
2279 * @state: the state to send, not necessarily the current state.
2280 *
2281 * Each state change queues an "after_state_ch" work, which will eventually
2282 * send the resulting new state to the peer. If more state changes happen
2283 * between queuing and processing of the after_state_ch work, we still
2284 * want to send each intermediary state in the order it occurred.
2285 */
2286int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
2287{
2288 struct socket *sock;
2289 struct p_state p;
2290 int ok = 0;
2291
2292 mutex_lock(&mdev->data.mutex);
2293
2294 p.state = cpu_to_be32(state.i);
2295 sock = mdev->data.socket;
2296
2297 if (likely(sock != NULL)) {
2298 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2299 (struct p_header80 *)&p, sizeof(p), 0);
2300 }
2301
2302 mutex_unlock(&mdev->data.mutex);
2303
2304 return ok;
2305}
2306
b411b363
PR
2307int drbd_send_state_req(struct drbd_conf *mdev,
2308 union drbd_state mask, union drbd_state val)
2309{
2310 struct p_req_state p;
2311
2312 p.mask = cpu_to_be32(mask.i);
2313 p.val = cpu_to_be32(val.i);
2314
2315 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
0b70a13d 2316 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2317}
2318
bf885f8a 2319int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
b411b363
PR
2320{
2321 struct p_req_state_reply p;
2322
2323 p.retcode = cpu_to_be32(retcode);
2324
2325 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
0b70a13d 2326 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2327}
2328
2329int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2330 struct p_compressed_bm *p,
2331 struct bm_xfer_ctx *c)
2332{
2333 struct bitstream bs;
2334 unsigned long plain_bits;
2335 unsigned long tmp;
2336 unsigned long rl;
2337 unsigned len;
2338 unsigned toggle;
2339 int bits;
2340
2341 /* may we use this feature? */
2342 if ((mdev->sync_conf.use_rle == 0) ||
2343 (mdev->agreed_pro_version < 90))
2344 return 0;
2345
2346 if (c->bit_offset >= c->bm_bits)
2347 return 0; /* nothing to do. */
2348
2349 /* use at most thus many bytes */
2350 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2351 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2352 /* plain bits covered in this code string */
2353 plain_bits = 0;
2354
2355 /* p->encoding & 0x80 stores whether the first run length is set.
2356 * bit offset is implicit.
2357 * start with toggle == 2 to be able to tell the first iteration */
2358 toggle = 2;
2359
2360 /* see how much plain bits we can stuff into one packet
2361 * using RLE and VLI. */
2362 do {
2363 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2364 : _drbd_bm_find_next(mdev, c->bit_offset);
2365 if (tmp == -1UL)
2366 tmp = c->bm_bits;
2367 rl = tmp - c->bit_offset;
2368
2369 if (toggle == 2) { /* first iteration */
2370 if (rl == 0) {
2371 /* the first checked bit was set,
2372 * store start value, */
2373 DCBP_set_start(p, 1);
2374 /* but skip encoding of zero run length */
2375 toggle = !toggle;
2376 continue;
2377 }
2378 DCBP_set_start(p, 0);
2379 }
2380
2381 /* paranoia: catch zero runlength.
2382 * can only happen if bitmap is modified while we scan it. */
2383 if (rl == 0) {
2384 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2385 "t:%u bo:%lu\n", toggle, c->bit_offset);
2386 return -1;
2387 }
2388
2389 bits = vli_encode_bits(&bs, rl);
2390 if (bits == -ENOBUFS) /* buffer full */
2391 break;
2392 if (bits <= 0) {
2393 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2394 return 0;
2395 }
2396
2397 toggle = !toggle;
2398 plain_bits += rl;
2399 c->bit_offset = tmp;
2400 } while (c->bit_offset < c->bm_bits);
2401
2402 len = bs.cur.b - p->code + !!bs.cur.bit;
2403
2404 if (plain_bits < (len << 3)) {
2405 /* incompressible with this method.
2406 * we need to rewind both word and bit position. */
2407 c->bit_offset -= plain_bits;
2408 bm_xfer_ctx_bit_to_word_offset(c);
2409 c->bit_offset = c->word_offset * BITS_PER_LONG;
2410 return 0;
2411 }
2412
2413 /* RLE + VLI was able to compress it just fine.
2414 * update c->word_offset. */
2415 bm_xfer_ctx_bit_to_word_offset(c);
2416
2417 /* store pad_bits */
2418 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2419
2420 return len;
2421}
2422
f70af118
AG
2423/**
2424 * send_bitmap_rle_or_plain
2425 *
2426 * Return 0 when done, 1 when another iteration is needed, and a negative error
2427 * code upon failure.
2428 */
2429static int
b411b363 2430send_bitmap_rle_or_plain(struct drbd_conf *mdev,
f70af118 2431 struct p_header80 *h, struct bm_xfer_ctx *c)
b411b363
PR
2432{
2433 struct p_compressed_bm *p = (void*)h;
2434 unsigned long num_words;
2435 int len;
2436 int ok;
2437
2438 len = fill_bitmap_rle_bits(mdev, p, c);
2439
2440 if (len < 0)
f70af118 2441 return -EIO;
b411b363
PR
2442
2443 if (len) {
2444 DCBP_set_code(p, RLE_VLI_Bits);
2445 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2446 sizeof(*p) + len, 0);
2447
2448 c->packets[0]++;
2449 c->bytes[0] += sizeof(*p) + len;
2450
2451 if (c->bit_offset >= c->bm_bits)
2452 len = 0; /* DONE */
2453 } else {
2454 /* was not compressible.
2455 * send a buffer full of plain text bits instead. */
2456 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2457 len = num_words * sizeof(long);
2458 if (len)
2459 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2460 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
0b70a13d 2461 h, sizeof(struct p_header80) + len, 0);
b411b363
PR
2462 c->word_offset += num_words;
2463 c->bit_offset = c->word_offset * BITS_PER_LONG;
2464
2465 c->packets[1]++;
0b70a13d 2466 c->bytes[1] += sizeof(struct p_header80) + len;
b411b363
PR
2467
2468 if (c->bit_offset > c->bm_bits)
2469 c->bit_offset = c->bm_bits;
2470 }
f70af118
AG
2471 if (ok) {
2472 if (len == 0) {
2473 INFO_bm_xfer_stats(mdev, "send", c);
2474 return 0;
2475 } else
2476 return 1;
2477 }
2478 return -EIO;
b411b363
PR
2479}
2480
2481/* See the comment at receive_bitmap() */
2482int _drbd_send_bitmap(struct drbd_conf *mdev)
2483{
2484 struct bm_xfer_ctx c;
0b70a13d 2485 struct p_header80 *p;
f70af118 2486 int err;
b411b363 2487
81e84650 2488 ERR_IF(!mdev->bitmap) return false;
b411b363
PR
2489
2490 /* maybe we should use some per thread scratch page,
2491 * and allocate that during initial device creation? */
0b70a13d 2492 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
b411b363
PR
2493 if (!p) {
2494 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
81e84650 2495 return false;
b411b363
PR
2496 }
2497
2498 if (get_ldev(mdev)) {
2499 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2500 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2501 drbd_bm_set_all(mdev);
2502 if (drbd_bm_write(mdev)) {
2503 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2504 * but otherwise process as per normal - need to tell other
2505 * side that a full resync is required! */
2506 dev_err(DEV, "Failed to write bitmap to disk!\n");
2507 } else {
2508 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2509 drbd_md_sync(mdev);
2510 }
2511 }
2512 put_ldev(mdev);
2513 }
2514
2515 c = (struct bm_xfer_ctx) {
2516 .bm_bits = drbd_bm_bits(mdev),
2517 .bm_words = drbd_bm_words(mdev),
2518 };
2519
2520 do {
f70af118
AG
2521 err = send_bitmap_rle_or_plain(mdev, p, &c);
2522 } while (err > 0);
b411b363
PR
2523
2524 free_page((unsigned long) p);
f70af118 2525 return err == 0;
b411b363
PR
2526}
2527
2528int drbd_send_bitmap(struct drbd_conf *mdev)
2529{
2530 int err;
2531
2532 if (!drbd_get_data_sock(mdev))
2533 return -1;
2534 err = !_drbd_send_bitmap(mdev);
2535 drbd_put_data_sock(mdev);
2536 return err;
2537}
2538
2539int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2540{
2541 int ok;
2542 struct p_barrier_ack p;
2543
2544 p.barrier = barrier_nr;
2545 p.set_size = cpu_to_be32(set_size);
2546
2547 if (mdev->state.conn < C_CONNECTED)
81e84650 2548 return false;
b411b363 2549 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
0b70a13d 2550 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2551 return ok;
2552}
2553
2554/**
2555 * _drbd_send_ack() - Sends an ack packet
2556 * @mdev: DRBD device.
2557 * @cmd: Packet command code.
2558 * @sector: sector, needs to be in big endian byte order
2559 * @blksize: size in byte, needs to be in big endian byte order
2560 * @block_id: Id, big endian byte order
2561 */
2562static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2563 u64 sector,
2564 u32 blksize,
2565 u64 block_id)
2566{
2567 int ok;
2568 struct p_block_ack p;
2569
2570 p.sector = sector;
2571 p.block_id = block_id;
2572 p.blksize = blksize;
2573 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2574
2575 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
81e84650 2576 return false;
b411b363 2577 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
0b70a13d 2578 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2579 return ok;
2580}
2581
2b2bf214
LE
2582/* dp->sector and dp->block_id already/still in network byte order,
2583 * data_size is payload size according to dp->head,
2584 * and may need to be corrected for digest size. */
b411b363 2585int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2b2bf214 2586 struct p_data *dp, int data_size)
b411b363 2587{
2b2bf214
LE
2588 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2589 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
b411b363
PR
2590 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2591 dp->block_id);
2592}
2593
2594int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2595 struct p_block_req *rp)
2596{
2597 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2598}
2599
2600/**
2601 * drbd_send_ack() - Sends an ack packet
2602 * @mdev: DRBD device.
2603 * @cmd: Packet command code.
2604 * @e: Epoch entry.
2605 */
2606int drbd_send_ack(struct drbd_conf *mdev,
2607 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2608{
2609 return _drbd_send_ack(mdev, cmd,
2610 cpu_to_be64(e->sector),
2611 cpu_to_be32(e->size),
2612 e->block_id);
2613}
2614
2615/* This function misuses the block_id field to signal if the blocks
2616 * are is sync or not. */
2617int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2618 sector_t sector, int blksize, u64 block_id)
2619{
2620 return _drbd_send_ack(mdev, cmd,
2621 cpu_to_be64(sector),
2622 cpu_to_be32(blksize),
2623 cpu_to_be64(block_id));
2624}
2625
2626int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2627 sector_t sector, int size, u64 block_id)
2628{
2629 int ok;
2630 struct p_block_req p;
2631
2632 p.sector = cpu_to_be64(sector);
2633 p.block_id = block_id;
2634 p.blksize = cpu_to_be32(size);
2635
2636 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
0b70a13d 2637 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2638 return ok;
2639}
2640
2641int drbd_send_drequest_csum(struct drbd_conf *mdev,
2642 sector_t sector, int size,
2643 void *digest, int digest_size,
2644 enum drbd_packets cmd)
2645{
2646 int ok;
2647 struct p_block_req p;
2648
2649 p.sector = cpu_to_be64(sector);
2650 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2651 p.blksize = cpu_to_be32(size);
2652
2653 p.head.magic = BE_DRBD_MAGIC;
2654 p.head.command = cpu_to_be16(cmd);
0b70a13d 2655 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
b411b363
PR
2656
2657 mutex_lock(&mdev->data.mutex);
2658
2659 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2660 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2661
2662 mutex_unlock(&mdev->data.mutex);
2663
2664 return ok;
2665}
2666
2667int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2668{
2669 int ok;
2670 struct p_block_req p;
2671
2672 p.sector = cpu_to_be64(sector);
2673 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2674 p.blksize = cpu_to_be32(size);
2675
2676 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
0b70a13d 2677 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2678 return ok;
2679}
2680
2681/* called on sndtimeo
81e84650
AG
2682 * returns false if we should retry,
2683 * true if we think connection is dead
b411b363
PR
2684 */
2685static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2686{
2687 int drop_it;
2688 /* long elapsed = (long)(jiffies - mdev->last_received); */
2689
2690 drop_it = mdev->meta.socket == sock
2691 || !mdev->asender.task
2692 || get_t_state(&mdev->asender) != Running
2693 || mdev->state.conn < C_CONNECTED;
2694
2695 if (drop_it)
81e84650 2696 return true;
b411b363
PR
2697
2698 drop_it = !--mdev->ko_count;
2699 if (!drop_it) {
2700 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2701 current->comm, current->pid, mdev->ko_count);
2702 request_ping(mdev);
2703 }
2704
2705 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2706}
2707
2708/* The idea of sendpage seems to be to put some kind of reference
2709 * to the page into the skb, and to hand it over to the NIC. In
2710 * this process get_page() gets called.
2711 *
2712 * As soon as the page was really sent over the network put_page()
2713 * gets called by some part of the network layer. [ NIC driver? ]
2714 *
2715 * [ get_page() / put_page() increment/decrement the count. If count
2716 * reaches 0 the page will be freed. ]
2717 *
2718 * This works nicely with pages from FSs.
2719 * But this means that in protocol A we might signal IO completion too early!
2720 *
2721 * In order not to corrupt data during a resync we must make sure
2722 * that we do not reuse our own buffer pages (EEs) to early, therefore
2723 * we have the net_ee list.
2724 *
2725 * XFS seems to have problems, still, it submits pages with page_count == 0!
2726 * As a workaround, we disable sendpage on pages
2727 * with page_count == 0 or PageSlab.
2728 */
2729static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 2730 int offset, size_t size, unsigned msg_flags)
b411b363 2731{
ba11ad9a 2732 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
b411b363
PR
2733 kunmap(page);
2734 if (sent == size)
2735 mdev->send_cnt += size>>9;
2736 return sent == size;
2737}
2738
2739static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 2740 int offset, size_t size, unsigned msg_flags)
b411b363
PR
2741{
2742 mm_segment_t oldfs = get_fs();
2743 int sent, ok;
2744 int len = size;
2745
2746 /* e.g. XFS meta- & log-data is in slab pages, which have a
2747 * page_count of 0 and/or have PageSlab() set.
2748 * we cannot use send_page for those, as that does get_page();
2749 * put_page(); and would cause either a VM_BUG directly, or
2750 * __page_cache_release a page that would actually still be referenced
2751 * by someone, leading to some obscure delayed Oops somewhere else. */
2752 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
ba11ad9a 2753 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
b411b363 2754
ba11ad9a 2755 msg_flags |= MSG_NOSIGNAL;
b411b363
PR
2756 drbd_update_congested(mdev);
2757 set_fs(KERNEL_DS);
2758 do {
2759 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2760 offset, len,
ba11ad9a 2761 msg_flags);
b411b363
PR
2762 if (sent == -EAGAIN) {
2763 if (we_should_drop_the_connection(mdev,
2764 mdev->data.socket))
2765 break;
2766 else
2767 continue;
2768 }
2769 if (sent <= 0) {
2770 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2771 __func__, (int)size, len, sent);
2772 break;
2773 }
2774 len -= sent;
2775 offset += sent;
2776 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2777 set_fs(oldfs);
2778 clear_bit(NET_CONGESTED, &mdev->flags);
2779
2780 ok = (len == 0);
2781 if (likely(ok))
2782 mdev->send_cnt += size>>9;
2783 return ok;
2784}
2785
2786static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2787{
2788 struct bio_vec *bvec;
2789 int i;
ba11ad9a 2790 /* hint all but last page with MSG_MORE */
001a8868 2791 bio_for_each_segment(bvec, bio, i) {
b411b363 2792 if (!_drbd_no_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
2793 bvec->bv_offset, bvec->bv_len,
2794 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
2795 return 0;
2796 }
2797 return 1;
2798}
2799
2800static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2801{
2802 struct bio_vec *bvec;
2803 int i;
ba11ad9a 2804 /* hint all but last page with MSG_MORE */
001a8868 2805 bio_for_each_segment(bvec, bio, i) {
b411b363 2806 if (!_drbd_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
2807 bvec->bv_offset, bvec->bv_len,
2808 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
2809 return 0;
2810 }
b411b363
PR
2811 return 1;
2812}
2813
45bb912b
LE
2814static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2815{
2816 struct page *page = e->pages;
2817 unsigned len = e->size;
ba11ad9a 2818 /* hint all but last page with MSG_MORE */
45bb912b
LE
2819 page_chain_for_each(page) {
2820 unsigned l = min_t(unsigned, len, PAGE_SIZE);
ba11ad9a
LE
2821 if (!_drbd_send_page(mdev, page, 0, l,
2822 page_chain_next(page) ? MSG_MORE : 0))
45bb912b
LE
2823 return 0;
2824 len -= l;
2825 }
2826 return 1;
2827}
2828
76d2e7ec
PR
2829static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2830{
2831 if (mdev->agreed_pro_version >= 95)
2832 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
76d2e7ec
PR
2833 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2834 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2835 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2836 else
721a9602 2837 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
76d2e7ec
PR
2838}
2839
b411b363
PR
2840/* Used to send write requests
2841 * R_PRIMARY -> Peer (P_DATA)
2842 */
2843int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2844{
2845 int ok = 1;
2846 struct p_data p;
2847 unsigned int dp_flags = 0;
2848 void *dgb;
2849 int dgs;
2850
2851 if (!drbd_get_data_sock(mdev))
2852 return 0;
2853
2854 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2855 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2856
d5373389 2857 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
0b70a13d
PR
2858 p.head.h80.magic = BE_DRBD_MAGIC;
2859 p.head.h80.command = cpu_to_be16(P_DATA);
2860 p.head.h80.length =
2861 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2862 } else {
2863 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2864 p.head.h95.command = cpu_to_be16(P_DATA);
2865 p.head.h95.length =
2866 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2867 }
b411b363
PR
2868
2869 p.sector = cpu_to_be64(req->sector);
2870 p.block_id = (unsigned long)req;
671a74e7 2871 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
b411b363 2872
76d2e7ec
PR
2873 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2874
b411b363
PR
2875 if (mdev->state.conn >= C_SYNC_SOURCE &&
2876 mdev->state.conn <= C_PAUSED_SYNC_T)
2877 dp_flags |= DP_MAY_SET_IN_SYNC;
2878
2879 p.dp_flags = cpu_to_be32(dp_flags);
b411b363
PR
2880 set_bit(UNPLUG_REMOTE, &mdev->flags);
2881 ok = (sizeof(p) ==
ba11ad9a 2882 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
b411b363
PR
2883 if (ok && dgs) {
2884 dgb = mdev->int_dig_out;
45bb912b 2885 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
cab2f74b 2886 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
b411b363
PR
2887 }
2888 if (ok) {
470be44a
LE
2889 /* For protocol A, we have to memcpy the payload into
2890 * socket buffers, as we may complete right away
2891 * as soon as we handed it over to tcp, at which point the data
2892 * pages may become invalid.
2893 *
2894 * For data-integrity enabled, we copy it as well, so we can be
2895 * sure that even if the bio pages may still be modified, it
2896 * won't change the data on the wire, thus if the digest checks
2897 * out ok after sending on this side, but does not fit on the
2898 * receiving side, we sure have detected corruption elsewhere.
2899 */
2900 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
b411b363
PR
2901 ok = _drbd_send_bio(mdev, req->master_bio);
2902 else
2903 ok = _drbd_send_zc_bio(mdev, req->master_bio);
470be44a
LE
2904
2905 /* double check digest, sometimes buffers have been modified in flight. */
2906 if (dgs > 0 && dgs <= 64) {
24c4830c 2907 /* 64 byte, 512 bit, is the largest digest size
470be44a
LE
2908 * currently supported in kernel crypto. */
2909 unsigned char digest[64];
2910 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2911 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2912 dev_warn(DEV,
2913 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2914 (unsigned long long)req->sector, req->size);
2915 }
2916 } /* else if (dgs > 64) {
2917 ... Be noisy about digest too large ...
2918 } */
b411b363
PR
2919 }
2920
2921 drbd_put_data_sock(mdev);
bd26bfc5 2922
b411b363
PR
2923 return ok;
2924}
2925
2926/* answer packet, used to send data back for read requests:
2927 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2928 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2929 */
2930int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2931 struct drbd_epoch_entry *e)
2932{
2933 int ok;
2934 struct p_data p;
2935 void *dgb;
2936 int dgs;
2937
2938 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2939 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2940
d5373389 2941 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
0b70a13d
PR
2942 p.head.h80.magic = BE_DRBD_MAGIC;
2943 p.head.h80.command = cpu_to_be16(cmd);
2944 p.head.h80.length =
2945 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2946 } else {
2947 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2948 p.head.h95.command = cpu_to_be16(cmd);
2949 p.head.h95.length =
2950 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2951 }
b411b363
PR
2952
2953 p.sector = cpu_to_be64(e->sector);
2954 p.block_id = e->block_id;
2955 /* p.seq_num = 0; No sequence numbers here.. */
2956
2957 /* Only called by our kernel thread.
2958 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2959 * in response to admin command or module unload.
2960 */
2961 if (!drbd_get_data_sock(mdev))
2962 return 0;
2963
0b70a13d 2964 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
b411b363
PR
2965 if (ok && dgs) {
2966 dgb = mdev->int_dig_out;
45bb912b 2967 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
cab2f74b 2968 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
b411b363
PR
2969 }
2970 if (ok)
45bb912b 2971 ok = _drbd_send_zc_ee(mdev, e);
b411b363
PR
2972
2973 drbd_put_data_sock(mdev);
bd26bfc5 2974
b411b363
PR
2975 return ok;
2976}
2977
73a01a18
PR
2978int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2979{
2980 struct p_block_desc p;
2981
2982 p.sector = cpu_to_be64(req->sector);
2983 p.blksize = cpu_to_be32(req->size);
2984
2985 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2986}
2987
b411b363
PR
2988/*
2989 drbd_send distinguishes two cases:
2990
2991 Packets sent via the data socket "sock"
2992 and packets sent via the meta data socket "msock"
2993
2994 sock msock
2995 -----------------+-------------------------+------------------------------
2996 timeout conf.timeout / 2 conf.timeout / 2
2997 timeout action send a ping via msock Abort communication
2998 and close all sockets
2999*/
3000
3001/*
3002 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
3003 */
3004int drbd_send(struct drbd_conf *mdev, struct socket *sock,
3005 void *buf, size_t size, unsigned msg_flags)
3006{
3007 struct kvec iov;
3008 struct msghdr msg;
3009 int rv, sent = 0;
3010
3011 if (!sock)
3012 return -1000;
3013
3014 /* THINK if (signal_pending) return ... ? */
3015
3016 iov.iov_base = buf;
3017 iov.iov_len = size;
3018
3019 msg.msg_name = NULL;
3020 msg.msg_namelen = 0;
3021 msg.msg_control = NULL;
3022 msg.msg_controllen = 0;
3023 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
3024
3025 if (sock == mdev->data.socket) {
3026 mdev->ko_count = mdev->net_conf->ko_count;
3027 drbd_update_congested(mdev);
3028 }
3029 do {
3030 /* STRANGE
3031 * tcp_sendmsg does _not_ use its size parameter at all ?
3032 *
3033 * -EAGAIN on timeout, -EINTR on signal.
3034 */
3035/* THINK
3036 * do we need to block DRBD_SIG if sock == &meta.socket ??
3037 * otherwise wake_asender() might interrupt some send_*Ack !
3038 */
3039 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
3040 if (rv == -EAGAIN) {
3041 if (we_should_drop_the_connection(mdev, sock))
3042 break;
3043 else
3044 continue;
3045 }
3046 D_ASSERT(rv != 0);
3047 if (rv == -EINTR) {
3048 flush_signals(current);
3049 rv = 0;
3050 }
3051 if (rv < 0)
3052 break;
3053 sent += rv;
3054 iov.iov_base += rv;
3055 iov.iov_len -= rv;
3056 } while (sent < size);
3057
3058 if (sock == mdev->data.socket)
3059 clear_bit(NET_CONGESTED, &mdev->flags);
3060
3061 if (rv <= 0) {
3062 if (rv != -EAGAIN) {
3063 dev_err(DEV, "%s_sendmsg returned %d\n",
3064 sock == mdev->meta.socket ? "msock" : "sock",
3065 rv);
3066 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
3067 } else
3068 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
3069 }
3070
3071 return sent;
3072}
3073
3074static int drbd_open(struct block_device *bdev, fmode_t mode)
3075{
3076 struct drbd_conf *mdev = bdev->bd_disk->private_data;
3077 unsigned long flags;
3078 int rv = 0;
3079
2a48fc0a 3080 mutex_lock(&drbd_main_mutex);
b411b363
PR
3081 spin_lock_irqsave(&mdev->req_lock, flags);
3082 /* to have a stable mdev->state.role
3083 * and no race with updating open_cnt */
3084
3085 if (mdev->state.role != R_PRIMARY) {
3086 if (mode & FMODE_WRITE)
3087 rv = -EROFS;
3088 else if (!allow_oos)
3089 rv = -EMEDIUMTYPE;
3090 }
3091
3092 if (!rv)
3093 mdev->open_cnt++;
3094 spin_unlock_irqrestore(&mdev->req_lock, flags);
2a48fc0a 3095 mutex_unlock(&drbd_main_mutex);
b411b363
PR
3096
3097 return rv;
3098}
3099
3100static int drbd_release(struct gendisk *gd, fmode_t mode)
3101{
3102 struct drbd_conf *mdev = gd->private_data;
2a48fc0a 3103 mutex_lock(&drbd_main_mutex);
b411b363 3104 mdev->open_cnt--;
2a48fc0a 3105 mutex_unlock(&drbd_main_mutex);
b411b363
PR
3106 return 0;
3107}
3108
b411b363
PR
3109static void drbd_set_defaults(struct drbd_conf *mdev)
3110{
85f4cc17
PR
3111 /* This way we get a compile error when sync_conf grows,
3112 and we forgot to initialize it here */
3113 mdev->sync_conf = (struct syncer_conf) {
3114 /* .rate = */ DRBD_RATE_DEF,
3115 /* .after = */ DRBD_AFTER_DEF,
3116 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
85f4cc17
PR
3117 /* .verify_alg = */ {}, 0,
3118 /* .cpu_mask = */ {}, 0,
3119 /* .csums_alg = */ {}, 0,
e756414f 3120 /* .use_rle = */ 0,
9a31d716
PR
3121 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
3122 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
3123 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
3124 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
0f0601f4
LE
3125 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
3126 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
85f4cc17
PR
3127 };
3128
3129 /* Have to use that way, because the layout differs between
3130 big endian and little endian */
b411b363
PR
3131 mdev->state = (union drbd_state) {
3132 { .role = R_SECONDARY,
3133 .peer = R_UNKNOWN,
3134 .conn = C_STANDALONE,
3135 .disk = D_DISKLESS,
3136 .pdsk = D_UNKNOWN,
fb22c402
PR
3137 .susp = 0,
3138 .susp_nod = 0,
3139 .susp_fen = 0
b411b363
PR
3140 } };
3141}
3142
3143void drbd_init_set_defaults(struct drbd_conf *mdev)
3144{
3145 /* the memset(,0,) did most of this.
3146 * note: only assignments, no allocation in here */
3147
3148 drbd_set_defaults(mdev);
3149
b411b363
PR
3150 atomic_set(&mdev->ap_bio_cnt, 0);
3151 atomic_set(&mdev->ap_pending_cnt, 0);
3152 atomic_set(&mdev->rs_pending_cnt, 0);
3153 atomic_set(&mdev->unacked_cnt, 0);
3154 atomic_set(&mdev->local_cnt, 0);
3155 atomic_set(&mdev->net_cnt, 0);
3156 atomic_set(&mdev->packet_seq, 0);
3157 atomic_set(&mdev->pp_in_use, 0);
435f0740 3158 atomic_set(&mdev->pp_in_use_by_net, 0);
778f271d 3159 atomic_set(&mdev->rs_sect_in, 0);
0f0601f4 3160 atomic_set(&mdev->rs_sect_ev, 0);
759fbdfb 3161 atomic_set(&mdev->ap_in_flight, 0);
e1711731 3162 atomic_set(&mdev->md_io_in_use, 0);
b411b363 3163
b411b363
PR
3164 mutex_init(&mdev->data.mutex);
3165 mutex_init(&mdev->meta.mutex);
3166 sema_init(&mdev->data.work.s, 0);
3167 sema_init(&mdev->meta.work.s, 0);
3168 mutex_init(&mdev->state_mutex);
3169
3170 spin_lock_init(&mdev->data.work.q_lock);
3171 spin_lock_init(&mdev->meta.work.q_lock);
3172
3173 spin_lock_init(&mdev->al_lock);
3174 spin_lock_init(&mdev->req_lock);
3175 spin_lock_init(&mdev->peer_seq_lock);
3176 spin_lock_init(&mdev->epoch_lock);
3177
3178 INIT_LIST_HEAD(&mdev->active_ee);
3179 INIT_LIST_HEAD(&mdev->sync_ee);
3180 INIT_LIST_HEAD(&mdev->done_ee);
3181 INIT_LIST_HEAD(&mdev->read_ee);
3182 INIT_LIST_HEAD(&mdev->net_ee);
3183 INIT_LIST_HEAD(&mdev->resync_reads);
3184 INIT_LIST_HEAD(&mdev->data.work.q);
3185 INIT_LIST_HEAD(&mdev->meta.work.q);
3186 INIT_LIST_HEAD(&mdev->resync_work.list);
3187 INIT_LIST_HEAD(&mdev->unplug_work.list);
e9e6f3ec 3188 INIT_LIST_HEAD(&mdev->go_diskless.list);
b411b363 3189 INIT_LIST_HEAD(&mdev->md_sync_work.list);
c4752ef1 3190 INIT_LIST_HEAD(&mdev->start_resync_work.list);
b411b363 3191 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
0ced55a3 3192
794abb75 3193 mdev->resync_work.cb = w_resync_timer;
b411b363 3194 mdev->unplug_work.cb = w_send_write_hint;
e9e6f3ec 3195 mdev->go_diskless.cb = w_go_diskless;
b411b363
PR
3196 mdev->md_sync_work.cb = w_md_sync;
3197 mdev->bm_io_work.w.cb = w_bitmap_io;
370a43e7 3198 mdev->start_resync_work.cb = w_start_resync;
b411b363
PR
3199 init_timer(&mdev->resync_timer);
3200 init_timer(&mdev->md_sync_timer);
370a43e7 3201 init_timer(&mdev->start_resync_timer);
7fde2be9 3202 init_timer(&mdev->request_timer);
b411b363
PR
3203 mdev->resync_timer.function = resync_timer_fn;
3204 mdev->resync_timer.data = (unsigned long) mdev;
3205 mdev->md_sync_timer.function = md_sync_timer_fn;
3206 mdev->md_sync_timer.data = (unsigned long) mdev;
370a43e7
PR
3207 mdev->start_resync_timer.function = start_resync_timer_fn;
3208 mdev->start_resync_timer.data = (unsigned long) mdev;
7fde2be9
PR
3209 mdev->request_timer.function = request_timer_fn;
3210 mdev->request_timer.data = (unsigned long) mdev;
b411b363
PR
3211
3212 init_waitqueue_head(&mdev->misc_wait);
3213 init_waitqueue_head(&mdev->state_wait);
84dfb9f5 3214 init_waitqueue_head(&mdev->net_cnt_wait);
b411b363
PR
3215 init_waitqueue_head(&mdev->ee_wait);
3216 init_waitqueue_head(&mdev->al_wait);
3217 init_waitqueue_head(&mdev->seq_wait);
3218
3219 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3220 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3221 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3222
3223 mdev->agreed_pro_version = PRO_VERSION_MAX;
2451fc3b 3224 mdev->write_ordering = WO_bdev_flush;
b411b363 3225 mdev->resync_wenr = LC_FREE;
99432fcc
PR
3226 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3227 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
b411b363
PR
3228}
3229
3230void drbd_mdev_cleanup(struct drbd_conf *mdev)
3231{
1d7734a0 3232 int i;
b411b363
PR
3233 if (mdev->receiver.t_state != None)
3234 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3235 mdev->receiver.t_state);
3236
3237 /* no need to lock it, I'm the only thread alive */
3238 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3239 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3240 mdev->al_writ_cnt =
3241 mdev->bm_writ_cnt =
3242 mdev->read_cnt =
3243 mdev->recv_cnt =
3244 mdev->send_cnt =
3245 mdev->writ_cnt =
3246 mdev->p_size =
3247 mdev->rs_start =
3248 mdev->rs_total =
1d7734a0
LE
3249 mdev->rs_failed = 0;
3250 mdev->rs_last_events = 0;
0f0601f4 3251 mdev->rs_last_sect_ev = 0;
1d7734a0
LE
3252 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3253 mdev->rs_mark_left[i] = 0;
3254 mdev->rs_mark_time[i] = 0;
3255 }
b411b363
PR
3256 D_ASSERT(mdev->net_conf == NULL);
3257
3258 drbd_set_my_capacity(mdev, 0);
3259 if (mdev->bitmap) {
3260 /* maybe never allocated. */
02d9a94b 3261 drbd_bm_resize(mdev, 0, 1);
b411b363
PR
3262 drbd_bm_cleanup(mdev);
3263 }
3264
3265 drbd_free_resources(mdev);
0778286a 3266 clear_bit(AL_SUSPENDED, &mdev->flags);
b411b363
PR
3267
3268 /*
3269 * currently we drbd_init_ee only on module load, so
3270 * we may do drbd_release_ee only on module unload!
3271 */
3272 D_ASSERT(list_empty(&mdev->active_ee));
3273 D_ASSERT(list_empty(&mdev->sync_ee));
3274 D_ASSERT(list_empty(&mdev->done_ee));
3275 D_ASSERT(list_empty(&mdev->read_ee));
3276 D_ASSERT(list_empty(&mdev->net_ee));
3277 D_ASSERT(list_empty(&mdev->resync_reads));
3278 D_ASSERT(list_empty(&mdev->data.work.q));
3279 D_ASSERT(list_empty(&mdev->meta.work.q));
3280 D_ASSERT(list_empty(&mdev->resync_work.list));
3281 D_ASSERT(list_empty(&mdev->unplug_work.list));
e9e6f3ec 3282 D_ASSERT(list_empty(&mdev->go_diskless.list));
2265b473
LE
3283
3284 drbd_set_defaults(mdev);
b411b363
PR
3285}
3286
3287
3288static void drbd_destroy_mempools(void)
3289{
3290 struct page *page;
3291
3292 while (drbd_pp_pool) {
3293 page = drbd_pp_pool;
3294 drbd_pp_pool = (struct page *)page_private(page);
3295 __free_page(page);
3296 drbd_pp_vacant--;
3297 }
3298
3299 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3300
9476f39d
LE
3301 if (drbd_md_io_bio_set)
3302 bioset_free(drbd_md_io_bio_set);
4281808f
LE
3303 if (drbd_md_io_page_pool)
3304 mempool_destroy(drbd_md_io_page_pool);
b411b363
PR
3305 if (drbd_ee_mempool)
3306 mempool_destroy(drbd_ee_mempool);
3307 if (drbd_request_mempool)
3308 mempool_destroy(drbd_request_mempool);
3309 if (drbd_ee_cache)
3310 kmem_cache_destroy(drbd_ee_cache);
3311 if (drbd_request_cache)
3312 kmem_cache_destroy(drbd_request_cache);
3313 if (drbd_bm_ext_cache)
3314 kmem_cache_destroy(drbd_bm_ext_cache);
3315 if (drbd_al_ext_cache)
3316 kmem_cache_destroy(drbd_al_ext_cache);
3317
9476f39d 3318 drbd_md_io_bio_set = NULL;
4281808f 3319 drbd_md_io_page_pool = NULL;
b411b363
PR
3320 drbd_ee_mempool = NULL;
3321 drbd_request_mempool = NULL;
3322 drbd_ee_cache = NULL;
3323 drbd_request_cache = NULL;
3324 drbd_bm_ext_cache = NULL;
3325 drbd_al_ext_cache = NULL;
3326
3327 return;
3328}
3329
3330static int drbd_create_mempools(void)
3331{
3332 struct page *page;
1816a2b4 3333 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
b411b363
PR
3334 int i;
3335
3336 /* prepare our caches and mempools */
3337 drbd_request_mempool = NULL;
3338 drbd_ee_cache = NULL;
3339 drbd_request_cache = NULL;
3340 drbd_bm_ext_cache = NULL;
3341 drbd_al_ext_cache = NULL;
3342 drbd_pp_pool = NULL;
4281808f 3343 drbd_md_io_page_pool = NULL;
9476f39d 3344 drbd_md_io_bio_set = NULL;
b411b363
PR
3345
3346 /* caches */
3347 drbd_request_cache = kmem_cache_create(
3348 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3349 if (drbd_request_cache == NULL)
3350 goto Enomem;
3351
3352 drbd_ee_cache = kmem_cache_create(
3353 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3354 if (drbd_ee_cache == NULL)
3355 goto Enomem;
3356
3357 drbd_bm_ext_cache = kmem_cache_create(
3358 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3359 if (drbd_bm_ext_cache == NULL)
3360 goto Enomem;
3361
3362 drbd_al_ext_cache = kmem_cache_create(
3363 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3364 if (drbd_al_ext_cache == NULL)
3365 goto Enomem;
3366
3367 /* mempools */
9476f39d
LE
3368#ifdef COMPAT_HAVE_BIOSET_CREATE
3369 drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
3370 if (drbd_md_io_bio_set == NULL)
3371 goto Enomem;
3372#endif
3373
4281808f
LE
3374 drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
3375 if (drbd_md_io_page_pool == NULL)
3376 goto Enomem;
3377
b411b363
PR
3378 drbd_request_mempool = mempool_create(number,
3379 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3380 if (drbd_request_mempool == NULL)
3381 goto Enomem;
3382
3383 drbd_ee_mempool = mempool_create(number,
3384 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2027ae1f 3385 if (drbd_ee_mempool == NULL)
b411b363
PR
3386 goto Enomem;
3387
3388 /* drbd's page pool */
3389 spin_lock_init(&drbd_pp_lock);
3390
3391 for (i = 0; i < number; i++) {
3392 page = alloc_page(GFP_HIGHUSER);
3393 if (!page)
3394 goto Enomem;
3395 set_page_private(page, (unsigned long)drbd_pp_pool);
3396 drbd_pp_pool = page;
3397 }
3398 drbd_pp_vacant = number;
3399
3400 return 0;
3401
3402Enomem:
3403 drbd_destroy_mempools(); /* in case we allocated some */
3404 return -ENOMEM;
3405}
3406
3407static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3408 void *unused)
3409{
3410 /* just so we have it. you never know what interesting things we
3411 * might want to do here some day...
3412 */
3413
3414 return NOTIFY_DONE;
3415}
3416
3417static struct notifier_block drbd_notifier = {
3418 .notifier_call = drbd_notify_sys,
3419};
3420
3421static void drbd_release_ee_lists(struct drbd_conf *mdev)
3422{
3423 int rr;
3424
3425 rr = drbd_release_ee(mdev, &mdev->active_ee);
3426 if (rr)
3427 dev_err(DEV, "%d EEs in active list found!\n", rr);
3428
3429 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3430 if (rr)
3431 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3432
3433 rr = drbd_release_ee(mdev, &mdev->read_ee);
3434 if (rr)
3435 dev_err(DEV, "%d EEs in read list found!\n", rr);
3436
3437 rr = drbd_release_ee(mdev, &mdev->done_ee);
3438 if (rr)
3439 dev_err(DEV, "%d EEs in done list found!\n", rr);
3440
3441 rr = drbd_release_ee(mdev, &mdev->net_ee);
3442 if (rr)
3443 dev_err(DEV, "%d EEs in net list found!\n", rr);
3444}
3445
3446/* caution. no locking.
3447 * currently only used from module cleanup code. */
3448static void drbd_delete_device(unsigned int minor)
3449{
3450 struct drbd_conf *mdev = minor_to_mdev(minor);
3451
3452 if (!mdev)
3453 return;
3454
dfa8bedb
PR
3455 del_timer_sync(&mdev->request_timer);
3456
b411b363
PR
3457 /* paranoia asserts */
3458 if (mdev->open_cnt != 0)
3459 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3460 __FILE__ , __LINE__);
3461
3462 ERR_IF (!list_empty(&mdev->data.work.q)) {
3463 struct list_head *lp;
3464 list_for_each(lp, &mdev->data.work.q) {
3465 dev_err(DEV, "lp = %p\n", lp);
3466 }
3467 };
3468 /* end paranoia asserts */
3469
3470 del_gendisk(mdev->vdisk);
3471
3472 /* cleanup stuff that may have been allocated during
3473 * device (re-)configuration or state changes */
3474
3475 if (mdev->this_bdev)
3476 bdput(mdev->this_bdev);
3477
3478 drbd_free_resources(mdev);
3479
3480 drbd_release_ee_lists(mdev);
3481
24c4830c 3482 /* should be freed on disconnect? */
b411b363
PR
3483 kfree(mdev->ee_hash);
3484 /*
3485 mdev->ee_hash_s = 0;
3486 mdev->ee_hash = NULL;
3487 */
3488
3489 lc_destroy(mdev->act_log);
3490 lc_destroy(mdev->resync);
3491
3492 kfree(mdev->p_uuid);
3493 /* mdev->p_uuid = NULL; */
3494
3495 kfree(mdev->int_dig_out);
3496 kfree(mdev->int_dig_in);
3497 kfree(mdev->int_dig_vv);
3498
3499 /* cleanup the rest that has been
3500 * allocated from drbd_new_device
3501 * and actually free the mdev itself */
3502 drbd_free_mdev(mdev);
3503}
3504
3505static void drbd_cleanup(void)
3506{
3507 unsigned int i;
3508
3509 unregister_reboot_notifier(&drbd_notifier);
3510
17a93f30
LE
3511 /* first remove proc,
3512 * drbdsetup uses it's presence to detect
3513 * whether DRBD is loaded.
3514 * If we would get stuck in proc removal,
3515 * but have netlink already deregistered,
3516 * some drbdsetup commands may wait forever
3517 * for an answer.
3518 */
3519 if (drbd_proc)
3520 remove_proc_entry("drbd", NULL);
3521
b411b363
PR
3522 drbd_nl_cleanup();
3523
3524 if (minor_table) {
b411b363
PR
3525 i = minor_count;
3526 while (i--)
3527 drbd_delete_device(i);
3528 drbd_destroy_mempools();
3529 }
3530
3531 kfree(minor_table);
3532
3533 unregister_blkdev(DRBD_MAJOR, "drbd");
3534
3535 printk(KERN_INFO "drbd: module cleanup done.\n");
3536}
3537
3538/**
3539 * drbd_congested() - Callback for pdflush
3540 * @congested_data: User data
3541 * @bdi_bits: Bits pdflush is currently interested in
3542 *
3543 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3544 */
3545static int drbd_congested(void *congested_data, int bdi_bits)
3546{
3547 struct drbd_conf *mdev = congested_data;
3548 struct request_queue *q;
3549 char reason = '-';
3550 int r = 0;
3551
1b881ef7 3552 if (!may_inc_ap_bio(mdev)) {
b411b363
PR
3553 /* DRBD has frozen IO */
3554 r = bdi_bits;
3555 reason = 'd';
3556 goto out;
3557 }
3558
c2ba686f
LE
3559 if (test_bit(CALLBACK_PENDING, &mdev->flags)) {
3560 r |= (1 << BDI_async_congested);
3561 /* Without good local data, we would need to read from remote,
3562 * and that would need the worker thread as well, which is
3563 * currently blocked waiting for that usermode helper to
3564 * finish.
3565 */
3566 if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
3567 r |= (1 << BDI_sync_congested);
3568 else
3569 put_ldev(mdev);
3570 r &= bdi_bits;
3571 reason = 'c';
3572 goto out;
3573 }
3574
b411b363
PR
3575 if (get_ldev(mdev)) {
3576 q = bdev_get_queue(mdev->ldev->backing_bdev);
3577 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3578 put_ldev(mdev);
3579 if (r)
3580 reason = 'b';
3581 }
3582
3583 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3584 r |= (1 << BDI_async_congested);
3585 reason = reason == 'b' ? 'a' : 'n';
3586 }
3587
3588out:
3589 mdev->congestion_reason = reason;
3590 return r;
3591}
3592
3593struct drbd_conf *drbd_new_device(unsigned int minor)
3594{
3595 struct drbd_conf *mdev;
3596 struct gendisk *disk;
3597 struct request_queue *q;
3598
3599 /* GFP_KERNEL, we are outside of all write-out paths */
3600 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3601 if (!mdev)
3602 return NULL;
3603 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3604 goto out_no_cpumask;
3605
3606 mdev->minor = minor;
3607
3608 drbd_init_set_defaults(mdev);
3609
3610 q = blk_alloc_queue(GFP_KERNEL);
3611 if (!q)
3612 goto out_no_q;
3613 mdev->rq_queue = q;
3614 q->queuedata = mdev;
b411b363
PR
3615
3616 disk = alloc_disk(1);
3617 if (!disk)
3618 goto out_no_disk;
3619 mdev->vdisk = disk;
3620
81e84650 3621 set_disk_ro(disk, true);
b411b363
PR
3622
3623 disk->queue = q;
3624 disk->major = DRBD_MAJOR;
3625 disk->first_minor = minor;
3626 disk->fops = &drbd_ops;
3627 sprintf(disk->disk_name, "drbd%d", minor);
3628 disk->private_data = mdev;
3629
3630 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3631 /* we have no partitions. we contain only ourselves. */
3632 mdev->this_bdev->bd_contains = mdev->this_bdev;
3633
3634 q->backing_dev_info.congested_fn = drbd_congested;
3635 q->backing_dev_info.congested_data = mdev;
3636
2f58dcfc 3637 blk_queue_make_request(q, drbd_make_request);
99432fcc
PR
3638 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3639 This triggers a max_bio_size message upon first attach or connect */
3640 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
b411b363
PR
3641 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3642 blk_queue_merge_bvec(q, drbd_merge_bvec);
7eaceacc 3643 q->queue_lock = &mdev->req_lock;
b411b363
PR
3644
3645 mdev->md_io_page = alloc_page(GFP_KERNEL);
3646 if (!mdev->md_io_page)
3647 goto out_no_io_page;
3648
3649 if (drbd_bm_init(mdev))
3650 goto out_no_bitmap;
3651 /* no need to lock access, we are still initializing this minor device. */
3652 if (!tl_init(mdev))
3653 goto out_no_tl;
3654
3655 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3656 if (!mdev->app_reads_hash)
3657 goto out_no_app_reads;
3658
3659 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3660 if (!mdev->current_epoch)
3661 goto out_no_epoch;
3662
3663 INIT_LIST_HEAD(&mdev->current_epoch->list);
3664 mdev->epochs = 1;
3665
3666 return mdev;
3667
3668/* out_whatever_else:
3669 kfree(mdev->current_epoch); */
3670out_no_epoch:
3671 kfree(mdev->app_reads_hash);
3672out_no_app_reads:
3673 tl_cleanup(mdev);
3674out_no_tl:
3675 drbd_bm_cleanup(mdev);
3676out_no_bitmap:
3677 __free_page(mdev->md_io_page);
3678out_no_io_page:
3679 put_disk(disk);
3680out_no_disk:
3681 blk_cleanup_queue(q);
3682out_no_q:
3683 free_cpumask_var(mdev->cpu_mask);
3684out_no_cpumask:
3685 kfree(mdev);
3686 return NULL;
3687}
3688
3689/* counterpart of drbd_new_device.
3690 * last part of drbd_delete_device. */
3691void drbd_free_mdev(struct drbd_conf *mdev)
3692{
3693 kfree(mdev->current_epoch);
3694 kfree(mdev->app_reads_hash);
3695 tl_cleanup(mdev);
3696 if (mdev->bitmap) /* should no longer be there. */
3697 drbd_bm_cleanup(mdev);
3698 __free_page(mdev->md_io_page);
3699 put_disk(mdev->vdisk);
3700 blk_cleanup_queue(mdev->rq_queue);
3701 free_cpumask_var(mdev->cpu_mask);
3719094e 3702 drbd_free_tl_hash(mdev);
b411b363
PR
3703 kfree(mdev);
3704}
3705
3706
3707int __init drbd_init(void)
3708{
3709 int err;
3710
3711 if (sizeof(struct p_handshake) != 80) {
3712 printk(KERN_ERR
3713 "drbd: never change the size or layout "
3714 "of the HandShake packet.\n");
3715 return -EINVAL;
3716 }
3717
2b8a90b5 3718 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
b411b363
PR
3719 printk(KERN_ERR
3720 "drbd: invalid minor_count (%d)\n", minor_count);
3721#ifdef MODULE
3722 return -EINVAL;
3723#else
3724 minor_count = 8;
3725#endif
3726 }
3727
3728 err = drbd_nl_init();
3729 if (err)
3730 return err;
3731
3732 err = register_blkdev(DRBD_MAJOR, "drbd");
3733 if (err) {
3734 printk(KERN_ERR
3735 "drbd: unable to register block device major %d\n",
3736 DRBD_MAJOR);
3737 return err;
3738 }
3739
3740 register_reboot_notifier(&drbd_notifier);
3741
3742 /*
3743 * allocate all necessary structs
3744 */
3745 err = -ENOMEM;
3746
3747 init_waitqueue_head(&drbd_pp_wait);
3748
3749 drbd_proc = NULL; /* play safe for drbd_cleanup */
3750 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3751 GFP_KERNEL);
3752 if (!minor_table)
3753 goto Enomem;
3754
3755 err = drbd_create_mempools();
3756 if (err)
3757 goto Enomem;
3758
8c484ee4 3759 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
b411b363
PR
3760 if (!drbd_proc) {
3761 printk(KERN_ERR "drbd: unable to register proc file\n");
3762 goto Enomem;
3763 }
3764
3765 rwlock_init(&global_state_lock);
3766
3767 printk(KERN_INFO "drbd: initialized. "
3768 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3769 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3770 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3771 printk(KERN_INFO "drbd: registered as block device major %d\n",
3772 DRBD_MAJOR);
3773 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3774
3775 return 0; /* Success! */
3776
3777Enomem:
3778 drbd_cleanup();
3779 if (err == -ENOMEM)
3780 /* currently always the case */
3781 printk(KERN_ERR "drbd: ran out of memory\n");
3782 else
3783 printk(KERN_ERR "drbd: initialization failure\n");
3784 return err;
3785}
3786
3787void drbd_free_bc(struct drbd_backing_dev *ldev)
3788{
3789 if (ldev == NULL)
3790 return;
3791
e525fd89
TH
3792 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3793 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
b411b363
PR
3794
3795 kfree(ldev);
3796}
3797
3798void drbd_free_sock(struct drbd_conf *mdev)
3799{
3800 if (mdev->data.socket) {
4589d7f8 3801 mutex_lock(&mdev->data.mutex);
b411b363
PR
3802 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3803 sock_release(mdev->data.socket);
3804 mdev->data.socket = NULL;
4589d7f8 3805 mutex_unlock(&mdev->data.mutex);
b411b363
PR
3806 }
3807 if (mdev->meta.socket) {
4589d7f8 3808 mutex_lock(&mdev->meta.mutex);
b411b363
PR
3809 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3810 sock_release(mdev->meta.socket);
3811 mdev->meta.socket = NULL;
4589d7f8 3812 mutex_unlock(&mdev->meta.mutex);
b411b363
PR
3813 }
3814}
3815
3816
3817void drbd_free_resources(struct drbd_conf *mdev)
3818{
3819 crypto_free_hash(mdev->csums_tfm);
3820 mdev->csums_tfm = NULL;
3821 crypto_free_hash(mdev->verify_tfm);
3822 mdev->verify_tfm = NULL;
3823 crypto_free_hash(mdev->cram_hmac_tfm);
3824 mdev->cram_hmac_tfm = NULL;
3825 crypto_free_hash(mdev->integrity_w_tfm);
3826 mdev->integrity_w_tfm = NULL;
3827 crypto_free_hash(mdev->integrity_r_tfm);
3828 mdev->integrity_r_tfm = NULL;
3829
3830 drbd_free_sock(mdev);
3831
3832 __no_warn(local,
3833 drbd_free_bc(mdev->ldev);
3834 mdev->ldev = NULL;);
3835}
3836
3837/* meta data management */
3838
3839struct meta_data_on_disk {
3840 u64 la_size; /* last agreed size. */
3841 u64 uuid[UI_SIZE]; /* UUIDs. */
3842 u64 device_uuid;
3843 u64 reserved_u64_1;
3844 u32 flags; /* MDF */
3845 u32 magic;
3846 u32 md_size_sect;
3847 u32 al_offset; /* offset to this block */
3848 u32 al_nr_extents; /* important for restoring the AL */
3849 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3850 u32 bm_offset; /* offset to the bitmap, from here */
3851 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
99432fcc
PR
3852 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3853 u32 reserved_u32[3];
b411b363
PR
3854
3855} __packed;
3856
3857/**
3858 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3859 * @mdev: DRBD device.
3860 */
3861void drbd_md_sync(struct drbd_conf *mdev)
3862{
3863 struct meta_data_on_disk *buffer;
3864 sector_t sector;
3865 int i;
3866
ee15b038
LE
3867 del_timer(&mdev->md_sync_timer);
3868 /* timer may be rearmed by drbd_md_mark_dirty() now. */
b411b363
PR
3869 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3870 return;
b411b363
PR
3871
3872 /* We use here D_FAILED and not D_ATTACHING because we try to write
3873 * metadata even if we detach due to a disk failure! */
3874 if (!get_ldev_if_state(mdev, D_FAILED))
3875 return;
3876
e1711731
PR
3877 buffer = drbd_md_get_buffer(mdev);
3878 if (!buffer)
3879 goto out;
3880
b411b363
PR
3881 memset(buffer, 0, 512);
3882
3883 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3884 for (i = UI_CURRENT; i < UI_SIZE; i++)
3885 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3886 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3887 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3888
3889 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3890 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3891 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3892 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3893 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3894
3895 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
99432fcc 3896 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
b411b363
PR
3897
3898 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3899 sector = mdev->ldev->md.md_offset;
3900
3f3a9b84 3901 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
b411b363
PR
3902 /* this was a try anyways ... */
3903 dev_err(DEV, "meta data update failed!\n");
383606e0 3904 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
b411b363
PR
3905 }
3906
3907 /* Update mdev->ldev->md.la_size_sect,
3908 * since we updated it on metadata. */
3909 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3910
e1711731
PR
3911 drbd_md_put_buffer(mdev);
3912out:
b411b363
PR
3913 put_ldev(mdev);
3914}
3915
3916/**
3917 * drbd_md_read() - Reads in the meta data super block
3918 * @mdev: DRBD device.
3919 * @bdev: Device from which the meta data should be read in.
3920 *
116676ca 3921 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
b411b363
PR
3922 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3923 */
3924int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3925{
3926 struct meta_data_on_disk *buffer;
3927 int i, rv = NO_ERROR;
3928
3929 if (!get_ldev_if_state(mdev, D_ATTACHING))
3930 return ERR_IO_MD_DISK;
3931
e1711731
PR
3932 buffer = drbd_md_get_buffer(mdev);
3933 if (!buffer)
3934 goto out;
b411b363
PR
3935
3936 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
25985edc 3937 /* NOTE: can't do normal error processing here as this is
b411b363
PR
3938 called BEFORE disk is attached */
3939 dev_err(DEV, "Error while reading metadata.\n");
3940 rv = ERR_IO_MD_DISK;
3941 goto err;
3942 }
3943
3944 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3945 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3946 rv = ERR_MD_INVALID;
3947 goto err;
3948 }
3949 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3950 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3951 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3952 rv = ERR_MD_INVALID;
3953 goto err;
3954 }
3955 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3956 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3957 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3958 rv = ERR_MD_INVALID;
3959 goto err;
3960 }
3961 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3962 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3963 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3964 rv = ERR_MD_INVALID;
3965 goto err;
3966 }
3967
3968 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3969 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3970 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3971 rv = ERR_MD_INVALID;
3972 goto err;
3973 }
3974
3975 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3976 for (i = UI_CURRENT; i < UI_SIZE; i++)
3977 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3978 bdev->md.flags = be32_to_cpu(buffer->flags);
3979 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3980 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3981
99432fcc
PR
3982 spin_lock_irq(&mdev->req_lock);
3983 if (mdev->state.conn < C_CONNECTED) {
3984 int peer;
3985 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3986 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3987 mdev->peer_max_bio_size = peer;
3988 }
3989 spin_unlock_irq(&mdev->req_lock);
3990
b411b363
PR
3991 if (mdev->sync_conf.al_extents < 7)
3992 mdev->sync_conf.al_extents = 127;
3993
3994 err:
e1711731
PR
3995 drbd_md_put_buffer(mdev);
3996 out:
b411b363
PR
3997 put_ldev(mdev);
3998
3999 return rv;
4000}
4001
4002/**
4003 * drbd_md_mark_dirty() - Mark meta data super block as dirty
4004 * @mdev: DRBD device.
4005 *
4006 * Call this function if you change anything that should be written to
4007 * the meta-data super block. This function sets MD_DIRTY, and starts a
4008 * timer that ensures that within five seconds you have to call drbd_md_sync().
4009 */
ca0e6098 4010#ifdef DEBUG
ee15b038
LE
4011void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
4012{
4013 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
4014 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
4015 mdev->last_md_mark_dirty.line = line;
4016 mdev->last_md_mark_dirty.func = func;
4017 }
4018}
4019#else
b411b363
PR
4020void drbd_md_mark_dirty(struct drbd_conf *mdev)
4021{
ee15b038 4022 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
ca0e6098 4023 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
b411b363 4024}
ee15b038 4025#endif
b411b363
PR
4026
4027static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
4028{
4029 int i;
4030
62b0da3a 4031 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
b411b363 4032 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
b411b363
PR
4033}
4034
4035void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4036{
4037 if (idx == UI_CURRENT) {
4038 if (mdev->state.role == R_PRIMARY)
4039 val |= 1;
4040 else
4041 val &= ~((u64)1);
4042
4043 drbd_set_ed_uuid(mdev, val);
4044 }
4045
4046 mdev->ldev->md.uuid[idx] = val;
b411b363
PR
4047 drbd_md_mark_dirty(mdev);
4048}
4049
4050
4051void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4052{
4053 if (mdev->ldev->md.uuid[idx]) {
4054 drbd_uuid_move_history(mdev);
4055 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
b411b363
PR
4056 }
4057 _drbd_uuid_set(mdev, idx, val);
4058}
4059
4060/**
4061 * drbd_uuid_new_current() - Creates a new current UUID
4062 * @mdev: DRBD device.
4063 *
4064 * Creates a new current UUID, and rotates the old current UUID into
4065 * the bitmap slot. Causes an incremental resync upon next connect.
4066 */
4067void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
4068{
4069 u64 val;
62b0da3a
LE
4070 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
4071
4072 if (bm_uuid)
4073 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 4074
b411b363 4075 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
b411b363
PR
4076
4077 get_random_bytes(&val, sizeof(u64));
4078 _drbd_uuid_set(mdev, UI_CURRENT, val);
62b0da3a 4079 drbd_print_uuids(mdev, "new current UUID");
aaa8e2b3
LE
4080 /* get it to stable storage _now_ */
4081 drbd_md_sync(mdev);
b411b363
PR
4082}
4083
4084void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
4085{
4086 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
4087 return;
4088
4089 if (val == 0) {
4090 drbd_uuid_move_history(mdev);
4091 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
4092 mdev->ldev->md.uuid[UI_BITMAP] = 0;
b411b363 4093 } else {
62b0da3a
LE
4094 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
4095 if (bm_uuid)
4096 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 4097
62b0da3a 4098 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
b411b363
PR
4099 }
4100 drbd_md_mark_dirty(mdev);
4101}
4102
4103/**
4104 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4105 * @mdev: DRBD device.
4106 *
4107 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
4108 */
4109int drbd_bmio_set_n_write(struct drbd_conf *mdev)
4110{
4111 int rv = -EIO;
4112
4113 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4114 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
4115 drbd_md_sync(mdev);
4116 drbd_bm_set_all(mdev);
4117
4118 rv = drbd_bm_write(mdev);
4119
4120 if (!rv) {
4121 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
4122 drbd_md_sync(mdev);
4123 }
4124
4125 put_ldev(mdev);
4126 }
4127
4128 return rv;
4129}
4130
4131/**
4132 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4133 * @mdev: DRBD device.
4134 *
4135 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
4136 */
4137int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
4138{
4139 int rv = -EIO;
4140
0778286a 4141 drbd_resume_al(mdev);
b411b363
PR
4142 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4143 drbd_bm_clear_all(mdev);
4144 rv = drbd_bm_write(mdev);
4145 put_ldev(mdev);
4146 }
4147
4148 return rv;
4149}
4150
4151static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4152{
4153 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
02851e9f 4154 int rv = -EIO;
b411b363
PR
4155
4156 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
4157
02851e9f 4158 if (get_ldev(mdev)) {
20ceb2b2 4159 drbd_bm_lock(mdev, work->why, work->flags);
02851e9f
LE
4160 rv = work->io_fn(mdev);
4161 drbd_bm_unlock(mdev);
4162 put_ldev(mdev);
4163 }
b411b363
PR
4164
4165 clear_bit(BITMAP_IO, &mdev->flags);
127b3178 4166 smp_mb__after_clear_bit();
b411b363
PR
4167 wake_up(&mdev->misc_wait);
4168
4169 if (work->done)
4170 work->done(mdev, rv);
4171
4172 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
4173 work->why = NULL;
20ceb2b2 4174 work->flags = 0;
b411b363
PR
4175
4176 return 1;
4177}
4178
82f59cc6
LE
4179void drbd_ldev_destroy(struct drbd_conf *mdev)
4180{
4181 lc_destroy(mdev->resync);
4182 mdev->resync = NULL;
4183 lc_destroy(mdev->act_log);
4184 mdev->act_log = NULL;
4185 __no_warn(local,
4186 drbd_free_bc(mdev->ldev);
4187 mdev->ldev = NULL;);
4188
4189 if (mdev->md_io_tmpp) {
4190 __free_page(mdev->md_io_tmpp);
4191 mdev->md_io_tmpp = NULL;
4192 }
4193 clear_bit(GO_DISKLESS, &mdev->flags);
4194}
4195
e9e6f3ec
LE
4196static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4197{
4198 D_ASSERT(mdev->state.disk == D_FAILED);
9d282875
LE
4199 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4200 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
82f59cc6
LE
4201 * the protected members anymore, though, so once put_ldev reaches zero
4202 * again, it will be safe to free them. */
e9e6f3ec 4203 drbd_force_state(mdev, NS(disk, D_DISKLESS));
e9e6f3ec
LE
4204 return 1;
4205}
4206
4207void drbd_go_diskless(struct drbd_conf *mdev)
4208{
4209 D_ASSERT(mdev->state.disk == D_FAILED);
4210 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
9d282875 4211 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
e9e6f3ec
LE
4212}
4213
b411b363
PR
4214/**
4215 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4216 * @mdev: DRBD device.
4217 * @io_fn: IO callback to be called when bitmap IO is possible
4218 * @done: callback to be called after the bitmap IO was performed
4219 * @why: Descriptive text of the reason for doing the IO
4220 *
4221 * While IO on the bitmap happens we freeze application IO thus we ensure
4222 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4223 * called from worker context. It MUST NOT be used while a previous such
4224 * work is still pending!
4225 */
4226void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4227 int (*io_fn)(struct drbd_conf *),
4228 void (*done)(struct drbd_conf *, int),
20ceb2b2 4229 char *why, enum bm_flag flags)
b411b363
PR
4230{
4231 D_ASSERT(current == mdev->worker.task);
4232
4233 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4234 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4235 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4236 if (mdev->bm_io_work.why)
4237 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4238 why, mdev->bm_io_work.why);
4239
4240 mdev->bm_io_work.io_fn = io_fn;
4241 mdev->bm_io_work.done = done;
4242 mdev->bm_io_work.why = why;
20ceb2b2 4243 mdev->bm_io_work.flags = flags;
b411b363 4244
22afd7ee 4245 spin_lock_irq(&mdev->req_lock);
b411b363
PR
4246 set_bit(BITMAP_IO, &mdev->flags);
4247 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
127b3178 4248 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
b411b363 4249 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
b411b363 4250 }
22afd7ee 4251 spin_unlock_irq(&mdev->req_lock);
b411b363
PR
4252}
4253
4254/**
4255 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4256 * @mdev: DRBD device.
4257 * @io_fn: IO callback to be called when bitmap IO is possible
4258 * @why: Descriptive text of the reason for doing the IO
4259 *
4260 * freezes application IO while that the actual IO operations runs. This
4261 * functions MAY NOT be called from worker context.
4262 */
20ceb2b2
LE
4263int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4264 char *why, enum bm_flag flags)
b411b363
PR
4265{
4266 int rv;
4267
4268 D_ASSERT(current != mdev->worker.task);
4269
20ceb2b2
LE
4270 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4271 drbd_suspend_io(mdev);
b411b363 4272
20ceb2b2 4273 drbd_bm_lock(mdev, why, flags);
b411b363
PR
4274 rv = io_fn(mdev);
4275 drbd_bm_unlock(mdev);
4276
20ceb2b2
LE
4277 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4278 drbd_resume_io(mdev);
b411b363
PR
4279
4280 return rv;
4281}
4282
4283void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4284{
4285 if ((mdev->ldev->md.flags & flag) != flag) {
4286 drbd_md_mark_dirty(mdev);
4287 mdev->ldev->md.flags |= flag;
4288 }
4289}
4290
4291void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4292{
4293 if ((mdev->ldev->md.flags & flag) != 0) {
4294 drbd_md_mark_dirty(mdev);
4295 mdev->ldev->md.flags &= ~flag;
4296 }
4297}
4298int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4299{
4300 return (bdev->md.flags & flag) != 0;
4301}
4302
4303static void md_sync_timer_fn(unsigned long data)
4304{
4305 struct drbd_conf *mdev = (struct drbd_conf *) data;
4306
4307 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4308}
4309
4310static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4311{
4312 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
ee15b038
LE
4313#ifdef DEBUG
4314 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4315 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4316#endif
b411b363 4317 drbd_md_sync(mdev);
b411b363
PR
4318 return 1;
4319}
4320
4321#ifdef CONFIG_DRBD_FAULT_INJECTION
4322/* Fault insertion support including random number generator shamelessly
4323 * stolen from kernel/rcutorture.c */
4324struct fault_random_state {
4325 unsigned long state;
4326 unsigned long count;
4327};
4328
4329#define FAULT_RANDOM_MULT 39916801 /* prime */
4330#define FAULT_RANDOM_ADD 479001701 /* prime */
4331#define FAULT_RANDOM_REFRESH 10000
4332
4333/*
4334 * Crude but fast random-number generator. Uses a linear congruential
4335 * generator, with occasional help from get_random_bytes().
4336 */
4337static unsigned long
4338_drbd_fault_random(struct fault_random_state *rsp)
4339{
4340 long refresh;
4341
49829ea7 4342 if (!rsp->count--) {
b411b363
PR
4343 get_random_bytes(&refresh, sizeof(refresh));
4344 rsp->state += refresh;
4345 rsp->count = FAULT_RANDOM_REFRESH;
4346 }
4347 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4348 return swahw32(rsp->state);
4349}
4350
4351static char *
4352_drbd_fault_str(unsigned int type) {
4353 static char *_faults[] = {
4354 [DRBD_FAULT_MD_WR] = "Meta-data write",
4355 [DRBD_FAULT_MD_RD] = "Meta-data read",
4356 [DRBD_FAULT_RS_WR] = "Resync write",
4357 [DRBD_FAULT_RS_RD] = "Resync read",
4358 [DRBD_FAULT_DT_WR] = "Data write",
4359 [DRBD_FAULT_DT_RD] = "Data read",
4360 [DRBD_FAULT_DT_RA] = "Data read ahead",
4361 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
6b4388ac
PR
4362 [DRBD_FAULT_AL_EE] = "EE allocation",
4363 [DRBD_FAULT_RECEIVE] = "receive data corruption",
b411b363
PR
4364 };
4365
4366 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4367}
4368
4369unsigned int
4370_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4371{
4372 static struct fault_random_state rrs = {0, 0};
4373
4374 unsigned int ret = (
4375 (fault_devs == 0 ||
4376 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4377 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4378
4379 if (ret) {
4380 fault_count++;
4381
7383506c 4382 if (__ratelimit(&drbd_ratelimit_state))
b411b363
PR
4383 dev_warn(DEV, "***Simulating %s failure\n",
4384 _drbd_fault_str(type));
4385 }
4386
4387 return ret;
4388}
4389#endif
4390
4391const char *drbd_buildtag(void)
4392{
4393 /* DRBD built from external sources has here a reference to the
4394 git hash of the source code. */
4395
4396 static char buildtag[38] = "\0uilt-in";
4397
4398 if (buildtag[0] == 0) {
bc4854bc
CW
4399#ifdef MODULE
4400 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4401#else
4402 buildtag[0] = 'b';
b411b363 4403#endif
b411b363
PR
4404 }
4405
4406 return buildtag;
4407}
4408
4409module_init(drbd_init)
4410module_exit(drbd_cleanup)
4411
b411b363
PR
4412EXPORT_SYMBOL(drbd_conn_str);
4413EXPORT_SYMBOL(drbd_role_str);
4414EXPORT_SYMBOL(drbd_disk_str);
4415EXPORT_SYMBOL(drbd_set_st_err_str);
This page took 0.458245 seconds and 5 git commands to generate.