Merge branch 'for-linus' into for-next
[deliverable/linux.git] / drivers / block / drbd / drbd_main.c
CommitLineData
b411b363
PR
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
b411b363 29#include <linux/module.h>
b411b363
PR
30#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
2a48fc0a 35#include <linux/mutex.h>
b411b363
PR
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
b411b363
PR
55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
2a48fc0a 67static DEFINE_MUTEX(drbd_main_mutex);
b411b363
PR
68int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
e9e6f3ec 81static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
509fc019 82static void _tl_clear(struct drbd_conf *mdev);
b411b363 83
b411b363
PR
84MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
85 "Lars Ellenberg <lars@linbit.com>");
86MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
87MODULE_VERSION(REL_VERSION);
88MODULE_LICENSE("GPL");
2b8a90b5
PR
89MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
90 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
b411b363
PR
91MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
92
93#include <linux/moduleparam.h>
94/* allow_open_on_secondary */
95MODULE_PARM_DESC(allow_oos, "DONT USE!");
96/* thanks to these macros, if compiled into the kernel (not-module),
97 * this becomes the boot parameter drbd.minor_count */
98module_param(minor_count, uint, 0444);
99module_param(disable_sendpage, bool, 0644);
100module_param(allow_oos, bool, 0);
101module_param(cn_idx, uint, 0444);
102module_param(proc_details, int, 0644);
103
104#ifdef CONFIG_DRBD_FAULT_INJECTION
105int enable_faults;
106int fault_rate;
107static int fault_count;
108int fault_devs;
109/* bitmap of enabled faults */
110module_param(enable_faults, int, 0664);
111/* fault rate % value - applies to all enabled faults */
112module_param(fault_rate, int, 0664);
113/* count of faults inserted */
114module_param(fault_count, int, 0664);
115/* bitmap of devices to insert faults on */
116module_param(fault_devs, int, 0644);
117#endif
118
119/* module parameter, defined */
2b8a90b5 120unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
90ab5ee9
RR
121bool disable_sendpage;
122bool allow_oos;
b411b363
PR
123unsigned int cn_idx = CN_IDX_DRBD;
124int proc_details; /* Detail level in proc drbd*/
125
126/* Module parameter for setting the user mode helper program
127 * to run. Default is /sbin/drbdadm */
128char usermode_helper[80] = "/sbin/drbdadm";
129
130module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
131
132/* in 2.6.x, our device mapping and config info contains our virtual gendisks
133 * as member "struct gendisk *vdisk;"
134 */
135struct drbd_conf **minor_table;
136
137struct kmem_cache *drbd_request_cache;
138struct kmem_cache *drbd_ee_cache; /* epoch entries */
139struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
140struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
141mempool_t *drbd_request_mempool;
142mempool_t *drbd_ee_mempool;
4281808f 143mempool_t *drbd_md_io_page_pool;
9476f39d 144struct bio_set *drbd_md_io_bio_set;
b411b363
PR
145
146/* I do not use a standard mempool, because:
147 1) I want to hand out the pre-allocated objects first.
148 2) I want to be able to interrupt sleeping allocation with a signal.
149 Note: This is a single linked list, the next pointer is the private
150 member of struct page.
151 */
152struct page *drbd_pp_pool;
153spinlock_t drbd_pp_lock;
154int drbd_pp_vacant;
155wait_queue_head_t drbd_pp_wait;
156
157DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
158
7d4e9d09 159static const struct block_device_operations drbd_ops = {
b411b363
PR
160 .owner = THIS_MODULE,
161 .open = drbd_open,
162 .release = drbd_release,
163};
164
9476f39d
LE
165static void bio_destructor_drbd(struct bio *bio)
166{
167 bio_free(bio, drbd_md_io_bio_set);
168}
169
170struct bio *bio_alloc_drbd(gfp_t gfp_mask)
171{
172 struct bio *bio;
173
174 if (!drbd_md_io_bio_set)
175 return bio_alloc(gfp_mask, 1);
176
177 bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
178 if (!bio)
179 return NULL;
180 bio->bi_destructor = bio_destructor_drbd;
181 return bio;
182}
183
b411b363
PR
184#ifdef __CHECKER__
185/* When checking with sparse, and this is an inline function, sparse will
186 give tons of false positives. When this is a real functions sparse works.
187 */
188int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
189{
190 int io_allowed;
191
192 atomic_inc(&mdev->local_cnt);
193 io_allowed = (mdev->state.disk >= mins);
194 if (!io_allowed) {
195 if (atomic_dec_and_test(&mdev->local_cnt))
196 wake_up(&mdev->misc_wait);
197 }
198 return io_allowed;
199}
200
201#endif
202
203/**
204 * DOC: The transfer log
205 *
206 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
207 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
208 * of the list. There is always at least one &struct drbd_tl_epoch object.
209 *
210 * Each &struct drbd_tl_epoch has a circular double linked list of requests
211 * attached.
212 */
213static int tl_init(struct drbd_conf *mdev)
214{
215 struct drbd_tl_epoch *b;
216
217 /* during device minor initialization, we may well use GFP_KERNEL */
218 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
219 if (!b)
220 return 0;
221 INIT_LIST_HEAD(&b->requests);
222 INIT_LIST_HEAD(&b->w.list);
223 b->next = NULL;
224 b->br_number = 4711;
7e602c0a 225 b->n_writes = 0;
b411b363
PR
226 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
227
228 mdev->oldest_tle = b;
229 mdev->newest_tle = b;
230 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
6d7e32f5 231 INIT_LIST_HEAD(&mdev->barrier_acked_requests);
b411b363
PR
232
233 mdev->tl_hash = NULL;
234 mdev->tl_hash_s = 0;
235
236 return 1;
237}
238
239static void tl_cleanup(struct drbd_conf *mdev)
240{
241 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
242 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
243 kfree(mdev->oldest_tle);
244 mdev->oldest_tle = NULL;
245 kfree(mdev->unused_spare_tle);
246 mdev->unused_spare_tle = NULL;
247 kfree(mdev->tl_hash);
248 mdev->tl_hash = NULL;
249 mdev->tl_hash_s = 0;
250}
251
252/**
253 * _tl_add_barrier() - Adds a barrier to the transfer log
254 * @mdev: DRBD device.
255 * @new: Barrier to be added before the current head of the TL.
256 *
257 * The caller must hold the req_lock.
258 */
259void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
260{
261 struct drbd_tl_epoch *newest_before;
262
263 INIT_LIST_HEAD(&new->requests);
264 INIT_LIST_HEAD(&new->w.list);
265 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
266 new->next = NULL;
7e602c0a 267 new->n_writes = 0;
b411b363
PR
268
269 newest_before = mdev->newest_tle;
c088b2d9 270 new->br_number = newest_before->br_number+1;
b411b363
PR
271 if (mdev->newest_tle != new) {
272 mdev->newest_tle->next = new;
273 mdev->newest_tle = new;
274 }
275}
276
277/**
278 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
279 * @mdev: DRBD device.
280 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
281 * @set_size: Expected number of requests before that barrier.
282 *
283 * In case the passed barrier_nr or set_size does not match the oldest
284 * &struct drbd_tl_epoch objects this function will cause a termination
285 * of the connection.
286 */
287void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
288 unsigned int set_size)
289{
290 struct drbd_tl_epoch *b, *nob; /* next old barrier */
291 struct list_head *le, *tle;
292 struct drbd_request *r;
293
294 spin_lock_irq(&mdev->req_lock);
295
296 b = mdev->oldest_tle;
297
298 /* first some paranoia code */
299 if (b == NULL) {
300 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
301 barrier_nr);
302 goto bail;
303 }
304 if (b->br_number != barrier_nr) {
305 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
306 barrier_nr, b->br_number);
307 goto bail;
308 }
7e602c0a
PR
309 if (b->n_writes != set_size) {
310 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
311 barrier_nr, set_size, b->n_writes);
b411b363
PR
312 goto bail;
313 }
314
315 /* Clean up list of requests processed during current epoch */
316 list_for_each_safe(le, tle, &b->requests) {
317 r = list_entry(le, struct drbd_request, tl_requests);
318 _req_mod(r, barrier_acked);
319 }
320 /* There could be requests on the list waiting for completion
321 of the write to the local disk. To avoid corruptions of
322 slab's data structures we have to remove the lists head.
323
324 Also there could have been a barrier ack out of sequence, overtaking
325 the write acks - which would be a bug and violating write ordering.
326 To not deadlock in case we lose connection while such requests are
327 still pending, we need some way to find them for the
328 _req_mode(connection_lost_while_pending).
329
330 These have been list_move'd to the out_of_sequence_requests list in
331 _req_mod(, barrier_acked) above.
332 */
6d7e32f5 333 list_splice_init(&b->requests, &mdev->barrier_acked_requests);
b411b363
PR
334
335 nob = b->next;
336 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
337 _tl_add_barrier(mdev, b);
338 if (nob)
339 mdev->oldest_tle = nob;
340 /* if nob == NULL b was the only barrier, and becomes the new
341 barrier. Therefore mdev->oldest_tle points already to b */
342 } else {
343 D_ASSERT(nob != NULL);
344 mdev->oldest_tle = nob;
345 kfree(b);
346 }
347
348 spin_unlock_irq(&mdev->req_lock);
349 dec_ap_pending(mdev);
350
351 return;
352
353bail:
354 spin_unlock_irq(&mdev->req_lock);
355 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
356}
357
617049aa 358
b411b363 359/**
11b58e73 360 * _tl_restart() - Walks the transfer log, and applies an action to all requests
b411b363 361 * @mdev: DRBD device.
11b58e73 362 * @what: The action/event to perform with all request objects
b411b363 363 *
11b58e73 364 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
fd2491f4 365 * restart_frozen_disk_io.
b411b363 366 */
11b58e73 367static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
b411b363 368{
11b58e73 369 struct drbd_tl_epoch *b, *tmp, **pn;
b9b98716 370 struct list_head *le, *tle, carry_reads;
11b58e73
PR
371 struct drbd_request *req;
372 int rv, n_writes, n_reads;
b411b363
PR
373
374 b = mdev->oldest_tle;
11b58e73 375 pn = &mdev->oldest_tle;
b411b363 376 while (b) {
11b58e73
PR
377 n_writes = 0;
378 n_reads = 0;
b9b98716 379 INIT_LIST_HEAD(&carry_reads);
b411b363 380 list_for_each_safe(le, tle, &b->requests) {
11b58e73
PR
381 req = list_entry(le, struct drbd_request, tl_requests);
382 rv = _req_mod(req, what);
383
384 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
385 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
b411b363
PR
386 }
387 tmp = b->next;
388
b9b98716 389 if (n_writes) {
11b58e73
PR
390 if (what == resend) {
391 b->n_writes = n_writes;
392 if (b->w.cb == NULL) {
393 b->w.cb = w_send_barrier;
394 inc_ap_pending(mdev);
395 set_bit(CREATE_BARRIER, &mdev->flags);
396 }
397
398 drbd_queue_work(&mdev->data.work, &b->w);
399 }
400 pn = &b->next;
401 } else {
b9b98716
PR
402 if (n_reads)
403 list_add(&carry_reads, &b->requests);
11b58e73
PR
404 /* there could still be requests on that ring list,
405 * in case local io is still pending */
406 list_del(&b->requests);
407
408 /* dec_ap_pending corresponding to queue_barrier.
409 * the newest barrier may not have been queued yet,
410 * in which case w.cb is still NULL. */
411 if (b->w.cb != NULL)
412 dec_ap_pending(mdev);
413
414 if (b == mdev->newest_tle) {
415 /* recycle, but reinit! */
416 D_ASSERT(tmp == NULL);
417 INIT_LIST_HEAD(&b->requests);
b9b98716 418 list_splice(&carry_reads, &b->requests);
11b58e73
PR
419 INIT_LIST_HEAD(&b->w.list);
420 b->w.cb = NULL;
421 b->br_number = net_random();
422 b->n_writes = 0;
423
424 *pn = b;
425 break;
426 }
427 *pn = tmp;
428 kfree(b);
b411b363 429 }
b411b363 430 b = tmp;
b9b98716 431 list_splice(&carry_reads, &b->requests);
b411b363 432 }
6d7e32f5
PR
433
434 /* Actions operating on the disk state, also want to work on
435 requests that got barrier acked. */
6d7e32f5 436
509fc019
PR
437 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
438 req = list_entry(le, struct drbd_request, tl_requests);
439 _req_mod(req, what);
6d7e32f5 440 }
11b58e73
PR
441}
442
b411b363
PR
443
444/**
445 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
446 * @mdev: DRBD device.
447 *
448 * This is called after the connection to the peer was lost. The storage covered
449 * by the requests on the transfer gets marked as our of sync. Called from the
450 * receiver thread and the worker thread.
451 */
452void tl_clear(struct drbd_conf *mdev)
509fc019
PR
453{
454 spin_lock_irq(&mdev->req_lock);
455 _tl_clear(mdev);
456 spin_unlock_irq(&mdev->req_lock);
457}
458
459static void _tl_clear(struct drbd_conf *mdev)
b411b363 460{
b411b363
PR
461 struct list_head *le, *tle;
462 struct drbd_request *r;
b411b363 463
11b58e73 464 _tl_restart(mdev, connection_lost_while_pending);
b411b363
PR
465
466 /* we expect this list to be empty. */
467 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
468
469 /* but just in case, clean it up anyways! */
470 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
471 r = list_entry(le, struct drbd_request, tl_requests);
472 /* It would be nice to complete outside of spinlock.
473 * But this is easier for now. */
474 _req_mod(r, connection_lost_while_pending);
475 }
476
477 /* ensure bit indicating barrier is required is clear */
478 clear_bit(CREATE_BARRIER, &mdev->flags);
479
288f422e
PR
480 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
481
b411b363
PR
482}
483
11b58e73
PR
484void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
485{
486 spin_lock_irq(&mdev->req_lock);
487 _tl_restart(mdev, what);
b411b363
PR
488 spin_unlock_irq(&mdev->req_lock);
489}
490
fd2491f4
PR
491/**
492 * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
493 * @mdev: DRBD device.
494 */
495void tl_abort_disk_io(struct drbd_conf *mdev)
496{
497 struct drbd_tl_epoch *b;
498 struct list_head *le, *tle;
499 struct drbd_request *req;
500
501 spin_lock_irq(&mdev->req_lock);
502 b = mdev->oldest_tle;
503 while (b) {
504 list_for_each_safe(le, tle, &b->requests) {
505 req = list_entry(le, struct drbd_request, tl_requests);
506 if (!(req->rq_state & RQ_LOCAL_PENDING))
507 continue;
508 _req_mod(req, abort_disk_io);
509 }
510 b = b->next;
511 }
512
513 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
514 req = list_entry(le, struct drbd_request, tl_requests);
515 if (!(req->rq_state & RQ_LOCAL_PENDING))
516 continue;
517 _req_mod(req, abort_disk_io);
518 }
519
520 spin_unlock_irq(&mdev->req_lock);
521}
522
b411b363 523/**
81e84650 524 * cl_wide_st_chg() - true if the state change is a cluster wide one
b411b363
PR
525 * @mdev: DRBD device.
526 * @os: old (current) state.
527 * @ns: new (wanted) state.
528 */
529static int cl_wide_st_chg(struct drbd_conf *mdev,
530 union drbd_state os, union drbd_state ns)
531{
532 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
533 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
534 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
535 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
02ee8f95 536 (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
b411b363
PR
537 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
538 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
539}
540
bf885f8a
AG
541enum drbd_state_rv
542drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
543 union drbd_state mask, union drbd_state val)
b411b363
PR
544{
545 unsigned long flags;
546 union drbd_state os, ns;
bf885f8a 547 enum drbd_state_rv rv;
b411b363
PR
548
549 spin_lock_irqsave(&mdev->req_lock, flags);
550 os = mdev->state;
551 ns.i = (os.i & ~mask.i) | val.i;
552 rv = _drbd_set_state(mdev, ns, f, NULL);
553 ns = mdev->state;
554 spin_unlock_irqrestore(&mdev->req_lock, flags);
555
556 return rv;
557}
558
559/**
560 * drbd_force_state() - Impose a change which happens outside our control on our state
561 * @mdev: DRBD device.
562 * @mask: mask of state bits to change.
563 * @val: value of new state bits.
564 */
565void drbd_force_state(struct drbd_conf *mdev,
566 union drbd_state mask, union drbd_state val)
567{
568 drbd_change_state(mdev, CS_HARD, mask, val);
569}
570
bf885f8a
AG
571static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
572static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
573 union drbd_state,
574 union drbd_state);
77e8fdfc
PR
575enum sanitize_state_warnings {
576 NO_WARNING,
577 ABORTED_ONLINE_VERIFY,
578 ABORTED_RESYNC,
579 CONNECTION_LOST_NEGOTIATING,
580 IMPLICITLY_UPGRADED_DISK,
581 IMPLICITLY_UPGRADED_PDSK,
582};
b411b363 583static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
77e8fdfc 584 union drbd_state ns, enum sanitize_state_warnings *warn);
b411b363
PR
585int drbd_send_state_req(struct drbd_conf *,
586 union drbd_state, union drbd_state);
587
c8b32563
AG
588static enum drbd_state_rv
589_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
590 union drbd_state val)
b411b363
PR
591{
592 union drbd_state os, ns;
593 unsigned long flags;
bf885f8a 594 enum drbd_state_rv rv;
b411b363
PR
595
596 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
597 return SS_CW_SUCCESS;
598
599 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
600 return SS_CW_FAILED_BY_PEER;
601
602 rv = 0;
603 spin_lock_irqsave(&mdev->req_lock, flags);
604 os = mdev->state;
605 ns.i = (os.i & ~mask.i) | val.i;
606 ns = sanitize_state(mdev, os, ns, NULL);
607
608 if (!cl_wide_st_chg(mdev, os, ns))
609 rv = SS_CW_NO_NEED;
610 if (!rv) {
611 rv = is_valid_state(mdev, ns);
612 if (rv == SS_SUCCESS) {
613 rv = is_valid_state_transition(mdev, ns, os);
614 if (rv == SS_SUCCESS)
bf885f8a 615 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
b411b363
PR
616 }
617 }
618 spin_unlock_irqrestore(&mdev->req_lock, flags);
619
620 return rv;
621}
622
623/**
624 * drbd_req_state() - Perform an eventually cluster wide state change
625 * @mdev: DRBD device.
626 * @mask: mask of state bits to change.
627 * @val: value of new state bits.
628 * @f: flags
629 *
630 * Should not be called directly, use drbd_request_state() or
631 * _drbd_request_state().
632 */
bf885f8a
AG
633static enum drbd_state_rv
634drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
635 union drbd_state val, enum chg_state_flags f)
b411b363
PR
636{
637 struct completion done;
638 unsigned long flags;
639 union drbd_state os, ns;
bf885f8a 640 enum drbd_state_rv rv;
b411b363
PR
641
642 init_completion(&done);
643
644 if (f & CS_SERIALIZE)
645 mutex_lock(&mdev->state_mutex);
646
647 spin_lock_irqsave(&mdev->req_lock, flags);
648 os = mdev->state;
649 ns.i = (os.i & ~mask.i) | val.i;
650 ns = sanitize_state(mdev, os, ns, NULL);
651
652 if (cl_wide_st_chg(mdev, os, ns)) {
653 rv = is_valid_state(mdev, ns);
654 if (rv == SS_SUCCESS)
655 rv = is_valid_state_transition(mdev, ns, os);
656 spin_unlock_irqrestore(&mdev->req_lock, flags);
657
658 if (rv < SS_SUCCESS) {
659 if (f & CS_VERBOSE)
660 print_st_err(mdev, os, ns, rv);
661 goto abort;
662 }
663
664 drbd_state_lock(mdev);
665 if (!drbd_send_state_req(mdev, mask, val)) {
666 drbd_state_unlock(mdev);
667 rv = SS_CW_FAILED_BY_PEER;
668 if (f & CS_VERBOSE)
669 print_st_err(mdev, os, ns, rv);
670 goto abort;
671 }
672
673 wait_event(mdev->state_wait,
674 (rv = _req_st_cond(mdev, mask, val)));
675
676 if (rv < SS_SUCCESS) {
677 drbd_state_unlock(mdev);
678 if (f & CS_VERBOSE)
679 print_st_err(mdev, os, ns, rv);
680 goto abort;
681 }
682 spin_lock_irqsave(&mdev->req_lock, flags);
683 os = mdev->state;
684 ns.i = (os.i & ~mask.i) | val.i;
685 rv = _drbd_set_state(mdev, ns, f, &done);
686 drbd_state_unlock(mdev);
687 } else {
688 rv = _drbd_set_state(mdev, ns, f, &done);
689 }
690
691 spin_unlock_irqrestore(&mdev->req_lock, flags);
692
693 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
694 D_ASSERT(current != mdev->worker.task);
695 wait_for_completion(&done);
696 }
697
698abort:
699 if (f & CS_SERIALIZE)
700 mutex_unlock(&mdev->state_mutex);
701
702 return rv;
703}
704
705/**
706 * _drbd_request_state() - Request a state change (with flags)
707 * @mdev: DRBD device.
708 * @mask: mask of state bits to change.
709 * @val: value of new state bits.
710 * @f: flags
711 *
712 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
713 * flag, or when logging of failed state change requests is not desired.
714 */
bf885f8a
AG
715enum drbd_state_rv
716_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
717 union drbd_state val, enum chg_state_flags f)
b411b363 718{
bf885f8a 719 enum drbd_state_rv rv;
b411b363
PR
720
721 wait_event(mdev->state_wait,
722 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
723
724 return rv;
725}
726
727static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
728{
729 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
730 name,
731 drbd_conn_str(ns.conn),
732 drbd_role_str(ns.role),
733 drbd_role_str(ns.peer),
734 drbd_disk_str(ns.disk),
735 drbd_disk_str(ns.pdsk),
fb22c402 736 is_susp(ns) ? 's' : 'r',
b411b363
PR
737 ns.aftr_isp ? 'a' : '-',
738 ns.peer_isp ? 'p' : '-',
739 ns.user_isp ? 'u' : '-'
740 );
741}
742
bf885f8a
AG
743void print_st_err(struct drbd_conf *mdev, union drbd_state os,
744 union drbd_state ns, enum drbd_state_rv err)
b411b363
PR
745{
746 if (err == SS_IN_TRANSIENT_STATE)
747 return;
748 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
749 print_st(mdev, " state", os);
750 print_st(mdev, "wanted", ns);
751}
752
753
b411b363
PR
754/**
755 * is_valid_state() - Returns an SS_ error code if ns is not valid
756 * @mdev: DRBD device.
757 * @ns: State to consider.
758 */
bf885f8a
AG
759static enum drbd_state_rv
760is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
b411b363
PR
761{
762 /* See drbd_state_sw_errors in drbd_strings.c */
763
764 enum drbd_fencing_p fp;
bf885f8a 765 enum drbd_state_rv rv = SS_SUCCESS;
b411b363
PR
766
767 fp = FP_DONT_CARE;
768 if (get_ldev(mdev)) {
769 fp = mdev->ldev->dc.fencing;
770 put_ldev(mdev);
771 }
772
773 if (get_net_conf(mdev)) {
774 if (!mdev->net_conf->two_primaries &&
775 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
776 rv = SS_TWO_PRIMARIES;
777 put_net_conf(mdev);
778 }
779
780 if (rv <= 0)
781 /* already found a reason to abort */;
782 else if (ns.role == R_SECONDARY && mdev->open_cnt)
783 rv = SS_DEVICE_IN_USE;
784
785 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
786 rv = SS_NO_UP_TO_DATE_DISK;
787
788 else if (fp >= FP_RESOURCE &&
789 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
790 rv = SS_PRIMARY_NOP;
791
792 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
793 rv = SS_NO_UP_TO_DATE_DISK;
794
795 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
796 rv = SS_NO_LOCAL_DISK;
797
798 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
799 rv = SS_NO_REMOTE_DISK;
800
8d4ce82b
LE
801 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
802 rv = SS_NO_UP_TO_DATE_DISK;
803
b411b363
PR
804 else if ((ns.conn == C_CONNECTED ||
805 ns.conn == C_WF_BITMAP_S ||
806 ns.conn == C_SYNC_SOURCE ||
807 ns.conn == C_PAUSED_SYNC_S) &&
808 ns.disk == D_OUTDATED)
809 rv = SS_CONNECTED_OUTDATES;
810
811 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
812 (mdev->sync_conf.verify_alg[0] == 0))
813 rv = SS_NO_VERIFY_ALG;
814
815 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
816 mdev->agreed_pro_version < 88)
817 rv = SS_NOT_SUPPORTED;
818
fa7d9396
PR
819 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
820 rv = SS_CONNECTED_OUTDATES;
821
b411b363
PR
822 return rv;
823}
824
825/**
826 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
827 * @mdev: DRBD device.
828 * @ns: new state.
829 * @os: old state.
830 */
bf885f8a
AG
831static enum drbd_state_rv
832is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
833 union drbd_state os)
b411b363 834{
bf885f8a 835 enum drbd_state_rv rv = SS_SUCCESS;
b411b363
PR
836
837 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
838 os.conn > C_CONNECTED)
839 rv = SS_RESYNC_RUNNING;
840
841 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
842 rv = SS_ALREADY_STANDALONE;
843
844 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
845 rv = SS_IS_DISKLESS;
846
847 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
848 rv = SS_NO_NET_CONFIG;
849
850 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
851 rv = SS_LOWER_THAN_OUTDATED;
852
853 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
854 rv = SS_IN_TRANSIENT_STATE;
855
856 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
857 rv = SS_IN_TRANSIENT_STATE;
858
197296ff
PR
859 /* While establishing a connection only allow cstate to change.
860 Delay/refuse role changes, detach attach etc... */
861 if (test_bit(STATE_SENT, &mdev->flags) &&
862 !(os.conn == C_WF_REPORT_PARAMS ||
863 (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
864 rv = SS_IN_TRANSIENT_STATE;
865
b411b363
PR
866 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
867 rv = SS_NEED_CONNECTION;
868
869 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
870 ns.conn != os.conn && os.conn > C_CONNECTED)
871 rv = SS_RESYNC_RUNNING;
872
873 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
874 os.conn < C_CONNECTED)
875 rv = SS_NEED_CONNECTION;
876
1fc80cf3
PR
877 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
878 && os.conn < C_WF_REPORT_PARAMS)
879 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
880
b411b363
PR
881 return rv;
882}
883
77e8fdfc
PR
884static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
885{
886 static const char *msg_table[] = {
887 [NO_WARNING] = "",
888 [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
889 [ABORTED_RESYNC] = "Resync aborted.",
890 [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
891 [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
892 [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
893 };
894
895 if (warn != NO_WARNING)
896 dev_warn(DEV, "%s\n", msg_table[warn]);
897}
898
b411b363
PR
899/**
900 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
901 * @mdev: DRBD device.
902 * @os: old state.
903 * @ns: new state.
904 * @warn_sync_abort:
905 *
906 * When we loose connection, we have to set the state of the peers disk (pdsk)
907 * to D_UNKNOWN. This rule and many more along those lines are in this function.
908 */
909static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
77e8fdfc 910 union drbd_state ns, enum sanitize_state_warnings *warn)
b411b363
PR
911{
912 enum drbd_fencing_p fp;
ab17b68f 913 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
b411b363 914
77e8fdfc
PR
915 if (warn)
916 *warn = NO_WARNING;
917
b411b363
PR
918 fp = FP_DONT_CARE;
919 if (get_ldev(mdev)) {
920 fp = mdev->ldev->dc.fencing;
921 put_ldev(mdev);
922 }
923
924 /* Disallow Network errors to configure a device's network part */
925 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
926 os.conn <= C_DISCONNECTING)
927 ns.conn = os.conn;
928
f2906e18
LE
929 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
930 * If you try to go into some Sync* state, that shall fail (elsewhere). */
b411b363 931 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
545752d5 932 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
b411b363
PR
933 ns.conn = os.conn;
934
82f59cc6
LE
935 /* we cannot fail (again) if we already detached */
936 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
937 ns.disk = D_DISKLESS;
938
b411b363
PR
939 /* After C_DISCONNECTING only C_STANDALONE may follow */
940 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
941 ns.conn = os.conn;
942
943 if (ns.conn < C_CONNECTED) {
944 ns.peer_isp = 0;
945 ns.peer = R_UNKNOWN;
946 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
947 ns.pdsk = D_UNKNOWN;
948 }
949
950 /* Clear the aftr_isp when becoming unconfigured */
951 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
952 ns.aftr_isp = 0;
953
b411b363
PR
954 /* Abort resync if a disk fails/detaches */
955 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
956 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
77e8fdfc
PR
957 if (warn)
958 *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
959 ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
b411b363
PR
960 ns.conn = C_CONNECTED;
961 }
962
b411b363
PR
963 /* Connection breaks down before we finished "Negotiating" */
964 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
965 get_ldev_if_state(mdev, D_NEGOTIATING)) {
966 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
967 ns.disk = mdev->new_state_tmp.disk;
968 ns.pdsk = mdev->new_state_tmp.pdsk;
969 } else {
77e8fdfc
PR
970 if (warn)
971 *warn = CONNECTION_LOST_NEGOTIATING;
b411b363
PR
972 ns.disk = D_DISKLESS;
973 ns.pdsk = D_UNKNOWN;
974 }
975 put_ldev(mdev);
976 }
977
ab17b68f
PR
978 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
979 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
980 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
981 ns.disk = D_UP_TO_DATE;
982 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
983 ns.pdsk = D_UP_TO_DATE;
984 }
985
986 /* Implications of the connection stat on the disk states */
987 disk_min = D_DISKLESS;
988 disk_max = D_UP_TO_DATE;
989 pdsk_min = D_INCONSISTENT;
990 pdsk_max = D_UNKNOWN;
991 switch ((enum drbd_conns)ns.conn) {
992 case C_WF_BITMAP_T:
993 case C_PAUSED_SYNC_T:
994 case C_STARTING_SYNC_T:
995 case C_WF_SYNC_UUID:
996 case C_BEHIND:
997 disk_min = D_INCONSISTENT;
998 disk_max = D_OUTDATED;
999 pdsk_min = D_UP_TO_DATE;
1000 pdsk_max = D_UP_TO_DATE;
1001 break;
1002 case C_VERIFY_S:
1003 case C_VERIFY_T:
1004 disk_min = D_UP_TO_DATE;
1005 disk_max = D_UP_TO_DATE;
1006 pdsk_min = D_UP_TO_DATE;
1007 pdsk_max = D_UP_TO_DATE;
1008 break;
1009 case C_CONNECTED:
1010 disk_min = D_DISKLESS;
1011 disk_max = D_UP_TO_DATE;
1012 pdsk_min = D_DISKLESS;
1013 pdsk_max = D_UP_TO_DATE;
1014 break;
1015 case C_WF_BITMAP_S:
1016 case C_PAUSED_SYNC_S:
1017 case C_STARTING_SYNC_S:
1018 case C_AHEAD:
1019 disk_min = D_UP_TO_DATE;
1020 disk_max = D_UP_TO_DATE;
1021 pdsk_min = D_INCONSISTENT;
1022 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
1023 break;
1024 case C_SYNC_TARGET:
1025 disk_min = D_INCONSISTENT;
1026 disk_max = D_INCONSISTENT;
1027 pdsk_min = D_UP_TO_DATE;
1028 pdsk_max = D_UP_TO_DATE;
1029 break;
1030 case C_SYNC_SOURCE:
1031 disk_min = D_UP_TO_DATE;
1032 disk_max = D_UP_TO_DATE;
1033 pdsk_min = D_INCONSISTENT;
1034 pdsk_max = D_INCONSISTENT;
1035 break;
1036 case C_STANDALONE:
1037 case C_DISCONNECTING:
1038 case C_UNCONNECTED:
1039 case C_TIMEOUT:
1040 case C_BROKEN_PIPE:
1041 case C_NETWORK_FAILURE:
1042 case C_PROTOCOL_ERROR:
1043 case C_TEAR_DOWN:
1044 case C_WF_CONNECTION:
1045 case C_WF_REPORT_PARAMS:
1046 case C_MASK:
1047 break;
1048 }
1049 if (ns.disk > disk_max)
1050 ns.disk = disk_max;
1051
1052 if (ns.disk < disk_min) {
77e8fdfc
PR
1053 if (warn)
1054 *warn = IMPLICITLY_UPGRADED_DISK;
ab17b68f
PR
1055 ns.disk = disk_min;
1056 }
1057 if (ns.pdsk > pdsk_max)
1058 ns.pdsk = pdsk_max;
1059
1060 if (ns.pdsk < pdsk_min) {
77e8fdfc
PR
1061 if (warn)
1062 *warn = IMPLICITLY_UPGRADED_PDSK;
ab17b68f
PR
1063 ns.pdsk = pdsk_min;
1064 }
1065
b411b363 1066 if (fp == FP_STONITH &&
0a492166
PR
1067 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1068 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
fb22c402 1069 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
265be2d0
PR
1070
1071 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1072 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1073 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
fb22c402 1074 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
b411b363
PR
1075
1076 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1077 if (ns.conn == C_SYNC_SOURCE)
1078 ns.conn = C_PAUSED_SYNC_S;
1079 if (ns.conn == C_SYNC_TARGET)
1080 ns.conn = C_PAUSED_SYNC_T;
1081 } else {
1082 if (ns.conn == C_PAUSED_SYNC_S)
1083 ns.conn = C_SYNC_SOURCE;
1084 if (ns.conn == C_PAUSED_SYNC_T)
1085 ns.conn = C_SYNC_TARGET;
1086 }
1087
1088 return ns;
1089}
1090
1091/* helper for __drbd_set_state */
1092static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1093{
30b743a2
LE
1094 if (mdev->agreed_pro_version < 90)
1095 mdev->ov_start_sector = 0;
1096 mdev->rs_total = drbd_bm_bits(mdev);
1097 mdev->ov_position = 0;
b411b363
PR
1098 if (cs == C_VERIFY_T) {
1099 /* starting online verify from an arbitrary position
1100 * does not fit well into the existing protocol.
1101 * on C_VERIFY_T, we initialize ov_left and friends
1102 * implicitly in receive_DataRequest once the
1103 * first P_OV_REQUEST is received */
1104 mdev->ov_start_sector = ~(sector_t)0;
1105 } else {
1106 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
30b743a2 1107 if (bit >= mdev->rs_total) {
b411b363
PR
1108 mdev->ov_start_sector =
1109 BM_BIT_TO_SECT(mdev->rs_total - 1);
30b743a2
LE
1110 mdev->rs_total = 1;
1111 } else
1112 mdev->rs_total -= bit;
b411b363
PR
1113 mdev->ov_position = mdev->ov_start_sector;
1114 }
30b743a2 1115 mdev->ov_left = mdev->rs_total;
b411b363
PR
1116}
1117
0778286a
PR
1118static void drbd_resume_al(struct drbd_conf *mdev)
1119{
1120 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1121 dev_info(DEV, "Resumed AL updates\n");
1122}
1123
b411b363
PR
1124/**
1125 * __drbd_set_state() - Set a new DRBD state
1126 * @mdev: DRBD device.
1127 * @ns: new state.
1128 * @flags: Flags
1129 * @done: Optional completion, that will get completed after the after_state_ch() finished
1130 *
1131 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1132 */
bf885f8a
AG
1133enum drbd_state_rv
1134__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1135 enum chg_state_flags flags, struct completion *done)
b411b363
PR
1136{
1137 union drbd_state os;
bf885f8a 1138 enum drbd_state_rv rv = SS_SUCCESS;
77e8fdfc 1139 enum sanitize_state_warnings ssw;
b411b363
PR
1140 struct after_state_chg_work *ascw;
1141
1142 os = mdev->state;
1143
77e8fdfc 1144 ns = sanitize_state(mdev, os, ns, &ssw);
b411b363
PR
1145
1146 if (ns.i == os.i)
1147 return SS_NOTHING_TO_DO;
1148
1149 if (!(flags & CS_HARD)) {
1150 /* pre-state-change checks ; only look at ns */
1151 /* See drbd_state_sw_errors in drbd_strings.c */
1152
1153 rv = is_valid_state(mdev, ns);
1154 if (rv < SS_SUCCESS) {
1155 /* If the old state was illegal as well, then let
1156 this happen...*/
1157
1616a254 1158 if (is_valid_state(mdev, os) == rv)
b411b363 1159 rv = is_valid_state_transition(mdev, ns, os);
b411b363
PR
1160 } else
1161 rv = is_valid_state_transition(mdev, ns, os);
1162 }
1163
1164 if (rv < SS_SUCCESS) {
1165 if (flags & CS_VERBOSE)
1166 print_st_err(mdev, os, ns, rv);
1167 return rv;
1168 }
1169
77e8fdfc 1170 print_sanitize_warnings(mdev, ssw);
b411b363
PR
1171
1172 {
662d91a2
AG
1173 char *pbp, pb[300];
1174 pbp = pb;
1175 *pbp = 0;
1176 if (ns.role != os.role)
1177 pbp += sprintf(pbp, "role( %s -> %s ) ",
1178 drbd_role_str(os.role),
1179 drbd_role_str(ns.role));
1180 if (ns.peer != os.peer)
1181 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1182 drbd_role_str(os.peer),
1183 drbd_role_str(ns.peer));
1184 if (ns.conn != os.conn)
1185 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1186 drbd_conn_str(os.conn),
1187 drbd_conn_str(ns.conn));
1188 if (ns.disk != os.disk)
1189 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1190 drbd_disk_str(os.disk),
1191 drbd_disk_str(ns.disk));
1192 if (ns.pdsk != os.pdsk)
1193 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1194 drbd_disk_str(os.pdsk),
1195 drbd_disk_str(ns.pdsk));
1196 if (is_susp(ns) != is_susp(os))
1197 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1198 is_susp(os),
1199 is_susp(ns));
1200 if (ns.aftr_isp != os.aftr_isp)
1201 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1202 os.aftr_isp,
1203 ns.aftr_isp);
1204 if (ns.peer_isp != os.peer_isp)
1205 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1206 os.peer_isp,
1207 ns.peer_isp);
1208 if (ns.user_isp != os.user_isp)
1209 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1210 os.user_isp,
1211 ns.user_isp);
1212 dev_info(DEV, "%s\n", pb);
b411b363
PR
1213 }
1214
1215 /* solve the race between becoming unconfigured,
1216 * worker doing the cleanup, and
1217 * admin reconfiguring us:
1218 * on (re)configure, first set CONFIG_PENDING,
1219 * then wait for a potentially exiting worker,
1220 * start the worker, and schedule one no_op.
1221 * then proceed with configuration.
1222 */
1223 if (ns.disk == D_DISKLESS &&
1224 ns.conn == C_STANDALONE &&
1225 ns.role == R_SECONDARY &&
1226 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1227 set_bit(DEVICE_DYING, &mdev->flags);
1228
82f59cc6
LE
1229 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1230 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1231 * drbd_ldev_destroy() won't happen before our corresponding
1232 * after_state_ch works run, where we put_ldev again. */
1233 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1234 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1235 atomic_inc(&mdev->local_cnt);
1236
1237 mdev->state = ns;
62b0da3a
LE
1238
1239 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1240 drbd_print_uuids(mdev, "attached to UUIDs");
1241
b411b363
PR
1242 wake_up(&mdev->misc_wait);
1243 wake_up(&mdev->state_wait);
1244
b411b363
PR
1245 /* aborted verify run. log the last position */
1246 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1247 ns.conn < C_CONNECTED) {
1248 mdev->ov_start_sector =
30b743a2 1249 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
b411b363
PR
1250 dev_info(DEV, "Online Verify reached sector %llu\n",
1251 (unsigned long long)mdev->ov_start_sector);
1252 }
1253
1254 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1255 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1256 dev_info(DEV, "Syncer continues.\n");
1d7734a0
LE
1257 mdev->rs_paused += (long)jiffies
1258 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
63106d3c
PR
1259 if (ns.conn == C_SYNC_TARGET)
1260 mod_timer(&mdev->resync_timer, jiffies);
b411b363
PR
1261 }
1262
1263 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1264 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1265 dev_info(DEV, "Resync suspended\n");
1d7734a0 1266 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
b411b363
PR
1267 }
1268
1269 if (os.conn == C_CONNECTED &&
1270 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1d7734a0
LE
1271 unsigned long now = jiffies;
1272 int i;
1273
30b743a2 1274 set_ov_position(mdev, ns.conn);
1d7734a0 1275 mdev->rs_start = now;
0f0601f4
LE
1276 mdev->rs_last_events = 0;
1277 mdev->rs_last_sect_ev = 0;
b411b363
PR
1278 mdev->ov_last_oos_size = 0;
1279 mdev->ov_last_oos_start = 0;
1280
1d7734a0 1281 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
30b743a2 1282 mdev->rs_mark_left[i] = mdev->ov_left;
1d7734a0
LE
1283 mdev->rs_mark_time[i] = now;
1284 }
1285
2649f080
LE
1286 drbd_rs_controller_reset(mdev);
1287
b411b363
PR
1288 if (ns.conn == C_VERIFY_S) {
1289 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1290 (unsigned long long)mdev->ov_position);
1291 mod_timer(&mdev->resync_timer, jiffies);
1292 }
1293 }
1294
1295 if (get_ldev(mdev)) {
1296 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1297 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1298 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1299
1300 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1301 mdf |= MDF_CRASHED_PRIMARY;
1302 if (mdev->state.role == R_PRIMARY ||
1303 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1304 mdf |= MDF_PRIMARY_IND;
1305 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1306 mdf |= MDF_CONNECTED_IND;
1307 if (mdev->state.disk > D_INCONSISTENT)
1308 mdf |= MDF_CONSISTENT;
1309 if (mdev->state.disk > D_OUTDATED)
1310 mdf |= MDF_WAS_UP_TO_DATE;
1311 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1312 mdf |= MDF_PEER_OUT_DATED;
1313 if (mdf != mdev->ldev->md.flags) {
1314 mdev->ldev->md.flags = mdf;
1315 drbd_md_mark_dirty(mdev);
1316 }
1317 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1318 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1319 put_ldev(mdev);
1320 }
1321
1322 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1323 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1324 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1325 set_bit(CONSIDER_RESYNC, &mdev->flags);
1326
1327 /* Receiver should clean up itself */
1328 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1329 drbd_thread_stop_nowait(&mdev->receiver);
1330
1331 /* Now the receiver finished cleaning up itself, it should die */
1332 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1333 drbd_thread_stop_nowait(&mdev->receiver);
1334
1335 /* Upon network failure, we need to restart the receiver. */
1e86ac48 1336 if (os.conn > C_WF_CONNECTION &&
b411b363
PR
1337 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1338 drbd_thread_restart_nowait(&mdev->receiver);
1339
0778286a
PR
1340 /* Resume AL writing if we get a connection */
1341 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1342 drbd_resume_al(mdev);
1343
ba280c09
LE
1344 /* remember last connect and attach times so request_timer_fn() won't
1345 * kill newly established sessions while we are still trying to thaw
1346 * previously frozen IO */
1347 if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
1348 mdev->last_reconnect_jif = jiffies;
1349 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1350 ns.disk > D_NEGOTIATING)
1351 mdev->last_reattach_jif = jiffies;
1352
b411b363
PR
1353 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1354 if (ascw) {
1355 ascw->os = os;
1356 ascw->ns = ns;
1357 ascw->flags = flags;
1358 ascw->w.cb = w_after_state_ch;
1359 ascw->done = done;
1360 drbd_queue_work(&mdev->data.work, &ascw->w);
1361 } else {
1362 dev_warn(DEV, "Could not kmalloc an ascw\n");
1363 }
1364
1365 return rv;
1366}
1367
1368static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1369{
1370 struct after_state_chg_work *ascw =
1371 container_of(w, struct after_state_chg_work, w);
1372 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1373 if (ascw->flags & CS_WAIT_COMPLETE) {
1374 D_ASSERT(ascw->done != NULL);
1375 complete(ascw->done);
1376 }
1377 kfree(ascw);
1378
1379 return 1;
1380}
1381
1382static void abw_start_sync(struct drbd_conf *mdev, int rv)
1383{
1384 if (rv) {
1385 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1386 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1387 return;
1388 }
1389
1390 switch (mdev->state.conn) {
1391 case C_STARTING_SYNC_T:
1392 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1393 break;
1394 case C_STARTING_SYNC_S:
1395 drbd_start_resync(mdev, C_SYNC_SOURCE);
1396 break;
1397 }
1398}
1399
20ceb2b2
LE
1400int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1401 int (*io_fn)(struct drbd_conf *),
1402 char *why, enum bm_flag flags)
19f843aa
LE
1403{
1404 int rv;
1405
1406 D_ASSERT(current == mdev->worker.task);
1407
1408 /* open coded non-blocking drbd_suspend_io(mdev); */
1409 set_bit(SUSPEND_IO, &mdev->flags);
19f843aa 1410
20ceb2b2 1411 drbd_bm_lock(mdev, why, flags);
19f843aa
LE
1412 rv = io_fn(mdev);
1413 drbd_bm_unlock(mdev);
1414
1415 drbd_resume_io(mdev);
1416
1417 return rv;
1418}
1419
b411b363
PR
1420/**
1421 * after_state_ch() - Perform after state change actions that may sleep
1422 * @mdev: DRBD device.
1423 * @os: old state.
1424 * @ns: new state.
1425 * @flags: Flags
1426 */
1427static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1428 union drbd_state ns, enum chg_state_flags flags)
1429{
1430 enum drbd_fencing_p fp;
67098930 1431 enum drbd_req_event what = nothing;
fb22c402 1432 union drbd_state nsm = (union drbd_state){ .i = -1 };
b411b363
PR
1433
1434 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1435 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1436 if (mdev->p_uuid)
1437 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1438 }
1439
1440 fp = FP_DONT_CARE;
1441 if (get_ldev(mdev)) {
1442 fp = mdev->ldev->dc.fencing;
1443 put_ldev(mdev);
1444 }
1445
1446 /* Inform userspace about the change... */
1447 drbd_bcast_state(mdev, ns);
1448
1449 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1450 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1451 drbd_khelper(mdev, "pri-on-incon-degr");
1452
1453 /* Here we have the actions that are performed after a
1454 state change. This function might sleep */
1455
dfa8bedb
PR
1456 if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
1457 mod_timer(&mdev->request_timer, jiffies + HZ);
1458
fb22c402
PR
1459 nsm.i = -1;
1460 if (ns.susp_nod) {
3f98688a
PR
1461 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1462 what = resend;
265be2d0 1463
79f16f5d
PR
1464 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1465 ns.disk > D_NEGOTIATING)
3f98688a 1466 what = restart_frozen_disk_io;
fb22c402 1467
3f98688a
PR
1468 if (what != nothing)
1469 nsm.susp_nod = 0;
265be2d0
PR
1470 }
1471
fb22c402 1472 if (ns.susp_fen) {
43a5182c
PR
1473 /* case1: The outdate peer handler is successful: */
1474 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
43a5182c
PR
1475 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1476 drbd_uuid_new_current(mdev);
1477 clear_bit(NEW_CUR_UUID, &mdev->flags);
43a5182c 1478 }
b411b363 1479 spin_lock_irq(&mdev->req_lock);
509fc019 1480 _tl_clear(mdev);
fb22c402 1481 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
b411b363
PR
1482 spin_unlock_irq(&mdev->req_lock);
1483 }
43a5182c
PR
1484 /* case2: The connection was established again: */
1485 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1486 clear_bit(NEW_CUR_UUID, &mdev->flags);
67098930 1487 what = resend;
fb22c402 1488 nsm.susp_fen = 0;
43a5182c 1489 }
b411b363 1490 }
67098930
PR
1491
1492 if (what != nothing) {
1493 spin_lock_irq(&mdev->req_lock);
1494 _tl_restart(mdev, what);
fb22c402
PR
1495 nsm.i &= mdev->state.i;
1496 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
67098930 1497 spin_unlock_irq(&mdev->req_lock);
b411b363 1498 }
67098930 1499
5a22db89
LE
1500 /* Became sync source. With protocol >= 96, we still need to send out
1501 * the sync uuid now. Need to do that before any drbd_send_state, or
1502 * the other side may go "paused sync" before receiving the sync uuids,
1503 * which is unexpected. */
1504 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1505 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1506 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1507 drbd_gen_and_send_sync_uuid(mdev);
1508 put_ldev(mdev);
1509 }
1510
b411b363
PR
1511 /* Do not change the order of the if above and the two below... */
1512 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
0029d624
LE
1513 /* we probably will start a resync soon.
1514 * make sure those things are properly reset. */
1515 mdev->rs_total = 0;
1516 mdev->rs_failed = 0;
1517 atomic_set(&mdev->rs_pending_cnt, 0);
1518 drbd_rs_cancel_all(mdev);
1519
b411b363 1520 drbd_send_uuids(mdev);
f479ea06 1521 drbd_send_state(mdev, ns);
b411b363 1522 }
54b956ab
LE
1523 /* No point in queuing send_bitmap if we don't have a connection
1524 * anymore, so check also the _current_ state, not only the new state
1525 * at the time this work was queued. */
1526 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1527 mdev->state.conn == C_WF_BITMAP_S)
1528 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
20ceb2b2
LE
1529 "send_bitmap (WFBitMapS)",
1530 BM_LOCKED_TEST_ALLOWED);
b411b363
PR
1531
1532 /* Lost contact to peer's copy of the data */
1533 if ((os.pdsk >= D_INCONSISTENT &&
1534 os.pdsk != D_UNKNOWN &&
1535 os.pdsk != D_OUTDATED)
1536 && (ns.pdsk < D_INCONSISTENT ||
1537 ns.pdsk == D_UNKNOWN ||
1538 ns.pdsk == D_OUTDATED)) {
b411b363
PR
1539 if (get_ldev(mdev)) {
1540 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
2c8d1967 1541 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
fb22c402 1542 if (is_susp(mdev->state)) {
43a5182c
PR
1543 set_bit(NEW_CUR_UUID, &mdev->flags);
1544 } else {
1545 drbd_uuid_new_current(mdev);
1546 drbd_send_uuids(mdev);
1547 }
2c8d1967 1548 }
b411b363
PR
1549 put_ldev(mdev);
1550 }
1551 }
1552
1553 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
bca482e9
PR
1554 if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
1555 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
2c8d1967 1556 drbd_uuid_new_current(mdev);
18a50fa2
PR
1557 drbd_send_uuids(mdev);
1558 }
b411b363
PR
1559 /* D_DISKLESS Peer becomes secondary */
1560 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
20ceb2b2
LE
1561 /* We may still be Primary ourselves.
1562 * No harm done if the bitmap still changes,
1563 * redirtied pages will follow later. */
1564 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1565 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
19f843aa
LE
1566 put_ldev(mdev);
1567 }
1568
06d33e96
LE
1569 /* Write out all changed bits on demote.
1570 * Though, no need to da that just yet
1571 * if there is a resync going on still */
1572 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1573 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
20ceb2b2
LE
1574 /* No changes to the bitmap expected this time, so assert that,
1575 * even though no harm was done if it did change. */
1576 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1577 "demote", BM_LOCKED_TEST_ALLOWED);
b411b363
PR
1578 put_ldev(mdev);
1579 }
1580
1581 /* Last part of the attaching process ... */
1582 if (ns.conn >= C_CONNECTED &&
1583 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
e89b591c 1584 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
b411b363 1585 drbd_send_uuids(mdev);
f479ea06 1586 drbd_send_state(mdev, ns);
b411b363
PR
1587 }
1588
1589 /* We want to pause/continue resync, tell peer. */
1590 if (ns.conn >= C_CONNECTED &&
1591 ((os.aftr_isp != ns.aftr_isp) ||
1592 (os.user_isp != ns.user_isp)))
f479ea06 1593 drbd_send_state(mdev, ns);
b411b363
PR
1594
1595 /* In case one of the isp bits got set, suspend other devices. */
1596 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1597 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1598 suspend_other_sg(mdev);
1599
1600 /* Make sure the peer gets informed about eventual state
1601 changes (ISP bits) while we were in WFReportParams. */
1602 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
f479ea06 1603 drbd_send_state(mdev, ns);
b411b363 1604
67531718 1605 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
f479ea06 1606 drbd_send_state(mdev, ns);
67531718 1607
b411b363
PR
1608 /* We are in the progress to start a full sync... */
1609 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1610 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
20ceb2b2
LE
1611 /* no other bitmap changes expected during this phase */
1612 drbd_queue_bitmap_io(mdev,
1613 &drbd_bmio_set_n_write, &abw_start_sync,
1614 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
b411b363
PR
1615
1616 /* We are invalidating our self... */
1617 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1618 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
20ceb2b2
LE
1619 /* other bitmap operation expected during this phase */
1620 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1621 "set_n_write from invalidate", BM_LOCKED_MASK);
b411b363 1622
82f59cc6
LE
1623 /* first half of local IO error, failure to attach,
1624 * or administrative detach */
1625 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
7caacb69
PR
1626 enum drbd_io_error_p eh = EP_PASS_ON;
1627 int was_io_error = 0;
82f59cc6 1628 /* corresponding get_ldev was in __drbd_set_state, to serialize
7caacb69
PR
1629 * our cleanup here with the transition to D_DISKLESS.
1630 * But is is still not save to dreference ldev here, since
1631 * we might come from an failed Attach before ldev was set. */
1632 if (mdev->ldev) {
1633 eh = mdev->ldev->dc.on_io_error;
1634 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1635
63a6d0bb
LE
1636 if (was_io_error && eh == EP_CALL_HELPER)
1637 drbd_khelper(mdev, "local-io-error");
1638
383606e0
LE
1639 /* Immediately allow completion of all application IO,
1640 * that waits for completion from the local disk,
1641 * if this was a force-detach due to disk_timeout
1642 * or administrator request (drbdsetup detach --force).
1643 * Do NOT abort otherwise.
1644 * Aborting local requests may cause serious problems,
1645 * if requests are completed to upper layers already,
1646 * and then later the already submitted local bio completes.
1647 * This can cause DMA into former bio pages that meanwhile
1648 * have been re-used for other things.
1649 * So aborting local requests may cause crashes,
1650 * or even worse, silent data corruption.
1651 */
1652 if (test_and_clear_bit(FORCE_DETACH, &mdev->flags))
1653 tl_abort_disk_io(mdev);
7caacb69
PR
1654
1655 /* current state still has to be D_FAILED,
1656 * there is only one way out: to D_DISKLESS,
1657 * and that may only happen after our put_ldev below. */
1658 if (mdev->state.disk != D_FAILED)
1659 dev_err(DEV,
1660 "ASSERT FAILED: disk is %s during detach\n",
1661 drbd_disk_str(mdev->state.disk));
1662
1663 if (ns.conn >= C_CONNECTED)
1664 drbd_send_state(mdev, ns);
1665
1666 drbd_rs_cancel_all(mdev);
1667
1668 /* In case we want to get something to stable storage still,
1669 * this may be the last chance.
1670 * Following put_ldev may transition to D_DISKLESS. */
1671 drbd_md_sync(mdev);
1672 }
82f59cc6 1673 put_ldev(mdev);
e9e6f3ec 1674 }
b411b363 1675
82f59cc6
LE
1676 /* second half of local IO error, failure to attach,
1677 * or administrative detach,
1678 * after local_cnt references have reached zero again */
1679 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1680 /* We must still be diskless,
1681 * re-attach has to be serialized with this! */
1682 if (mdev->state.disk != D_DISKLESS)
1683 dev_err(DEV,
1684 "ASSERT FAILED: disk is %s while going diskless\n",
1685 drbd_disk_str(mdev->state.disk));
e9e6f3ec 1686
4afc433c
PR
1687 if (ns.conn >= C_CONNECTED)
1688 drbd_send_state(mdev, ns);
1689
82f59cc6 1690 /* corresponding get_ldev in __drbd_set_state
25985edc 1691 * this may finally trigger drbd_ldev_destroy. */
82f59cc6 1692 put_ldev(mdev);
b411b363
PR
1693 }
1694
738a84b2 1695 /* Notify peer that I had a local IO error, and did not detached.. */
4afc433c 1696 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
f479ea06 1697 drbd_send_state(mdev, ns);
738a84b2 1698
b411b363
PR
1699 /* Disks got bigger while they were detached */
1700 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1701 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1702 if (ns.conn == C_CONNECTED)
1703 resync_after_online_grow(mdev);
1704 }
1705
1706 /* A resync finished or aborted, wake paused devices... */
1707 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1708 (os.peer_isp && !ns.peer_isp) ||
1709 (os.user_isp && !ns.user_isp))
1710 resume_next_sg(mdev);
1711
af85e8e8
LE
1712 /* sync target done with resync. Explicitly notify peer, even though
1713 * it should (at least for non-empty resyncs) already know itself. */
1714 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
f479ea06 1715 drbd_send_state(mdev, ns);
af85e8e8 1716
197296ff
PR
1717 /* Wake up role changes, that were delayed because of connection establishing */
1718 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
1719 clear_bit(STATE_SENT, &mdev->flags);
1720 wake_up(&mdev->state_wait);
1721 }
1722
79a30d2d
LE
1723 /* This triggers bitmap writeout of potentially still unwritten pages
1724 * if the resync finished cleanly, or aborted because of peer disk
20ceb2b2 1725 * failure, or because of connection loss.
79a30d2d
LE
1726 * For resync aborted because of local disk failure, we cannot do
1727 * any bitmap writeout anymore.
20ceb2b2 1728 * No harm done if some bits change during this phase.
79a30d2d 1729 */
20ceb2b2 1730 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
0e8488ad
LE
1731 drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
1732 "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
79a30d2d
LE
1733 put_ldev(mdev);
1734 }
02851e9f 1735
f70b3511 1736 /* free tl_hash if we Got thawed and are C_STANDALONE */
fb22c402 1737 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
f70b3511
PR
1738 drbd_free_tl_hash(mdev);
1739
b411b363
PR
1740 /* Upon network connection, we need to start the receiver */
1741 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1742 drbd_thread_start(&mdev->receiver);
1743
1744 /* Terminate worker thread if we are unconfigured - it will be
1745 restarted as needed... */
1746 if (ns.disk == D_DISKLESS &&
1747 ns.conn == C_STANDALONE &&
1748 ns.role == R_SECONDARY) {
1749 if (os.aftr_isp != ns.aftr_isp)
1750 resume_next_sg(mdev);
1751 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1752 if (test_bit(DEVICE_DYING, &mdev->flags))
1753 drbd_thread_stop_nowait(&mdev->worker);
1754 }
1755
1756 drbd_md_sync(mdev);
1757}
1758
1759
1760static int drbd_thread_setup(void *arg)
1761{
1762 struct drbd_thread *thi = (struct drbd_thread *) arg;
1763 struct drbd_conf *mdev = thi->mdev;
1764 unsigned long flags;
1765 int retval;
1766
1767restart:
1768 retval = thi->function(thi);
1769
1770 spin_lock_irqsave(&thi->t_lock, flags);
1771
1772 /* if the receiver has been "Exiting", the last thing it did
1773 * was set the conn state to "StandAlone",
1774 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1775 * and receiver thread will be "started".
1776 * drbd_thread_start needs to set "Restarting" in that case.
1777 * t_state check and assignment needs to be within the same spinlock,
1778 * so either thread_start sees Exiting, and can remap to Restarting,
1779 * or thread_start see None, and can proceed as normal.
1780 */
1781
1782 if (thi->t_state == Restarting) {
1783 dev_info(DEV, "Restarting %s\n", current->comm);
1784 thi->t_state = Running;
1785 spin_unlock_irqrestore(&thi->t_lock, flags);
1786 goto restart;
1787 }
1788
1789 thi->task = NULL;
1790 thi->t_state = None;
1791 smp_mb();
1792 complete(&thi->stop);
1793 spin_unlock_irqrestore(&thi->t_lock, flags);
1794
1795 dev_info(DEV, "Terminating %s\n", current->comm);
1796
1797 /* Release mod reference taken when thread was started */
1798 module_put(THIS_MODULE);
1799 return retval;
1800}
1801
1802static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1803 int (*func) (struct drbd_thread *))
1804{
1805 spin_lock_init(&thi->t_lock);
1806 thi->task = NULL;
1807 thi->t_state = None;
1808 thi->function = func;
1809 thi->mdev = mdev;
1810}
1811
1812int drbd_thread_start(struct drbd_thread *thi)
1813{
1814 struct drbd_conf *mdev = thi->mdev;
1815 struct task_struct *nt;
1816 unsigned long flags;
1817
1818 const char *me =
1819 thi == &mdev->receiver ? "receiver" :
1820 thi == &mdev->asender ? "asender" :
1821 thi == &mdev->worker ? "worker" : "NONSENSE";
1822
1823 /* is used from state engine doing drbd_thread_stop_nowait,
1824 * while holding the req lock irqsave */
1825 spin_lock_irqsave(&thi->t_lock, flags);
1826
1827 switch (thi->t_state) {
1828 case None:
1829 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1830 me, current->comm, current->pid);
1831
1832 /* Get ref on module for thread - this is released when thread exits */
1833 if (!try_module_get(THIS_MODULE)) {
1834 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1835 spin_unlock_irqrestore(&thi->t_lock, flags);
81e84650 1836 return false;
b411b363
PR
1837 }
1838
1839 init_completion(&thi->stop);
1840 D_ASSERT(thi->task == NULL);
1841 thi->reset_cpu_mask = 1;
1842 thi->t_state = Running;
1843 spin_unlock_irqrestore(&thi->t_lock, flags);
1844 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1845
1846 nt = kthread_create(drbd_thread_setup, (void *) thi,
1847 "drbd%d_%s", mdev_to_minor(mdev), me);
1848
1849 if (IS_ERR(nt)) {
1850 dev_err(DEV, "Couldn't start thread\n");
1851
1852 module_put(THIS_MODULE);
81e84650 1853 return false;
b411b363
PR
1854 }
1855 spin_lock_irqsave(&thi->t_lock, flags);
1856 thi->task = nt;
1857 thi->t_state = Running;
1858 spin_unlock_irqrestore(&thi->t_lock, flags);
1859 wake_up_process(nt);
1860 break;
1861 case Exiting:
1862 thi->t_state = Restarting;
1863 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1864 me, current->comm, current->pid);
1865 /* fall through */
1866 case Running:
1867 case Restarting:
1868 default:
1869 spin_unlock_irqrestore(&thi->t_lock, flags);
1870 break;
1871 }
1872
81e84650 1873 return true;
b411b363
PR
1874}
1875
1876
1877void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1878{
1879 unsigned long flags;
1880
1881 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1882
1883 /* may be called from state engine, holding the req lock irqsave */
1884 spin_lock_irqsave(&thi->t_lock, flags);
1885
1886 if (thi->t_state == None) {
1887 spin_unlock_irqrestore(&thi->t_lock, flags);
1888 if (restart)
1889 drbd_thread_start(thi);
1890 return;
1891 }
1892
1893 if (thi->t_state != ns) {
1894 if (thi->task == NULL) {
1895 spin_unlock_irqrestore(&thi->t_lock, flags);
1896 return;
1897 }
1898
1899 thi->t_state = ns;
1900 smp_mb();
1901 init_completion(&thi->stop);
1902 if (thi->task != current)
1903 force_sig(DRBD_SIGKILL, thi->task);
1904
1905 }
1906
1907 spin_unlock_irqrestore(&thi->t_lock, flags);
1908
1909 if (wait)
1910 wait_for_completion(&thi->stop);
1911}
1912
1913#ifdef CONFIG_SMP
1914/**
1915 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1916 * @mdev: DRBD device.
1917 *
1918 * Forces all threads of a device onto the same CPU. This is beneficial for
1919 * DRBD's performance. May be overwritten by user's configuration.
1920 */
1921void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1922{
1923 int ord, cpu;
1924
1925 /* user override. */
1926 if (cpumask_weight(mdev->cpu_mask))
1927 return;
1928
1929 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1930 for_each_online_cpu(cpu) {
1931 if (ord-- == 0) {
1932 cpumask_set_cpu(cpu, mdev->cpu_mask);
1933 return;
1934 }
1935 }
1936 /* should not be reached */
1937 cpumask_setall(mdev->cpu_mask);
1938}
1939
1940/**
1941 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1942 * @mdev: DRBD device.
1943 *
1944 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1945 * prematurely.
1946 */
1947void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1948{
1949 struct task_struct *p = current;
1950 struct drbd_thread *thi =
1951 p == mdev->asender.task ? &mdev->asender :
1952 p == mdev->receiver.task ? &mdev->receiver :
1953 p == mdev->worker.task ? &mdev->worker :
1954 NULL;
1955 ERR_IF(thi == NULL)
1956 return;
1957 if (!thi->reset_cpu_mask)
1958 return;
1959 thi->reset_cpu_mask = 0;
1960 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1961}
1962#endif
1963
1964/* the appropriate socket mutex must be held already */
1965int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
0b70a13d 1966 enum drbd_packets cmd, struct p_header80 *h,
b411b363
PR
1967 size_t size, unsigned msg_flags)
1968{
1969 int sent, ok;
1970
81e84650
AG
1971 ERR_IF(!h) return false;
1972 ERR_IF(!size) return false;
b411b363
PR
1973
1974 h->magic = BE_DRBD_MAGIC;
1975 h->command = cpu_to_be16(cmd);
0b70a13d 1976 h->length = cpu_to_be16(size-sizeof(struct p_header80));
b411b363 1977
b411b363
PR
1978 sent = drbd_send(mdev, sock, h, size, msg_flags);
1979
1980 ok = (sent == size);
0ddc5549
LE
1981 if (!ok && !signal_pending(current))
1982 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
b411b363
PR
1983 cmdname(cmd), (int)size, sent);
1984 return ok;
1985}
1986
1987/* don't pass the socket. we may only look at it
1988 * when we hold the appropriate socket mutex.
1989 */
1990int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
0b70a13d 1991 enum drbd_packets cmd, struct p_header80 *h, size_t size)
b411b363
PR
1992{
1993 int ok = 0;
1994 struct socket *sock;
1995
1996 if (use_data_socket) {
1997 mutex_lock(&mdev->data.mutex);
1998 sock = mdev->data.socket;
1999 } else {
2000 mutex_lock(&mdev->meta.mutex);
2001 sock = mdev->meta.socket;
2002 }
2003
2004 /* drbd_disconnect() could have called drbd_free_sock()
2005 * while we were waiting in down()... */
2006 if (likely(sock != NULL))
2007 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
2008
2009 if (use_data_socket)
2010 mutex_unlock(&mdev->data.mutex);
2011 else
2012 mutex_unlock(&mdev->meta.mutex);
2013 return ok;
2014}
2015
2016int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
2017 size_t size)
2018{
0b70a13d 2019 struct p_header80 h;
b411b363
PR
2020 int ok;
2021
2022 h.magic = BE_DRBD_MAGIC;
2023 h.command = cpu_to_be16(cmd);
2024 h.length = cpu_to_be16(size);
2025
2026 if (!drbd_get_data_sock(mdev))
2027 return 0;
2028
b411b363
PR
2029 ok = (sizeof(h) ==
2030 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
2031 ok = ok && (size ==
2032 drbd_send(mdev, mdev->data.socket, data, size, 0));
2033
2034 drbd_put_data_sock(mdev);
2035
2036 return ok;
2037}
2038
2039int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
2040{
8e26f9cc 2041 struct p_rs_param_95 *p;
b411b363
PR
2042 struct socket *sock;
2043 int size, rv;
2044 const int apv = mdev->agreed_pro_version;
2045
2046 size = apv <= 87 ? sizeof(struct p_rs_param)
2047 : apv == 88 ? sizeof(struct p_rs_param)
2048 + strlen(mdev->sync_conf.verify_alg) + 1
8e26f9cc
PR
2049 : apv <= 94 ? sizeof(struct p_rs_param_89)
2050 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363
PR
2051
2052 /* used from admin command context and receiver/worker context.
2053 * to avoid kmalloc, grab the socket right here,
2054 * then use the pre-allocated sbuf there */
2055 mutex_lock(&mdev->data.mutex);
2056 sock = mdev->data.socket;
2057
2058 if (likely(sock != NULL)) {
2059 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
2060
8e26f9cc 2061 p = &mdev->data.sbuf.rs_param_95;
b411b363
PR
2062
2063 /* initialize verify_alg and csums_alg */
2064 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2065
2066 p->rate = cpu_to_be32(sc->rate);
8e26f9cc
PR
2067 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
2068 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
2069 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
2070 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
b411b363
PR
2071
2072 if (apv >= 88)
2073 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
2074 if (apv >= 89)
2075 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
2076
2077 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
2078 } else
2079 rv = 0; /* not ok */
2080
2081 mutex_unlock(&mdev->data.mutex);
2082
2083 return rv;
2084}
2085
2086int drbd_send_protocol(struct drbd_conf *mdev)
2087{
2088 struct p_protocol *p;
cf14c2e9 2089 int size, cf, rv;
b411b363
PR
2090
2091 size = sizeof(struct p_protocol);
2092
2093 if (mdev->agreed_pro_version >= 87)
2094 size += strlen(mdev->net_conf->integrity_alg) + 1;
2095
2096 /* we must not recurse into our own queue,
2097 * as that is blocked during handshake */
2098 p = kmalloc(size, GFP_NOIO);
2099 if (p == NULL)
2100 return 0;
2101
2102 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
2103 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
2104 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
2105 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
b411b363
PR
2106 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2107
cf14c2e9
PR
2108 cf = 0;
2109 if (mdev->net_conf->want_lose)
2110 cf |= CF_WANT_LOSE;
2111 if (mdev->net_conf->dry_run) {
2112 if (mdev->agreed_pro_version >= 92)
2113 cf |= CF_DRY_RUN;
2114 else {
2115 dev_err(DEV, "--dry-run is not supported by peer");
7ac314c8 2116 kfree(p);
148efa16 2117 return -1;
cf14c2e9
PR
2118 }
2119 }
2120 p->conn_flags = cpu_to_be32(cf);
2121
b411b363
PR
2122 if (mdev->agreed_pro_version >= 87)
2123 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2124
2125 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
0b70a13d 2126 (struct p_header80 *)p, size);
b411b363
PR
2127 kfree(p);
2128 return rv;
2129}
2130
2131int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2132{
2133 struct p_uuids p;
2134 int i;
2135
2136 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2137 return 1;
2138
2139 for (i = UI_CURRENT; i < UI_SIZE; i++)
2140 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2141
2142 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2143 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2144 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2145 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2146 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2147 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2148
2149 put_ldev(mdev);
2150
2151 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
0b70a13d 2152 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2153}
2154
2155int drbd_send_uuids(struct drbd_conf *mdev)
2156{
2157 return _drbd_send_uuids(mdev, 0);
2158}
2159
2160int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2161{
2162 return _drbd_send_uuids(mdev, 8);
2163}
2164
62b0da3a
LE
2165void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2166{
2167 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2168 u64 *uuid = mdev->ldev->md.uuid;
2169 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2170 text,
2171 (unsigned long long)uuid[UI_CURRENT],
2172 (unsigned long long)uuid[UI_BITMAP],
2173 (unsigned long long)uuid[UI_HISTORY_START],
2174 (unsigned long long)uuid[UI_HISTORY_END]);
2175 put_ldev(mdev);
2176 } else {
2177 dev_info(DEV, "%s effective data uuid: %016llX\n",
2178 text,
2179 (unsigned long long)mdev->ed_uuid);
2180 }
2181}
2182
5a22db89 2183int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
b411b363
PR
2184{
2185 struct p_rs_uuid p;
5a22db89
LE
2186 u64 uuid;
2187
2188 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
b411b363 2189
5ba3dac5
PR
2190 uuid = mdev->ldev->md.uuid[UI_BITMAP];
2191 if (uuid && uuid != UUID_JUST_CREATED)
2192 uuid = uuid + UUID_NEW_BM_OFFSET;
2193 else
2194 get_random_bytes(&uuid, sizeof(u64));
5a22db89 2195 drbd_uuid_set(mdev, UI_BITMAP, uuid);
62b0da3a 2196 drbd_print_uuids(mdev, "updated sync UUID");
5a22db89
LE
2197 drbd_md_sync(mdev);
2198 p.uuid = cpu_to_be64(uuid);
b411b363
PR
2199
2200 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
0b70a13d 2201 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2202}
2203
e89b591c 2204int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
b411b363
PR
2205{
2206 struct p_sizes p;
2207 sector_t d_size, u_size;
db141b2f
LE
2208 int q_order_type;
2209 unsigned int max_bio_size;
b411b363
PR
2210 int ok;
2211
2212 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2213 D_ASSERT(mdev->ldev->backing_bdev);
2214 d_size = drbd_get_max_capacity(mdev->ldev);
2215 u_size = mdev->ldev->dc.disk_size;
2216 q_order_type = drbd_queue_order_type(mdev);
99432fcc 2217 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
db141b2f 2218 max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
b411b363
PR
2219 put_ldev(mdev);
2220 } else {
2221 d_size = 0;
2222 u_size = 0;
2223 q_order_type = QUEUE_ORDERED_NONE;
99432fcc 2224 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
b411b363
PR
2225 }
2226
6809384c
PR
2227 /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
2228 if (mdev->agreed_pro_version <= 94)
db141b2f 2229 max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
6809384c 2230
b411b363
PR
2231 p.d_size = cpu_to_be64(d_size);
2232 p.u_size = cpu_to_be64(u_size);
2233 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
99432fcc 2234 p.max_bio_size = cpu_to_be32(max_bio_size);
e89b591c
PR
2235 p.queue_order_type = cpu_to_be16(q_order_type);
2236 p.dds_flags = cpu_to_be16(flags);
b411b363
PR
2237
2238 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
0b70a13d 2239 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2240 return ok;
2241}
2242
2243/**
f479ea06 2244 * drbd_send_current_state() - Sends the drbd state to the peer
b411b363
PR
2245 * @mdev: DRBD device.
2246 */
f479ea06 2247int drbd_send_current_state(struct drbd_conf *mdev)
b411b363
PR
2248{
2249 struct socket *sock;
2250 struct p_state p;
2251 int ok = 0;
2252
2253 /* Grab state lock so we wont send state if we're in the middle
2254 * of a cluster wide state change on another thread */
2255 drbd_state_lock(mdev);
2256
2257 mutex_lock(&mdev->data.mutex);
2258
2259 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2260 sock = mdev->data.socket;
2261
2262 if (likely(sock != NULL)) {
2263 ok = _drbd_send_cmd(mdev, sock, P_STATE,
0b70a13d 2264 (struct p_header80 *)&p, sizeof(p), 0);
b411b363
PR
2265 }
2266
2267 mutex_unlock(&mdev->data.mutex);
2268
2269 drbd_state_unlock(mdev);
2270 return ok;
2271}
2272
f479ea06
LE
2273/**
2274 * drbd_send_state() - After a state change, sends the new state to the peer
2275 * @mdev: DRBD device.
2276 * @state: the state to send, not necessarily the current state.
2277 *
2278 * Each state change queues an "after_state_ch" work, which will eventually
2279 * send the resulting new state to the peer. If more state changes happen
2280 * between queuing and processing of the after_state_ch work, we still
2281 * want to send each intermediary state in the order it occurred.
2282 */
2283int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
2284{
2285 struct socket *sock;
2286 struct p_state p;
2287 int ok = 0;
2288
2289 mutex_lock(&mdev->data.mutex);
2290
2291 p.state = cpu_to_be32(state.i);
2292 sock = mdev->data.socket;
2293
2294 if (likely(sock != NULL)) {
2295 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2296 (struct p_header80 *)&p, sizeof(p), 0);
2297 }
2298
2299 mutex_unlock(&mdev->data.mutex);
2300
2301 return ok;
2302}
2303
b411b363
PR
2304int drbd_send_state_req(struct drbd_conf *mdev,
2305 union drbd_state mask, union drbd_state val)
2306{
2307 struct p_req_state p;
2308
2309 p.mask = cpu_to_be32(mask.i);
2310 p.val = cpu_to_be32(val.i);
2311
2312 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
0b70a13d 2313 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2314}
2315
bf885f8a 2316int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
b411b363
PR
2317{
2318 struct p_req_state_reply p;
2319
2320 p.retcode = cpu_to_be32(retcode);
2321
2322 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
0b70a13d 2323 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2324}
2325
2326int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2327 struct p_compressed_bm *p,
2328 struct bm_xfer_ctx *c)
2329{
2330 struct bitstream bs;
2331 unsigned long plain_bits;
2332 unsigned long tmp;
2333 unsigned long rl;
2334 unsigned len;
2335 unsigned toggle;
2336 int bits;
2337
2338 /* may we use this feature? */
2339 if ((mdev->sync_conf.use_rle == 0) ||
2340 (mdev->agreed_pro_version < 90))
2341 return 0;
2342
2343 if (c->bit_offset >= c->bm_bits)
2344 return 0; /* nothing to do. */
2345
2346 /* use at most thus many bytes */
2347 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2348 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2349 /* plain bits covered in this code string */
2350 plain_bits = 0;
2351
2352 /* p->encoding & 0x80 stores whether the first run length is set.
2353 * bit offset is implicit.
2354 * start with toggle == 2 to be able to tell the first iteration */
2355 toggle = 2;
2356
2357 /* see how much plain bits we can stuff into one packet
2358 * using RLE and VLI. */
2359 do {
2360 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2361 : _drbd_bm_find_next(mdev, c->bit_offset);
2362 if (tmp == -1UL)
2363 tmp = c->bm_bits;
2364 rl = tmp - c->bit_offset;
2365
2366 if (toggle == 2) { /* first iteration */
2367 if (rl == 0) {
2368 /* the first checked bit was set,
2369 * store start value, */
2370 DCBP_set_start(p, 1);
2371 /* but skip encoding of zero run length */
2372 toggle = !toggle;
2373 continue;
2374 }
2375 DCBP_set_start(p, 0);
2376 }
2377
2378 /* paranoia: catch zero runlength.
2379 * can only happen if bitmap is modified while we scan it. */
2380 if (rl == 0) {
2381 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2382 "t:%u bo:%lu\n", toggle, c->bit_offset);
2383 return -1;
2384 }
2385
2386 bits = vli_encode_bits(&bs, rl);
2387 if (bits == -ENOBUFS) /* buffer full */
2388 break;
2389 if (bits <= 0) {
2390 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2391 return 0;
2392 }
2393
2394 toggle = !toggle;
2395 plain_bits += rl;
2396 c->bit_offset = tmp;
2397 } while (c->bit_offset < c->bm_bits);
2398
2399 len = bs.cur.b - p->code + !!bs.cur.bit;
2400
2401 if (plain_bits < (len << 3)) {
2402 /* incompressible with this method.
2403 * we need to rewind both word and bit position. */
2404 c->bit_offset -= plain_bits;
2405 bm_xfer_ctx_bit_to_word_offset(c);
2406 c->bit_offset = c->word_offset * BITS_PER_LONG;
2407 return 0;
2408 }
2409
2410 /* RLE + VLI was able to compress it just fine.
2411 * update c->word_offset. */
2412 bm_xfer_ctx_bit_to_word_offset(c);
2413
2414 /* store pad_bits */
2415 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2416
2417 return len;
2418}
2419
f70af118
AG
2420/**
2421 * send_bitmap_rle_or_plain
2422 *
2423 * Return 0 when done, 1 when another iteration is needed, and a negative error
2424 * code upon failure.
2425 */
2426static int
b411b363 2427send_bitmap_rle_or_plain(struct drbd_conf *mdev,
f70af118 2428 struct p_header80 *h, struct bm_xfer_ctx *c)
b411b363
PR
2429{
2430 struct p_compressed_bm *p = (void*)h;
2431 unsigned long num_words;
2432 int len;
2433 int ok;
2434
2435 len = fill_bitmap_rle_bits(mdev, p, c);
2436
2437 if (len < 0)
f70af118 2438 return -EIO;
b411b363
PR
2439
2440 if (len) {
2441 DCBP_set_code(p, RLE_VLI_Bits);
2442 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2443 sizeof(*p) + len, 0);
2444
2445 c->packets[0]++;
2446 c->bytes[0] += sizeof(*p) + len;
2447
2448 if (c->bit_offset >= c->bm_bits)
2449 len = 0; /* DONE */
2450 } else {
2451 /* was not compressible.
2452 * send a buffer full of plain text bits instead. */
2453 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2454 len = num_words * sizeof(long);
2455 if (len)
2456 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2457 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
0b70a13d 2458 h, sizeof(struct p_header80) + len, 0);
b411b363
PR
2459 c->word_offset += num_words;
2460 c->bit_offset = c->word_offset * BITS_PER_LONG;
2461
2462 c->packets[1]++;
0b70a13d 2463 c->bytes[1] += sizeof(struct p_header80) + len;
b411b363
PR
2464
2465 if (c->bit_offset > c->bm_bits)
2466 c->bit_offset = c->bm_bits;
2467 }
f70af118
AG
2468 if (ok) {
2469 if (len == 0) {
2470 INFO_bm_xfer_stats(mdev, "send", c);
2471 return 0;
2472 } else
2473 return 1;
2474 }
2475 return -EIO;
b411b363
PR
2476}
2477
2478/* See the comment at receive_bitmap() */
2479int _drbd_send_bitmap(struct drbd_conf *mdev)
2480{
2481 struct bm_xfer_ctx c;
0b70a13d 2482 struct p_header80 *p;
f70af118 2483 int err;
b411b363 2484
81e84650 2485 ERR_IF(!mdev->bitmap) return false;
b411b363
PR
2486
2487 /* maybe we should use some per thread scratch page,
2488 * and allocate that during initial device creation? */
0b70a13d 2489 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
b411b363
PR
2490 if (!p) {
2491 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
81e84650 2492 return false;
b411b363
PR
2493 }
2494
2495 if (get_ldev(mdev)) {
2496 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2497 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2498 drbd_bm_set_all(mdev);
2499 if (drbd_bm_write(mdev)) {
2500 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2501 * but otherwise process as per normal - need to tell other
2502 * side that a full resync is required! */
2503 dev_err(DEV, "Failed to write bitmap to disk!\n");
2504 } else {
2505 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2506 drbd_md_sync(mdev);
2507 }
2508 }
2509 put_ldev(mdev);
2510 }
2511
2512 c = (struct bm_xfer_ctx) {
2513 .bm_bits = drbd_bm_bits(mdev),
2514 .bm_words = drbd_bm_words(mdev),
2515 };
2516
2517 do {
f70af118
AG
2518 err = send_bitmap_rle_or_plain(mdev, p, &c);
2519 } while (err > 0);
b411b363
PR
2520
2521 free_page((unsigned long) p);
f70af118 2522 return err == 0;
b411b363
PR
2523}
2524
2525int drbd_send_bitmap(struct drbd_conf *mdev)
2526{
2527 int err;
2528
2529 if (!drbd_get_data_sock(mdev))
2530 return -1;
2531 err = !_drbd_send_bitmap(mdev);
2532 drbd_put_data_sock(mdev);
2533 return err;
2534}
2535
2536int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2537{
2538 int ok;
2539 struct p_barrier_ack p;
2540
2541 p.barrier = barrier_nr;
2542 p.set_size = cpu_to_be32(set_size);
2543
2544 if (mdev->state.conn < C_CONNECTED)
81e84650 2545 return false;
b411b363 2546 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
0b70a13d 2547 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2548 return ok;
2549}
2550
2551/**
2552 * _drbd_send_ack() - Sends an ack packet
2553 * @mdev: DRBD device.
2554 * @cmd: Packet command code.
2555 * @sector: sector, needs to be in big endian byte order
2556 * @blksize: size in byte, needs to be in big endian byte order
2557 * @block_id: Id, big endian byte order
2558 */
2559static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2560 u64 sector,
2561 u32 blksize,
2562 u64 block_id)
2563{
2564 int ok;
2565 struct p_block_ack p;
2566
2567 p.sector = sector;
2568 p.block_id = block_id;
2569 p.blksize = blksize;
2570 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2571
2572 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
81e84650 2573 return false;
b411b363 2574 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
0b70a13d 2575 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2576 return ok;
2577}
2578
2b2bf214
LE
2579/* dp->sector and dp->block_id already/still in network byte order,
2580 * data_size is payload size according to dp->head,
2581 * and may need to be corrected for digest size. */
b411b363 2582int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2b2bf214 2583 struct p_data *dp, int data_size)
b411b363 2584{
2b2bf214
LE
2585 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2586 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
b411b363
PR
2587 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2588 dp->block_id);
2589}
2590
2591int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2592 struct p_block_req *rp)
2593{
2594 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2595}
2596
2597/**
2598 * drbd_send_ack() - Sends an ack packet
2599 * @mdev: DRBD device.
2600 * @cmd: Packet command code.
2601 * @e: Epoch entry.
2602 */
2603int drbd_send_ack(struct drbd_conf *mdev,
2604 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2605{
2606 return _drbd_send_ack(mdev, cmd,
2607 cpu_to_be64(e->sector),
2608 cpu_to_be32(e->size),
2609 e->block_id);
2610}
2611
2612/* This function misuses the block_id field to signal if the blocks
2613 * are is sync or not. */
2614int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2615 sector_t sector, int blksize, u64 block_id)
2616{
2617 return _drbd_send_ack(mdev, cmd,
2618 cpu_to_be64(sector),
2619 cpu_to_be32(blksize),
2620 cpu_to_be64(block_id));
2621}
2622
2623int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2624 sector_t sector, int size, u64 block_id)
2625{
2626 int ok;
2627 struct p_block_req p;
2628
2629 p.sector = cpu_to_be64(sector);
2630 p.block_id = block_id;
2631 p.blksize = cpu_to_be32(size);
2632
2633 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
0b70a13d 2634 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2635 return ok;
2636}
2637
2638int drbd_send_drequest_csum(struct drbd_conf *mdev,
2639 sector_t sector, int size,
2640 void *digest, int digest_size,
2641 enum drbd_packets cmd)
2642{
2643 int ok;
2644 struct p_block_req p;
2645
2646 p.sector = cpu_to_be64(sector);
2647 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2648 p.blksize = cpu_to_be32(size);
2649
2650 p.head.magic = BE_DRBD_MAGIC;
2651 p.head.command = cpu_to_be16(cmd);
0b70a13d 2652 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
b411b363
PR
2653
2654 mutex_lock(&mdev->data.mutex);
2655
2656 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2657 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2658
2659 mutex_unlock(&mdev->data.mutex);
2660
2661 return ok;
2662}
2663
2664int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2665{
2666 int ok;
2667 struct p_block_req p;
2668
2669 p.sector = cpu_to_be64(sector);
2670 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2671 p.blksize = cpu_to_be32(size);
2672
2673 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
0b70a13d 2674 (struct p_header80 *)&p, sizeof(p));
b411b363
PR
2675 return ok;
2676}
2677
2678/* called on sndtimeo
81e84650
AG
2679 * returns false if we should retry,
2680 * true if we think connection is dead
b411b363
PR
2681 */
2682static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2683{
2684 int drop_it;
2685 /* long elapsed = (long)(jiffies - mdev->last_received); */
2686
2687 drop_it = mdev->meta.socket == sock
2688 || !mdev->asender.task
2689 || get_t_state(&mdev->asender) != Running
2690 || mdev->state.conn < C_CONNECTED;
2691
2692 if (drop_it)
81e84650 2693 return true;
b411b363
PR
2694
2695 drop_it = !--mdev->ko_count;
2696 if (!drop_it) {
2697 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2698 current->comm, current->pid, mdev->ko_count);
2699 request_ping(mdev);
2700 }
2701
2702 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2703}
2704
2705/* The idea of sendpage seems to be to put some kind of reference
2706 * to the page into the skb, and to hand it over to the NIC. In
2707 * this process get_page() gets called.
2708 *
2709 * As soon as the page was really sent over the network put_page()
2710 * gets called by some part of the network layer. [ NIC driver? ]
2711 *
2712 * [ get_page() / put_page() increment/decrement the count. If count
2713 * reaches 0 the page will be freed. ]
2714 *
2715 * This works nicely with pages from FSs.
2716 * But this means that in protocol A we might signal IO completion too early!
2717 *
2718 * In order not to corrupt data during a resync we must make sure
2719 * that we do not reuse our own buffer pages (EEs) to early, therefore
2720 * we have the net_ee list.
2721 *
2722 * XFS seems to have problems, still, it submits pages with page_count == 0!
2723 * As a workaround, we disable sendpage on pages
2724 * with page_count == 0 or PageSlab.
2725 */
2726static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 2727 int offset, size_t size, unsigned msg_flags)
b411b363 2728{
ba11ad9a 2729 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
b411b363
PR
2730 kunmap(page);
2731 if (sent == size)
2732 mdev->send_cnt += size>>9;
2733 return sent == size;
2734}
2735
2736static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 2737 int offset, size_t size, unsigned msg_flags)
b411b363
PR
2738{
2739 mm_segment_t oldfs = get_fs();
2740 int sent, ok;
2741 int len = size;
2742
2743 /* e.g. XFS meta- & log-data is in slab pages, which have a
2744 * page_count of 0 and/or have PageSlab() set.
2745 * we cannot use send_page for those, as that does get_page();
2746 * put_page(); and would cause either a VM_BUG directly, or
2747 * __page_cache_release a page that would actually still be referenced
2748 * by someone, leading to some obscure delayed Oops somewhere else. */
2749 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
ba11ad9a 2750 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
b411b363 2751
ba11ad9a 2752 msg_flags |= MSG_NOSIGNAL;
b411b363
PR
2753 drbd_update_congested(mdev);
2754 set_fs(KERNEL_DS);
2755 do {
2756 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2757 offset, len,
ba11ad9a 2758 msg_flags);
b411b363
PR
2759 if (sent == -EAGAIN) {
2760 if (we_should_drop_the_connection(mdev,
2761 mdev->data.socket))
2762 break;
2763 else
2764 continue;
2765 }
2766 if (sent <= 0) {
2767 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2768 __func__, (int)size, len, sent);
2769 break;
2770 }
2771 len -= sent;
2772 offset += sent;
2773 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2774 set_fs(oldfs);
2775 clear_bit(NET_CONGESTED, &mdev->flags);
2776
2777 ok = (len == 0);
2778 if (likely(ok))
2779 mdev->send_cnt += size>>9;
2780 return ok;
2781}
2782
2783static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2784{
2785 struct bio_vec *bvec;
2786 int i;
ba11ad9a 2787 /* hint all but last page with MSG_MORE */
001a8868 2788 bio_for_each_segment(bvec, bio, i) {
b411b363 2789 if (!_drbd_no_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
2790 bvec->bv_offset, bvec->bv_len,
2791 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
2792 return 0;
2793 }
2794 return 1;
2795}
2796
2797static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2798{
2799 struct bio_vec *bvec;
2800 int i;
ba11ad9a 2801 /* hint all but last page with MSG_MORE */
001a8868 2802 bio_for_each_segment(bvec, bio, i) {
b411b363 2803 if (!_drbd_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
2804 bvec->bv_offset, bvec->bv_len,
2805 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
2806 return 0;
2807 }
b411b363
PR
2808 return 1;
2809}
2810
45bb912b
LE
2811static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2812{
2813 struct page *page = e->pages;
2814 unsigned len = e->size;
ba11ad9a 2815 /* hint all but last page with MSG_MORE */
45bb912b
LE
2816 page_chain_for_each(page) {
2817 unsigned l = min_t(unsigned, len, PAGE_SIZE);
ba11ad9a
LE
2818 if (!_drbd_send_page(mdev, page, 0, l,
2819 page_chain_next(page) ? MSG_MORE : 0))
45bb912b
LE
2820 return 0;
2821 len -= l;
2822 }
2823 return 1;
2824}
2825
76d2e7ec
PR
2826static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2827{
2828 if (mdev->agreed_pro_version >= 95)
2829 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
76d2e7ec
PR
2830 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2831 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2832 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2833 else
721a9602 2834 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
76d2e7ec
PR
2835}
2836
b411b363
PR
2837/* Used to send write requests
2838 * R_PRIMARY -> Peer (P_DATA)
2839 */
2840int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2841{
2842 int ok = 1;
2843 struct p_data p;
2844 unsigned int dp_flags = 0;
2845 void *dgb;
2846 int dgs;
2847
2848 if (!drbd_get_data_sock(mdev))
2849 return 0;
2850
2851 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2852 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2853
d5373389 2854 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
0b70a13d
PR
2855 p.head.h80.magic = BE_DRBD_MAGIC;
2856 p.head.h80.command = cpu_to_be16(P_DATA);
2857 p.head.h80.length =
2858 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2859 } else {
2860 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2861 p.head.h95.command = cpu_to_be16(P_DATA);
2862 p.head.h95.length =
2863 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2864 }
b411b363
PR
2865
2866 p.sector = cpu_to_be64(req->sector);
2867 p.block_id = (unsigned long)req;
671a74e7 2868 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
b411b363 2869
76d2e7ec
PR
2870 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2871
b411b363
PR
2872 if (mdev->state.conn >= C_SYNC_SOURCE &&
2873 mdev->state.conn <= C_PAUSED_SYNC_T)
2874 dp_flags |= DP_MAY_SET_IN_SYNC;
2875
2876 p.dp_flags = cpu_to_be32(dp_flags);
b411b363
PR
2877 set_bit(UNPLUG_REMOTE, &mdev->flags);
2878 ok = (sizeof(p) ==
ba11ad9a 2879 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
b411b363
PR
2880 if (ok && dgs) {
2881 dgb = mdev->int_dig_out;
45bb912b 2882 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
cab2f74b 2883 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
b411b363
PR
2884 }
2885 if (ok) {
470be44a
LE
2886 /* For protocol A, we have to memcpy the payload into
2887 * socket buffers, as we may complete right away
2888 * as soon as we handed it over to tcp, at which point the data
2889 * pages may become invalid.
2890 *
2891 * For data-integrity enabled, we copy it as well, so we can be
2892 * sure that even if the bio pages may still be modified, it
2893 * won't change the data on the wire, thus if the digest checks
2894 * out ok after sending on this side, but does not fit on the
2895 * receiving side, we sure have detected corruption elsewhere.
2896 */
2897 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
b411b363
PR
2898 ok = _drbd_send_bio(mdev, req->master_bio);
2899 else
2900 ok = _drbd_send_zc_bio(mdev, req->master_bio);
470be44a
LE
2901
2902 /* double check digest, sometimes buffers have been modified in flight. */
2903 if (dgs > 0 && dgs <= 64) {
24c4830c 2904 /* 64 byte, 512 bit, is the largest digest size
470be44a
LE
2905 * currently supported in kernel crypto. */
2906 unsigned char digest[64];
2907 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2908 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2909 dev_warn(DEV,
2910 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2911 (unsigned long long)req->sector, req->size);
2912 }
2913 } /* else if (dgs > 64) {
2914 ... Be noisy about digest too large ...
2915 } */
b411b363
PR
2916 }
2917
2918 drbd_put_data_sock(mdev);
bd26bfc5 2919
b411b363
PR
2920 return ok;
2921}
2922
2923/* answer packet, used to send data back for read requests:
2924 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2925 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2926 */
2927int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2928 struct drbd_epoch_entry *e)
2929{
2930 int ok;
2931 struct p_data p;
2932 void *dgb;
2933 int dgs;
2934
2935 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2936 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2937
d5373389 2938 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
0b70a13d
PR
2939 p.head.h80.magic = BE_DRBD_MAGIC;
2940 p.head.h80.command = cpu_to_be16(cmd);
2941 p.head.h80.length =
2942 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2943 } else {
2944 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2945 p.head.h95.command = cpu_to_be16(cmd);
2946 p.head.h95.length =
2947 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2948 }
b411b363
PR
2949
2950 p.sector = cpu_to_be64(e->sector);
2951 p.block_id = e->block_id;
2952 /* p.seq_num = 0; No sequence numbers here.. */
2953
2954 /* Only called by our kernel thread.
2955 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2956 * in response to admin command or module unload.
2957 */
2958 if (!drbd_get_data_sock(mdev))
2959 return 0;
2960
0b70a13d 2961 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
b411b363
PR
2962 if (ok && dgs) {
2963 dgb = mdev->int_dig_out;
45bb912b 2964 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
cab2f74b 2965 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
b411b363
PR
2966 }
2967 if (ok)
45bb912b 2968 ok = _drbd_send_zc_ee(mdev, e);
b411b363
PR
2969
2970 drbd_put_data_sock(mdev);
bd26bfc5 2971
b411b363
PR
2972 return ok;
2973}
2974
73a01a18
PR
2975int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2976{
2977 struct p_block_desc p;
2978
2979 p.sector = cpu_to_be64(req->sector);
2980 p.blksize = cpu_to_be32(req->size);
2981
2982 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2983}
2984
b411b363
PR
2985/*
2986 drbd_send distinguishes two cases:
2987
2988 Packets sent via the data socket "sock"
2989 and packets sent via the meta data socket "msock"
2990
2991 sock msock
2992 -----------------+-------------------------+------------------------------
2993 timeout conf.timeout / 2 conf.timeout / 2
2994 timeout action send a ping via msock Abort communication
2995 and close all sockets
2996*/
2997
2998/*
2999 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
3000 */
3001int drbd_send(struct drbd_conf *mdev, struct socket *sock,
3002 void *buf, size_t size, unsigned msg_flags)
3003{
3004 struct kvec iov;
3005 struct msghdr msg;
3006 int rv, sent = 0;
3007
3008 if (!sock)
3009 return -1000;
3010
3011 /* THINK if (signal_pending) return ... ? */
3012
3013 iov.iov_base = buf;
3014 iov.iov_len = size;
3015
3016 msg.msg_name = NULL;
3017 msg.msg_namelen = 0;
3018 msg.msg_control = NULL;
3019 msg.msg_controllen = 0;
3020 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
3021
3022 if (sock == mdev->data.socket) {
3023 mdev->ko_count = mdev->net_conf->ko_count;
3024 drbd_update_congested(mdev);
3025 }
3026 do {
3027 /* STRANGE
3028 * tcp_sendmsg does _not_ use its size parameter at all ?
3029 *
3030 * -EAGAIN on timeout, -EINTR on signal.
3031 */
3032/* THINK
3033 * do we need to block DRBD_SIG if sock == &meta.socket ??
3034 * otherwise wake_asender() might interrupt some send_*Ack !
3035 */
3036 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
3037 if (rv == -EAGAIN) {
3038 if (we_should_drop_the_connection(mdev, sock))
3039 break;
3040 else
3041 continue;
3042 }
3043 D_ASSERT(rv != 0);
3044 if (rv == -EINTR) {
3045 flush_signals(current);
3046 rv = 0;
3047 }
3048 if (rv < 0)
3049 break;
3050 sent += rv;
3051 iov.iov_base += rv;
3052 iov.iov_len -= rv;
3053 } while (sent < size);
3054
3055 if (sock == mdev->data.socket)
3056 clear_bit(NET_CONGESTED, &mdev->flags);
3057
3058 if (rv <= 0) {
3059 if (rv != -EAGAIN) {
3060 dev_err(DEV, "%s_sendmsg returned %d\n",
3061 sock == mdev->meta.socket ? "msock" : "sock",
3062 rv);
3063 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
3064 } else
3065 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
3066 }
3067
3068 return sent;
3069}
3070
3071static int drbd_open(struct block_device *bdev, fmode_t mode)
3072{
3073 struct drbd_conf *mdev = bdev->bd_disk->private_data;
3074 unsigned long flags;
3075 int rv = 0;
3076
2a48fc0a 3077 mutex_lock(&drbd_main_mutex);
b411b363
PR
3078 spin_lock_irqsave(&mdev->req_lock, flags);
3079 /* to have a stable mdev->state.role
3080 * and no race with updating open_cnt */
3081
3082 if (mdev->state.role != R_PRIMARY) {
3083 if (mode & FMODE_WRITE)
3084 rv = -EROFS;
3085 else if (!allow_oos)
3086 rv = -EMEDIUMTYPE;
3087 }
3088
3089 if (!rv)
3090 mdev->open_cnt++;
3091 spin_unlock_irqrestore(&mdev->req_lock, flags);
2a48fc0a 3092 mutex_unlock(&drbd_main_mutex);
b411b363
PR
3093
3094 return rv;
3095}
3096
3097static int drbd_release(struct gendisk *gd, fmode_t mode)
3098{
3099 struct drbd_conf *mdev = gd->private_data;
2a48fc0a 3100 mutex_lock(&drbd_main_mutex);
b411b363 3101 mdev->open_cnt--;
2a48fc0a 3102 mutex_unlock(&drbd_main_mutex);
b411b363
PR
3103 return 0;
3104}
3105
b411b363
PR
3106static void drbd_set_defaults(struct drbd_conf *mdev)
3107{
85f4cc17
PR
3108 /* This way we get a compile error when sync_conf grows,
3109 and we forgot to initialize it here */
3110 mdev->sync_conf = (struct syncer_conf) {
3111 /* .rate = */ DRBD_RATE_DEF,
3112 /* .after = */ DRBD_AFTER_DEF,
3113 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
85f4cc17
PR
3114 /* .verify_alg = */ {}, 0,
3115 /* .cpu_mask = */ {}, 0,
3116 /* .csums_alg = */ {}, 0,
e756414f 3117 /* .use_rle = */ 0,
9a31d716
PR
3118 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
3119 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
3120 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
3121 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
0f0601f4
LE
3122 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
3123 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
85f4cc17
PR
3124 };
3125
3126 /* Have to use that way, because the layout differs between
3127 big endian and little endian */
b411b363
PR
3128 mdev->state = (union drbd_state) {
3129 { .role = R_SECONDARY,
3130 .peer = R_UNKNOWN,
3131 .conn = C_STANDALONE,
3132 .disk = D_DISKLESS,
3133 .pdsk = D_UNKNOWN,
fb22c402
PR
3134 .susp = 0,
3135 .susp_nod = 0,
3136 .susp_fen = 0
b411b363
PR
3137 } };
3138}
3139
3140void drbd_init_set_defaults(struct drbd_conf *mdev)
3141{
3142 /* the memset(,0,) did most of this.
3143 * note: only assignments, no allocation in here */
3144
3145 drbd_set_defaults(mdev);
3146
b411b363
PR
3147 atomic_set(&mdev->ap_bio_cnt, 0);
3148 atomic_set(&mdev->ap_pending_cnt, 0);
3149 atomic_set(&mdev->rs_pending_cnt, 0);
3150 atomic_set(&mdev->unacked_cnt, 0);
3151 atomic_set(&mdev->local_cnt, 0);
3152 atomic_set(&mdev->net_cnt, 0);
3153 atomic_set(&mdev->packet_seq, 0);
3154 atomic_set(&mdev->pp_in_use, 0);
435f0740 3155 atomic_set(&mdev->pp_in_use_by_net, 0);
778f271d 3156 atomic_set(&mdev->rs_sect_in, 0);
0f0601f4 3157 atomic_set(&mdev->rs_sect_ev, 0);
759fbdfb 3158 atomic_set(&mdev->ap_in_flight, 0);
e1711731 3159 atomic_set(&mdev->md_io_in_use, 0);
b411b363 3160
b411b363
PR
3161 mutex_init(&mdev->data.mutex);
3162 mutex_init(&mdev->meta.mutex);
3163 sema_init(&mdev->data.work.s, 0);
3164 sema_init(&mdev->meta.work.s, 0);
3165 mutex_init(&mdev->state_mutex);
3166
3167 spin_lock_init(&mdev->data.work.q_lock);
3168 spin_lock_init(&mdev->meta.work.q_lock);
3169
3170 spin_lock_init(&mdev->al_lock);
3171 spin_lock_init(&mdev->req_lock);
3172 spin_lock_init(&mdev->peer_seq_lock);
3173 spin_lock_init(&mdev->epoch_lock);
3174
3175 INIT_LIST_HEAD(&mdev->active_ee);
3176 INIT_LIST_HEAD(&mdev->sync_ee);
3177 INIT_LIST_HEAD(&mdev->done_ee);
3178 INIT_LIST_HEAD(&mdev->read_ee);
3179 INIT_LIST_HEAD(&mdev->net_ee);
3180 INIT_LIST_HEAD(&mdev->resync_reads);
3181 INIT_LIST_HEAD(&mdev->data.work.q);
3182 INIT_LIST_HEAD(&mdev->meta.work.q);
3183 INIT_LIST_HEAD(&mdev->resync_work.list);
3184 INIT_LIST_HEAD(&mdev->unplug_work.list);
e9e6f3ec 3185 INIT_LIST_HEAD(&mdev->go_diskless.list);
b411b363 3186 INIT_LIST_HEAD(&mdev->md_sync_work.list);
c4752ef1 3187 INIT_LIST_HEAD(&mdev->start_resync_work.list);
b411b363 3188 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
0ced55a3 3189
794abb75 3190 mdev->resync_work.cb = w_resync_timer;
b411b363 3191 mdev->unplug_work.cb = w_send_write_hint;
e9e6f3ec 3192 mdev->go_diskless.cb = w_go_diskless;
b411b363
PR
3193 mdev->md_sync_work.cb = w_md_sync;
3194 mdev->bm_io_work.w.cb = w_bitmap_io;
370a43e7 3195 mdev->start_resync_work.cb = w_start_resync;
b411b363
PR
3196 init_timer(&mdev->resync_timer);
3197 init_timer(&mdev->md_sync_timer);
370a43e7 3198 init_timer(&mdev->start_resync_timer);
7fde2be9 3199 init_timer(&mdev->request_timer);
b411b363
PR
3200 mdev->resync_timer.function = resync_timer_fn;
3201 mdev->resync_timer.data = (unsigned long) mdev;
3202 mdev->md_sync_timer.function = md_sync_timer_fn;
3203 mdev->md_sync_timer.data = (unsigned long) mdev;
370a43e7
PR
3204 mdev->start_resync_timer.function = start_resync_timer_fn;
3205 mdev->start_resync_timer.data = (unsigned long) mdev;
7fde2be9
PR
3206 mdev->request_timer.function = request_timer_fn;
3207 mdev->request_timer.data = (unsigned long) mdev;
b411b363
PR
3208
3209 init_waitqueue_head(&mdev->misc_wait);
3210 init_waitqueue_head(&mdev->state_wait);
84dfb9f5 3211 init_waitqueue_head(&mdev->net_cnt_wait);
b411b363
PR
3212 init_waitqueue_head(&mdev->ee_wait);
3213 init_waitqueue_head(&mdev->al_wait);
3214 init_waitqueue_head(&mdev->seq_wait);
3215
3216 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3217 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3218 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3219
3220 mdev->agreed_pro_version = PRO_VERSION_MAX;
2451fc3b 3221 mdev->write_ordering = WO_bdev_flush;
b411b363 3222 mdev->resync_wenr = LC_FREE;
99432fcc
PR
3223 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3224 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
b411b363
PR
3225}
3226
3227void drbd_mdev_cleanup(struct drbd_conf *mdev)
3228{
1d7734a0 3229 int i;
b411b363
PR
3230 if (mdev->receiver.t_state != None)
3231 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3232 mdev->receiver.t_state);
3233
3234 /* no need to lock it, I'm the only thread alive */
3235 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3236 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3237 mdev->al_writ_cnt =
3238 mdev->bm_writ_cnt =
3239 mdev->read_cnt =
3240 mdev->recv_cnt =
3241 mdev->send_cnt =
3242 mdev->writ_cnt =
3243 mdev->p_size =
3244 mdev->rs_start =
3245 mdev->rs_total =
1d7734a0
LE
3246 mdev->rs_failed = 0;
3247 mdev->rs_last_events = 0;
0f0601f4 3248 mdev->rs_last_sect_ev = 0;
1d7734a0
LE
3249 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3250 mdev->rs_mark_left[i] = 0;
3251 mdev->rs_mark_time[i] = 0;
3252 }
b411b363
PR
3253 D_ASSERT(mdev->net_conf == NULL);
3254
3255 drbd_set_my_capacity(mdev, 0);
3256 if (mdev->bitmap) {
3257 /* maybe never allocated. */
02d9a94b 3258 drbd_bm_resize(mdev, 0, 1);
b411b363
PR
3259 drbd_bm_cleanup(mdev);
3260 }
3261
3262 drbd_free_resources(mdev);
0778286a 3263 clear_bit(AL_SUSPENDED, &mdev->flags);
b411b363
PR
3264
3265 /*
3266 * currently we drbd_init_ee only on module load, so
3267 * we may do drbd_release_ee only on module unload!
3268 */
3269 D_ASSERT(list_empty(&mdev->active_ee));
3270 D_ASSERT(list_empty(&mdev->sync_ee));
3271 D_ASSERT(list_empty(&mdev->done_ee));
3272 D_ASSERT(list_empty(&mdev->read_ee));
3273 D_ASSERT(list_empty(&mdev->net_ee));
3274 D_ASSERT(list_empty(&mdev->resync_reads));
3275 D_ASSERT(list_empty(&mdev->data.work.q));
3276 D_ASSERT(list_empty(&mdev->meta.work.q));
3277 D_ASSERT(list_empty(&mdev->resync_work.list));
3278 D_ASSERT(list_empty(&mdev->unplug_work.list));
e9e6f3ec 3279 D_ASSERT(list_empty(&mdev->go_diskless.list));
2265b473
LE
3280
3281 drbd_set_defaults(mdev);
b411b363
PR
3282}
3283
3284
3285static void drbd_destroy_mempools(void)
3286{
3287 struct page *page;
3288
3289 while (drbd_pp_pool) {
3290 page = drbd_pp_pool;
3291 drbd_pp_pool = (struct page *)page_private(page);
3292 __free_page(page);
3293 drbd_pp_vacant--;
3294 }
3295
3296 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3297
9476f39d
LE
3298 if (drbd_md_io_bio_set)
3299 bioset_free(drbd_md_io_bio_set);
4281808f
LE
3300 if (drbd_md_io_page_pool)
3301 mempool_destroy(drbd_md_io_page_pool);
b411b363
PR
3302 if (drbd_ee_mempool)
3303 mempool_destroy(drbd_ee_mempool);
3304 if (drbd_request_mempool)
3305 mempool_destroy(drbd_request_mempool);
3306 if (drbd_ee_cache)
3307 kmem_cache_destroy(drbd_ee_cache);
3308 if (drbd_request_cache)
3309 kmem_cache_destroy(drbd_request_cache);
3310 if (drbd_bm_ext_cache)
3311 kmem_cache_destroy(drbd_bm_ext_cache);
3312 if (drbd_al_ext_cache)
3313 kmem_cache_destroy(drbd_al_ext_cache);
3314
9476f39d 3315 drbd_md_io_bio_set = NULL;
4281808f 3316 drbd_md_io_page_pool = NULL;
b411b363
PR
3317 drbd_ee_mempool = NULL;
3318 drbd_request_mempool = NULL;
3319 drbd_ee_cache = NULL;
3320 drbd_request_cache = NULL;
3321 drbd_bm_ext_cache = NULL;
3322 drbd_al_ext_cache = NULL;
3323
3324 return;
3325}
3326
3327static int drbd_create_mempools(void)
3328{
3329 struct page *page;
1816a2b4 3330 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
b411b363
PR
3331 int i;
3332
3333 /* prepare our caches and mempools */
3334 drbd_request_mempool = NULL;
3335 drbd_ee_cache = NULL;
3336 drbd_request_cache = NULL;
3337 drbd_bm_ext_cache = NULL;
3338 drbd_al_ext_cache = NULL;
3339 drbd_pp_pool = NULL;
4281808f 3340 drbd_md_io_page_pool = NULL;
9476f39d 3341 drbd_md_io_bio_set = NULL;
b411b363
PR
3342
3343 /* caches */
3344 drbd_request_cache = kmem_cache_create(
3345 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3346 if (drbd_request_cache == NULL)
3347 goto Enomem;
3348
3349 drbd_ee_cache = kmem_cache_create(
3350 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3351 if (drbd_ee_cache == NULL)
3352 goto Enomem;
3353
3354 drbd_bm_ext_cache = kmem_cache_create(
3355 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3356 if (drbd_bm_ext_cache == NULL)
3357 goto Enomem;
3358
3359 drbd_al_ext_cache = kmem_cache_create(
3360 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3361 if (drbd_al_ext_cache == NULL)
3362 goto Enomem;
3363
3364 /* mempools */
9476f39d
LE
3365#ifdef COMPAT_HAVE_BIOSET_CREATE
3366 drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
3367 if (drbd_md_io_bio_set == NULL)
3368 goto Enomem;
3369#endif
3370
4281808f
LE
3371 drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
3372 if (drbd_md_io_page_pool == NULL)
3373 goto Enomem;
3374
b411b363
PR
3375 drbd_request_mempool = mempool_create(number,
3376 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3377 if (drbd_request_mempool == NULL)
3378 goto Enomem;
3379
3380 drbd_ee_mempool = mempool_create(number,
3381 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2027ae1f 3382 if (drbd_ee_mempool == NULL)
b411b363
PR
3383 goto Enomem;
3384
3385 /* drbd's page pool */
3386 spin_lock_init(&drbd_pp_lock);
3387
3388 for (i = 0; i < number; i++) {
3389 page = alloc_page(GFP_HIGHUSER);
3390 if (!page)
3391 goto Enomem;
3392 set_page_private(page, (unsigned long)drbd_pp_pool);
3393 drbd_pp_pool = page;
3394 }
3395 drbd_pp_vacant = number;
3396
3397 return 0;
3398
3399Enomem:
3400 drbd_destroy_mempools(); /* in case we allocated some */
3401 return -ENOMEM;
3402}
3403
3404static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3405 void *unused)
3406{
3407 /* just so we have it. you never know what interesting things we
3408 * might want to do here some day...
3409 */
3410
3411 return NOTIFY_DONE;
3412}
3413
3414static struct notifier_block drbd_notifier = {
3415 .notifier_call = drbd_notify_sys,
3416};
3417
3418static void drbd_release_ee_lists(struct drbd_conf *mdev)
3419{
3420 int rr;
3421
3422 rr = drbd_release_ee(mdev, &mdev->active_ee);
3423 if (rr)
3424 dev_err(DEV, "%d EEs in active list found!\n", rr);
3425
3426 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3427 if (rr)
3428 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3429
3430 rr = drbd_release_ee(mdev, &mdev->read_ee);
3431 if (rr)
3432 dev_err(DEV, "%d EEs in read list found!\n", rr);
3433
3434 rr = drbd_release_ee(mdev, &mdev->done_ee);
3435 if (rr)
3436 dev_err(DEV, "%d EEs in done list found!\n", rr);
3437
3438 rr = drbd_release_ee(mdev, &mdev->net_ee);
3439 if (rr)
3440 dev_err(DEV, "%d EEs in net list found!\n", rr);
3441}
3442
3443/* caution. no locking.
3444 * currently only used from module cleanup code. */
3445static void drbd_delete_device(unsigned int minor)
3446{
3447 struct drbd_conf *mdev = minor_to_mdev(minor);
3448
3449 if (!mdev)
3450 return;
3451
dfa8bedb
PR
3452 del_timer_sync(&mdev->request_timer);
3453
b411b363
PR
3454 /* paranoia asserts */
3455 if (mdev->open_cnt != 0)
3456 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3457 __FILE__ , __LINE__);
3458
3459 ERR_IF (!list_empty(&mdev->data.work.q)) {
3460 struct list_head *lp;
3461 list_for_each(lp, &mdev->data.work.q) {
3462 dev_err(DEV, "lp = %p\n", lp);
3463 }
3464 };
3465 /* end paranoia asserts */
3466
3467 del_gendisk(mdev->vdisk);
3468
3469 /* cleanup stuff that may have been allocated during
3470 * device (re-)configuration or state changes */
3471
3472 if (mdev->this_bdev)
3473 bdput(mdev->this_bdev);
3474
3475 drbd_free_resources(mdev);
3476
3477 drbd_release_ee_lists(mdev);
3478
24c4830c 3479 /* should be freed on disconnect? */
b411b363
PR
3480 kfree(mdev->ee_hash);
3481 /*
3482 mdev->ee_hash_s = 0;
3483 mdev->ee_hash = NULL;
3484 */
3485
3486 lc_destroy(mdev->act_log);
3487 lc_destroy(mdev->resync);
3488
3489 kfree(mdev->p_uuid);
3490 /* mdev->p_uuid = NULL; */
3491
3492 kfree(mdev->int_dig_out);
3493 kfree(mdev->int_dig_in);
3494 kfree(mdev->int_dig_vv);
3495
3496 /* cleanup the rest that has been
3497 * allocated from drbd_new_device
3498 * and actually free the mdev itself */
3499 drbd_free_mdev(mdev);
3500}
3501
3502static void drbd_cleanup(void)
3503{
3504 unsigned int i;
3505
3506 unregister_reboot_notifier(&drbd_notifier);
3507
17a93f30
LE
3508 /* first remove proc,
3509 * drbdsetup uses it's presence to detect
3510 * whether DRBD is loaded.
3511 * If we would get stuck in proc removal,
3512 * but have netlink already deregistered,
3513 * some drbdsetup commands may wait forever
3514 * for an answer.
3515 */
3516 if (drbd_proc)
3517 remove_proc_entry("drbd", NULL);
3518
b411b363
PR
3519 drbd_nl_cleanup();
3520
3521 if (minor_table) {
b411b363
PR
3522 i = minor_count;
3523 while (i--)
3524 drbd_delete_device(i);
3525 drbd_destroy_mempools();
3526 }
3527
3528 kfree(minor_table);
3529
3530 unregister_blkdev(DRBD_MAJOR, "drbd");
3531
3532 printk(KERN_INFO "drbd: module cleanup done.\n");
3533}
3534
3535/**
d97482ed 3536 * drbd_congested() - Callback for the flusher thread
b411b363 3537 * @congested_data: User data
d97482ed 3538 * @bdi_bits: Bits the BDI flusher thread is currently interested in
b411b363
PR
3539 *
3540 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3541 */
3542static int drbd_congested(void *congested_data, int bdi_bits)
3543{
3544 struct drbd_conf *mdev = congested_data;
3545 struct request_queue *q;
3546 char reason = '-';
3547 int r = 0;
3548
1b881ef7 3549 if (!may_inc_ap_bio(mdev)) {
b411b363
PR
3550 /* DRBD has frozen IO */
3551 r = bdi_bits;
3552 reason = 'd';
3553 goto out;
3554 }
3555
c2ba686f
LE
3556 if (test_bit(CALLBACK_PENDING, &mdev->flags)) {
3557 r |= (1 << BDI_async_congested);
3558 /* Without good local data, we would need to read from remote,
3559 * and that would need the worker thread as well, which is
3560 * currently blocked waiting for that usermode helper to
3561 * finish.
3562 */
3563 if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
3564 r |= (1 << BDI_sync_congested);
3565 else
3566 put_ldev(mdev);
3567 r &= bdi_bits;
3568 reason = 'c';
3569 goto out;
3570 }
3571
b411b363
PR
3572 if (get_ldev(mdev)) {
3573 q = bdev_get_queue(mdev->ldev->backing_bdev);
3574 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3575 put_ldev(mdev);
3576 if (r)
3577 reason = 'b';
3578 }
3579
3580 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3581 r |= (1 << BDI_async_congested);
3582 reason = reason == 'b' ? 'a' : 'n';
3583 }
3584
3585out:
3586 mdev->congestion_reason = reason;
3587 return r;
3588}
3589
3590struct drbd_conf *drbd_new_device(unsigned int minor)
3591{
3592 struct drbd_conf *mdev;
3593 struct gendisk *disk;
3594 struct request_queue *q;
3595
3596 /* GFP_KERNEL, we are outside of all write-out paths */
3597 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3598 if (!mdev)
3599 return NULL;
3600 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3601 goto out_no_cpumask;
3602
3603 mdev->minor = minor;
3604
3605 drbd_init_set_defaults(mdev);
3606
3607 q = blk_alloc_queue(GFP_KERNEL);
3608 if (!q)
3609 goto out_no_q;
3610 mdev->rq_queue = q;
3611 q->queuedata = mdev;
b411b363
PR
3612
3613 disk = alloc_disk(1);
3614 if (!disk)
3615 goto out_no_disk;
3616 mdev->vdisk = disk;
3617
81e84650 3618 set_disk_ro(disk, true);
b411b363
PR
3619
3620 disk->queue = q;
3621 disk->major = DRBD_MAJOR;
3622 disk->first_minor = minor;
3623 disk->fops = &drbd_ops;
3624 sprintf(disk->disk_name, "drbd%d", minor);
3625 disk->private_data = mdev;
3626
3627 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3628 /* we have no partitions. we contain only ourselves. */
3629 mdev->this_bdev->bd_contains = mdev->this_bdev;
3630
3631 q->backing_dev_info.congested_fn = drbd_congested;
3632 q->backing_dev_info.congested_data = mdev;
3633
2f58dcfc 3634 blk_queue_make_request(q, drbd_make_request);
a73ff323 3635 blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
99432fcc
PR
3636 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3637 This triggers a max_bio_size message upon first attach or connect */
3638 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
b411b363
PR
3639 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3640 blk_queue_merge_bvec(q, drbd_merge_bvec);
7eaceacc 3641 q->queue_lock = &mdev->req_lock;
b411b363
PR
3642
3643 mdev->md_io_page = alloc_page(GFP_KERNEL);
3644 if (!mdev->md_io_page)
3645 goto out_no_io_page;
3646
3647 if (drbd_bm_init(mdev))
3648 goto out_no_bitmap;
3649 /* no need to lock access, we are still initializing this minor device. */
3650 if (!tl_init(mdev))
3651 goto out_no_tl;
3652
3653 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3654 if (!mdev->app_reads_hash)
3655 goto out_no_app_reads;
3656
3657 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3658 if (!mdev->current_epoch)
3659 goto out_no_epoch;
3660
3661 INIT_LIST_HEAD(&mdev->current_epoch->list);
3662 mdev->epochs = 1;
3663
3664 return mdev;
3665
3666/* out_whatever_else:
3667 kfree(mdev->current_epoch); */
3668out_no_epoch:
3669 kfree(mdev->app_reads_hash);
3670out_no_app_reads:
3671 tl_cleanup(mdev);
3672out_no_tl:
3673 drbd_bm_cleanup(mdev);
3674out_no_bitmap:
3675 __free_page(mdev->md_io_page);
3676out_no_io_page:
3677 put_disk(disk);
3678out_no_disk:
3679 blk_cleanup_queue(q);
3680out_no_q:
3681 free_cpumask_var(mdev->cpu_mask);
3682out_no_cpumask:
3683 kfree(mdev);
3684 return NULL;
3685}
3686
3687/* counterpart of drbd_new_device.
3688 * last part of drbd_delete_device. */
3689void drbd_free_mdev(struct drbd_conf *mdev)
3690{
3691 kfree(mdev->current_epoch);
3692 kfree(mdev->app_reads_hash);
3693 tl_cleanup(mdev);
3694 if (mdev->bitmap) /* should no longer be there. */
3695 drbd_bm_cleanup(mdev);
3696 __free_page(mdev->md_io_page);
3697 put_disk(mdev->vdisk);
3698 blk_cleanup_queue(mdev->rq_queue);
3699 free_cpumask_var(mdev->cpu_mask);
3719094e 3700 drbd_free_tl_hash(mdev);
b411b363
PR
3701 kfree(mdev);
3702}
3703
3704
3705int __init drbd_init(void)
3706{
3707 int err;
3708
3709 if (sizeof(struct p_handshake) != 80) {
3710 printk(KERN_ERR
3711 "drbd: never change the size or layout "
3712 "of the HandShake packet.\n");
3713 return -EINVAL;
3714 }
3715
2b8a90b5 3716 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
b411b363
PR
3717 printk(KERN_ERR
3718 "drbd: invalid minor_count (%d)\n", minor_count);
3719#ifdef MODULE
3720 return -EINVAL;
3721#else
3722 minor_count = 8;
3723#endif
3724 }
3725
3726 err = drbd_nl_init();
3727 if (err)
3728 return err;
3729
3730 err = register_blkdev(DRBD_MAJOR, "drbd");
3731 if (err) {
3732 printk(KERN_ERR
3733 "drbd: unable to register block device major %d\n",
3734 DRBD_MAJOR);
3735 return err;
3736 }
3737
3738 register_reboot_notifier(&drbd_notifier);
3739
3740 /*
3741 * allocate all necessary structs
3742 */
3743 err = -ENOMEM;
3744
3745 init_waitqueue_head(&drbd_pp_wait);
3746
3747 drbd_proc = NULL; /* play safe for drbd_cleanup */
3748 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3749 GFP_KERNEL);
3750 if (!minor_table)
3751 goto Enomem;
3752
3753 err = drbd_create_mempools();
3754 if (err)
3755 goto Enomem;
3756
8c484ee4 3757 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
b411b363
PR
3758 if (!drbd_proc) {
3759 printk(KERN_ERR "drbd: unable to register proc file\n");
3760 goto Enomem;
3761 }
3762
3763 rwlock_init(&global_state_lock);
3764
3765 printk(KERN_INFO "drbd: initialized. "
3766 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3767 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3768 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3769 printk(KERN_INFO "drbd: registered as block device major %d\n",
3770 DRBD_MAJOR);
3771 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3772
3773 return 0; /* Success! */
3774
3775Enomem:
3776 drbd_cleanup();
3777 if (err == -ENOMEM)
3778 /* currently always the case */
3779 printk(KERN_ERR "drbd: ran out of memory\n");
3780 else
3781 printk(KERN_ERR "drbd: initialization failure\n");
3782 return err;
3783}
3784
3785void drbd_free_bc(struct drbd_backing_dev *ldev)
3786{
3787 if (ldev == NULL)
3788 return;
3789
e525fd89
TH
3790 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3791 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
b411b363
PR
3792
3793 kfree(ldev);
3794}
3795
3796void drbd_free_sock(struct drbd_conf *mdev)
3797{
3798 if (mdev->data.socket) {
4589d7f8 3799 mutex_lock(&mdev->data.mutex);
b411b363
PR
3800 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3801 sock_release(mdev->data.socket);
3802 mdev->data.socket = NULL;
4589d7f8 3803 mutex_unlock(&mdev->data.mutex);
b411b363
PR
3804 }
3805 if (mdev->meta.socket) {
4589d7f8 3806 mutex_lock(&mdev->meta.mutex);
b411b363
PR
3807 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3808 sock_release(mdev->meta.socket);
3809 mdev->meta.socket = NULL;
4589d7f8 3810 mutex_unlock(&mdev->meta.mutex);
b411b363
PR
3811 }
3812}
3813
3814
3815void drbd_free_resources(struct drbd_conf *mdev)
3816{
3817 crypto_free_hash(mdev->csums_tfm);
3818 mdev->csums_tfm = NULL;
3819 crypto_free_hash(mdev->verify_tfm);
3820 mdev->verify_tfm = NULL;
3821 crypto_free_hash(mdev->cram_hmac_tfm);
3822 mdev->cram_hmac_tfm = NULL;
3823 crypto_free_hash(mdev->integrity_w_tfm);
3824 mdev->integrity_w_tfm = NULL;
3825 crypto_free_hash(mdev->integrity_r_tfm);
3826 mdev->integrity_r_tfm = NULL;
3827
3828 drbd_free_sock(mdev);
3829
3830 __no_warn(local,
3831 drbd_free_bc(mdev->ldev);
3832 mdev->ldev = NULL;);
3833}
3834
3835/* meta data management */
3836
3837struct meta_data_on_disk {
3838 u64 la_size; /* last agreed size. */
3839 u64 uuid[UI_SIZE]; /* UUIDs. */
3840 u64 device_uuid;
3841 u64 reserved_u64_1;
3842 u32 flags; /* MDF */
3843 u32 magic;
3844 u32 md_size_sect;
3845 u32 al_offset; /* offset to this block */
3846 u32 al_nr_extents; /* important for restoring the AL */
3847 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3848 u32 bm_offset; /* offset to the bitmap, from here */
3849 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
99432fcc
PR
3850 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3851 u32 reserved_u32[3];
b411b363
PR
3852
3853} __packed;
3854
3855/**
3856 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3857 * @mdev: DRBD device.
3858 */
3859void drbd_md_sync(struct drbd_conf *mdev)
3860{
3861 struct meta_data_on_disk *buffer;
3862 sector_t sector;
3863 int i;
3864
ee15b038
LE
3865 del_timer(&mdev->md_sync_timer);
3866 /* timer may be rearmed by drbd_md_mark_dirty() now. */
b411b363
PR
3867 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3868 return;
b411b363
PR
3869
3870 /* We use here D_FAILED and not D_ATTACHING because we try to write
3871 * metadata even if we detach due to a disk failure! */
3872 if (!get_ldev_if_state(mdev, D_FAILED))
3873 return;
3874
e1711731
PR
3875 buffer = drbd_md_get_buffer(mdev);
3876 if (!buffer)
3877 goto out;
3878
b411b363
PR
3879 memset(buffer, 0, 512);
3880
3881 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3882 for (i = UI_CURRENT; i < UI_SIZE; i++)
3883 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3884 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3885 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3886
3887 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3888 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3889 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3890 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3891 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3892
3893 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
99432fcc 3894 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
b411b363
PR
3895
3896 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3897 sector = mdev->ldev->md.md_offset;
3898
3f3a9b84 3899 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
b411b363
PR
3900 /* this was a try anyways ... */
3901 dev_err(DEV, "meta data update failed!\n");
383606e0 3902 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
b411b363
PR
3903 }
3904
3905 /* Update mdev->ldev->md.la_size_sect,
3906 * since we updated it on metadata. */
3907 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3908
e1711731
PR
3909 drbd_md_put_buffer(mdev);
3910out:
b411b363
PR
3911 put_ldev(mdev);
3912}
3913
3914/**
3915 * drbd_md_read() - Reads in the meta data super block
3916 * @mdev: DRBD device.
3917 * @bdev: Device from which the meta data should be read in.
3918 *
116676ca 3919 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
b411b363
PR
3920 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3921 */
3922int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3923{
3924 struct meta_data_on_disk *buffer;
3925 int i, rv = NO_ERROR;
3926
3927 if (!get_ldev_if_state(mdev, D_ATTACHING))
3928 return ERR_IO_MD_DISK;
3929
e1711731
PR
3930 buffer = drbd_md_get_buffer(mdev);
3931 if (!buffer)
3932 goto out;
b411b363
PR
3933
3934 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
25985edc 3935 /* NOTE: can't do normal error processing here as this is
b411b363
PR
3936 called BEFORE disk is attached */
3937 dev_err(DEV, "Error while reading metadata.\n");
3938 rv = ERR_IO_MD_DISK;
3939 goto err;
3940 }
3941
3942 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3943 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3944 rv = ERR_MD_INVALID;
3945 goto err;
3946 }
3947 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3948 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3949 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3950 rv = ERR_MD_INVALID;
3951 goto err;
3952 }
3953 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3954 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3955 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3956 rv = ERR_MD_INVALID;
3957 goto err;
3958 }
3959 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3960 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3961 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3962 rv = ERR_MD_INVALID;
3963 goto err;
3964 }
3965
3966 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3967 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3968 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3969 rv = ERR_MD_INVALID;
3970 goto err;
3971 }
3972
3973 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3974 for (i = UI_CURRENT; i < UI_SIZE; i++)
3975 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3976 bdev->md.flags = be32_to_cpu(buffer->flags);
3977 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3978 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3979
99432fcc
PR
3980 spin_lock_irq(&mdev->req_lock);
3981 if (mdev->state.conn < C_CONNECTED) {
db141b2f 3982 unsigned int peer;
99432fcc 3983 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
db141b2f 3984 peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
99432fcc
PR
3985 mdev->peer_max_bio_size = peer;
3986 }
3987 spin_unlock_irq(&mdev->req_lock);
3988
b411b363
PR
3989 if (mdev->sync_conf.al_extents < 7)
3990 mdev->sync_conf.al_extents = 127;
3991
3992 err:
e1711731
PR
3993 drbd_md_put_buffer(mdev);
3994 out:
b411b363
PR
3995 put_ldev(mdev);
3996
3997 return rv;
3998}
3999
4000/**
4001 * drbd_md_mark_dirty() - Mark meta data super block as dirty
4002 * @mdev: DRBD device.
4003 *
4004 * Call this function if you change anything that should be written to
4005 * the meta-data super block. This function sets MD_DIRTY, and starts a
4006 * timer that ensures that within five seconds you have to call drbd_md_sync().
4007 */
ca0e6098 4008#ifdef DEBUG
ee15b038
LE
4009void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
4010{
4011 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
4012 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
4013 mdev->last_md_mark_dirty.line = line;
4014 mdev->last_md_mark_dirty.func = func;
4015 }
4016}
4017#else
b411b363
PR
4018void drbd_md_mark_dirty(struct drbd_conf *mdev)
4019{
ee15b038 4020 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
ca0e6098 4021 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
b411b363 4022}
ee15b038 4023#endif
b411b363
PR
4024
4025static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
4026{
4027 int i;
4028
62b0da3a 4029 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
b411b363 4030 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
b411b363
PR
4031}
4032
4033void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4034{
4035 if (idx == UI_CURRENT) {
4036 if (mdev->state.role == R_PRIMARY)
4037 val |= 1;
4038 else
4039 val &= ~((u64)1);
4040
4041 drbd_set_ed_uuid(mdev, val);
4042 }
4043
4044 mdev->ldev->md.uuid[idx] = val;
b411b363
PR
4045 drbd_md_mark_dirty(mdev);
4046}
4047
4048
4049void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4050{
4051 if (mdev->ldev->md.uuid[idx]) {
4052 drbd_uuid_move_history(mdev);
4053 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
b411b363
PR
4054 }
4055 _drbd_uuid_set(mdev, idx, val);
4056}
4057
4058/**
4059 * drbd_uuid_new_current() - Creates a new current UUID
4060 * @mdev: DRBD device.
4061 *
4062 * Creates a new current UUID, and rotates the old current UUID into
4063 * the bitmap slot. Causes an incremental resync upon next connect.
4064 */
4065void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
4066{
4067 u64 val;
62b0da3a
LE
4068 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
4069
4070 if (bm_uuid)
4071 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 4072
b411b363 4073 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
b411b363
PR
4074
4075 get_random_bytes(&val, sizeof(u64));
4076 _drbd_uuid_set(mdev, UI_CURRENT, val);
62b0da3a 4077 drbd_print_uuids(mdev, "new current UUID");
aaa8e2b3
LE
4078 /* get it to stable storage _now_ */
4079 drbd_md_sync(mdev);
b411b363
PR
4080}
4081
4082void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
4083{
4084 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
4085 return;
4086
4087 if (val == 0) {
4088 drbd_uuid_move_history(mdev);
4089 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
4090 mdev->ldev->md.uuid[UI_BITMAP] = 0;
b411b363 4091 } else {
62b0da3a
LE
4092 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
4093 if (bm_uuid)
4094 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 4095
62b0da3a 4096 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
b411b363
PR
4097 }
4098 drbd_md_mark_dirty(mdev);
4099}
4100
4101/**
4102 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4103 * @mdev: DRBD device.
4104 *
4105 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
4106 */
4107int drbd_bmio_set_n_write(struct drbd_conf *mdev)
4108{
4109 int rv = -EIO;
4110
4111 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4112 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
4113 drbd_md_sync(mdev);
4114 drbd_bm_set_all(mdev);
4115
4116 rv = drbd_bm_write(mdev);
4117
4118 if (!rv) {
4119 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
4120 drbd_md_sync(mdev);
4121 }
4122
4123 put_ldev(mdev);
4124 }
4125
4126 return rv;
4127}
4128
4129/**
4130 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4131 * @mdev: DRBD device.
4132 *
4133 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
4134 */
4135int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
4136{
4137 int rv = -EIO;
4138
0778286a 4139 drbd_resume_al(mdev);
b411b363
PR
4140 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4141 drbd_bm_clear_all(mdev);
4142 rv = drbd_bm_write(mdev);
4143 put_ldev(mdev);
4144 }
4145
4146 return rv;
4147}
4148
4149static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4150{
4151 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
02851e9f 4152 int rv = -EIO;
b411b363
PR
4153
4154 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
4155
02851e9f 4156 if (get_ldev(mdev)) {
20ceb2b2 4157 drbd_bm_lock(mdev, work->why, work->flags);
02851e9f
LE
4158 rv = work->io_fn(mdev);
4159 drbd_bm_unlock(mdev);
4160 put_ldev(mdev);
4161 }
b411b363
PR
4162
4163 clear_bit(BITMAP_IO, &mdev->flags);
127b3178 4164 smp_mb__after_clear_bit();
b411b363
PR
4165 wake_up(&mdev->misc_wait);
4166
4167 if (work->done)
4168 work->done(mdev, rv);
4169
4170 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
4171 work->why = NULL;
20ceb2b2 4172 work->flags = 0;
b411b363
PR
4173
4174 return 1;
4175}
4176
82f59cc6
LE
4177void drbd_ldev_destroy(struct drbd_conf *mdev)
4178{
4179 lc_destroy(mdev->resync);
4180 mdev->resync = NULL;
4181 lc_destroy(mdev->act_log);
4182 mdev->act_log = NULL;
4183 __no_warn(local,
4184 drbd_free_bc(mdev->ldev);
4185 mdev->ldev = NULL;);
4186
4187 if (mdev->md_io_tmpp) {
4188 __free_page(mdev->md_io_tmpp);
4189 mdev->md_io_tmpp = NULL;
4190 }
4191 clear_bit(GO_DISKLESS, &mdev->flags);
4192}
4193
e9e6f3ec
LE
4194static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4195{
4196 D_ASSERT(mdev->state.disk == D_FAILED);
9d282875
LE
4197 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4198 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
82f59cc6
LE
4199 * the protected members anymore, though, so once put_ldev reaches zero
4200 * again, it will be safe to free them. */
e9e6f3ec 4201 drbd_force_state(mdev, NS(disk, D_DISKLESS));
e9e6f3ec
LE
4202 return 1;
4203}
4204
4205void drbd_go_diskless(struct drbd_conf *mdev)
4206{
4207 D_ASSERT(mdev->state.disk == D_FAILED);
4208 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
9d282875 4209 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
e9e6f3ec
LE
4210}
4211
b411b363
PR
4212/**
4213 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4214 * @mdev: DRBD device.
4215 * @io_fn: IO callback to be called when bitmap IO is possible
4216 * @done: callback to be called after the bitmap IO was performed
4217 * @why: Descriptive text of the reason for doing the IO
4218 *
4219 * While IO on the bitmap happens we freeze application IO thus we ensure
4220 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4221 * called from worker context. It MUST NOT be used while a previous such
4222 * work is still pending!
4223 */
4224void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4225 int (*io_fn)(struct drbd_conf *),
4226 void (*done)(struct drbd_conf *, int),
20ceb2b2 4227 char *why, enum bm_flag flags)
b411b363
PR
4228{
4229 D_ASSERT(current == mdev->worker.task);
4230
4231 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4232 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4233 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4234 if (mdev->bm_io_work.why)
4235 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4236 why, mdev->bm_io_work.why);
4237
4238 mdev->bm_io_work.io_fn = io_fn;
4239 mdev->bm_io_work.done = done;
4240 mdev->bm_io_work.why = why;
20ceb2b2 4241 mdev->bm_io_work.flags = flags;
b411b363 4242
22afd7ee 4243 spin_lock_irq(&mdev->req_lock);
b411b363
PR
4244 set_bit(BITMAP_IO, &mdev->flags);
4245 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
127b3178 4246 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
b411b363 4247 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
b411b363 4248 }
22afd7ee 4249 spin_unlock_irq(&mdev->req_lock);
b411b363
PR
4250}
4251
4252/**
4253 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4254 * @mdev: DRBD device.
4255 * @io_fn: IO callback to be called when bitmap IO is possible
4256 * @why: Descriptive text of the reason for doing the IO
4257 *
4258 * freezes application IO while that the actual IO operations runs. This
4259 * functions MAY NOT be called from worker context.
4260 */
20ceb2b2
LE
4261int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4262 char *why, enum bm_flag flags)
b411b363
PR
4263{
4264 int rv;
4265
4266 D_ASSERT(current != mdev->worker.task);
4267
20ceb2b2
LE
4268 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4269 drbd_suspend_io(mdev);
b411b363 4270
20ceb2b2 4271 drbd_bm_lock(mdev, why, flags);
b411b363
PR
4272 rv = io_fn(mdev);
4273 drbd_bm_unlock(mdev);
4274
20ceb2b2
LE
4275 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4276 drbd_resume_io(mdev);
b411b363
PR
4277
4278 return rv;
4279}
4280
4281void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4282{
4283 if ((mdev->ldev->md.flags & flag) != flag) {
4284 drbd_md_mark_dirty(mdev);
4285 mdev->ldev->md.flags |= flag;
4286 }
4287}
4288
4289void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4290{
4291 if ((mdev->ldev->md.flags & flag) != 0) {
4292 drbd_md_mark_dirty(mdev);
4293 mdev->ldev->md.flags &= ~flag;
4294 }
4295}
4296int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4297{
4298 return (bdev->md.flags & flag) != 0;
4299}
4300
4301static void md_sync_timer_fn(unsigned long data)
4302{
4303 struct drbd_conf *mdev = (struct drbd_conf *) data;
4304
4305 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4306}
4307
4308static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4309{
4310 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
ee15b038
LE
4311#ifdef DEBUG
4312 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4313 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4314#endif
b411b363 4315 drbd_md_sync(mdev);
b411b363
PR
4316 return 1;
4317}
4318
4319#ifdef CONFIG_DRBD_FAULT_INJECTION
4320/* Fault insertion support including random number generator shamelessly
4321 * stolen from kernel/rcutorture.c */
4322struct fault_random_state {
4323 unsigned long state;
4324 unsigned long count;
4325};
4326
4327#define FAULT_RANDOM_MULT 39916801 /* prime */
4328#define FAULT_RANDOM_ADD 479001701 /* prime */
4329#define FAULT_RANDOM_REFRESH 10000
4330
4331/*
4332 * Crude but fast random-number generator. Uses a linear congruential
4333 * generator, with occasional help from get_random_bytes().
4334 */
4335static unsigned long
4336_drbd_fault_random(struct fault_random_state *rsp)
4337{
4338 long refresh;
4339
49829ea7 4340 if (!rsp->count--) {
b411b363
PR
4341 get_random_bytes(&refresh, sizeof(refresh));
4342 rsp->state += refresh;
4343 rsp->count = FAULT_RANDOM_REFRESH;
4344 }
4345 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4346 return swahw32(rsp->state);
4347}
4348
4349static char *
4350_drbd_fault_str(unsigned int type) {
4351 static char *_faults[] = {
4352 [DRBD_FAULT_MD_WR] = "Meta-data write",
4353 [DRBD_FAULT_MD_RD] = "Meta-data read",
4354 [DRBD_FAULT_RS_WR] = "Resync write",
4355 [DRBD_FAULT_RS_RD] = "Resync read",
4356 [DRBD_FAULT_DT_WR] = "Data write",
4357 [DRBD_FAULT_DT_RD] = "Data read",
4358 [DRBD_FAULT_DT_RA] = "Data read ahead",
4359 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
6b4388ac
PR
4360 [DRBD_FAULT_AL_EE] = "EE allocation",
4361 [DRBD_FAULT_RECEIVE] = "receive data corruption",
b411b363
PR
4362 };
4363
4364 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4365}
4366
4367unsigned int
4368_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4369{
4370 static struct fault_random_state rrs = {0, 0};
4371
4372 unsigned int ret = (
4373 (fault_devs == 0 ||
4374 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4375 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4376
4377 if (ret) {
4378 fault_count++;
4379
7383506c 4380 if (__ratelimit(&drbd_ratelimit_state))
b411b363
PR
4381 dev_warn(DEV, "***Simulating %s failure\n",
4382 _drbd_fault_str(type));
4383 }
4384
4385 return ret;
4386}
4387#endif
4388
4389const char *drbd_buildtag(void)
4390{
4391 /* DRBD built from external sources has here a reference to the
4392 git hash of the source code. */
4393
4394 static char buildtag[38] = "\0uilt-in";
4395
4396 if (buildtag[0] == 0) {
bc4854bc
CW
4397#ifdef MODULE
4398 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4399#else
4400 buildtag[0] = 'b';
b411b363 4401#endif
b411b363
PR
4402 }
4403
4404 return buildtag;
4405}
4406
4407module_init(drbd_init)
4408module_exit(drbd_cleanup)
4409
b411b363
PR
4410EXPORT_SYMBOL(drbd_conn_str);
4411EXPORT_SYMBOL(drbd_role_str);
4412EXPORT_SYMBOL(drbd_disk_str);
4413EXPORT_SYMBOL(drbd_set_st_err_str);
This page took 0.41483 seconds and 5 git commands to generate.