drbd: Move sequence number logic into drbd_receiver.c and simplify it
[deliverable/linux.git] / drivers / block / drbd / drbd_main.c
CommitLineData
b411b363
PR
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
b411b363 29#include <linux/module.h>
b411b363
PR
30#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
2a48fc0a 35#include <linux/mutex.h>
b411b363
PR
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
b411b363
PR
55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
2a48fc0a 67static DEFINE_MUTEX(drbd_main_mutex);
b411b363
PR
68int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
e9e6f3ec 81static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
b411b363 82
b411b363
PR
83MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
2b8a90b5
PR
88MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
b411b363
PR
90MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
91
92#include <linux/moduleparam.h>
93/* allow_open_on_secondary */
94MODULE_PARM_DESC(allow_oos, "DONT USE!");
95/* thanks to these macros, if compiled into the kernel (not-module),
96 * this becomes the boot parameter drbd.minor_count */
97module_param(minor_count, uint, 0444);
98module_param(disable_sendpage, bool, 0644);
99module_param(allow_oos, bool, 0);
100module_param(cn_idx, uint, 0444);
101module_param(proc_details, int, 0644);
102
103#ifdef CONFIG_DRBD_FAULT_INJECTION
104int enable_faults;
105int fault_rate;
106static int fault_count;
107int fault_devs;
108/* bitmap of enabled faults */
109module_param(enable_faults, int, 0664);
110/* fault rate % value - applies to all enabled faults */
111module_param(fault_rate, int, 0664);
112/* count of faults inserted */
113module_param(fault_count, int, 0664);
114/* bitmap of devices to insert faults on */
115module_param(fault_devs, int, 0644);
116#endif
117
118/* module parameter, defined */
2b8a90b5 119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
b411b363
PR
120int disable_sendpage;
121int allow_oos;
122unsigned int cn_idx = CN_IDX_DRBD;
123int proc_details; /* Detail level in proc drbd*/
124
125/* Module parameter for setting the user mode helper program
126 * to run. Default is /sbin/drbdadm */
127char usermode_helper[80] = "/sbin/drbdadm";
128
129module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
130
131/* in 2.6.x, our device mapping and config info contains our virtual gendisks
132 * as member "struct gendisk *vdisk;"
133 */
134struct drbd_conf **minor_table;
2111438b 135struct list_head drbd_tconns; /* list of struct drbd_tconn */
b411b363
PR
136
137struct kmem_cache *drbd_request_cache;
138struct kmem_cache *drbd_ee_cache; /* epoch entries */
139struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
140struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
141mempool_t *drbd_request_mempool;
142mempool_t *drbd_ee_mempool;
143
144/* I do not use a standard mempool, because:
145 1) I want to hand out the pre-allocated objects first.
146 2) I want to be able to interrupt sleeping allocation with a signal.
147 Note: This is a single linked list, the next pointer is the private
148 member of struct page.
149 */
150struct page *drbd_pp_pool;
151spinlock_t drbd_pp_lock;
152int drbd_pp_vacant;
153wait_queue_head_t drbd_pp_wait;
154
155DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
156
7d4e9d09 157static const struct block_device_operations drbd_ops = {
b411b363
PR
158 .owner = THIS_MODULE,
159 .open = drbd_open,
160 .release = drbd_release,
161};
162
163#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
164
165#ifdef __CHECKER__
166/* When checking with sparse, and this is an inline function, sparse will
167 give tons of false positives. When this is a real functions sparse works.
168 */
169int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
170{
171 int io_allowed;
172
173 atomic_inc(&mdev->local_cnt);
174 io_allowed = (mdev->state.disk >= mins);
175 if (!io_allowed) {
176 if (atomic_dec_and_test(&mdev->local_cnt))
177 wake_up(&mdev->misc_wait);
178 }
179 return io_allowed;
180}
181
182#endif
183
184/**
185 * DOC: The transfer log
186 *
187 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
87eeee41 188 * mdev->tconn->newest_tle points to the head, mdev->tconn->oldest_tle points to the tail
b411b363
PR
189 * of the list. There is always at least one &struct drbd_tl_epoch object.
190 *
191 * Each &struct drbd_tl_epoch has a circular double linked list of requests
192 * attached.
193 */
194static int tl_init(struct drbd_conf *mdev)
195{
196 struct drbd_tl_epoch *b;
197
198 /* during device minor initialization, we may well use GFP_KERNEL */
199 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
200 if (!b)
201 return 0;
202 INIT_LIST_HEAD(&b->requests);
203 INIT_LIST_HEAD(&b->w.list);
204 b->next = NULL;
205 b->br_number = 4711;
7e602c0a 206 b->n_writes = 0;
b411b363
PR
207 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
208
87eeee41
PR
209 mdev->tconn->oldest_tle = b;
210 mdev->tconn->newest_tle = b;
211 INIT_LIST_HEAD(&mdev->tconn->out_of_sequence_requests);
b411b363 212
b411b363
PR
213 return 1;
214}
215
216static void tl_cleanup(struct drbd_conf *mdev)
217{
87eeee41
PR
218 D_ASSERT(mdev->tconn->oldest_tle == mdev->tconn->newest_tle);
219 D_ASSERT(list_empty(&mdev->tconn->out_of_sequence_requests));
220 kfree(mdev->tconn->oldest_tle);
221 mdev->tconn->oldest_tle = NULL;
222 kfree(mdev->tconn->unused_spare_tle);
223 mdev->tconn->unused_spare_tle = NULL;
d628769b
AG
224}
225
b411b363
PR
226/**
227 * _tl_add_barrier() - Adds a barrier to the transfer log
228 * @mdev: DRBD device.
229 * @new: Barrier to be added before the current head of the TL.
230 *
231 * The caller must hold the req_lock.
232 */
233void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
234{
235 struct drbd_tl_epoch *newest_before;
236
237 INIT_LIST_HEAD(&new->requests);
238 INIT_LIST_HEAD(&new->w.list);
239 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
240 new->next = NULL;
7e602c0a 241 new->n_writes = 0;
b411b363 242
87eeee41 243 newest_before = mdev->tconn->newest_tle;
b411b363
PR
244 /* never send a barrier number == 0, because that is special-cased
245 * when using TCQ for our write ordering code */
246 new->br_number = (newest_before->br_number+1) ?: 1;
87eeee41
PR
247 if (mdev->tconn->newest_tle != new) {
248 mdev->tconn->newest_tle->next = new;
249 mdev->tconn->newest_tle = new;
b411b363
PR
250 }
251}
252
253/**
254 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
255 * @mdev: DRBD device.
256 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
257 * @set_size: Expected number of requests before that barrier.
258 *
259 * In case the passed barrier_nr or set_size does not match the oldest
260 * &struct drbd_tl_epoch objects this function will cause a termination
261 * of the connection.
262 */
263void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
264 unsigned int set_size)
265{
266 struct drbd_tl_epoch *b, *nob; /* next old barrier */
267 struct list_head *le, *tle;
268 struct drbd_request *r;
269
87eeee41 270 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 271
87eeee41 272 b = mdev->tconn->oldest_tle;
b411b363
PR
273
274 /* first some paranoia code */
275 if (b == NULL) {
276 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
277 barrier_nr);
278 goto bail;
279 }
280 if (b->br_number != barrier_nr) {
281 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
282 barrier_nr, b->br_number);
283 goto bail;
284 }
7e602c0a
PR
285 if (b->n_writes != set_size) {
286 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
287 barrier_nr, set_size, b->n_writes);
b411b363
PR
288 goto bail;
289 }
290
291 /* Clean up list of requests processed during current epoch */
292 list_for_each_safe(le, tle, &b->requests) {
293 r = list_entry(le, struct drbd_request, tl_requests);
8554df1c 294 _req_mod(r, BARRIER_ACKED);
b411b363
PR
295 }
296 /* There could be requests on the list waiting for completion
297 of the write to the local disk. To avoid corruptions of
298 slab's data structures we have to remove the lists head.
299
300 Also there could have been a barrier ack out of sequence, overtaking
301 the write acks - which would be a bug and violating write ordering.
302 To not deadlock in case we lose connection while such requests are
303 still pending, we need some way to find them for the
8554df1c 304 _req_mode(CONNECTION_LOST_WHILE_PENDING).
b411b363
PR
305
306 These have been list_move'd to the out_of_sequence_requests list in
8554df1c 307 _req_mod(, BARRIER_ACKED) above.
b411b363
PR
308 */
309 list_del_init(&b->requests);
310
311 nob = b->next;
312 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
313 _tl_add_barrier(mdev, b);
314 if (nob)
87eeee41 315 mdev->tconn->oldest_tle = nob;
b411b363 316 /* if nob == NULL b was the only barrier, and becomes the new
87eeee41 317 barrier. Therefore mdev->tconn->oldest_tle points already to b */
b411b363
PR
318 } else {
319 D_ASSERT(nob != NULL);
87eeee41 320 mdev->tconn->oldest_tle = nob;
b411b363
PR
321 kfree(b);
322 }
323
87eeee41 324 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
325 dec_ap_pending(mdev);
326
327 return;
328
329bail:
87eeee41 330 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
331 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
332}
333
617049aa 334
b411b363 335/**
11b58e73 336 * _tl_restart() - Walks the transfer log, and applies an action to all requests
b411b363 337 * @mdev: DRBD device.
11b58e73 338 * @what: The action/event to perform with all request objects
b411b363 339 *
8554df1c
AG
340 * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
341 * RESTART_FROZEN_DISK_IO.
b411b363 342 */
11b58e73 343static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
b411b363 344{
11b58e73 345 struct drbd_tl_epoch *b, *tmp, **pn;
b9b98716 346 struct list_head *le, *tle, carry_reads;
11b58e73
PR
347 struct drbd_request *req;
348 int rv, n_writes, n_reads;
b411b363 349
87eeee41
PR
350 b = mdev->tconn->oldest_tle;
351 pn = &mdev->tconn->oldest_tle;
b411b363 352 while (b) {
11b58e73
PR
353 n_writes = 0;
354 n_reads = 0;
b9b98716 355 INIT_LIST_HEAD(&carry_reads);
b411b363 356 list_for_each_safe(le, tle, &b->requests) {
11b58e73
PR
357 req = list_entry(le, struct drbd_request, tl_requests);
358 rv = _req_mod(req, what);
359
360 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
361 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
b411b363
PR
362 }
363 tmp = b->next;
364
b9b98716 365 if (n_writes) {
8554df1c 366 if (what == RESEND) {
11b58e73
PR
367 b->n_writes = n_writes;
368 if (b->w.cb == NULL) {
369 b->w.cb = w_send_barrier;
370 inc_ap_pending(mdev);
371 set_bit(CREATE_BARRIER, &mdev->flags);
372 }
373
e42325a5 374 drbd_queue_work(&mdev->tconn->data.work, &b->w);
11b58e73
PR
375 }
376 pn = &b->next;
377 } else {
b9b98716
PR
378 if (n_reads)
379 list_add(&carry_reads, &b->requests);
11b58e73
PR
380 /* there could still be requests on that ring list,
381 * in case local io is still pending */
382 list_del(&b->requests);
383
384 /* dec_ap_pending corresponding to queue_barrier.
385 * the newest barrier may not have been queued yet,
386 * in which case w.cb is still NULL. */
387 if (b->w.cb != NULL)
388 dec_ap_pending(mdev);
389
87eeee41 390 if (b == mdev->tconn->newest_tle) {
11b58e73
PR
391 /* recycle, but reinit! */
392 D_ASSERT(tmp == NULL);
393 INIT_LIST_HEAD(&b->requests);
b9b98716 394 list_splice(&carry_reads, &b->requests);
11b58e73
PR
395 INIT_LIST_HEAD(&b->w.list);
396 b->w.cb = NULL;
397 b->br_number = net_random();
398 b->n_writes = 0;
399
400 *pn = b;
401 break;
402 }
403 *pn = tmp;
404 kfree(b);
b411b363 405 }
b411b363 406 b = tmp;
b9b98716 407 list_splice(&carry_reads, &b->requests);
b411b363 408 }
11b58e73
PR
409}
410
b411b363
PR
411
412/**
413 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
414 * @mdev: DRBD device.
415 *
416 * This is called after the connection to the peer was lost. The storage covered
417 * by the requests on the transfer gets marked as our of sync. Called from the
418 * receiver thread and the worker thread.
419 */
420void tl_clear(struct drbd_conf *mdev)
421{
b411b363
PR
422 struct list_head *le, *tle;
423 struct drbd_request *r;
b411b363 424
87eeee41 425 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 426
8554df1c 427 _tl_restart(mdev, CONNECTION_LOST_WHILE_PENDING);
b411b363
PR
428
429 /* we expect this list to be empty. */
87eeee41 430 D_ASSERT(list_empty(&mdev->tconn->out_of_sequence_requests));
b411b363
PR
431
432 /* but just in case, clean it up anyways! */
87eeee41 433 list_for_each_safe(le, tle, &mdev->tconn->out_of_sequence_requests) {
b411b363
PR
434 r = list_entry(le, struct drbd_request, tl_requests);
435 /* It would be nice to complete outside of spinlock.
436 * But this is easier for now. */
8554df1c 437 _req_mod(r, CONNECTION_LOST_WHILE_PENDING);
b411b363
PR
438 }
439
440 /* ensure bit indicating barrier is required is clear */
441 clear_bit(CREATE_BARRIER, &mdev->flags);
442
87eeee41 443 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
444}
445
11b58e73
PR
446void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
447{
87eeee41 448 spin_lock_irq(&mdev->tconn->req_lock);
11b58e73 449 _tl_restart(mdev, what);
87eeee41 450 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
451}
452
453/**
81e84650 454 * cl_wide_st_chg() - true if the state change is a cluster wide one
b411b363
PR
455 * @mdev: DRBD device.
456 * @os: old (current) state.
457 * @ns: new (wanted) state.
458 */
459static int cl_wide_st_chg(struct drbd_conf *mdev,
460 union drbd_state os, union drbd_state ns)
461{
462 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
463 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
464 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
465 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
466 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
467 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
468 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
469}
470
bf885f8a
AG
471enum drbd_state_rv
472drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
473 union drbd_state mask, union drbd_state val)
b411b363
PR
474{
475 unsigned long flags;
476 union drbd_state os, ns;
bf885f8a 477 enum drbd_state_rv rv;
b411b363 478
87eeee41 479 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
b411b363
PR
480 os = mdev->state;
481 ns.i = (os.i & ~mask.i) | val.i;
482 rv = _drbd_set_state(mdev, ns, f, NULL);
483 ns = mdev->state;
87eeee41 484 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
b411b363
PR
485
486 return rv;
487}
488
489/**
490 * drbd_force_state() - Impose a change which happens outside our control on our state
491 * @mdev: DRBD device.
492 * @mask: mask of state bits to change.
493 * @val: value of new state bits.
494 */
495void drbd_force_state(struct drbd_conf *mdev,
496 union drbd_state mask, union drbd_state val)
497{
498 drbd_change_state(mdev, CS_HARD, mask, val);
499}
500
bf885f8a
AG
501static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
502static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
503 union drbd_state,
504 union drbd_state);
b411b363 505static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
02bc7174 506 union drbd_state ns, const char **warn_sync_abort);
b411b363
PR
507int drbd_send_state_req(struct drbd_conf *,
508 union drbd_state, union drbd_state);
509
c8b32563
AG
510static enum drbd_state_rv
511_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
512 union drbd_state val)
b411b363
PR
513{
514 union drbd_state os, ns;
515 unsigned long flags;
bf885f8a 516 enum drbd_state_rv rv;
b411b363
PR
517
518 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
519 return SS_CW_SUCCESS;
520
521 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
522 return SS_CW_FAILED_BY_PEER;
523
524 rv = 0;
87eeee41 525 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
b411b363
PR
526 os = mdev->state;
527 ns.i = (os.i & ~mask.i) | val.i;
528 ns = sanitize_state(mdev, os, ns, NULL);
529
530 if (!cl_wide_st_chg(mdev, os, ns))
531 rv = SS_CW_NO_NEED;
532 if (!rv) {
533 rv = is_valid_state(mdev, ns);
534 if (rv == SS_SUCCESS) {
535 rv = is_valid_state_transition(mdev, ns, os);
536 if (rv == SS_SUCCESS)
bf885f8a 537 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
b411b363
PR
538 }
539 }
87eeee41 540 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
b411b363
PR
541
542 return rv;
543}
544
545/**
546 * drbd_req_state() - Perform an eventually cluster wide state change
547 * @mdev: DRBD device.
548 * @mask: mask of state bits to change.
549 * @val: value of new state bits.
550 * @f: flags
551 *
552 * Should not be called directly, use drbd_request_state() or
553 * _drbd_request_state().
554 */
bf885f8a
AG
555static enum drbd_state_rv
556drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
557 union drbd_state val, enum chg_state_flags f)
b411b363
PR
558{
559 struct completion done;
560 unsigned long flags;
561 union drbd_state os, ns;
bf885f8a 562 enum drbd_state_rv rv;
b411b363
PR
563
564 init_completion(&done);
565
566 if (f & CS_SERIALIZE)
567 mutex_lock(&mdev->state_mutex);
568
87eeee41 569 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
b411b363
PR
570 os = mdev->state;
571 ns.i = (os.i & ~mask.i) | val.i;
572 ns = sanitize_state(mdev, os, ns, NULL);
573
574 if (cl_wide_st_chg(mdev, os, ns)) {
575 rv = is_valid_state(mdev, ns);
576 if (rv == SS_SUCCESS)
577 rv = is_valid_state_transition(mdev, ns, os);
87eeee41 578 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
b411b363
PR
579
580 if (rv < SS_SUCCESS) {
581 if (f & CS_VERBOSE)
582 print_st_err(mdev, os, ns, rv);
583 goto abort;
584 }
585
586 drbd_state_lock(mdev);
587 if (!drbd_send_state_req(mdev, mask, val)) {
588 drbd_state_unlock(mdev);
589 rv = SS_CW_FAILED_BY_PEER;
590 if (f & CS_VERBOSE)
591 print_st_err(mdev, os, ns, rv);
592 goto abort;
593 }
594
595 wait_event(mdev->state_wait,
596 (rv = _req_st_cond(mdev, mask, val)));
597
598 if (rv < SS_SUCCESS) {
599 drbd_state_unlock(mdev);
600 if (f & CS_VERBOSE)
601 print_st_err(mdev, os, ns, rv);
602 goto abort;
603 }
87eeee41 604 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
b411b363
PR
605 os = mdev->state;
606 ns.i = (os.i & ~mask.i) | val.i;
607 rv = _drbd_set_state(mdev, ns, f, &done);
608 drbd_state_unlock(mdev);
609 } else {
610 rv = _drbd_set_state(mdev, ns, f, &done);
611 }
612
87eeee41 613 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
b411b363
PR
614
615 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
e6b3ea83 616 D_ASSERT(current != mdev->tconn->worker.task);
b411b363
PR
617 wait_for_completion(&done);
618 }
619
620abort:
621 if (f & CS_SERIALIZE)
622 mutex_unlock(&mdev->state_mutex);
623
624 return rv;
625}
626
627/**
628 * _drbd_request_state() - Request a state change (with flags)
629 * @mdev: DRBD device.
630 * @mask: mask of state bits to change.
631 * @val: value of new state bits.
632 * @f: flags
633 *
634 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
635 * flag, or when logging of failed state change requests is not desired.
636 */
bf885f8a
AG
637enum drbd_state_rv
638_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
639 union drbd_state val, enum chg_state_flags f)
b411b363 640{
bf885f8a 641 enum drbd_state_rv rv;
b411b363
PR
642
643 wait_event(mdev->state_wait,
644 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
645
646 return rv;
647}
648
649static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
650{
651 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
652 name,
653 drbd_conn_str(ns.conn),
654 drbd_role_str(ns.role),
655 drbd_role_str(ns.peer),
656 drbd_disk_str(ns.disk),
657 drbd_disk_str(ns.pdsk),
fb22c402 658 is_susp(ns) ? 's' : 'r',
b411b363
PR
659 ns.aftr_isp ? 'a' : '-',
660 ns.peer_isp ? 'p' : '-',
661 ns.user_isp ? 'u' : '-'
662 );
663}
664
bf885f8a
AG
665void print_st_err(struct drbd_conf *mdev, union drbd_state os,
666 union drbd_state ns, enum drbd_state_rv err)
b411b363
PR
667{
668 if (err == SS_IN_TRANSIENT_STATE)
669 return;
670 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
671 print_st(mdev, " state", os);
672 print_st(mdev, "wanted", ns);
673}
674
675
b411b363
PR
676/**
677 * is_valid_state() - Returns an SS_ error code if ns is not valid
678 * @mdev: DRBD device.
679 * @ns: State to consider.
680 */
bf885f8a
AG
681static enum drbd_state_rv
682is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
b411b363
PR
683{
684 /* See drbd_state_sw_errors in drbd_strings.c */
685
686 enum drbd_fencing_p fp;
bf885f8a 687 enum drbd_state_rv rv = SS_SUCCESS;
b411b363
PR
688
689 fp = FP_DONT_CARE;
690 if (get_ldev(mdev)) {
691 fp = mdev->ldev->dc.fencing;
692 put_ldev(mdev);
693 }
694
b2fb6dbe 695 if (get_net_conf(mdev->tconn)) {
89e58e75 696 if (!mdev->tconn->net_conf->two_primaries &&
b411b363
PR
697 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
698 rv = SS_TWO_PRIMARIES;
b2fb6dbe 699 put_net_conf(mdev->tconn);
b411b363
PR
700 }
701
702 if (rv <= 0)
703 /* already found a reason to abort */;
704 else if (ns.role == R_SECONDARY && mdev->open_cnt)
705 rv = SS_DEVICE_IN_USE;
706
707 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
708 rv = SS_NO_UP_TO_DATE_DISK;
709
710 else if (fp >= FP_RESOURCE &&
711 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
712 rv = SS_PRIMARY_NOP;
713
714 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
715 rv = SS_NO_UP_TO_DATE_DISK;
716
717 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
718 rv = SS_NO_LOCAL_DISK;
719
720 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
721 rv = SS_NO_REMOTE_DISK;
722
8d4ce82b
LE
723 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
724 rv = SS_NO_UP_TO_DATE_DISK;
725
b411b363
PR
726 else if ((ns.conn == C_CONNECTED ||
727 ns.conn == C_WF_BITMAP_S ||
728 ns.conn == C_SYNC_SOURCE ||
729 ns.conn == C_PAUSED_SYNC_S) &&
730 ns.disk == D_OUTDATED)
731 rv = SS_CONNECTED_OUTDATES;
732
733 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
734 (mdev->sync_conf.verify_alg[0] == 0))
735 rv = SS_NO_VERIFY_ALG;
736
737 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
31890f4a 738 mdev->tconn->agreed_pro_version < 88)
b411b363
PR
739 rv = SS_NOT_SUPPORTED;
740
fa7d9396
PR
741 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
742 rv = SS_CONNECTED_OUTDATES;
743
b411b363
PR
744 return rv;
745}
746
747/**
748 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
749 * @mdev: DRBD device.
750 * @ns: new state.
751 * @os: old state.
752 */
bf885f8a
AG
753static enum drbd_state_rv
754is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
755 union drbd_state os)
b411b363 756{
bf885f8a 757 enum drbd_state_rv rv = SS_SUCCESS;
b411b363
PR
758
759 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
760 os.conn > C_CONNECTED)
761 rv = SS_RESYNC_RUNNING;
762
763 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
764 rv = SS_ALREADY_STANDALONE;
765
766 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
767 rv = SS_IS_DISKLESS;
768
769 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
770 rv = SS_NO_NET_CONFIG;
771
772 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
773 rv = SS_LOWER_THAN_OUTDATED;
774
775 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
776 rv = SS_IN_TRANSIENT_STATE;
777
778 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
779 rv = SS_IN_TRANSIENT_STATE;
780
781 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
782 rv = SS_NEED_CONNECTION;
783
784 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
785 ns.conn != os.conn && os.conn > C_CONNECTED)
786 rv = SS_RESYNC_RUNNING;
787
788 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
789 os.conn < C_CONNECTED)
790 rv = SS_NEED_CONNECTION;
791
1fc80cf3
PR
792 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
793 && os.conn < C_WF_REPORT_PARAMS)
794 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
795
b411b363
PR
796 return rv;
797}
798
799/**
800 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
801 * @mdev: DRBD device.
802 * @os: old state.
803 * @ns: new state.
804 * @warn_sync_abort:
805 *
806 * When we loose connection, we have to set the state of the peers disk (pdsk)
807 * to D_UNKNOWN. This rule and many more along those lines are in this function.
808 */
809static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
02bc7174 810 union drbd_state ns, const char **warn_sync_abort)
b411b363
PR
811{
812 enum drbd_fencing_p fp;
ab17b68f 813 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
b411b363
PR
814
815 fp = FP_DONT_CARE;
816 if (get_ldev(mdev)) {
817 fp = mdev->ldev->dc.fencing;
818 put_ldev(mdev);
819 }
820
821 /* Disallow Network errors to configure a device's network part */
822 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
823 os.conn <= C_DISCONNECTING)
824 ns.conn = os.conn;
825
f2906e18
LE
826 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
827 * If you try to go into some Sync* state, that shall fail (elsewhere). */
b411b363 828 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
f2906e18 829 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
b411b363
PR
830 ns.conn = os.conn;
831
82f59cc6
LE
832 /* we cannot fail (again) if we already detached */
833 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
834 ns.disk = D_DISKLESS;
835
836 /* if we are only D_ATTACHING yet,
837 * we can (and should) go directly to D_DISKLESS. */
838 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
839 ns.disk = D_DISKLESS;
840
b411b363
PR
841 /* After C_DISCONNECTING only C_STANDALONE may follow */
842 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
843 ns.conn = os.conn;
844
845 if (ns.conn < C_CONNECTED) {
846 ns.peer_isp = 0;
847 ns.peer = R_UNKNOWN;
848 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
849 ns.pdsk = D_UNKNOWN;
850 }
851
852 /* Clear the aftr_isp when becoming unconfigured */
853 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
854 ns.aftr_isp = 0;
855
b411b363
PR
856 /* Abort resync if a disk fails/detaches */
857 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
858 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
859 if (warn_sync_abort)
02bc7174
LE
860 *warn_sync_abort =
861 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
862 "Online-verify" : "Resync";
b411b363
PR
863 ns.conn = C_CONNECTED;
864 }
865
b411b363
PR
866 /* Connection breaks down before we finished "Negotiating" */
867 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
868 get_ldev_if_state(mdev, D_NEGOTIATING)) {
869 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
870 ns.disk = mdev->new_state_tmp.disk;
871 ns.pdsk = mdev->new_state_tmp.pdsk;
872 } else {
873 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
874 ns.disk = D_DISKLESS;
875 ns.pdsk = D_UNKNOWN;
876 }
877 put_ldev(mdev);
878 }
879
ab17b68f
PR
880 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
881 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
882 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
883 ns.disk = D_UP_TO_DATE;
884 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
885 ns.pdsk = D_UP_TO_DATE;
886 }
887
888 /* Implications of the connection stat on the disk states */
889 disk_min = D_DISKLESS;
890 disk_max = D_UP_TO_DATE;
891 pdsk_min = D_INCONSISTENT;
892 pdsk_max = D_UNKNOWN;
893 switch ((enum drbd_conns)ns.conn) {
894 case C_WF_BITMAP_T:
895 case C_PAUSED_SYNC_T:
896 case C_STARTING_SYNC_T:
897 case C_WF_SYNC_UUID:
898 case C_BEHIND:
899 disk_min = D_INCONSISTENT;
900 disk_max = D_OUTDATED;
901 pdsk_min = D_UP_TO_DATE;
902 pdsk_max = D_UP_TO_DATE;
903 break;
904 case C_VERIFY_S:
905 case C_VERIFY_T:
906 disk_min = D_UP_TO_DATE;
907 disk_max = D_UP_TO_DATE;
908 pdsk_min = D_UP_TO_DATE;
909 pdsk_max = D_UP_TO_DATE;
910 break;
911 case C_CONNECTED:
912 disk_min = D_DISKLESS;
913 disk_max = D_UP_TO_DATE;
914 pdsk_min = D_DISKLESS;
915 pdsk_max = D_UP_TO_DATE;
916 break;
917 case C_WF_BITMAP_S:
918 case C_PAUSED_SYNC_S:
919 case C_STARTING_SYNC_S:
920 case C_AHEAD:
921 disk_min = D_UP_TO_DATE;
922 disk_max = D_UP_TO_DATE;
923 pdsk_min = D_INCONSISTENT;
924 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
925 break;
926 case C_SYNC_TARGET:
927 disk_min = D_INCONSISTENT;
928 disk_max = D_INCONSISTENT;
929 pdsk_min = D_UP_TO_DATE;
930 pdsk_max = D_UP_TO_DATE;
931 break;
932 case C_SYNC_SOURCE:
933 disk_min = D_UP_TO_DATE;
934 disk_max = D_UP_TO_DATE;
935 pdsk_min = D_INCONSISTENT;
936 pdsk_max = D_INCONSISTENT;
937 break;
938 case C_STANDALONE:
939 case C_DISCONNECTING:
940 case C_UNCONNECTED:
941 case C_TIMEOUT:
942 case C_BROKEN_PIPE:
943 case C_NETWORK_FAILURE:
944 case C_PROTOCOL_ERROR:
945 case C_TEAR_DOWN:
946 case C_WF_CONNECTION:
947 case C_WF_REPORT_PARAMS:
948 case C_MASK:
949 break;
950 }
951 if (ns.disk > disk_max)
952 ns.disk = disk_max;
953
954 if (ns.disk < disk_min) {
955 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
956 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
957 ns.disk = disk_min;
958 }
959 if (ns.pdsk > pdsk_max)
960 ns.pdsk = pdsk_max;
961
962 if (ns.pdsk < pdsk_min) {
963 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
964 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
965 ns.pdsk = pdsk_min;
966 }
967
b411b363 968 if (fp == FP_STONITH &&
0a492166
PR
969 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
970 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
fb22c402 971 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
265be2d0
PR
972
973 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
974 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
975 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
fb22c402 976 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
b411b363
PR
977
978 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
979 if (ns.conn == C_SYNC_SOURCE)
980 ns.conn = C_PAUSED_SYNC_S;
981 if (ns.conn == C_SYNC_TARGET)
982 ns.conn = C_PAUSED_SYNC_T;
983 } else {
984 if (ns.conn == C_PAUSED_SYNC_S)
985 ns.conn = C_SYNC_SOURCE;
986 if (ns.conn == C_PAUSED_SYNC_T)
987 ns.conn = C_SYNC_TARGET;
988 }
989
990 return ns;
991}
992
993/* helper for __drbd_set_state */
994static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
995{
31890f4a 996 if (mdev->tconn->agreed_pro_version < 90)
30b743a2
LE
997 mdev->ov_start_sector = 0;
998 mdev->rs_total = drbd_bm_bits(mdev);
999 mdev->ov_position = 0;
b411b363
PR
1000 if (cs == C_VERIFY_T) {
1001 /* starting online verify from an arbitrary position
1002 * does not fit well into the existing protocol.
1003 * on C_VERIFY_T, we initialize ov_left and friends
1004 * implicitly in receive_DataRequest once the
1005 * first P_OV_REQUEST is received */
1006 mdev->ov_start_sector = ~(sector_t)0;
1007 } else {
1008 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
30b743a2 1009 if (bit >= mdev->rs_total) {
b411b363
PR
1010 mdev->ov_start_sector =
1011 BM_BIT_TO_SECT(mdev->rs_total - 1);
30b743a2
LE
1012 mdev->rs_total = 1;
1013 } else
1014 mdev->rs_total -= bit;
b411b363
PR
1015 mdev->ov_position = mdev->ov_start_sector;
1016 }
30b743a2 1017 mdev->ov_left = mdev->rs_total;
b411b363
PR
1018}
1019
0778286a
PR
1020static void drbd_resume_al(struct drbd_conf *mdev)
1021{
1022 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1023 dev_info(DEV, "Resumed AL updates\n");
1024}
1025
b411b363
PR
1026/**
1027 * __drbd_set_state() - Set a new DRBD state
1028 * @mdev: DRBD device.
1029 * @ns: new state.
1030 * @flags: Flags
1031 * @done: Optional completion, that will get completed after the after_state_ch() finished
1032 *
1033 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1034 */
bf885f8a
AG
1035enum drbd_state_rv
1036__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1037 enum chg_state_flags flags, struct completion *done)
b411b363
PR
1038{
1039 union drbd_state os;
bf885f8a 1040 enum drbd_state_rv rv = SS_SUCCESS;
02bc7174 1041 const char *warn_sync_abort = NULL;
b411b363
PR
1042 struct after_state_chg_work *ascw;
1043
1044 os = mdev->state;
1045
1046 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1047
1048 if (ns.i == os.i)
1049 return SS_NOTHING_TO_DO;
1050
1051 if (!(flags & CS_HARD)) {
1052 /* pre-state-change checks ; only look at ns */
1053 /* See drbd_state_sw_errors in drbd_strings.c */
1054
1055 rv = is_valid_state(mdev, ns);
1056 if (rv < SS_SUCCESS) {
1057 /* If the old state was illegal as well, then let
1058 this happen...*/
1059
1616a254 1060 if (is_valid_state(mdev, os) == rv)
b411b363 1061 rv = is_valid_state_transition(mdev, ns, os);
b411b363
PR
1062 } else
1063 rv = is_valid_state_transition(mdev, ns, os);
1064 }
1065
1066 if (rv < SS_SUCCESS) {
1067 if (flags & CS_VERBOSE)
1068 print_st_err(mdev, os, ns, rv);
1069 return rv;
1070 }
1071
1072 if (warn_sync_abort)
02bc7174 1073 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
b411b363
PR
1074
1075 {
662d91a2
AG
1076 char *pbp, pb[300];
1077 pbp = pb;
1078 *pbp = 0;
1079 if (ns.role != os.role)
1080 pbp += sprintf(pbp, "role( %s -> %s ) ",
1081 drbd_role_str(os.role),
1082 drbd_role_str(ns.role));
1083 if (ns.peer != os.peer)
1084 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1085 drbd_role_str(os.peer),
1086 drbd_role_str(ns.peer));
1087 if (ns.conn != os.conn)
1088 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1089 drbd_conn_str(os.conn),
1090 drbd_conn_str(ns.conn));
1091 if (ns.disk != os.disk)
1092 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1093 drbd_disk_str(os.disk),
1094 drbd_disk_str(ns.disk));
1095 if (ns.pdsk != os.pdsk)
1096 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1097 drbd_disk_str(os.pdsk),
1098 drbd_disk_str(ns.pdsk));
1099 if (is_susp(ns) != is_susp(os))
1100 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1101 is_susp(os),
1102 is_susp(ns));
1103 if (ns.aftr_isp != os.aftr_isp)
1104 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1105 os.aftr_isp,
1106 ns.aftr_isp);
1107 if (ns.peer_isp != os.peer_isp)
1108 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1109 os.peer_isp,
1110 ns.peer_isp);
1111 if (ns.user_isp != os.user_isp)
1112 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1113 os.user_isp,
1114 ns.user_isp);
1115 dev_info(DEV, "%s\n", pb);
b411b363
PR
1116 }
1117
1118 /* solve the race between becoming unconfigured,
1119 * worker doing the cleanup, and
1120 * admin reconfiguring us:
1121 * on (re)configure, first set CONFIG_PENDING,
1122 * then wait for a potentially exiting worker,
1123 * start the worker, and schedule one no_op.
1124 * then proceed with configuration.
1125 */
1126 if (ns.disk == D_DISKLESS &&
1127 ns.conn == C_STANDALONE &&
1128 ns.role == R_SECONDARY &&
1129 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1130 set_bit(DEVICE_DYING, &mdev->flags);
1131
82f59cc6
LE
1132 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1133 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1134 * drbd_ldev_destroy() won't happen before our corresponding
1135 * after_state_ch works run, where we put_ldev again. */
1136 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1137 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1138 atomic_inc(&mdev->local_cnt);
1139
1140 mdev->state = ns;
62b0da3a
LE
1141
1142 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1143 drbd_print_uuids(mdev, "attached to UUIDs");
1144
b411b363
PR
1145 wake_up(&mdev->misc_wait);
1146 wake_up(&mdev->state_wait);
1147
b411b363
PR
1148 /* aborted verify run. log the last position */
1149 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1150 ns.conn < C_CONNECTED) {
1151 mdev->ov_start_sector =
30b743a2 1152 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
b411b363
PR
1153 dev_info(DEV, "Online Verify reached sector %llu\n",
1154 (unsigned long long)mdev->ov_start_sector);
1155 }
1156
1157 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1158 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1159 dev_info(DEV, "Syncer continues.\n");
1d7734a0
LE
1160 mdev->rs_paused += (long)jiffies
1161 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
63106d3c
PR
1162 if (ns.conn == C_SYNC_TARGET)
1163 mod_timer(&mdev->resync_timer, jiffies);
b411b363
PR
1164 }
1165
1166 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1167 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1168 dev_info(DEV, "Resync suspended\n");
1d7734a0 1169 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
b411b363
PR
1170 }
1171
1172 if (os.conn == C_CONNECTED &&
1173 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1d7734a0
LE
1174 unsigned long now = jiffies;
1175 int i;
1176
30b743a2 1177 set_ov_position(mdev, ns.conn);
1d7734a0 1178 mdev->rs_start = now;
0f0601f4
LE
1179 mdev->rs_last_events = 0;
1180 mdev->rs_last_sect_ev = 0;
b411b363
PR
1181 mdev->ov_last_oos_size = 0;
1182 mdev->ov_last_oos_start = 0;
1183
1d7734a0 1184 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
30b743a2 1185 mdev->rs_mark_left[i] = mdev->ov_left;
1d7734a0
LE
1186 mdev->rs_mark_time[i] = now;
1187 }
1188
2649f080
LE
1189 drbd_rs_controller_reset(mdev);
1190
b411b363
PR
1191 if (ns.conn == C_VERIFY_S) {
1192 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1193 (unsigned long long)mdev->ov_position);
1194 mod_timer(&mdev->resync_timer, jiffies);
1195 }
1196 }
1197
1198 if (get_ldev(mdev)) {
1199 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1200 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1201 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1202
1203 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1204 mdf |= MDF_CRASHED_PRIMARY;
1205 if (mdev->state.role == R_PRIMARY ||
1206 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1207 mdf |= MDF_PRIMARY_IND;
1208 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1209 mdf |= MDF_CONNECTED_IND;
1210 if (mdev->state.disk > D_INCONSISTENT)
1211 mdf |= MDF_CONSISTENT;
1212 if (mdev->state.disk > D_OUTDATED)
1213 mdf |= MDF_WAS_UP_TO_DATE;
1214 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1215 mdf |= MDF_PEER_OUT_DATED;
1216 if (mdf != mdev->ldev->md.flags) {
1217 mdev->ldev->md.flags = mdf;
1218 drbd_md_mark_dirty(mdev);
1219 }
1220 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1221 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1222 put_ldev(mdev);
1223 }
1224
1225 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1226 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1227 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1228 set_bit(CONSIDER_RESYNC, &mdev->flags);
1229
1230 /* Receiver should clean up itself */
1231 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
e6b3ea83 1232 drbd_thread_stop_nowait(&mdev->tconn->receiver);
b411b363
PR
1233
1234 /* Now the receiver finished cleaning up itself, it should die */
1235 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
e6b3ea83 1236 drbd_thread_stop_nowait(&mdev->tconn->receiver);
b411b363
PR
1237
1238 /* Upon network failure, we need to restart the receiver. */
1239 if (os.conn > C_TEAR_DOWN &&
1240 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
e6b3ea83 1241 drbd_thread_restart_nowait(&mdev->tconn->receiver);
b411b363 1242
0778286a
PR
1243 /* Resume AL writing if we get a connection */
1244 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1245 drbd_resume_al(mdev);
1246
b411b363
PR
1247 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1248 if (ascw) {
1249 ascw->os = os;
1250 ascw->ns = ns;
1251 ascw->flags = flags;
1252 ascw->w.cb = w_after_state_ch;
1253 ascw->done = done;
e42325a5 1254 drbd_queue_work(&mdev->tconn->data.work, &ascw->w);
b411b363
PR
1255 } else {
1256 dev_warn(DEV, "Could not kmalloc an ascw\n");
1257 }
1258
1259 return rv;
1260}
1261
1262static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1263{
1264 struct after_state_chg_work *ascw =
1265 container_of(w, struct after_state_chg_work, w);
1266 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1267 if (ascw->flags & CS_WAIT_COMPLETE) {
1268 D_ASSERT(ascw->done != NULL);
1269 complete(ascw->done);
1270 }
1271 kfree(ascw);
1272
1273 return 1;
1274}
1275
1276static void abw_start_sync(struct drbd_conf *mdev, int rv)
1277{
1278 if (rv) {
1279 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1280 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1281 return;
1282 }
1283
1284 switch (mdev->state.conn) {
1285 case C_STARTING_SYNC_T:
1286 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1287 break;
1288 case C_STARTING_SYNC_S:
1289 drbd_start_resync(mdev, C_SYNC_SOURCE);
1290 break;
1291 }
1292}
1293
20ceb2b2
LE
1294int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1295 int (*io_fn)(struct drbd_conf *),
1296 char *why, enum bm_flag flags)
19f843aa
LE
1297{
1298 int rv;
1299
e6b3ea83 1300 D_ASSERT(current == mdev->tconn->worker.task);
19f843aa
LE
1301
1302 /* open coded non-blocking drbd_suspend_io(mdev); */
1303 set_bit(SUSPEND_IO, &mdev->flags);
19f843aa 1304
20ceb2b2 1305 drbd_bm_lock(mdev, why, flags);
19f843aa
LE
1306 rv = io_fn(mdev);
1307 drbd_bm_unlock(mdev);
1308
1309 drbd_resume_io(mdev);
1310
1311 return rv;
1312}
1313
b411b363
PR
1314/**
1315 * after_state_ch() - Perform after state change actions that may sleep
1316 * @mdev: DRBD device.
1317 * @os: old state.
1318 * @ns: new state.
1319 * @flags: Flags
1320 */
1321static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1322 union drbd_state ns, enum chg_state_flags flags)
1323{
1324 enum drbd_fencing_p fp;
8554df1c 1325 enum drbd_req_event what = NOTHING;
fb22c402 1326 union drbd_state nsm = (union drbd_state){ .i = -1 };
b411b363
PR
1327
1328 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1329 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1330 if (mdev->p_uuid)
1331 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1332 }
1333
1334 fp = FP_DONT_CARE;
1335 if (get_ldev(mdev)) {
1336 fp = mdev->ldev->dc.fencing;
1337 put_ldev(mdev);
1338 }
1339
1340 /* Inform userspace about the change... */
1341 drbd_bcast_state(mdev, ns);
1342
1343 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1344 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1345 drbd_khelper(mdev, "pri-on-incon-degr");
1346
1347 /* Here we have the actions that are performed after a
1348 state change. This function might sleep */
1349
fb22c402
PR
1350 nsm.i = -1;
1351 if (ns.susp_nod) {
3f98688a 1352 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
8554df1c 1353 what = RESEND;
265be2d0 1354
67098930 1355 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
8554df1c 1356 what = RESTART_FROZEN_DISK_IO;
fb22c402 1357
8554df1c 1358 if (what != NOTHING)
3f98688a 1359 nsm.susp_nod = 0;
265be2d0
PR
1360 }
1361
fb22c402 1362 if (ns.susp_fen) {
43a5182c
PR
1363 /* case1: The outdate peer handler is successful: */
1364 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
b411b363 1365 tl_clear(mdev);
43a5182c
PR
1366 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1367 drbd_uuid_new_current(mdev);
1368 clear_bit(NEW_CUR_UUID, &mdev->flags);
43a5182c 1369 }
87eeee41 1370 spin_lock_irq(&mdev->tconn->req_lock);
fb22c402 1371 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
87eeee41 1372 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 1373 }
43a5182c
PR
1374 /* case2: The connection was established again: */
1375 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1376 clear_bit(NEW_CUR_UUID, &mdev->flags);
8554df1c 1377 what = RESEND;
fb22c402 1378 nsm.susp_fen = 0;
43a5182c 1379 }
b411b363 1380 }
67098930 1381
8554df1c 1382 if (what != NOTHING) {
87eeee41 1383 spin_lock_irq(&mdev->tconn->req_lock);
67098930 1384 _tl_restart(mdev, what);
fb22c402
PR
1385 nsm.i &= mdev->state.i;
1386 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
87eeee41 1387 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363 1388 }
67098930 1389
5a22db89
LE
1390 /* Became sync source. With protocol >= 96, we still need to send out
1391 * the sync uuid now. Need to do that before any drbd_send_state, or
1392 * the other side may go "paused sync" before receiving the sync uuids,
1393 * which is unexpected. */
1394 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1395 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
31890f4a 1396 mdev->tconn->agreed_pro_version >= 96 && get_ldev(mdev)) {
5a22db89
LE
1397 drbd_gen_and_send_sync_uuid(mdev);
1398 put_ldev(mdev);
1399 }
1400
b411b363
PR
1401 /* Do not change the order of the if above and the two below... */
1402 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1403 drbd_send_uuids(mdev);
1404 drbd_send_state(mdev);
1405 }
54b956ab
LE
1406 /* No point in queuing send_bitmap if we don't have a connection
1407 * anymore, so check also the _current_ state, not only the new state
1408 * at the time this work was queued. */
1409 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1410 mdev->state.conn == C_WF_BITMAP_S)
1411 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
20ceb2b2
LE
1412 "send_bitmap (WFBitMapS)",
1413 BM_LOCKED_TEST_ALLOWED);
b411b363
PR
1414
1415 /* Lost contact to peer's copy of the data */
1416 if ((os.pdsk >= D_INCONSISTENT &&
1417 os.pdsk != D_UNKNOWN &&
1418 os.pdsk != D_OUTDATED)
1419 && (ns.pdsk < D_INCONSISTENT ||
1420 ns.pdsk == D_UNKNOWN ||
1421 ns.pdsk == D_OUTDATED)) {
b411b363
PR
1422 if (get_ldev(mdev)) {
1423 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
2c8d1967 1424 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
fb22c402 1425 if (is_susp(mdev->state)) {
43a5182c
PR
1426 set_bit(NEW_CUR_UUID, &mdev->flags);
1427 } else {
1428 drbd_uuid_new_current(mdev);
1429 drbd_send_uuids(mdev);
1430 }
2c8d1967 1431 }
b411b363
PR
1432 put_ldev(mdev);
1433 }
1434 }
1435
1436 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
18a50fa2 1437 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
2c8d1967 1438 drbd_uuid_new_current(mdev);
18a50fa2
PR
1439 drbd_send_uuids(mdev);
1440 }
b411b363
PR
1441
1442 /* D_DISKLESS Peer becomes secondary */
1443 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
20ceb2b2
LE
1444 /* We may still be Primary ourselves.
1445 * No harm done if the bitmap still changes,
1446 * redirtied pages will follow later. */
1447 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1448 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
19f843aa
LE
1449 put_ldev(mdev);
1450 }
1451
06d33e96
LE
1452 /* Write out all changed bits on demote.
1453 * Though, no need to da that just yet
1454 * if there is a resync going on still */
1455 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1456 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
20ceb2b2
LE
1457 /* No changes to the bitmap expected this time, so assert that,
1458 * even though no harm was done if it did change. */
1459 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1460 "demote", BM_LOCKED_TEST_ALLOWED);
b411b363
PR
1461 put_ldev(mdev);
1462 }
1463
1464 /* Last part of the attaching process ... */
1465 if (ns.conn >= C_CONNECTED &&
1466 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
e89b591c 1467 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
b411b363
PR
1468 drbd_send_uuids(mdev);
1469 drbd_send_state(mdev);
1470 }
1471
1472 /* We want to pause/continue resync, tell peer. */
1473 if (ns.conn >= C_CONNECTED &&
1474 ((os.aftr_isp != ns.aftr_isp) ||
1475 (os.user_isp != ns.user_isp)))
1476 drbd_send_state(mdev);
1477
1478 /* In case one of the isp bits got set, suspend other devices. */
1479 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1480 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1481 suspend_other_sg(mdev);
1482
1483 /* Make sure the peer gets informed about eventual state
1484 changes (ISP bits) while we were in WFReportParams. */
1485 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1486 drbd_send_state(mdev);
1487
67531718
PR
1488 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1489 drbd_send_state(mdev);
1490
b411b363
PR
1491 /* We are in the progress to start a full sync... */
1492 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1493 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
20ceb2b2
LE
1494 /* no other bitmap changes expected during this phase */
1495 drbd_queue_bitmap_io(mdev,
1496 &drbd_bmio_set_n_write, &abw_start_sync,
1497 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
b411b363
PR
1498
1499 /* We are invalidating our self... */
1500 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1501 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
20ceb2b2
LE
1502 /* other bitmap operation expected during this phase */
1503 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1504 "set_n_write from invalidate", BM_LOCKED_MASK);
b411b363 1505
82f59cc6
LE
1506 /* first half of local IO error, failure to attach,
1507 * or administrative detach */
1508 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1509 enum drbd_io_error_p eh;
1510 int was_io_error;
1511 /* corresponding get_ldev was in __drbd_set_state, to serialize
1512 * our cleanup here with the transition to D_DISKLESS,
1513 * so it is safe to dreference ldev here. */
1514 eh = mdev->ldev->dc.on_io_error;
1515 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1516
1517 /* current state still has to be D_FAILED,
1518 * there is only one way out: to D_DISKLESS,
1519 * and that may only happen after our put_ldev below. */
1520 if (mdev->state.disk != D_FAILED)
1521 dev_err(DEV,
1522 "ASSERT FAILED: disk is %s during detach\n",
1523 drbd_disk_str(mdev->state.disk));
e9e6f3ec
LE
1524
1525 if (drbd_send_state(mdev))
82f59cc6 1526 dev_warn(DEV, "Notified peer that I am detaching my disk\n");
e9e6f3ec 1527 else
82f59cc6 1528 dev_err(DEV, "Sending state for detaching disk failed\n");
e9e6f3ec
LE
1529
1530 drbd_rs_cancel_all(mdev);
b411b363 1531
82f59cc6
LE
1532 /* In case we want to get something to stable storage still,
1533 * this may be the last chance.
1534 * Following put_ldev may transition to D_DISKLESS. */
1535 drbd_md_sync(mdev);
1536 put_ldev(mdev);
1537
1538 if (was_io_error && eh == EP_CALL_HELPER)
e9e6f3ec
LE
1539 drbd_khelper(mdev, "local-io-error");
1540 }
b411b363 1541
82f59cc6
LE
1542 /* second half of local IO error, failure to attach,
1543 * or administrative detach,
1544 * after local_cnt references have reached zero again */
1545 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1546 /* We must still be diskless,
1547 * re-attach has to be serialized with this! */
1548 if (mdev->state.disk != D_DISKLESS)
1549 dev_err(DEV,
1550 "ASSERT FAILED: disk is %s while going diskless\n",
1551 drbd_disk_str(mdev->state.disk));
e9e6f3ec 1552
82f59cc6
LE
1553 mdev->rs_total = 0;
1554 mdev->rs_failed = 0;
1555 atomic_set(&mdev->rs_pending_cnt, 0);
9d282875 1556
e9e6f3ec 1557 if (drbd_send_state(mdev))
82f59cc6 1558 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
82f59cc6 1559 /* corresponding get_ldev in __drbd_set_state
25985edc 1560 * this may finally trigger drbd_ldev_destroy. */
82f59cc6 1561 put_ldev(mdev);
b411b363
PR
1562 }
1563
738a84b2
PR
1564 /* Notify peer that I had a local IO error, and did not detached.. */
1565 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1566 drbd_send_state(mdev);
1567
b411b363
PR
1568 /* Disks got bigger while they were detached */
1569 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1570 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1571 if (ns.conn == C_CONNECTED)
1572 resync_after_online_grow(mdev);
1573 }
1574
1575 /* A resync finished or aborted, wake paused devices... */
1576 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1577 (os.peer_isp && !ns.peer_isp) ||
1578 (os.user_isp && !ns.user_isp))
1579 resume_next_sg(mdev);
1580
af85e8e8
LE
1581 /* sync target done with resync. Explicitly notify peer, even though
1582 * it should (at least for non-empty resyncs) already know itself. */
1583 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1584 drbd_send_state(mdev);
1585
79a30d2d
LE
1586 /* This triggers bitmap writeout of potentially still unwritten pages
1587 * if the resync finished cleanly, or aborted because of peer disk
20ceb2b2 1588 * failure, or because of connection loss.
79a30d2d
LE
1589 * For resync aborted because of local disk failure, we cannot do
1590 * any bitmap writeout anymore.
20ceb2b2 1591 * No harm done if some bits change during this phase.
79a30d2d 1592 */
20ceb2b2
LE
1593 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1594 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1595 "write from resync_finished", BM_LOCKED_SET_ALLOWED);
79a30d2d
LE
1596 put_ldev(mdev);
1597 }
02851e9f 1598
b411b363
PR
1599 /* Upon network connection, we need to start the receiver */
1600 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
e6b3ea83 1601 drbd_thread_start(&mdev->tconn->receiver);
b411b363
PR
1602
1603 /* Terminate worker thread if we are unconfigured - it will be
1604 restarted as needed... */
1605 if (ns.disk == D_DISKLESS &&
1606 ns.conn == C_STANDALONE &&
1607 ns.role == R_SECONDARY) {
1608 if (os.aftr_isp != ns.aftr_isp)
1609 resume_next_sg(mdev);
1610 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1611 if (test_bit(DEVICE_DYING, &mdev->flags))
e6b3ea83 1612 drbd_thread_stop_nowait(&mdev->tconn->worker);
b411b363
PR
1613 }
1614
1615 drbd_md_sync(mdev);
1616}
1617
1618
1619static int drbd_thread_setup(void *arg)
1620{
1621 struct drbd_thread *thi = (struct drbd_thread *) arg;
1622 struct drbd_conf *mdev = thi->mdev;
1623 unsigned long flags;
1624 int retval;
1625
1626restart:
1627 retval = thi->function(thi);
1628
1629 spin_lock_irqsave(&thi->t_lock, flags);
1630
e77a0a5c 1631 /* if the receiver has been "EXITING", the last thing it did
b411b363
PR
1632 * was set the conn state to "StandAlone",
1633 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1634 * and receiver thread will be "started".
e77a0a5c 1635 * drbd_thread_start needs to set "RESTARTING" in that case.
b411b363 1636 * t_state check and assignment needs to be within the same spinlock,
e77a0a5c
AG
1637 * so either thread_start sees EXITING, and can remap to RESTARTING,
1638 * or thread_start see NONE, and can proceed as normal.
b411b363
PR
1639 */
1640
e77a0a5c 1641 if (thi->t_state == RESTARTING) {
b411b363 1642 dev_info(DEV, "Restarting %s\n", current->comm);
e77a0a5c 1643 thi->t_state = RUNNING;
b411b363
PR
1644 spin_unlock_irqrestore(&thi->t_lock, flags);
1645 goto restart;
1646 }
1647
1648 thi->task = NULL;
e77a0a5c 1649 thi->t_state = NONE;
b411b363
PR
1650 smp_mb();
1651 complete(&thi->stop);
1652 spin_unlock_irqrestore(&thi->t_lock, flags);
1653
1654 dev_info(DEV, "Terminating %s\n", current->comm);
1655
1656 /* Release mod reference taken when thread was started */
1657 module_put(THIS_MODULE);
1658 return retval;
1659}
1660
1661static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1662 int (*func) (struct drbd_thread *))
1663{
1664 spin_lock_init(&thi->t_lock);
1665 thi->task = NULL;
e77a0a5c 1666 thi->t_state = NONE;
b411b363
PR
1667 thi->function = func;
1668 thi->mdev = mdev;
1669}
1670
1671int drbd_thread_start(struct drbd_thread *thi)
1672{
1673 struct drbd_conf *mdev = thi->mdev;
1674 struct task_struct *nt;
1675 unsigned long flags;
1676
1677 const char *me =
e6b3ea83
PR
1678 thi == &mdev->tconn->receiver ? "receiver" :
1679 thi == &mdev->tconn->asender ? "asender" :
1680 thi == &mdev->tconn->worker ? "worker" : "NONSENSE";
b411b363
PR
1681
1682 /* is used from state engine doing drbd_thread_stop_nowait,
1683 * while holding the req lock irqsave */
1684 spin_lock_irqsave(&thi->t_lock, flags);
1685
1686 switch (thi->t_state) {
e77a0a5c 1687 case NONE:
b411b363
PR
1688 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1689 me, current->comm, current->pid);
1690
1691 /* Get ref on module for thread - this is released when thread exits */
1692 if (!try_module_get(THIS_MODULE)) {
1693 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1694 spin_unlock_irqrestore(&thi->t_lock, flags);
81e84650 1695 return false;
b411b363
PR
1696 }
1697
1698 init_completion(&thi->stop);
1699 D_ASSERT(thi->task == NULL);
1700 thi->reset_cpu_mask = 1;
e77a0a5c 1701 thi->t_state = RUNNING;
b411b363
PR
1702 spin_unlock_irqrestore(&thi->t_lock, flags);
1703 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1704
1705 nt = kthread_create(drbd_thread_setup, (void *) thi,
1706 "drbd%d_%s", mdev_to_minor(mdev), me);
1707
1708 if (IS_ERR(nt)) {
1709 dev_err(DEV, "Couldn't start thread\n");
1710
1711 module_put(THIS_MODULE);
81e84650 1712 return false;
b411b363
PR
1713 }
1714 spin_lock_irqsave(&thi->t_lock, flags);
1715 thi->task = nt;
e77a0a5c 1716 thi->t_state = RUNNING;
b411b363
PR
1717 spin_unlock_irqrestore(&thi->t_lock, flags);
1718 wake_up_process(nt);
1719 break;
e77a0a5c
AG
1720 case EXITING:
1721 thi->t_state = RESTARTING;
b411b363
PR
1722 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1723 me, current->comm, current->pid);
1724 /* fall through */
e77a0a5c
AG
1725 case RUNNING:
1726 case RESTARTING:
b411b363
PR
1727 default:
1728 spin_unlock_irqrestore(&thi->t_lock, flags);
1729 break;
1730 }
1731
81e84650 1732 return true;
b411b363
PR
1733}
1734
1735
1736void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1737{
1738 unsigned long flags;
1739
e77a0a5c 1740 enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
b411b363
PR
1741
1742 /* may be called from state engine, holding the req lock irqsave */
1743 spin_lock_irqsave(&thi->t_lock, flags);
1744
e77a0a5c 1745 if (thi->t_state == NONE) {
b411b363
PR
1746 spin_unlock_irqrestore(&thi->t_lock, flags);
1747 if (restart)
1748 drbd_thread_start(thi);
1749 return;
1750 }
1751
1752 if (thi->t_state != ns) {
1753 if (thi->task == NULL) {
1754 spin_unlock_irqrestore(&thi->t_lock, flags);
1755 return;
1756 }
1757
1758 thi->t_state = ns;
1759 smp_mb();
1760 init_completion(&thi->stop);
1761 if (thi->task != current)
1762 force_sig(DRBD_SIGKILL, thi->task);
1763
1764 }
1765
1766 spin_unlock_irqrestore(&thi->t_lock, flags);
1767
1768 if (wait)
1769 wait_for_completion(&thi->stop);
1770}
1771
1772#ifdef CONFIG_SMP
1773/**
1774 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1775 * @mdev: DRBD device.
1776 *
1777 * Forces all threads of a device onto the same CPU. This is beneficial for
1778 * DRBD's performance. May be overwritten by user's configuration.
1779 */
1780void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1781{
1782 int ord, cpu;
1783
1784 /* user override. */
1785 if (cpumask_weight(mdev->cpu_mask))
1786 return;
1787
1788 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1789 for_each_online_cpu(cpu) {
1790 if (ord-- == 0) {
1791 cpumask_set_cpu(cpu, mdev->cpu_mask);
1792 return;
1793 }
1794 }
1795 /* should not be reached */
1796 cpumask_setall(mdev->cpu_mask);
1797}
1798
1799/**
1800 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1801 * @mdev: DRBD device.
1802 *
1803 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1804 * prematurely.
1805 */
1806void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1807{
1808 struct task_struct *p = current;
1809 struct drbd_thread *thi =
e6b3ea83
PR
1810 p == mdev->tconn->asender.task ? &mdev->tconn->asender :
1811 p == mdev->tconn->receiver.task ? &mdev->tconn->receiver :
1812 p == mdev->tconn->worker.task ? &mdev->tconn->worker :
b411b363 1813 NULL;
841ce241 1814 if (!expect(thi != NULL))
b411b363
PR
1815 return;
1816 if (!thi->reset_cpu_mask)
1817 return;
1818 thi->reset_cpu_mask = 0;
1819 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1820}
1821#endif
1822
fd340c12 1823static void prepare_header80(struct drbd_conf *mdev, struct p_header80 *h,
d8763023 1824 enum drbd_packet cmd, int size)
fd340c12
PR
1825{
1826 h->magic = cpu_to_be32(DRBD_MAGIC);
1827 h->command = cpu_to_be16(cmd);
1828 h->length = cpu_to_be16(size);
1829}
1830
1831static void prepare_header95(struct drbd_conf *mdev, struct p_header95 *h,
d8763023 1832 enum drbd_packet cmd, int size)
fd340c12
PR
1833{
1834 h->magic = cpu_to_be16(DRBD_MAGIC_BIG);
1835 h->command = cpu_to_be16(cmd);
1836 h->length = cpu_to_be32(size);
1837}
1838
1839static void prepare_header(struct drbd_conf *mdev, struct p_header *h,
d8763023 1840 enum drbd_packet cmd, int size)
fd340c12
PR
1841{
1842 if (mdev->tconn->agreed_pro_version >= 100 || size > DRBD_MAX_SIZE_H80_PACKET)
1843 prepare_header95(mdev, &h->h95, cmd, size);
1844 else
1845 prepare_header80(mdev, &h->h80, cmd, size);
1846}
1847
b411b363
PR
1848/* the appropriate socket mutex must be held already */
1849int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
d8763023
AG
1850 enum drbd_packet cmd, struct p_header *h, size_t size,
1851 unsigned msg_flags)
b411b363
PR
1852{
1853 int sent, ok;
1854
841ce241
AG
1855 if (!expect(h))
1856 return false;
1857 if (!expect(size))
1858 return false;
b411b363 1859
fd340c12 1860 prepare_header(mdev, h, cmd, size - sizeof(struct p_header));
b411b363 1861
b411b363
PR
1862 sent = drbd_send(mdev, sock, h, size, msg_flags);
1863
1864 ok = (sent == size);
0ddc5549
LE
1865 if (!ok && !signal_pending(current))
1866 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
b411b363
PR
1867 cmdname(cmd), (int)size, sent);
1868 return ok;
1869}
1870
1871/* don't pass the socket. we may only look at it
1872 * when we hold the appropriate socket mutex.
1873 */
1874int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
d8763023 1875 enum drbd_packet cmd, struct p_header *h, size_t size)
b411b363
PR
1876{
1877 int ok = 0;
1878 struct socket *sock;
1879
1880 if (use_data_socket) {
e42325a5
PR
1881 mutex_lock(&mdev->tconn->data.mutex);
1882 sock = mdev->tconn->data.socket;
b411b363 1883 } else {
e42325a5
PR
1884 mutex_lock(&mdev->tconn->meta.mutex);
1885 sock = mdev->tconn->meta.socket;
b411b363
PR
1886 }
1887
1888 /* drbd_disconnect() could have called drbd_free_sock()
1889 * while we were waiting in down()... */
1890 if (likely(sock != NULL))
1891 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1892
1893 if (use_data_socket)
e42325a5 1894 mutex_unlock(&mdev->tconn->data.mutex);
b411b363 1895 else
e42325a5 1896 mutex_unlock(&mdev->tconn->meta.mutex);
b411b363
PR
1897 return ok;
1898}
1899
d8763023 1900int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packet cmd, char *data,
b411b363
PR
1901 size_t size)
1902{
fd340c12 1903 struct p_header h;
b411b363
PR
1904 int ok;
1905
fd340c12 1906 prepare_header(mdev, &h, cmd, size);
b411b363
PR
1907
1908 if (!drbd_get_data_sock(mdev))
1909 return 0;
1910
b411b363 1911 ok = (sizeof(h) ==
e42325a5 1912 drbd_send(mdev, mdev->tconn->data.socket, &h, sizeof(h), 0));
b411b363 1913 ok = ok && (size ==
e42325a5 1914 drbd_send(mdev, mdev->tconn->data.socket, data, size, 0));
b411b363
PR
1915
1916 drbd_put_data_sock(mdev);
1917
1918 return ok;
1919}
1920
1921int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1922{
8e26f9cc 1923 struct p_rs_param_95 *p;
b411b363
PR
1924 struct socket *sock;
1925 int size, rv;
31890f4a 1926 const int apv = mdev->tconn->agreed_pro_version;
b411b363
PR
1927
1928 size = apv <= 87 ? sizeof(struct p_rs_param)
1929 : apv == 88 ? sizeof(struct p_rs_param)
1930 + strlen(mdev->sync_conf.verify_alg) + 1
8e26f9cc
PR
1931 : apv <= 94 ? sizeof(struct p_rs_param_89)
1932 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363
PR
1933
1934 /* used from admin command context and receiver/worker context.
1935 * to avoid kmalloc, grab the socket right here,
1936 * then use the pre-allocated sbuf there */
e42325a5
PR
1937 mutex_lock(&mdev->tconn->data.mutex);
1938 sock = mdev->tconn->data.socket;
b411b363
PR
1939
1940 if (likely(sock != NULL)) {
d8763023
AG
1941 enum drbd_packet cmd =
1942 apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
b411b363 1943
e42325a5 1944 p = &mdev->tconn->data.sbuf.rs_param_95;
b411b363
PR
1945
1946 /* initialize verify_alg and csums_alg */
1947 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1948
1949 p->rate = cpu_to_be32(sc->rate);
8e26f9cc
PR
1950 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1951 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1952 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1953 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
b411b363
PR
1954
1955 if (apv >= 88)
1956 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1957 if (apv >= 89)
1958 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1959
1960 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1961 } else
1962 rv = 0; /* not ok */
1963
e42325a5 1964 mutex_unlock(&mdev->tconn->data.mutex);
b411b363
PR
1965
1966 return rv;
1967}
1968
1969int drbd_send_protocol(struct drbd_conf *mdev)
1970{
1971 struct p_protocol *p;
cf14c2e9 1972 int size, cf, rv;
b411b363
PR
1973
1974 size = sizeof(struct p_protocol);
1975
31890f4a 1976 if (mdev->tconn->agreed_pro_version >= 87)
89e58e75 1977 size += strlen(mdev->tconn->net_conf->integrity_alg) + 1;
b411b363
PR
1978
1979 /* we must not recurse into our own queue,
1980 * as that is blocked during handshake */
1981 p = kmalloc(size, GFP_NOIO);
1982 if (p == NULL)
1983 return 0;
1984
89e58e75
PR
1985 p->protocol = cpu_to_be32(mdev->tconn->net_conf->wire_protocol);
1986 p->after_sb_0p = cpu_to_be32(mdev->tconn->net_conf->after_sb_0p);
1987 p->after_sb_1p = cpu_to_be32(mdev->tconn->net_conf->after_sb_1p);
1988 p->after_sb_2p = cpu_to_be32(mdev->tconn->net_conf->after_sb_2p);
1989 p->two_primaries = cpu_to_be32(mdev->tconn->net_conf->two_primaries);
b411b363 1990
cf14c2e9 1991 cf = 0;
89e58e75 1992 if (mdev->tconn->net_conf->want_lose)
cf14c2e9 1993 cf |= CF_WANT_LOSE;
89e58e75 1994 if (mdev->tconn->net_conf->dry_run) {
31890f4a 1995 if (mdev->tconn->agreed_pro_version >= 92)
cf14c2e9
PR
1996 cf |= CF_DRY_RUN;
1997 else {
1998 dev_err(DEV, "--dry-run is not supported by peer");
7ac314c8 1999 kfree(p);
148efa16 2000 return -1;
cf14c2e9
PR
2001 }
2002 }
2003 p->conn_flags = cpu_to_be32(cf);
2004
31890f4a 2005 if (mdev->tconn->agreed_pro_version >= 87)
89e58e75 2006 strcpy(p->integrity_alg, mdev->tconn->net_conf->integrity_alg);
b411b363 2007
c012949a 2008 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, &p->head, size);
b411b363
PR
2009 kfree(p);
2010 return rv;
2011}
2012
2013int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2014{
2015 struct p_uuids p;
2016 int i;
2017
2018 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2019 return 1;
2020
2021 for (i = UI_CURRENT; i < UI_SIZE; i++)
2022 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2023
2024 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2025 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
89e58e75 2026 uuid_flags |= mdev->tconn->net_conf->want_lose ? 1 : 0;
b411b363
PR
2027 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2028 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2029 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2030
2031 put_ldev(mdev);
2032
c012949a 2033 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, &p.head, sizeof(p));
b411b363
PR
2034}
2035
2036int drbd_send_uuids(struct drbd_conf *mdev)
2037{
2038 return _drbd_send_uuids(mdev, 0);
2039}
2040
2041int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2042{
2043 return _drbd_send_uuids(mdev, 8);
2044}
2045
62b0da3a
LE
2046void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2047{
2048 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2049 u64 *uuid = mdev->ldev->md.uuid;
2050 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2051 text,
2052 (unsigned long long)uuid[UI_CURRENT],
2053 (unsigned long long)uuid[UI_BITMAP],
2054 (unsigned long long)uuid[UI_HISTORY_START],
2055 (unsigned long long)uuid[UI_HISTORY_END]);
2056 put_ldev(mdev);
2057 } else {
2058 dev_info(DEV, "%s effective data uuid: %016llX\n",
2059 text,
2060 (unsigned long long)mdev->ed_uuid);
2061 }
2062}
2063
5a22db89 2064int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
b411b363
PR
2065{
2066 struct p_rs_uuid p;
5a22db89
LE
2067 u64 uuid;
2068
2069 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
b411b363 2070
4a23f264 2071 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
5a22db89 2072 drbd_uuid_set(mdev, UI_BITMAP, uuid);
62b0da3a 2073 drbd_print_uuids(mdev, "updated sync UUID");
5a22db89
LE
2074 drbd_md_sync(mdev);
2075 p.uuid = cpu_to_be64(uuid);
b411b363 2076
c012949a 2077 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, &p.head, sizeof(p));
b411b363
PR
2078}
2079
e89b591c 2080int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
b411b363
PR
2081{
2082 struct p_sizes p;
2083 sector_t d_size, u_size;
99432fcc 2084 int q_order_type, max_bio_size;
b411b363
PR
2085 int ok;
2086
2087 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2088 D_ASSERT(mdev->ldev->backing_bdev);
2089 d_size = drbd_get_max_capacity(mdev->ldev);
2090 u_size = mdev->ldev->dc.disk_size;
2091 q_order_type = drbd_queue_order_type(mdev);
99432fcc
PR
2092 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2093 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
b411b363
PR
2094 put_ldev(mdev);
2095 } else {
2096 d_size = 0;
2097 u_size = 0;
2098 q_order_type = QUEUE_ORDERED_NONE;
99432fcc 2099 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
b411b363
PR
2100 }
2101
2102 p.d_size = cpu_to_be64(d_size);
2103 p.u_size = cpu_to_be64(u_size);
2104 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
99432fcc 2105 p.max_bio_size = cpu_to_be32(max_bio_size);
e89b591c
PR
2106 p.queue_order_type = cpu_to_be16(q_order_type);
2107 p.dds_flags = cpu_to_be16(flags);
b411b363 2108
c012949a 2109 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, &p.head, sizeof(p));
b411b363
PR
2110 return ok;
2111}
2112
2113/**
2114 * drbd_send_state() - Sends the drbd state to the peer
2115 * @mdev: DRBD device.
2116 */
2117int drbd_send_state(struct drbd_conf *mdev)
2118{
2119 struct socket *sock;
2120 struct p_state p;
2121 int ok = 0;
2122
2123 /* Grab state lock so we wont send state if we're in the middle
2124 * of a cluster wide state change on another thread */
2125 drbd_state_lock(mdev);
2126
e42325a5 2127 mutex_lock(&mdev->tconn->data.mutex);
b411b363
PR
2128
2129 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
e42325a5 2130 sock = mdev->tconn->data.socket;
b411b363
PR
2131
2132 if (likely(sock != NULL)) {
c012949a 2133 ok = _drbd_send_cmd(mdev, sock, P_STATE, &p.head, sizeof(p), 0);
b411b363
PR
2134 }
2135
e42325a5 2136 mutex_unlock(&mdev->tconn->data.mutex);
b411b363
PR
2137
2138 drbd_state_unlock(mdev);
2139 return ok;
2140}
2141
2142int drbd_send_state_req(struct drbd_conf *mdev,
2143 union drbd_state mask, union drbd_state val)
2144{
2145 struct p_req_state p;
2146
2147 p.mask = cpu_to_be32(mask.i);
2148 p.val = cpu_to_be32(val.i);
2149
c012949a 2150 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, &p.head, sizeof(p));
b411b363
PR
2151}
2152
bf885f8a 2153int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
b411b363
PR
2154{
2155 struct p_req_state_reply p;
2156
2157 p.retcode = cpu_to_be32(retcode);
2158
c012949a 2159 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, &p.head, sizeof(p));
b411b363
PR
2160}
2161
2162int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2163 struct p_compressed_bm *p,
2164 struct bm_xfer_ctx *c)
2165{
2166 struct bitstream bs;
2167 unsigned long plain_bits;
2168 unsigned long tmp;
2169 unsigned long rl;
2170 unsigned len;
2171 unsigned toggle;
2172 int bits;
2173
2174 /* may we use this feature? */
2175 if ((mdev->sync_conf.use_rle == 0) ||
31890f4a 2176 (mdev->tconn->agreed_pro_version < 90))
b411b363
PR
2177 return 0;
2178
2179 if (c->bit_offset >= c->bm_bits)
2180 return 0; /* nothing to do. */
2181
2182 /* use at most thus many bytes */
2183 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2184 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2185 /* plain bits covered in this code string */
2186 plain_bits = 0;
2187
2188 /* p->encoding & 0x80 stores whether the first run length is set.
2189 * bit offset is implicit.
2190 * start with toggle == 2 to be able to tell the first iteration */
2191 toggle = 2;
2192
2193 /* see how much plain bits we can stuff into one packet
2194 * using RLE and VLI. */
2195 do {
2196 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2197 : _drbd_bm_find_next(mdev, c->bit_offset);
2198 if (tmp == -1UL)
2199 tmp = c->bm_bits;
2200 rl = tmp - c->bit_offset;
2201
2202 if (toggle == 2) { /* first iteration */
2203 if (rl == 0) {
2204 /* the first checked bit was set,
2205 * store start value, */
2206 DCBP_set_start(p, 1);
2207 /* but skip encoding of zero run length */
2208 toggle = !toggle;
2209 continue;
2210 }
2211 DCBP_set_start(p, 0);
2212 }
2213
2214 /* paranoia: catch zero runlength.
2215 * can only happen if bitmap is modified while we scan it. */
2216 if (rl == 0) {
2217 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2218 "t:%u bo:%lu\n", toggle, c->bit_offset);
2219 return -1;
2220 }
2221
2222 bits = vli_encode_bits(&bs, rl);
2223 if (bits == -ENOBUFS) /* buffer full */
2224 break;
2225 if (bits <= 0) {
2226 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2227 return 0;
2228 }
2229
2230 toggle = !toggle;
2231 plain_bits += rl;
2232 c->bit_offset = tmp;
2233 } while (c->bit_offset < c->bm_bits);
2234
2235 len = bs.cur.b - p->code + !!bs.cur.bit;
2236
2237 if (plain_bits < (len << 3)) {
2238 /* incompressible with this method.
2239 * we need to rewind both word and bit position. */
2240 c->bit_offset -= plain_bits;
2241 bm_xfer_ctx_bit_to_word_offset(c);
2242 c->bit_offset = c->word_offset * BITS_PER_LONG;
2243 return 0;
2244 }
2245
2246 /* RLE + VLI was able to compress it just fine.
2247 * update c->word_offset. */
2248 bm_xfer_ctx_bit_to_word_offset(c);
2249
2250 /* store pad_bits */
2251 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2252
2253 return len;
2254}
2255
f70af118
AG
2256/**
2257 * send_bitmap_rle_or_plain
2258 *
2259 * Return 0 when done, 1 when another iteration is needed, and a negative error
2260 * code upon failure.
2261 */
2262static int
b411b363 2263send_bitmap_rle_or_plain(struct drbd_conf *mdev,
c012949a 2264 struct p_header *h, struct bm_xfer_ctx *c)
b411b363
PR
2265{
2266 struct p_compressed_bm *p = (void*)h;
2267 unsigned long num_words;
2268 int len;
2269 int ok;
2270
2271 len = fill_bitmap_rle_bits(mdev, p, c);
2272
2273 if (len < 0)
f70af118 2274 return -EIO;
b411b363
PR
2275
2276 if (len) {
2277 DCBP_set_code(p, RLE_VLI_Bits);
e42325a5 2278 ok = _drbd_send_cmd(mdev, mdev->tconn->data.socket, P_COMPRESSED_BITMAP, h,
b411b363
PR
2279 sizeof(*p) + len, 0);
2280
2281 c->packets[0]++;
2282 c->bytes[0] += sizeof(*p) + len;
2283
2284 if (c->bit_offset >= c->bm_bits)
2285 len = 0; /* DONE */
2286 } else {
2287 /* was not compressible.
2288 * send a buffer full of plain text bits instead. */
2289 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2290 len = num_words * sizeof(long);
2291 if (len)
2292 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
e42325a5 2293 ok = _drbd_send_cmd(mdev, mdev->tconn->data.socket, P_BITMAP,
0b70a13d 2294 h, sizeof(struct p_header80) + len, 0);
b411b363
PR
2295 c->word_offset += num_words;
2296 c->bit_offset = c->word_offset * BITS_PER_LONG;
2297
2298 c->packets[1]++;
0b70a13d 2299 c->bytes[1] += sizeof(struct p_header80) + len;
b411b363
PR
2300
2301 if (c->bit_offset > c->bm_bits)
2302 c->bit_offset = c->bm_bits;
2303 }
f70af118
AG
2304 if (ok) {
2305 if (len == 0) {
2306 INFO_bm_xfer_stats(mdev, "send", c);
2307 return 0;
2308 } else
2309 return 1;
2310 }
2311 return -EIO;
b411b363
PR
2312}
2313
2314/* See the comment at receive_bitmap() */
2315int _drbd_send_bitmap(struct drbd_conf *mdev)
2316{
2317 struct bm_xfer_ctx c;
c012949a 2318 struct p_header *p;
f70af118 2319 int err;
b411b363 2320
841ce241
AG
2321 if (!expect(mdev->bitmap))
2322 return false;
b411b363
PR
2323
2324 /* maybe we should use some per thread scratch page,
2325 * and allocate that during initial device creation? */
c012949a 2326 p = (struct p_header *) __get_free_page(GFP_NOIO);
b411b363
PR
2327 if (!p) {
2328 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
81e84650 2329 return false;
b411b363
PR
2330 }
2331
2332 if (get_ldev(mdev)) {
2333 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2334 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2335 drbd_bm_set_all(mdev);
2336 if (drbd_bm_write(mdev)) {
2337 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2338 * but otherwise process as per normal - need to tell other
2339 * side that a full resync is required! */
2340 dev_err(DEV, "Failed to write bitmap to disk!\n");
2341 } else {
2342 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2343 drbd_md_sync(mdev);
2344 }
2345 }
2346 put_ldev(mdev);
2347 }
2348
2349 c = (struct bm_xfer_ctx) {
2350 .bm_bits = drbd_bm_bits(mdev),
2351 .bm_words = drbd_bm_words(mdev),
2352 };
2353
2354 do {
f70af118
AG
2355 err = send_bitmap_rle_or_plain(mdev, p, &c);
2356 } while (err > 0);
b411b363
PR
2357
2358 free_page((unsigned long) p);
f70af118 2359 return err == 0;
b411b363
PR
2360}
2361
2362int drbd_send_bitmap(struct drbd_conf *mdev)
2363{
2364 int err;
2365
2366 if (!drbd_get_data_sock(mdev))
2367 return -1;
2368 err = !_drbd_send_bitmap(mdev);
2369 drbd_put_data_sock(mdev);
2370 return err;
2371}
2372
2373int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2374{
2375 int ok;
2376 struct p_barrier_ack p;
2377
2378 p.barrier = barrier_nr;
2379 p.set_size = cpu_to_be32(set_size);
2380
2381 if (mdev->state.conn < C_CONNECTED)
81e84650 2382 return false;
c012949a 2383 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, &p.head, sizeof(p));
b411b363
PR
2384 return ok;
2385}
2386
2387/**
2388 * _drbd_send_ack() - Sends an ack packet
2389 * @mdev: DRBD device.
2390 * @cmd: Packet command code.
2391 * @sector: sector, needs to be in big endian byte order
2392 * @blksize: size in byte, needs to be in big endian byte order
2393 * @block_id: Id, big endian byte order
2394 */
d8763023
AG
2395static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
2396 u64 sector, u32 blksize, u64 block_id)
b411b363
PR
2397{
2398 int ok;
2399 struct p_block_ack p;
2400
2401 p.sector = sector;
2402 p.block_id = block_id;
2403 p.blksize = blksize;
2404 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2405
e42325a5 2406 if (!mdev->tconn->meta.socket || mdev->state.conn < C_CONNECTED)
81e84650 2407 return false;
c012949a 2408 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, &p.head, sizeof(p));
b411b363
PR
2409 return ok;
2410}
2411
2b2bf214
LE
2412/* dp->sector and dp->block_id already/still in network byte order,
2413 * data_size is payload size according to dp->head,
2414 * and may need to be corrected for digest size. */
d8763023 2415int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
2b2bf214 2416 struct p_data *dp, int data_size)
b411b363 2417{
a0638456
PR
2418 data_size -= (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_r_tfm) ?
2419 crypto_hash_digestsize(mdev->tconn->integrity_r_tfm) : 0;
b411b363
PR
2420 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2421 dp->block_id);
2422}
2423
d8763023 2424int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
b411b363
PR
2425 struct p_block_req *rp)
2426{
2427 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2428}
2429
2430/**
2431 * drbd_send_ack() - Sends an ack packet
2432 * @mdev: DRBD device.
2433 * @cmd: Packet command code.
2434 * @e: Epoch entry.
2435 */
d8763023
AG
2436int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
2437 struct drbd_epoch_entry *e)
b411b363
PR
2438{
2439 return _drbd_send_ack(mdev, cmd,
010f6e67
AG
2440 cpu_to_be64(e->i.sector),
2441 cpu_to_be32(e->i.size),
b411b363
PR
2442 e->block_id);
2443}
2444
2445/* This function misuses the block_id field to signal if the blocks
2446 * are is sync or not. */
d8763023 2447int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
b411b363
PR
2448 sector_t sector, int blksize, u64 block_id)
2449{
2450 return _drbd_send_ack(mdev, cmd,
2451 cpu_to_be64(sector),
2452 cpu_to_be32(blksize),
2453 cpu_to_be64(block_id));
2454}
2455
2456int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2457 sector_t sector, int size, u64 block_id)
2458{
2459 int ok;
2460 struct p_block_req p;
2461
2462 p.sector = cpu_to_be64(sector);
2463 p.block_id = block_id;
2464 p.blksize = cpu_to_be32(size);
2465
c012949a 2466 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &p.head, sizeof(p));
b411b363
PR
2467 return ok;
2468}
2469
d8763023
AG
2470int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size,
2471 void *digest, int digest_size, enum drbd_packet cmd)
b411b363
PR
2472{
2473 int ok;
2474 struct p_block_req p;
2475
fd340c12 2476 prepare_header(mdev, &p.head, cmd, sizeof(p) - sizeof(struct p_header) + digest_size);
b411b363 2477 p.sector = cpu_to_be64(sector);
9a8e7753 2478 p.block_id = ID_SYNCER /* unused */;
b411b363
PR
2479 p.blksize = cpu_to_be32(size);
2480
e42325a5 2481 mutex_lock(&mdev->tconn->data.mutex);
b411b363 2482
e42325a5
PR
2483 ok = (sizeof(p) == drbd_send(mdev, mdev->tconn->data.socket, &p, sizeof(p), 0));
2484 ok = ok && (digest_size == drbd_send(mdev, mdev->tconn->data.socket, digest, digest_size, 0));
b411b363 2485
e42325a5 2486 mutex_unlock(&mdev->tconn->data.mutex);
b411b363
PR
2487
2488 return ok;
2489}
2490
2491int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2492{
2493 int ok;
2494 struct p_block_req p;
2495
2496 p.sector = cpu_to_be64(sector);
9a8e7753 2497 p.block_id = ID_SYNCER /* unused */;
b411b363
PR
2498 p.blksize = cpu_to_be32(size);
2499
c012949a 2500 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, &p.head, sizeof(p));
b411b363
PR
2501 return ok;
2502}
2503
2504/* called on sndtimeo
81e84650
AG
2505 * returns false if we should retry,
2506 * true if we think connection is dead
b411b363
PR
2507 */
2508static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2509{
2510 int drop_it;
2511 /* long elapsed = (long)(jiffies - mdev->last_received); */
2512
e42325a5 2513 drop_it = mdev->tconn->meta.socket == sock
e6b3ea83
PR
2514 || !mdev->tconn->asender.task
2515 || get_t_state(&mdev->tconn->asender) != RUNNING
b411b363
PR
2516 || mdev->state.conn < C_CONNECTED;
2517
2518 if (drop_it)
81e84650 2519 return true;
b411b363 2520
31890f4a 2521 drop_it = !--mdev->tconn->ko_count;
b411b363
PR
2522 if (!drop_it) {
2523 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
31890f4a 2524 current->comm, current->pid, mdev->tconn->ko_count);
b411b363
PR
2525 request_ping(mdev);
2526 }
2527
2528 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2529}
2530
2531/* The idea of sendpage seems to be to put some kind of reference
2532 * to the page into the skb, and to hand it over to the NIC. In
2533 * this process get_page() gets called.
2534 *
2535 * As soon as the page was really sent over the network put_page()
2536 * gets called by some part of the network layer. [ NIC driver? ]
2537 *
2538 * [ get_page() / put_page() increment/decrement the count. If count
2539 * reaches 0 the page will be freed. ]
2540 *
2541 * This works nicely with pages from FSs.
2542 * But this means that in protocol A we might signal IO completion too early!
2543 *
2544 * In order not to corrupt data during a resync we must make sure
2545 * that we do not reuse our own buffer pages (EEs) to early, therefore
2546 * we have the net_ee list.
2547 *
2548 * XFS seems to have problems, still, it submits pages with page_count == 0!
2549 * As a workaround, we disable sendpage on pages
2550 * with page_count == 0 or PageSlab.
2551 */
2552static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 2553 int offset, size_t size, unsigned msg_flags)
b411b363 2554{
e42325a5 2555 int sent = drbd_send(mdev, mdev->tconn->data.socket, kmap(page) + offset, size, msg_flags);
b411b363
PR
2556 kunmap(page);
2557 if (sent == size)
2558 mdev->send_cnt += size>>9;
2559 return sent == size;
2560}
2561
2562static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 2563 int offset, size_t size, unsigned msg_flags)
b411b363
PR
2564{
2565 mm_segment_t oldfs = get_fs();
2566 int sent, ok;
2567 int len = size;
2568
2569 /* e.g. XFS meta- & log-data is in slab pages, which have a
2570 * page_count of 0 and/or have PageSlab() set.
2571 * we cannot use send_page for those, as that does get_page();
2572 * put_page(); and would cause either a VM_BUG directly, or
2573 * __page_cache_release a page that would actually still be referenced
2574 * by someone, leading to some obscure delayed Oops somewhere else. */
2575 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
ba11ad9a 2576 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
b411b363 2577
ba11ad9a 2578 msg_flags |= MSG_NOSIGNAL;
b411b363
PR
2579 drbd_update_congested(mdev);
2580 set_fs(KERNEL_DS);
2581 do {
e42325a5 2582 sent = mdev->tconn->data.socket->ops->sendpage(mdev->tconn->data.socket, page,
b411b363 2583 offset, len,
ba11ad9a 2584 msg_flags);
b411b363
PR
2585 if (sent == -EAGAIN) {
2586 if (we_should_drop_the_connection(mdev,
e42325a5 2587 mdev->tconn->data.socket))
b411b363
PR
2588 break;
2589 else
2590 continue;
2591 }
2592 if (sent <= 0) {
2593 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2594 __func__, (int)size, len, sent);
2595 break;
2596 }
2597 len -= sent;
2598 offset += sent;
2599 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2600 set_fs(oldfs);
2601 clear_bit(NET_CONGESTED, &mdev->flags);
2602
2603 ok = (len == 0);
2604 if (likely(ok))
2605 mdev->send_cnt += size>>9;
2606 return ok;
2607}
2608
2609static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2610{
2611 struct bio_vec *bvec;
2612 int i;
ba11ad9a 2613 /* hint all but last page with MSG_MORE */
b411b363
PR
2614 __bio_for_each_segment(bvec, bio, i, 0) {
2615 if (!_drbd_no_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
2616 bvec->bv_offset, bvec->bv_len,
2617 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
2618 return 0;
2619 }
2620 return 1;
2621}
2622
2623static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2624{
2625 struct bio_vec *bvec;
2626 int i;
ba11ad9a 2627 /* hint all but last page with MSG_MORE */
b411b363
PR
2628 __bio_for_each_segment(bvec, bio, i, 0) {
2629 if (!_drbd_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
2630 bvec->bv_offset, bvec->bv_len,
2631 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
2632 return 0;
2633 }
b411b363
PR
2634 return 1;
2635}
2636
45bb912b
LE
2637static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2638{
2639 struct page *page = e->pages;
010f6e67 2640 unsigned len = e->i.size;
ba11ad9a 2641 /* hint all but last page with MSG_MORE */
45bb912b
LE
2642 page_chain_for_each(page) {
2643 unsigned l = min_t(unsigned, len, PAGE_SIZE);
ba11ad9a
LE
2644 if (!_drbd_send_page(mdev, page, 0, l,
2645 page_chain_next(page) ? MSG_MORE : 0))
45bb912b
LE
2646 return 0;
2647 len -= l;
2648 }
2649 return 1;
2650}
2651
76d2e7ec
PR
2652static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2653{
31890f4a 2654 if (mdev->tconn->agreed_pro_version >= 95)
76d2e7ec 2655 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
76d2e7ec
PR
2656 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2657 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2658 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2659 else
721a9602 2660 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
76d2e7ec
PR
2661}
2662
b411b363
PR
2663/* Used to send write requests
2664 * R_PRIMARY -> Peer (P_DATA)
2665 */
2666int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2667{
2668 int ok = 1;
2669 struct p_data p;
2670 unsigned int dp_flags = 0;
2671 void *dgb;
2672 int dgs;
2673
2674 if (!drbd_get_data_sock(mdev))
2675 return 0;
2676
a0638456
PR
2677 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_w_tfm) ?
2678 crypto_hash_digestsize(mdev->tconn->integrity_w_tfm) : 0;
b411b363 2679
fd340c12 2680 prepare_header(mdev, &p.head, P_DATA, sizeof(p) - sizeof(struct p_header) + dgs + req->i.size);
ace652ac 2681 p.sector = cpu_to_be64(req->i.sector);
b411b363 2682 p.block_id = (unsigned long)req;
fd340c12 2683 p.seq_num = cpu_to_be32(req->seq_num = atomic_add_return(1, &mdev->packet_seq));
b411b363 2684
76d2e7ec
PR
2685 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2686
b411b363
PR
2687 if (mdev->state.conn >= C_SYNC_SOURCE &&
2688 mdev->state.conn <= C_PAUSED_SYNC_T)
2689 dp_flags |= DP_MAY_SET_IN_SYNC;
2690
2691 p.dp_flags = cpu_to_be32(dp_flags);
b411b363
PR
2692 set_bit(UNPLUG_REMOTE, &mdev->flags);
2693 ok = (sizeof(p) ==
e42325a5 2694 drbd_send(mdev, mdev->tconn->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
b411b363 2695 if (ok && dgs) {
a0638456
PR
2696 dgb = mdev->tconn->int_dig_out;
2697 drbd_csum_bio(mdev, mdev->tconn->integrity_w_tfm, req->master_bio, dgb);
e42325a5 2698 ok = dgs == drbd_send(mdev, mdev->tconn->data.socket, dgb, dgs, 0);
b411b363
PR
2699 }
2700 if (ok) {
470be44a
LE
2701 /* For protocol A, we have to memcpy the payload into
2702 * socket buffers, as we may complete right away
2703 * as soon as we handed it over to tcp, at which point the data
2704 * pages may become invalid.
2705 *
2706 * For data-integrity enabled, we copy it as well, so we can be
2707 * sure that even if the bio pages may still be modified, it
2708 * won't change the data on the wire, thus if the digest checks
2709 * out ok after sending on this side, but does not fit on the
2710 * receiving side, we sure have detected corruption elsewhere.
2711 */
89e58e75 2712 if (mdev->tconn->net_conf->wire_protocol == DRBD_PROT_A || dgs)
b411b363
PR
2713 ok = _drbd_send_bio(mdev, req->master_bio);
2714 else
2715 ok = _drbd_send_zc_bio(mdev, req->master_bio);
470be44a
LE
2716
2717 /* double check digest, sometimes buffers have been modified in flight. */
2718 if (dgs > 0 && dgs <= 64) {
24c4830c 2719 /* 64 byte, 512 bit, is the largest digest size
470be44a
LE
2720 * currently supported in kernel crypto. */
2721 unsigned char digest[64];
a0638456
PR
2722 drbd_csum_bio(mdev, mdev->tconn->integrity_w_tfm, req->master_bio, digest);
2723 if (memcmp(mdev->tconn->int_dig_out, digest, dgs)) {
470be44a
LE
2724 dev_warn(DEV,
2725 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
ace652ac 2726 (unsigned long long)req->i.sector, req->i.size);
470be44a
LE
2727 }
2728 } /* else if (dgs > 64) {
2729 ... Be noisy about digest too large ...
2730 } */
b411b363
PR
2731 }
2732
2733 drbd_put_data_sock(mdev);
bd26bfc5 2734
b411b363
PR
2735 return ok;
2736}
2737
2738/* answer packet, used to send data back for read requests:
2739 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2740 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2741 */
d8763023 2742int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd,
b411b363
PR
2743 struct drbd_epoch_entry *e)
2744{
2745 int ok;
2746 struct p_data p;
2747 void *dgb;
2748 int dgs;
2749
a0638456
PR
2750 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_w_tfm) ?
2751 crypto_hash_digestsize(mdev->tconn->integrity_w_tfm) : 0;
b411b363 2752
fd340c12 2753 prepare_header(mdev, &p.head, cmd, sizeof(p) - sizeof(struct p_header80) + dgs + e->i.size);
010f6e67 2754 p.sector = cpu_to_be64(e->i.sector);
b411b363 2755 p.block_id = e->block_id;
cc378270 2756 p.seq_num = 0; /* unused */
b411b363
PR
2757
2758 /* Only called by our kernel thread.
2759 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2760 * in response to admin command or module unload.
2761 */
2762 if (!drbd_get_data_sock(mdev))
2763 return 0;
2764
e42325a5 2765 ok = sizeof(p) == drbd_send(mdev, mdev->tconn->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
b411b363 2766 if (ok && dgs) {
a0638456
PR
2767 dgb = mdev->tconn->int_dig_out;
2768 drbd_csum_ee(mdev, mdev->tconn->integrity_w_tfm, e, dgb);
e42325a5 2769 ok = dgs == drbd_send(mdev, mdev->tconn->data.socket, dgb, dgs, 0);
b411b363
PR
2770 }
2771 if (ok)
45bb912b 2772 ok = _drbd_send_zc_ee(mdev, e);
b411b363
PR
2773
2774 drbd_put_data_sock(mdev);
bd26bfc5 2775
b411b363
PR
2776 return ok;
2777}
2778
73a01a18
PR
2779int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2780{
2781 struct p_block_desc p;
2782
ace652ac
AG
2783 p.sector = cpu_to_be64(req->i.sector);
2784 p.blksize = cpu_to_be32(req->i.size);
73a01a18
PR
2785
2786 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2787}
2788
b411b363
PR
2789/*
2790 drbd_send distinguishes two cases:
2791
2792 Packets sent via the data socket "sock"
2793 and packets sent via the meta data socket "msock"
2794
2795 sock msock
2796 -----------------+-------------------------+------------------------------
2797 timeout conf.timeout / 2 conf.timeout / 2
2798 timeout action send a ping via msock Abort communication
2799 and close all sockets
2800*/
2801
2802/*
2803 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2804 */
2805int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2806 void *buf, size_t size, unsigned msg_flags)
2807{
2808 struct kvec iov;
2809 struct msghdr msg;
2810 int rv, sent = 0;
2811
2812 if (!sock)
2813 return -1000;
2814
2815 /* THINK if (signal_pending) return ... ? */
2816
2817 iov.iov_base = buf;
2818 iov.iov_len = size;
2819
2820 msg.msg_name = NULL;
2821 msg.msg_namelen = 0;
2822 msg.msg_control = NULL;
2823 msg.msg_controllen = 0;
2824 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2825
e42325a5 2826 if (sock == mdev->tconn->data.socket) {
31890f4a 2827 mdev->tconn->ko_count = mdev->tconn->net_conf->ko_count;
b411b363
PR
2828 drbd_update_congested(mdev);
2829 }
2830 do {
2831 /* STRANGE
2832 * tcp_sendmsg does _not_ use its size parameter at all ?
2833 *
2834 * -EAGAIN on timeout, -EINTR on signal.
2835 */
2836/* THINK
2837 * do we need to block DRBD_SIG if sock == &meta.socket ??
2838 * otherwise wake_asender() might interrupt some send_*Ack !
2839 */
2840 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2841 if (rv == -EAGAIN) {
2842 if (we_should_drop_the_connection(mdev, sock))
2843 break;
2844 else
2845 continue;
2846 }
2847 D_ASSERT(rv != 0);
2848 if (rv == -EINTR) {
2849 flush_signals(current);
2850 rv = 0;
2851 }
2852 if (rv < 0)
2853 break;
2854 sent += rv;
2855 iov.iov_base += rv;
2856 iov.iov_len -= rv;
2857 } while (sent < size);
2858
e42325a5 2859 if (sock == mdev->tconn->data.socket)
b411b363
PR
2860 clear_bit(NET_CONGESTED, &mdev->flags);
2861
2862 if (rv <= 0) {
2863 if (rv != -EAGAIN) {
2864 dev_err(DEV, "%s_sendmsg returned %d\n",
e42325a5 2865 sock == mdev->tconn->meta.socket ? "msock" : "sock",
b411b363
PR
2866 rv);
2867 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2868 } else
2869 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2870 }
2871
2872 return sent;
2873}
2874
2875static int drbd_open(struct block_device *bdev, fmode_t mode)
2876{
2877 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2878 unsigned long flags;
2879 int rv = 0;
2880
2a48fc0a 2881 mutex_lock(&drbd_main_mutex);
87eeee41 2882 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
b411b363
PR
2883 /* to have a stable mdev->state.role
2884 * and no race with updating open_cnt */
2885
2886 if (mdev->state.role != R_PRIMARY) {
2887 if (mode & FMODE_WRITE)
2888 rv = -EROFS;
2889 else if (!allow_oos)
2890 rv = -EMEDIUMTYPE;
2891 }
2892
2893 if (!rv)
2894 mdev->open_cnt++;
87eeee41 2895 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
2a48fc0a 2896 mutex_unlock(&drbd_main_mutex);
b411b363
PR
2897
2898 return rv;
2899}
2900
2901static int drbd_release(struct gendisk *gd, fmode_t mode)
2902{
2903 struct drbd_conf *mdev = gd->private_data;
2a48fc0a 2904 mutex_lock(&drbd_main_mutex);
b411b363 2905 mdev->open_cnt--;
2a48fc0a 2906 mutex_unlock(&drbd_main_mutex);
b411b363
PR
2907 return 0;
2908}
2909
b411b363
PR
2910static void drbd_set_defaults(struct drbd_conf *mdev)
2911{
85f4cc17
PR
2912 /* This way we get a compile error when sync_conf grows,
2913 and we forgot to initialize it here */
2914 mdev->sync_conf = (struct syncer_conf) {
2915 /* .rate = */ DRBD_RATE_DEF,
2916 /* .after = */ DRBD_AFTER_DEF,
2917 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
85f4cc17
PR
2918 /* .verify_alg = */ {}, 0,
2919 /* .cpu_mask = */ {}, 0,
2920 /* .csums_alg = */ {}, 0,
e756414f 2921 /* .use_rle = */ 0,
9a31d716
PR
2922 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2923 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2924 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2925 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
0f0601f4
LE
2926 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2927 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
85f4cc17
PR
2928 };
2929
2930 /* Have to use that way, because the layout differs between
2931 big endian and little endian */
b411b363
PR
2932 mdev->state = (union drbd_state) {
2933 { .role = R_SECONDARY,
2934 .peer = R_UNKNOWN,
2935 .conn = C_STANDALONE,
2936 .disk = D_DISKLESS,
2937 .pdsk = D_UNKNOWN,
fb22c402
PR
2938 .susp = 0,
2939 .susp_nod = 0,
2940 .susp_fen = 0
b411b363
PR
2941 } };
2942}
2943
2944void drbd_init_set_defaults(struct drbd_conf *mdev)
2945{
2946 /* the memset(,0,) did most of this.
2947 * note: only assignments, no allocation in here */
2948
2949 drbd_set_defaults(mdev);
2950
b411b363
PR
2951 atomic_set(&mdev->ap_bio_cnt, 0);
2952 atomic_set(&mdev->ap_pending_cnt, 0);
2953 atomic_set(&mdev->rs_pending_cnt, 0);
2954 atomic_set(&mdev->unacked_cnt, 0);
2955 atomic_set(&mdev->local_cnt, 0);
b411b363 2956 atomic_set(&mdev->pp_in_use, 0);
435f0740 2957 atomic_set(&mdev->pp_in_use_by_net, 0);
778f271d 2958 atomic_set(&mdev->rs_sect_in, 0);
0f0601f4 2959 atomic_set(&mdev->rs_sect_ev, 0);
759fbdfb 2960 atomic_set(&mdev->ap_in_flight, 0);
b411b363
PR
2961
2962 mutex_init(&mdev->md_io_mutex);
e42325a5
PR
2963 mutex_init(&mdev->tconn->data.mutex);
2964 mutex_init(&mdev->tconn->meta.mutex);
2965 sema_init(&mdev->tconn->data.work.s, 0);
2966 sema_init(&mdev->tconn->meta.work.s, 0);
b411b363
PR
2967 mutex_init(&mdev->state_mutex);
2968
e42325a5
PR
2969 spin_lock_init(&mdev->tconn->data.work.q_lock);
2970 spin_lock_init(&mdev->tconn->meta.work.q_lock);
b411b363
PR
2971
2972 spin_lock_init(&mdev->al_lock);
87eeee41 2973 spin_lock_init(&mdev->tconn->req_lock);
b411b363
PR
2974 spin_lock_init(&mdev->peer_seq_lock);
2975 spin_lock_init(&mdev->epoch_lock);
2976
2977 INIT_LIST_HEAD(&mdev->active_ee);
2978 INIT_LIST_HEAD(&mdev->sync_ee);
2979 INIT_LIST_HEAD(&mdev->done_ee);
2980 INIT_LIST_HEAD(&mdev->read_ee);
2981 INIT_LIST_HEAD(&mdev->net_ee);
2982 INIT_LIST_HEAD(&mdev->resync_reads);
e42325a5
PR
2983 INIT_LIST_HEAD(&mdev->tconn->data.work.q);
2984 INIT_LIST_HEAD(&mdev->tconn->meta.work.q);
b411b363
PR
2985 INIT_LIST_HEAD(&mdev->resync_work.list);
2986 INIT_LIST_HEAD(&mdev->unplug_work.list);
e9e6f3ec 2987 INIT_LIST_HEAD(&mdev->go_diskless.list);
b411b363 2988 INIT_LIST_HEAD(&mdev->md_sync_work.list);
c4752ef1 2989 INIT_LIST_HEAD(&mdev->start_resync_work.list);
b411b363 2990 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
0ced55a3 2991
794abb75 2992 mdev->resync_work.cb = w_resync_timer;
b411b363 2993 mdev->unplug_work.cb = w_send_write_hint;
e9e6f3ec 2994 mdev->go_diskless.cb = w_go_diskless;
b411b363
PR
2995 mdev->md_sync_work.cb = w_md_sync;
2996 mdev->bm_io_work.w.cb = w_bitmap_io;
370a43e7 2997 mdev->start_resync_work.cb = w_start_resync;
b411b363
PR
2998 init_timer(&mdev->resync_timer);
2999 init_timer(&mdev->md_sync_timer);
370a43e7 3000 init_timer(&mdev->start_resync_timer);
7fde2be9 3001 init_timer(&mdev->request_timer);
b411b363
PR
3002 mdev->resync_timer.function = resync_timer_fn;
3003 mdev->resync_timer.data = (unsigned long) mdev;
3004 mdev->md_sync_timer.function = md_sync_timer_fn;
3005 mdev->md_sync_timer.data = (unsigned long) mdev;
370a43e7
PR
3006 mdev->start_resync_timer.function = start_resync_timer_fn;
3007 mdev->start_resync_timer.data = (unsigned long) mdev;
7fde2be9
PR
3008 mdev->request_timer.function = request_timer_fn;
3009 mdev->request_timer.data = (unsigned long) mdev;
b411b363
PR
3010
3011 init_waitqueue_head(&mdev->misc_wait);
3012 init_waitqueue_head(&mdev->state_wait);
3013 init_waitqueue_head(&mdev->ee_wait);
3014 init_waitqueue_head(&mdev->al_wait);
3015 init_waitqueue_head(&mdev->seq_wait);
3016
e6b3ea83
PR
3017 drbd_thread_init(mdev, &mdev->tconn->receiver, drbdd_init);
3018 drbd_thread_init(mdev, &mdev->tconn->worker, drbd_worker);
3019 drbd_thread_init(mdev, &mdev->tconn->asender, drbd_asender);
b411b363 3020
fd340c12 3021 /* mdev->tconn->agreed_pro_version gets initialized in drbd_connect() */
2451fc3b 3022 mdev->write_ordering = WO_bdev_flush;
b411b363 3023 mdev->resync_wenr = LC_FREE;
99432fcc
PR
3024 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3025 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
b411b363
PR
3026}
3027
3028void drbd_mdev_cleanup(struct drbd_conf *mdev)
3029{
1d7734a0 3030 int i;
e6b3ea83 3031 if (mdev->tconn->receiver.t_state != NONE)
b411b363 3032 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
e6b3ea83 3033 mdev->tconn->receiver.t_state);
b411b363
PR
3034
3035 /* no need to lock it, I'm the only thread alive */
3036 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3037 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3038 mdev->al_writ_cnt =
3039 mdev->bm_writ_cnt =
3040 mdev->read_cnt =
3041 mdev->recv_cnt =
3042 mdev->send_cnt =
3043 mdev->writ_cnt =
3044 mdev->p_size =
3045 mdev->rs_start =
3046 mdev->rs_total =
1d7734a0
LE
3047 mdev->rs_failed = 0;
3048 mdev->rs_last_events = 0;
0f0601f4 3049 mdev->rs_last_sect_ev = 0;
1d7734a0
LE
3050 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3051 mdev->rs_mark_left[i] = 0;
3052 mdev->rs_mark_time[i] = 0;
3053 }
89e58e75 3054 D_ASSERT(mdev->tconn->net_conf == NULL);
b411b363
PR
3055
3056 drbd_set_my_capacity(mdev, 0);
3057 if (mdev->bitmap) {
3058 /* maybe never allocated. */
02d9a94b 3059 drbd_bm_resize(mdev, 0, 1);
b411b363
PR
3060 drbd_bm_cleanup(mdev);
3061 }
3062
3063 drbd_free_resources(mdev);
0778286a 3064 clear_bit(AL_SUSPENDED, &mdev->flags);
b411b363
PR
3065
3066 /*
3067 * currently we drbd_init_ee only on module load, so
3068 * we may do drbd_release_ee only on module unload!
3069 */
3070 D_ASSERT(list_empty(&mdev->active_ee));
3071 D_ASSERT(list_empty(&mdev->sync_ee));
3072 D_ASSERT(list_empty(&mdev->done_ee));
3073 D_ASSERT(list_empty(&mdev->read_ee));
3074 D_ASSERT(list_empty(&mdev->net_ee));
3075 D_ASSERT(list_empty(&mdev->resync_reads));
e42325a5
PR
3076 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
3077 D_ASSERT(list_empty(&mdev->tconn->meta.work.q));
b411b363
PR
3078 D_ASSERT(list_empty(&mdev->resync_work.list));
3079 D_ASSERT(list_empty(&mdev->unplug_work.list));
e9e6f3ec 3080 D_ASSERT(list_empty(&mdev->go_diskless.list));
2265b473
LE
3081
3082 drbd_set_defaults(mdev);
b411b363
PR
3083}
3084
3085
3086static void drbd_destroy_mempools(void)
3087{
3088 struct page *page;
3089
3090 while (drbd_pp_pool) {
3091 page = drbd_pp_pool;
3092 drbd_pp_pool = (struct page *)page_private(page);
3093 __free_page(page);
3094 drbd_pp_vacant--;
3095 }
3096
3097 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3098
3099 if (drbd_ee_mempool)
3100 mempool_destroy(drbd_ee_mempool);
3101 if (drbd_request_mempool)
3102 mempool_destroy(drbd_request_mempool);
3103 if (drbd_ee_cache)
3104 kmem_cache_destroy(drbd_ee_cache);
3105 if (drbd_request_cache)
3106 kmem_cache_destroy(drbd_request_cache);
3107 if (drbd_bm_ext_cache)
3108 kmem_cache_destroy(drbd_bm_ext_cache);
3109 if (drbd_al_ext_cache)
3110 kmem_cache_destroy(drbd_al_ext_cache);
3111
3112 drbd_ee_mempool = NULL;
3113 drbd_request_mempool = NULL;
3114 drbd_ee_cache = NULL;
3115 drbd_request_cache = NULL;
3116 drbd_bm_ext_cache = NULL;
3117 drbd_al_ext_cache = NULL;
3118
3119 return;
3120}
3121
3122static int drbd_create_mempools(void)
3123{
3124 struct page *page;
1816a2b4 3125 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
b411b363
PR
3126 int i;
3127
3128 /* prepare our caches and mempools */
3129 drbd_request_mempool = NULL;
3130 drbd_ee_cache = NULL;
3131 drbd_request_cache = NULL;
3132 drbd_bm_ext_cache = NULL;
3133 drbd_al_ext_cache = NULL;
3134 drbd_pp_pool = NULL;
3135
3136 /* caches */
3137 drbd_request_cache = kmem_cache_create(
3138 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3139 if (drbd_request_cache == NULL)
3140 goto Enomem;
3141
3142 drbd_ee_cache = kmem_cache_create(
3143 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3144 if (drbd_ee_cache == NULL)
3145 goto Enomem;
3146
3147 drbd_bm_ext_cache = kmem_cache_create(
3148 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3149 if (drbd_bm_ext_cache == NULL)
3150 goto Enomem;
3151
3152 drbd_al_ext_cache = kmem_cache_create(
3153 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3154 if (drbd_al_ext_cache == NULL)
3155 goto Enomem;
3156
3157 /* mempools */
3158 drbd_request_mempool = mempool_create(number,
3159 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3160 if (drbd_request_mempool == NULL)
3161 goto Enomem;
3162
3163 drbd_ee_mempool = mempool_create(number,
3164 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2027ae1f 3165 if (drbd_ee_mempool == NULL)
b411b363
PR
3166 goto Enomem;
3167
3168 /* drbd's page pool */
3169 spin_lock_init(&drbd_pp_lock);
3170
3171 for (i = 0; i < number; i++) {
3172 page = alloc_page(GFP_HIGHUSER);
3173 if (!page)
3174 goto Enomem;
3175 set_page_private(page, (unsigned long)drbd_pp_pool);
3176 drbd_pp_pool = page;
3177 }
3178 drbd_pp_vacant = number;
3179
3180 return 0;
3181
3182Enomem:
3183 drbd_destroy_mempools(); /* in case we allocated some */
3184 return -ENOMEM;
3185}
3186
3187static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3188 void *unused)
3189{
3190 /* just so we have it. you never know what interesting things we
3191 * might want to do here some day...
3192 */
3193
3194 return NOTIFY_DONE;
3195}
3196
3197static struct notifier_block drbd_notifier = {
3198 .notifier_call = drbd_notify_sys,
3199};
3200
3201static void drbd_release_ee_lists(struct drbd_conf *mdev)
3202{
3203 int rr;
3204
3205 rr = drbd_release_ee(mdev, &mdev->active_ee);
3206 if (rr)
3207 dev_err(DEV, "%d EEs in active list found!\n", rr);
3208
3209 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3210 if (rr)
3211 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3212
3213 rr = drbd_release_ee(mdev, &mdev->read_ee);
3214 if (rr)
3215 dev_err(DEV, "%d EEs in read list found!\n", rr);
3216
3217 rr = drbd_release_ee(mdev, &mdev->done_ee);
3218 if (rr)
3219 dev_err(DEV, "%d EEs in done list found!\n", rr);
3220
3221 rr = drbd_release_ee(mdev, &mdev->net_ee);
3222 if (rr)
3223 dev_err(DEV, "%d EEs in net list found!\n", rr);
3224}
3225
3226/* caution. no locking.
3227 * currently only used from module cleanup code. */
3228static void drbd_delete_device(unsigned int minor)
3229{
3230 struct drbd_conf *mdev = minor_to_mdev(minor);
3231
3232 if (!mdev)
3233 return;
3234
3235 /* paranoia asserts */
70dc65e1 3236 D_ASSERT(mdev->open_cnt == 0);
e42325a5 3237 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
b411b363
PR
3238 /* end paranoia asserts */
3239
3240 del_gendisk(mdev->vdisk);
3241
3242 /* cleanup stuff that may have been allocated during
3243 * device (re-)configuration or state changes */
3244
3245 if (mdev->this_bdev)
3246 bdput(mdev->this_bdev);
3247
3248 drbd_free_resources(mdev);
2111438b 3249 drbd_free_tconn(mdev->tconn);
b411b363
PR
3250
3251 drbd_release_ee_lists(mdev);
3252
b411b363
PR
3253 lc_destroy(mdev->act_log);
3254 lc_destroy(mdev->resync);
3255
3256 kfree(mdev->p_uuid);
3257 /* mdev->p_uuid = NULL; */
3258
b411b363
PR
3259 /* cleanup the rest that has been
3260 * allocated from drbd_new_device
3261 * and actually free the mdev itself */
3262 drbd_free_mdev(mdev);
3263}
3264
3265static void drbd_cleanup(void)
3266{
3267 unsigned int i;
3268
3269 unregister_reboot_notifier(&drbd_notifier);
3270
17a93f30
LE
3271 /* first remove proc,
3272 * drbdsetup uses it's presence to detect
3273 * whether DRBD is loaded.
3274 * If we would get stuck in proc removal,
3275 * but have netlink already deregistered,
3276 * some drbdsetup commands may wait forever
3277 * for an answer.
3278 */
3279 if (drbd_proc)
3280 remove_proc_entry("drbd", NULL);
3281
b411b363
PR
3282 drbd_nl_cleanup();
3283
3284 if (minor_table) {
b411b363
PR
3285 i = minor_count;
3286 while (i--)
3287 drbd_delete_device(i);
3288 drbd_destroy_mempools();
3289 }
3290
3291 kfree(minor_table);
3292
3293 unregister_blkdev(DRBD_MAJOR, "drbd");
3294
3295 printk(KERN_INFO "drbd: module cleanup done.\n");
3296}
3297
3298/**
3299 * drbd_congested() - Callback for pdflush
3300 * @congested_data: User data
3301 * @bdi_bits: Bits pdflush is currently interested in
3302 *
3303 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3304 */
3305static int drbd_congested(void *congested_data, int bdi_bits)
3306{
3307 struct drbd_conf *mdev = congested_data;
3308 struct request_queue *q;
3309 char reason = '-';
3310 int r = 0;
3311
1b881ef7 3312 if (!may_inc_ap_bio(mdev)) {
b411b363
PR
3313 /* DRBD has frozen IO */
3314 r = bdi_bits;
3315 reason = 'd';
3316 goto out;
3317 }
3318
3319 if (get_ldev(mdev)) {
3320 q = bdev_get_queue(mdev->ldev->backing_bdev);
3321 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3322 put_ldev(mdev);
3323 if (r)
3324 reason = 'b';
3325 }
3326
3327 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3328 r |= (1 << BDI_async_congested);
3329 reason = reason == 'b' ? 'a' : 'n';
3330 }
3331
3332out:
3333 mdev->congestion_reason = reason;
3334 return r;
3335}
3336
2111438b
PR
3337struct drbd_tconn *drbd_new_tconn(char *name)
3338{
3339 struct drbd_tconn *tconn;
3340
3341 tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL);
3342 if (!tconn)
3343 return NULL;
3344
3345 tconn->name = kstrdup(name, GFP_KERNEL);
3346 if (!tconn->name)
3347 goto fail;
3348
b2fb6dbe
PR
3349 atomic_set(&tconn->net_cnt, 0);
3350 init_waitqueue_head(&tconn->net_cnt_wait);
3351
2111438b
PR
3352 write_lock_irq(&global_state_lock);
3353 list_add(&tconn->all_tconn, &drbd_tconns);
3354 write_unlock_irq(&global_state_lock);
3355
3356 return tconn;
3357
3358fail:
3359 kfree(tconn->name);
3360 kfree(tconn);
3361
3362 return NULL;
3363}
3364
3365void drbd_free_tconn(struct drbd_tconn *tconn)
3366{
3367 write_lock_irq(&global_state_lock);
3368 list_del(&tconn->all_tconn);
3369 write_unlock_irq(&global_state_lock);
3370
3371 kfree(tconn->name);
b42a70ad
PR
3372 kfree(tconn->int_dig_out);
3373 kfree(tconn->int_dig_in);
3374 kfree(tconn->int_dig_vv);
2111438b
PR
3375 kfree(tconn);
3376}
3377
b411b363
PR
3378struct drbd_conf *drbd_new_device(unsigned int minor)
3379{
3380 struct drbd_conf *mdev;
3381 struct gendisk *disk;
3382 struct request_queue *q;
3383
3384 /* GFP_KERNEL, we are outside of all write-out paths */
3385 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3386 if (!mdev)
3387 return NULL;
2111438b
PR
3388 mdev->tconn = drbd_new_tconn("dummy");
3389 if (!mdev->tconn)
3390 goto out_no_tconn;
3391
b411b363
PR
3392 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3393 goto out_no_cpumask;
3394
2111438b 3395 mdev->tconn->volume0 = mdev;
b411b363
PR
3396 mdev->minor = minor;
3397
3398 drbd_init_set_defaults(mdev);
3399
3400 q = blk_alloc_queue(GFP_KERNEL);
3401 if (!q)
3402 goto out_no_q;
3403 mdev->rq_queue = q;
3404 q->queuedata = mdev;
b411b363
PR
3405
3406 disk = alloc_disk(1);
3407 if (!disk)
3408 goto out_no_disk;
3409 mdev->vdisk = disk;
3410
81e84650 3411 set_disk_ro(disk, true);
b411b363
PR
3412
3413 disk->queue = q;
3414 disk->major = DRBD_MAJOR;
3415 disk->first_minor = minor;
3416 disk->fops = &drbd_ops;
3417 sprintf(disk->disk_name, "drbd%d", minor);
3418 disk->private_data = mdev;
3419
3420 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3421 /* we have no partitions. we contain only ourselves. */
3422 mdev->this_bdev->bd_contains = mdev->this_bdev;
3423
3424 q->backing_dev_info.congested_fn = drbd_congested;
3425 q->backing_dev_info.congested_data = mdev;
3426
2f58dcfc 3427 blk_queue_make_request(q, drbd_make_request);
99432fcc
PR
3428 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3429 This triggers a max_bio_size message upon first attach or connect */
3430 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
b411b363
PR
3431 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3432 blk_queue_merge_bvec(q, drbd_merge_bvec);
87eeee41 3433 q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */
b411b363
PR
3434
3435 mdev->md_io_page = alloc_page(GFP_KERNEL);
3436 if (!mdev->md_io_page)
3437 goto out_no_io_page;
3438
3439 if (drbd_bm_init(mdev))
3440 goto out_no_bitmap;
3441 /* no need to lock access, we are still initializing this minor device. */
3442 if (!tl_init(mdev))
3443 goto out_no_tl;
dac1389c 3444 mdev->read_requests = RB_ROOT;
de696716 3445 mdev->write_requests = RB_ROOT;
8b946255 3446 mdev->epoch_entries = RB_ROOT;
b411b363 3447
b411b363
PR
3448 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3449 if (!mdev->current_epoch)
3450 goto out_no_epoch;
3451
3452 INIT_LIST_HEAD(&mdev->current_epoch->list);
3453 mdev->epochs = 1;
3454
3455 return mdev;
3456
3457/* out_whatever_else:
3458 kfree(mdev->current_epoch); */
3459out_no_epoch:
b411b363
PR
3460 tl_cleanup(mdev);
3461out_no_tl:
3462 drbd_bm_cleanup(mdev);
3463out_no_bitmap:
3464 __free_page(mdev->md_io_page);
3465out_no_io_page:
3466 put_disk(disk);
3467out_no_disk:
3468 blk_cleanup_queue(q);
3469out_no_q:
3470 free_cpumask_var(mdev->cpu_mask);
3471out_no_cpumask:
2111438b
PR
3472 drbd_free_tconn(mdev->tconn);
3473out_no_tconn:
b411b363
PR
3474 kfree(mdev);
3475 return NULL;
3476}
3477
3478/* counterpart of drbd_new_device.
3479 * last part of drbd_delete_device. */
3480void drbd_free_mdev(struct drbd_conf *mdev)
3481{
3482 kfree(mdev->current_epoch);
b411b363
PR
3483 tl_cleanup(mdev);
3484 if (mdev->bitmap) /* should no longer be there. */
3485 drbd_bm_cleanup(mdev);
3486 __free_page(mdev->md_io_page);
3487 put_disk(mdev->vdisk);
3488 blk_cleanup_queue(mdev->rq_queue);
3489 free_cpumask_var(mdev->cpu_mask);
3490 kfree(mdev);
3491}
3492
3493
3494int __init drbd_init(void)
3495{
3496 int err;
3497
fd340c12
PR
3498 BUILD_BUG_ON(sizeof(struct p_header80) != sizeof(struct p_header95));
3499 BUILD_BUG_ON(sizeof(struct p_handshake) != 80);
b411b363 3500
2b8a90b5 3501 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
b411b363
PR
3502 printk(KERN_ERR
3503 "drbd: invalid minor_count (%d)\n", minor_count);
3504#ifdef MODULE
3505 return -EINVAL;
3506#else
3507 minor_count = 8;
3508#endif
3509 }
3510
3511 err = drbd_nl_init();
3512 if (err)
3513 return err;
3514
3515 err = register_blkdev(DRBD_MAJOR, "drbd");
3516 if (err) {
3517 printk(KERN_ERR
3518 "drbd: unable to register block device major %d\n",
3519 DRBD_MAJOR);
3520 return err;
3521 }
3522
3523 register_reboot_notifier(&drbd_notifier);
3524
3525 /*
3526 * allocate all necessary structs
3527 */
3528 err = -ENOMEM;
3529
3530 init_waitqueue_head(&drbd_pp_wait);
3531
3532 drbd_proc = NULL; /* play safe for drbd_cleanup */
3533 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3534 GFP_KERNEL);
3535 if (!minor_table)
3536 goto Enomem;
3537
3538 err = drbd_create_mempools();
3539 if (err)
3540 goto Enomem;
3541
8c484ee4 3542 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
b411b363
PR
3543 if (!drbd_proc) {
3544 printk(KERN_ERR "drbd: unable to register proc file\n");
3545 goto Enomem;
3546 }
3547
3548 rwlock_init(&global_state_lock);
2111438b 3549 INIT_LIST_HEAD(&drbd_tconns);
b411b363
PR
3550
3551 printk(KERN_INFO "drbd: initialized. "
3552 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3553 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3554 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3555 printk(KERN_INFO "drbd: registered as block device major %d\n",
3556 DRBD_MAJOR);
3557 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3558
3559 return 0; /* Success! */
3560
3561Enomem:
3562 drbd_cleanup();
3563 if (err == -ENOMEM)
3564 /* currently always the case */
3565 printk(KERN_ERR "drbd: ran out of memory\n");
3566 else
3567 printk(KERN_ERR "drbd: initialization failure\n");
3568 return err;
3569}
3570
3571void drbd_free_bc(struct drbd_backing_dev *ldev)
3572{
3573 if (ldev == NULL)
3574 return;
3575
e525fd89
TH
3576 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3577 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
b411b363
PR
3578
3579 kfree(ldev);
3580}
3581
3582void drbd_free_sock(struct drbd_conf *mdev)
3583{
e42325a5
PR
3584 if (mdev->tconn->data.socket) {
3585 mutex_lock(&mdev->tconn->data.mutex);
3586 kernel_sock_shutdown(mdev->tconn->data.socket, SHUT_RDWR);
3587 sock_release(mdev->tconn->data.socket);
3588 mdev->tconn->data.socket = NULL;
3589 mutex_unlock(&mdev->tconn->data.mutex);
b411b363 3590 }
e42325a5
PR
3591 if (mdev->tconn->meta.socket) {
3592 mutex_lock(&mdev->tconn->meta.mutex);
3593 kernel_sock_shutdown(mdev->tconn->meta.socket, SHUT_RDWR);
3594 sock_release(mdev->tconn->meta.socket);
3595 mdev->tconn->meta.socket = NULL;
3596 mutex_unlock(&mdev->tconn->meta.mutex);
b411b363
PR
3597 }
3598}
3599
3600
3601void drbd_free_resources(struct drbd_conf *mdev)
3602{
3603 crypto_free_hash(mdev->csums_tfm);
3604 mdev->csums_tfm = NULL;
3605 crypto_free_hash(mdev->verify_tfm);
3606 mdev->verify_tfm = NULL;
a0638456
PR
3607 crypto_free_hash(mdev->tconn->cram_hmac_tfm);
3608 mdev->tconn->cram_hmac_tfm = NULL;
3609 crypto_free_hash(mdev->tconn->integrity_w_tfm);
3610 mdev->tconn->integrity_w_tfm = NULL;
3611 crypto_free_hash(mdev->tconn->integrity_r_tfm);
3612 mdev->tconn->integrity_r_tfm = NULL;
b411b363
PR
3613
3614 drbd_free_sock(mdev);
3615
3616 __no_warn(local,
3617 drbd_free_bc(mdev->ldev);
3618 mdev->ldev = NULL;);
3619}
3620
3621/* meta data management */
3622
3623struct meta_data_on_disk {
3624 u64 la_size; /* last agreed size. */
3625 u64 uuid[UI_SIZE]; /* UUIDs. */
3626 u64 device_uuid;
3627 u64 reserved_u64_1;
3628 u32 flags; /* MDF */
3629 u32 magic;
3630 u32 md_size_sect;
3631 u32 al_offset; /* offset to this block */
3632 u32 al_nr_extents; /* important for restoring the AL */
3633 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3634 u32 bm_offset; /* offset to the bitmap, from here */
3635 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
99432fcc
PR
3636 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3637 u32 reserved_u32[3];
b411b363
PR
3638
3639} __packed;
3640
3641/**
3642 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3643 * @mdev: DRBD device.
3644 */
3645void drbd_md_sync(struct drbd_conf *mdev)
3646{
3647 struct meta_data_on_disk *buffer;
3648 sector_t sector;
3649 int i;
3650
ee15b038
LE
3651 del_timer(&mdev->md_sync_timer);
3652 /* timer may be rearmed by drbd_md_mark_dirty() now. */
b411b363
PR
3653 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3654 return;
b411b363
PR
3655
3656 /* We use here D_FAILED and not D_ATTACHING because we try to write
3657 * metadata even if we detach due to a disk failure! */
3658 if (!get_ldev_if_state(mdev, D_FAILED))
3659 return;
3660
b411b363
PR
3661 mutex_lock(&mdev->md_io_mutex);
3662 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3663 memset(buffer, 0, 512);
3664
3665 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3666 for (i = UI_CURRENT; i < UI_SIZE; i++)
3667 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3668 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3669 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3670
3671 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3672 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3673 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3674 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3675 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3676
3677 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
99432fcc 3678 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
b411b363
PR
3679
3680 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3681 sector = mdev->ldev->md.md_offset;
3682
3f3a9b84 3683 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
b411b363
PR
3684 /* this was a try anyways ... */
3685 dev_err(DEV, "meta data update failed!\n");
81e84650 3686 drbd_chk_io_error(mdev, 1, true);
b411b363
PR
3687 }
3688
3689 /* Update mdev->ldev->md.la_size_sect,
3690 * since we updated it on metadata. */
3691 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3692
3693 mutex_unlock(&mdev->md_io_mutex);
3694 put_ldev(mdev);
3695}
3696
3697/**
3698 * drbd_md_read() - Reads in the meta data super block
3699 * @mdev: DRBD device.
3700 * @bdev: Device from which the meta data should be read in.
3701 *
116676ca 3702 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
b411b363
PR
3703 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3704 */
3705int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3706{
3707 struct meta_data_on_disk *buffer;
3708 int i, rv = NO_ERROR;
3709
3710 if (!get_ldev_if_state(mdev, D_ATTACHING))
3711 return ERR_IO_MD_DISK;
3712
b411b363
PR
3713 mutex_lock(&mdev->md_io_mutex);
3714 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3715
3716 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
25985edc 3717 /* NOTE: can't do normal error processing here as this is
b411b363
PR
3718 called BEFORE disk is attached */
3719 dev_err(DEV, "Error while reading metadata.\n");
3720 rv = ERR_IO_MD_DISK;
3721 goto err;
3722 }
3723
e7fad8af 3724 if (buffer->magic != cpu_to_be32(DRBD_MD_MAGIC)) {
b411b363
PR
3725 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3726 rv = ERR_MD_INVALID;
3727 goto err;
3728 }
3729 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3730 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3731 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3732 rv = ERR_MD_INVALID;
3733 goto err;
3734 }
3735 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3736 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3737 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3738 rv = ERR_MD_INVALID;
3739 goto err;
3740 }
3741 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3742 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3743 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3744 rv = ERR_MD_INVALID;
3745 goto err;
3746 }
3747
3748 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3749 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3750 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3751 rv = ERR_MD_INVALID;
3752 goto err;
3753 }
3754
3755 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3756 for (i = UI_CURRENT; i < UI_SIZE; i++)
3757 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3758 bdev->md.flags = be32_to_cpu(buffer->flags);
3759 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3760 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3761
87eeee41 3762 spin_lock_irq(&mdev->tconn->req_lock);
99432fcc
PR
3763 if (mdev->state.conn < C_CONNECTED) {
3764 int peer;
3765 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3766 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3767 mdev->peer_max_bio_size = peer;
3768 }
87eeee41 3769 spin_unlock_irq(&mdev->tconn->req_lock);
99432fcc 3770
b411b363
PR
3771 if (mdev->sync_conf.al_extents < 7)
3772 mdev->sync_conf.al_extents = 127;
3773
3774 err:
3775 mutex_unlock(&mdev->md_io_mutex);
3776 put_ldev(mdev);
3777
3778 return rv;
3779}
3780
3781/**
3782 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3783 * @mdev: DRBD device.
3784 *
3785 * Call this function if you change anything that should be written to
3786 * the meta-data super block. This function sets MD_DIRTY, and starts a
3787 * timer that ensures that within five seconds you have to call drbd_md_sync().
3788 */
ca0e6098 3789#ifdef DEBUG
ee15b038
LE
3790void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3791{
3792 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3793 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3794 mdev->last_md_mark_dirty.line = line;
3795 mdev->last_md_mark_dirty.func = func;
3796 }
3797}
3798#else
b411b363
PR
3799void drbd_md_mark_dirty(struct drbd_conf *mdev)
3800{
ee15b038 3801 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
ca0e6098 3802 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
b411b363 3803}
ee15b038 3804#endif
b411b363
PR
3805
3806static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3807{
3808 int i;
3809
62b0da3a 3810 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
b411b363 3811 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
b411b363
PR
3812}
3813
3814void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3815{
3816 if (idx == UI_CURRENT) {
3817 if (mdev->state.role == R_PRIMARY)
3818 val |= 1;
3819 else
3820 val &= ~((u64)1);
3821
3822 drbd_set_ed_uuid(mdev, val);
3823 }
3824
3825 mdev->ldev->md.uuid[idx] = val;
b411b363
PR
3826 drbd_md_mark_dirty(mdev);
3827}
3828
3829
3830void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3831{
3832 if (mdev->ldev->md.uuid[idx]) {
3833 drbd_uuid_move_history(mdev);
3834 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
b411b363
PR
3835 }
3836 _drbd_uuid_set(mdev, idx, val);
3837}
3838
3839/**
3840 * drbd_uuid_new_current() - Creates a new current UUID
3841 * @mdev: DRBD device.
3842 *
3843 * Creates a new current UUID, and rotates the old current UUID into
3844 * the bitmap slot. Causes an incremental resync upon next connect.
3845 */
3846void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3847{
3848 u64 val;
62b0da3a
LE
3849 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3850
3851 if (bm_uuid)
3852 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 3853
b411b363 3854 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
b411b363
PR
3855
3856 get_random_bytes(&val, sizeof(u64));
3857 _drbd_uuid_set(mdev, UI_CURRENT, val);
62b0da3a 3858 drbd_print_uuids(mdev, "new current UUID");
aaa8e2b3
LE
3859 /* get it to stable storage _now_ */
3860 drbd_md_sync(mdev);
b411b363
PR
3861}
3862
3863void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3864{
3865 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3866 return;
3867
3868 if (val == 0) {
3869 drbd_uuid_move_history(mdev);
3870 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3871 mdev->ldev->md.uuid[UI_BITMAP] = 0;
b411b363 3872 } else {
62b0da3a
LE
3873 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3874 if (bm_uuid)
3875 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 3876
62b0da3a 3877 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
b411b363
PR
3878 }
3879 drbd_md_mark_dirty(mdev);
3880}
3881
3882/**
3883 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3884 * @mdev: DRBD device.
3885 *
3886 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3887 */
3888int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3889{
3890 int rv = -EIO;
3891
3892 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3893 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3894 drbd_md_sync(mdev);
3895 drbd_bm_set_all(mdev);
3896
3897 rv = drbd_bm_write(mdev);
3898
3899 if (!rv) {
3900 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3901 drbd_md_sync(mdev);
3902 }
3903
3904 put_ldev(mdev);
3905 }
3906
3907 return rv;
3908}
3909
3910/**
3911 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3912 * @mdev: DRBD device.
3913 *
3914 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3915 */
3916int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3917{
3918 int rv = -EIO;
3919
0778286a 3920 drbd_resume_al(mdev);
b411b363
PR
3921 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3922 drbd_bm_clear_all(mdev);
3923 rv = drbd_bm_write(mdev);
3924 put_ldev(mdev);
3925 }
3926
3927 return rv;
3928}
3929
3930static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3931{
3932 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
02851e9f 3933 int rv = -EIO;
b411b363
PR
3934
3935 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3936
02851e9f 3937 if (get_ldev(mdev)) {
20ceb2b2 3938 drbd_bm_lock(mdev, work->why, work->flags);
02851e9f
LE
3939 rv = work->io_fn(mdev);
3940 drbd_bm_unlock(mdev);
3941 put_ldev(mdev);
3942 }
b411b363
PR
3943
3944 clear_bit(BITMAP_IO, &mdev->flags);
127b3178 3945 smp_mb__after_clear_bit();
b411b363
PR
3946 wake_up(&mdev->misc_wait);
3947
3948 if (work->done)
3949 work->done(mdev, rv);
3950
3951 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3952 work->why = NULL;
20ceb2b2 3953 work->flags = 0;
b411b363
PR
3954
3955 return 1;
3956}
3957
82f59cc6
LE
3958void drbd_ldev_destroy(struct drbd_conf *mdev)
3959{
3960 lc_destroy(mdev->resync);
3961 mdev->resync = NULL;
3962 lc_destroy(mdev->act_log);
3963 mdev->act_log = NULL;
3964 __no_warn(local,
3965 drbd_free_bc(mdev->ldev);
3966 mdev->ldev = NULL;);
3967
3968 if (mdev->md_io_tmpp) {
3969 __free_page(mdev->md_io_tmpp);
3970 mdev->md_io_tmpp = NULL;
3971 }
3972 clear_bit(GO_DISKLESS, &mdev->flags);
3973}
3974
e9e6f3ec
LE
3975static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3976{
3977 D_ASSERT(mdev->state.disk == D_FAILED);
9d282875
LE
3978 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3979 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
82f59cc6
LE
3980 * the protected members anymore, though, so once put_ldev reaches zero
3981 * again, it will be safe to free them. */
e9e6f3ec 3982 drbd_force_state(mdev, NS(disk, D_DISKLESS));
e9e6f3ec
LE
3983 return 1;
3984}
3985
3986void drbd_go_diskless(struct drbd_conf *mdev)
3987{
3988 D_ASSERT(mdev->state.disk == D_FAILED);
3989 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
e42325a5 3990 drbd_queue_work(&mdev->tconn->data.work, &mdev->go_diskless);
e9e6f3ec
LE
3991}
3992
b411b363
PR
3993/**
3994 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3995 * @mdev: DRBD device.
3996 * @io_fn: IO callback to be called when bitmap IO is possible
3997 * @done: callback to be called after the bitmap IO was performed
3998 * @why: Descriptive text of the reason for doing the IO
3999 *
4000 * While IO on the bitmap happens we freeze application IO thus we ensure
4001 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4002 * called from worker context. It MUST NOT be used while a previous such
4003 * work is still pending!
4004 */
4005void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4006 int (*io_fn)(struct drbd_conf *),
4007 void (*done)(struct drbd_conf *, int),
20ceb2b2 4008 char *why, enum bm_flag flags)
b411b363 4009{
e6b3ea83 4010 D_ASSERT(current == mdev->tconn->worker.task);
b411b363
PR
4011
4012 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4013 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4014 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4015 if (mdev->bm_io_work.why)
4016 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4017 why, mdev->bm_io_work.why);
4018
4019 mdev->bm_io_work.io_fn = io_fn;
4020 mdev->bm_io_work.done = done;
4021 mdev->bm_io_work.why = why;
20ceb2b2 4022 mdev->bm_io_work.flags = flags;
b411b363 4023
87eeee41 4024 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
4025 set_bit(BITMAP_IO, &mdev->flags);
4026 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
127b3178 4027 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
e42325a5 4028 drbd_queue_work(&mdev->tconn->data.work, &mdev->bm_io_work.w);
b411b363 4029 }
87eeee41 4030 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
4031}
4032
4033/**
4034 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4035 * @mdev: DRBD device.
4036 * @io_fn: IO callback to be called when bitmap IO is possible
4037 * @why: Descriptive text of the reason for doing the IO
4038 *
4039 * freezes application IO while that the actual IO operations runs. This
4040 * functions MAY NOT be called from worker context.
4041 */
20ceb2b2
LE
4042int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4043 char *why, enum bm_flag flags)
b411b363
PR
4044{
4045 int rv;
4046
e6b3ea83 4047 D_ASSERT(current != mdev->tconn->worker.task);
b411b363 4048
20ceb2b2
LE
4049 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4050 drbd_suspend_io(mdev);
b411b363 4051
20ceb2b2 4052 drbd_bm_lock(mdev, why, flags);
b411b363
PR
4053 rv = io_fn(mdev);
4054 drbd_bm_unlock(mdev);
4055
20ceb2b2
LE
4056 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4057 drbd_resume_io(mdev);
b411b363
PR
4058
4059 return rv;
4060}
4061
4062void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4063{
4064 if ((mdev->ldev->md.flags & flag) != flag) {
4065 drbd_md_mark_dirty(mdev);
4066 mdev->ldev->md.flags |= flag;
4067 }
4068}
4069
4070void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4071{
4072 if ((mdev->ldev->md.flags & flag) != 0) {
4073 drbd_md_mark_dirty(mdev);
4074 mdev->ldev->md.flags &= ~flag;
4075 }
4076}
4077int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4078{
4079 return (bdev->md.flags & flag) != 0;
4080}
4081
4082static void md_sync_timer_fn(unsigned long data)
4083{
4084 struct drbd_conf *mdev = (struct drbd_conf *) data;
4085
e42325a5 4086 drbd_queue_work_front(&mdev->tconn->data.work, &mdev->md_sync_work);
b411b363
PR
4087}
4088
4089static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4090{
4091 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
ee15b038
LE
4092#ifdef DEBUG
4093 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4094 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4095#endif
b411b363 4096 drbd_md_sync(mdev);
b411b363
PR
4097 return 1;
4098}
4099
d8763023 4100const char *cmdname(enum drbd_packet cmd)
f2ad9063
AG
4101{
4102 /* THINK may need to become several global tables
4103 * when we want to support more than
4104 * one PRO_VERSION */
4105 static const char *cmdnames[] = {
4106 [P_DATA] = "Data",
4107 [P_DATA_REPLY] = "DataReply",
4108 [P_RS_DATA_REPLY] = "RSDataReply",
4109 [P_BARRIER] = "Barrier",
4110 [P_BITMAP] = "ReportBitMap",
4111 [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
4112 [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
4113 [P_UNPLUG_REMOTE] = "UnplugRemote",
4114 [P_DATA_REQUEST] = "DataRequest",
4115 [P_RS_DATA_REQUEST] = "RSDataRequest",
4116 [P_SYNC_PARAM] = "SyncParam",
4117 [P_SYNC_PARAM89] = "SyncParam89",
4118 [P_PROTOCOL] = "ReportProtocol",
4119 [P_UUIDS] = "ReportUUIDs",
4120 [P_SIZES] = "ReportSizes",
4121 [P_STATE] = "ReportState",
4122 [P_SYNC_UUID] = "ReportSyncUUID",
4123 [P_AUTH_CHALLENGE] = "AuthChallenge",
4124 [P_AUTH_RESPONSE] = "AuthResponse",
4125 [P_PING] = "Ping",
4126 [P_PING_ACK] = "PingAck",
4127 [P_RECV_ACK] = "RecvAck",
4128 [P_WRITE_ACK] = "WriteAck",
4129 [P_RS_WRITE_ACK] = "RSWriteAck",
4130 [P_DISCARD_ACK] = "DiscardAck",
4131 [P_NEG_ACK] = "NegAck",
4132 [P_NEG_DREPLY] = "NegDReply",
4133 [P_NEG_RS_DREPLY] = "NegRSDReply",
4134 [P_BARRIER_ACK] = "BarrierAck",
4135 [P_STATE_CHG_REQ] = "StateChgRequest",
4136 [P_STATE_CHG_REPLY] = "StateChgReply",
4137 [P_OV_REQUEST] = "OVRequest",
4138 [P_OV_REPLY] = "OVReply",
4139 [P_OV_RESULT] = "OVResult",
4140 [P_CSUM_RS_REQUEST] = "CsumRSRequest",
4141 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
4142 [P_COMPRESSED_BITMAP] = "CBitmap",
4143 [P_DELAY_PROBE] = "DelayProbe",
4144 [P_OUT_OF_SYNC] = "OutOfSync",
4145 [P_MAX_CMD] = NULL,
4146 };
4147
4148 if (cmd == P_HAND_SHAKE_M)
4149 return "HandShakeM";
4150 if (cmd == P_HAND_SHAKE_S)
4151 return "HandShakeS";
4152 if (cmd == P_HAND_SHAKE)
4153 return "HandShake";
4154 if (cmd >= P_MAX_CMD)
4155 return "Unknown";
4156 return cmdnames[cmd];
4157}
4158
b411b363
PR
4159#ifdef CONFIG_DRBD_FAULT_INJECTION
4160/* Fault insertion support including random number generator shamelessly
4161 * stolen from kernel/rcutorture.c */
4162struct fault_random_state {
4163 unsigned long state;
4164 unsigned long count;
4165};
4166
4167#define FAULT_RANDOM_MULT 39916801 /* prime */
4168#define FAULT_RANDOM_ADD 479001701 /* prime */
4169#define FAULT_RANDOM_REFRESH 10000
4170
4171/*
4172 * Crude but fast random-number generator. Uses a linear congruential
4173 * generator, with occasional help from get_random_bytes().
4174 */
4175static unsigned long
4176_drbd_fault_random(struct fault_random_state *rsp)
4177{
4178 long refresh;
4179
49829ea7 4180 if (!rsp->count--) {
b411b363
PR
4181 get_random_bytes(&refresh, sizeof(refresh));
4182 rsp->state += refresh;
4183 rsp->count = FAULT_RANDOM_REFRESH;
4184 }
4185 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4186 return swahw32(rsp->state);
4187}
4188
4189static char *
4190_drbd_fault_str(unsigned int type) {
4191 static char *_faults[] = {
4192 [DRBD_FAULT_MD_WR] = "Meta-data write",
4193 [DRBD_FAULT_MD_RD] = "Meta-data read",
4194 [DRBD_FAULT_RS_WR] = "Resync write",
4195 [DRBD_FAULT_RS_RD] = "Resync read",
4196 [DRBD_FAULT_DT_WR] = "Data write",
4197 [DRBD_FAULT_DT_RD] = "Data read",
4198 [DRBD_FAULT_DT_RA] = "Data read ahead",
4199 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
6b4388ac
PR
4200 [DRBD_FAULT_AL_EE] = "EE allocation",
4201 [DRBD_FAULT_RECEIVE] = "receive data corruption",
b411b363
PR
4202 };
4203
4204 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4205}
4206
4207unsigned int
4208_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4209{
4210 static struct fault_random_state rrs = {0, 0};
4211
4212 unsigned int ret = (
4213 (fault_devs == 0 ||
4214 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4215 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4216
4217 if (ret) {
4218 fault_count++;
4219
7383506c 4220 if (__ratelimit(&drbd_ratelimit_state))
b411b363
PR
4221 dev_warn(DEV, "***Simulating %s failure\n",
4222 _drbd_fault_str(type));
4223 }
4224
4225 return ret;
4226}
4227#endif
4228
4229const char *drbd_buildtag(void)
4230{
4231 /* DRBD built from external sources has here a reference to the
4232 git hash of the source code. */
4233
4234 static char buildtag[38] = "\0uilt-in";
4235
4236 if (buildtag[0] == 0) {
4237#ifdef CONFIG_MODULES
4238 if (THIS_MODULE != NULL)
4239 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4240 else
4241#endif
4242 buildtag[0] = 'b';
4243 }
4244
4245 return buildtag;
4246}
4247
4248module_init(drbd_init)
4249module_exit(drbd_cleanup)
4250
b411b363
PR
4251EXPORT_SYMBOL(drbd_conn_str);
4252EXPORT_SYMBOL(drbd_role_str);
4253EXPORT_SYMBOL(drbd_disk_str);
4254EXPORT_SYMBOL(drbd_set_st_err_str);
This page took 0.364104 seconds and 5 git commands to generate.