drbd: Four new configuration settings for resync speed control
[deliverable/linux.git] / drivers / block / drbd / drbd_main.c
CommitLineData
b411b363
PR
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
b411b363 29#include <linux/module.h>
b411b363
PR
30#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
35#include <linux/smp_lock.h>
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
b411b363
PR
55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
67int drbdd_init(struct drbd_thread *);
68int drbd_worker(struct drbd_thread *);
69int drbd_asender(struct drbd_thread *);
70
71int drbd_init(void);
72static int drbd_open(struct block_device *bdev, fmode_t mode);
73static int drbd_release(struct gendisk *gd, fmode_t mode);
74static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
75static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
76 union drbd_state ns, enum chg_state_flags flags);
77static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
78static void md_sync_timer_fn(unsigned long data);
79static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
80
b411b363
PR
81MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
82 "Lars Ellenberg <lars@linbit.com>");
83MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
84MODULE_VERSION(REL_VERSION);
85MODULE_LICENSE("GPL");
86MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
87MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
88
89#include <linux/moduleparam.h>
90/* allow_open_on_secondary */
91MODULE_PARM_DESC(allow_oos, "DONT USE!");
92/* thanks to these macros, if compiled into the kernel (not-module),
93 * this becomes the boot parameter drbd.minor_count */
94module_param(minor_count, uint, 0444);
95module_param(disable_sendpage, bool, 0644);
96module_param(allow_oos, bool, 0);
97module_param(cn_idx, uint, 0444);
98module_param(proc_details, int, 0644);
99
100#ifdef CONFIG_DRBD_FAULT_INJECTION
101int enable_faults;
102int fault_rate;
103static int fault_count;
104int fault_devs;
105/* bitmap of enabled faults */
106module_param(enable_faults, int, 0664);
107/* fault rate % value - applies to all enabled faults */
108module_param(fault_rate, int, 0664);
109/* count of faults inserted */
110module_param(fault_count, int, 0664);
111/* bitmap of devices to insert faults on */
112module_param(fault_devs, int, 0644);
113#endif
114
115/* module parameter, defined */
116unsigned int minor_count = 32;
117int disable_sendpage;
118int allow_oos;
119unsigned int cn_idx = CN_IDX_DRBD;
120int proc_details; /* Detail level in proc drbd*/
121
122/* Module parameter for setting the user mode helper program
123 * to run. Default is /sbin/drbdadm */
124char usermode_helper[80] = "/sbin/drbdadm";
125
126module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
127
128/* in 2.6.x, our device mapping and config info contains our virtual gendisks
129 * as member "struct gendisk *vdisk;"
130 */
131struct drbd_conf **minor_table;
132
133struct kmem_cache *drbd_request_cache;
134struct kmem_cache *drbd_ee_cache; /* epoch entries */
135struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
136struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
137mempool_t *drbd_request_mempool;
138mempool_t *drbd_ee_mempool;
139
140/* I do not use a standard mempool, because:
141 1) I want to hand out the pre-allocated objects first.
142 2) I want to be able to interrupt sleeping allocation with a signal.
143 Note: This is a single linked list, the next pointer is the private
144 member of struct page.
145 */
146struct page *drbd_pp_pool;
147spinlock_t drbd_pp_lock;
148int drbd_pp_vacant;
149wait_queue_head_t drbd_pp_wait;
150
151DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
152
7d4e9d09 153static const struct block_device_operations drbd_ops = {
b411b363
PR
154 .owner = THIS_MODULE,
155 .open = drbd_open,
156 .release = drbd_release,
157};
158
159#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
160
161#ifdef __CHECKER__
162/* When checking with sparse, and this is an inline function, sparse will
163 give tons of false positives. When this is a real functions sparse works.
164 */
165int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
166{
167 int io_allowed;
168
169 atomic_inc(&mdev->local_cnt);
170 io_allowed = (mdev->state.disk >= mins);
171 if (!io_allowed) {
172 if (atomic_dec_and_test(&mdev->local_cnt))
173 wake_up(&mdev->misc_wait);
174 }
175 return io_allowed;
176}
177
178#endif
179
180/**
181 * DOC: The transfer log
182 *
183 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
184 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
185 * of the list. There is always at least one &struct drbd_tl_epoch object.
186 *
187 * Each &struct drbd_tl_epoch has a circular double linked list of requests
188 * attached.
189 */
190static int tl_init(struct drbd_conf *mdev)
191{
192 struct drbd_tl_epoch *b;
193
194 /* during device minor initialization, we may well use GFP_KERNEL */
195 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
196 if (!b)
197 return 0;
198 INIT_LIST_HEAD(&b->requests);
199 INIT_LIST_HEAD(&b->w.list);
200 b->next = NULL;
201 b->br_number = 4711;
202 b->n_req = 0;
203 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
204
205 mdev->oldest_tle = b;
206 mdev->newest_tle = b;
207 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
208
209 mdev->tl_hash = NULL;
210 mdev->tl_hash_s = 0;
211
212 return 1;
213}
214
215static void tl_cleanup(struct drbd_conf *mdev)
216{
217 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
218 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
219 kfree(mdev->oldest_tle);
220 mdev->oldest_tle = NULL;
221 kfree(mdev->unused_spare_tle);
222 mdev->unused_spare_tle = NULL;
223 kfree(mdev->tl_hash);
224 mdev->tl_hash = NULL;
225 mdev->tl_hash_s = 0;
226}
227
228/**
229 * _tl_add_barrier() - Adds a barrier to the transfer log
230 * @mdev: DRBD device.
231 * @new: Barrier to be added before the current head of the TL.
232 *
233 * The caller must hold the req_lock.
234 */
235void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
236{
237 struct drbd_tl_epoch *newest_before;
238
239 INIT_LIST_HEAD(&new->requests);
240 INIT_LIST_HEAD(&new->w.list);
241 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
242 new->next = NULL;
243 new->n_req = 0;
244
245 newest_before = mdev->newest_tle;
246 /* never send a barrier number == 0, because that is special-cased
247 * when using TCQ for our write ordering code */
248 new->br_number = (newest_before->br_number+1) ?: 1;
249 if (mdev->newest_tle != new) {
250 mdev->newest_tle->next = new;
251 mdev->newest_tle = new;
252 }
253}
254
255/**
256 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
257 * @mdev: DRBD device.
258 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
259 * @set_size: Expected number of requests before that barrier.
260 *
261 * In case the passed barrier_nr or set_size does not match the oldest
262 * &struct drbd_tl_epoch objects this function will cause a termination
263 * of the connection.
264 */
265void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
266 unsigned int set_size)
267{
268 struct drbd_tl_epoch *b, *nob; /* next old barrier */
269 struct list_head *le, *tle;
270 struct drbd_request *r;
271
272 spin_lock_irq(&mdev->req_lock);
273
274 b = mdev->oldest_tle;
275
276 /* first some paranoia code */
277 if (b == NULL) {
278 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
279 barrier_nr);
280 goto bail;
281 }
282 if (b->br_number != barrier_nr) {
283 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
284 barrier_nr, b->br_number);
285 goto bail;
286 }
287 if (b->n_req != set_size) {
288 dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
289 barrier_nr, set_size, b->n_req);
290 goto bail;
291 }
292
293 /* Clean up list of requests processed during current epoch */
294 list_for_each_safe(le, tle, &b->requests) {
295 r = list_entry(le, struct drbd_request, tl_requests);
296 _req_mod(r, barrier_acked);
297 }
298 /* There could be requests on the list waiting for completion
299 of the write to the local disk. To avoid corruptions of
300 slab's data structures we have to remove the lists head.
301
302 Also there could have been a barrier ack out of sequence, overtaking
303 the write acks - which would be a bug and violating write ordering.
304 To not deadlock in case we lose connection while such requests are
305 still pending, we need some way to find them for the
306 _req_mode(connection_lost_while_pending).
307
308 These have been list_move'd to the out_of_sequence_requests list in
309 _req_mod(, barrier_acked) above.
310 */
311 list_del_init(&b->requests);
312
313 nob = b->next;
314 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
315 _tl_add_barrier(mdev, b);
316 if (nob)
317 mdev->oldest_tle = nob;
318 /* if nob == NULL b was the only barrier, and becomes the new
319 barrier. Therefore mdev->oldest_tle points already to b */
320 } else {
321 D_ASSERT(nob != NULL);
322 mdev->oldest_tle = nob;
323 kfree(b);
324 }
325
326 spin_unlock_irq(&mdev->req_lock);
327 dec_ap_pending(mdev);
328
329 return;
330
331bail:
332 spin_unlock_irq(&mdev->req_lock);
333 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
334}
335
336
337/**
338 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
339 * @mdev: DRBD device.
340 *
341 * This is called after the connection to the peer was lost. The storage covered
342 * by the requests on the transfer gets marked as our of sync. Called from the
343 * receiver thread and the worker thread.
344 */
345void tl_clear(struct drbd_conf *mdev)
346{
347 struct drbd_tl_epoch *b, *tmp;
348 struct list_head *le, *tle;
349 struct drbd_request *r;
350 int new_initial_bnr = net_random();
351
352 spin_lock_irq(&mdev->req_lock);
353
354 b = mdev->oldest_tle;
355 while (b) {
356 list_for_each_safe(le, tle, &b->requests) {
357 r = list_entry(le, struct drbd_request, tl_requests);
358 /* It would be nice to complete outside of spinlock.
359 * But this is easier for now. */
360 _req_mod(r, connection_lost_while_pending);
361 }
362 tmp = b->next;
363
364 /* there could still be requests on that ring list,
365 * in case local io is still pending */
366 list_del(&b->requests);
367
368 /* dec_ap_pending corresponding to queue_barrier.
369 * the newest barrier may not have been queued yet,
370 * in which case w.cb is still NULL. */
371 if (b->w.cb != NULL)
372 dec_ap_pending(mdev);
373
374 if (b == mdev->newest_tle) {
375 /* recycle, but reinit! */
376 D_ASSERT(tmp == NULL);
377 INIT_LIST_HEAD(&b->requests);
378 INIT_LIST_HEAD(&b->w.list);
379 b->w.cb = NULL;
380 b->br_number = new_initial_bnr;
381 b->n_req = 0;
382
383 mdev->oldest_tle = b;
384 break;
385 }
386 kfree(b);
387 b = tmp;
388 }
389
390 /* we expect this list to be empty. */
391 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
392
393 /* but just in case, clean it up anyways! */
394 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
395 r = list_entry(le, struct drbd_request, tl_requests);
396 /* It would be nice to complete outside of spinlock.
397 * But this is easier for now. */
398 _req_mod(r, connection_lost_while_pending);
399 }
400
401 /* ensure bit indicating barrier is required is clear */
402 clear_bit(CREATE_BARRIER, &mdev->flags);
403
404 spin_unlock_irq(&mdev->req_lock);
405}
406
407/**
408 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
409 * @mdev: DRBD device.
410 * @os: old (current) state.
411 * @ns: new (wanted) state.
412 */
413static int cl_wide_st_chg(struct drbd_conf *mdev,
414 union drbd_state os, union drbd_state ns)
415{
416 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
417 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
418 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
419 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
420 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
421 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
422 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
423}
424
425int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
426 union drbd_state mask, union drbd_state val)
427{
428 unsigned long flags;
429 union drbd_state os, ns;
430 int rv;
431
432 spin_lock_irqsave(&mdev->req_lock, flags);
433 os = mdev->state;
434 ns.i = (os.i & ~mask.i) | val.i;
435 rv = _drbd_set_state(mdev, ns, f, NULL);
436 ns = mdev->state;
437 spin_unlock_irqrestore(&mdev->req_lock, flags);
438
439 return rv;
440}
441
442/**
443 * drbd_force_state() - Impose a change which happens outside our control on our state
444 * @mdev: DRBD device.
445 * @mask: mask of state bits to change.
446 * @val: value of new state bits.
447 */
448void drbd_force_state(struct drbd_conf *mdev,
449 union drbd_state mask, union drbd_state val)
450{
451 drbd_change_state(mdev, CS_HARD, mask, val);
452}
453
454static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
455static int is_valid_state_transition(struct drbd_conf *,
456 union drbd_state, union drbd_state);
457static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
458 union drbd_state ns, int *warn_sync_abort);
459int drbd_send_state_req(struct drbd_conf *,
460 union drbd_state, union drbd_state);
461
462static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
463 union drbd_state mask, union drbd_state val)
464{
465 union drbd_state os, ns;
466 unsigned long flags;
467 int rv;
468
469 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
470 return SS_CW_SUCCESS;
471
472 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
473 return SS_CW_FAILED_BY_PEER;
474
475 rv = 0;
476 spin_lock_irqsave(&mdev->req_lock, flags);
477 os = mdev->state;
478 ns.i = (os.i & ~mask.i) | val.i;
479 ns = sanitize_state(mdev, os, ns, NULL);
480
481 if (!cl_wide_st_chg(mdev, os, ns))
482 rv = SS_CW_NO_NEED;
483 if (!rv) {
484 rv = is_valid_state(mdev, ns);
485 if (rv == SS_SUCCESS) {
486 rv = is_valid_state_transition(mdev, ns, os);
487 if (rv == SS_SUCCESS)
488 rv = 0; /* cont waiting, otherwise fail. */
489 }
490 }
491 spin_unlock_irqrestore(&mdev->req_lock, flags);
492
493 return rv;
494}
495
496/**
497 * drbd_req_state() - Perform an eventually cluster wide state change
498 * @mdev: DRBD device.
499 * @mask: mask of state bits to change.
500 * @val: value of new state bits.
501 * @f: flags
502 *
503 * Should not be called directly, use drbd_request_state() or
504 * _drbd_request_state().
505 */
506static int drbd_req_state(struct drbd_conf *mdev,
507 union drbd_state mask, union drbd_state val,
508 enum chg_state_flags f)
509{
510 struct completion done;
511 unsigned long flags;
512 union drbd_state os, ns;
513 int rv;
514
515 init_completion(&done);
516
517 if (f & CS_SERIALIZE)
518 mutex_lock(&mdev->state_mutex);
519
520 spin_lock_irqsave(&mdev->req_lock, flags);
521 os = mdev->state;
522 ns.i = (os.i & ~mask.i) | val.i;
523 ns = sanitize_state(mdev, os, ns, NULL);
524
525 if (cl_wide_st_chg(mdev, os, ns)) {
526 rv = is_valid_state(mdev, ns);
527 if (rv == SS_SUCCESS)
528 rv = is_valid_state_transition(mdev, ns, os);
529 spin_unlock_irqrestore(&mdev->req_lock, flags);
530
531 if (rv < SS_SUCCESS) {
532 if (f & CS_VERBOSE)
533 print_st_err(mdev, os, ns, rv);
534 goto abort;
535 }
536
537 drbd_state_lock(mdev);
538 if (!drbd_send_state_req(mdev, mask, val)) {
539 drbd_state_unlock(mdev);
540 rv = SS_CW_FAILED_BY_PEER;
541 if (f & CS_VERBOSE)
542 print_st_err(mdev, os, ns, rv);
543 goto abort;
544 }
545
546 wait_event(mdev->state_wait,
547 (rv = _req_st_cond(mdev, mask, val)));
548
549 if (rv < SS_SUCCESS) {
550 drbd_state_unlock(mdev);
551 if (f & CS_VERBOSE)
552 print_st_err(mdev, os, ns, rv);
553 goto abort;
554 }
555 spin_lock_irqsave(&mdev->req_lock, flags);
556 os = mdev->state;
557 ns.i = (os.i & ~mask.i) | val.i;
558 rv = _drbd_set_state(mdev, ns, f, &done);
559 drbd_state_unlock(mdev);
560 } else {
561 rv = _drbd_set_state(mdev, ns, f, &done);
562 }
563
564 spin_unlock_irqrestore(&mdev->req_lock, flags);
565
566 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
567 D_ASSERT(current != mdev->worker.task);
568 wait_for_completion(&done);
569 }
570
571abort:
572 if (f & CS_SERIALIZE)
573 mutex_unlock(&mdev->state_mutex);
574
575 return rv;
576}
577
578/**
579 * _drbd_request_state() - Request a state change (with flags)
580 * @mdev: DRBD device.
581 * @mask: mask of state bits to change.
582 * @val: value of new state bits.
583 * @f: flags
584 *
585 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
586 * flag, or when logging of failed state change requests is not desired.
587 */
588int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
589 union drbd_state val, enum chg_state_flags f)
590{
591 int rv;
592
593 wait_event(mdev->state_wait,
594 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
595
596 return rv;
597}
598
599static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
600{
601 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
602 name,
603 drbd_conn_str(ns.conn),
604 drbd_role_str(ns.role),
605 drbd_role_str(ns.peer),
606 drbd_disk_str(ns.disk),
607 drbd_disk_str(ns.pdsk),
608 ns.susp ? 's' : 'r',
609 ns.aftr_isp ? 'a' : '-',
610 ns.peer_isp ? 'p' : '-',
611 ns.user_isp ? 'u' : '-'
612 );
613}
614
615void print_st_err(struct drbd_conf *mdev,
616 union drbd_state os, union drbd_state ns, int err)
617{
618 if (err == SS_IN_TRANSIENT_STATE)
619 return;
620 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
621 print_st(mdev, " state", os);
622 print_st(mdev, "wanted", ns);
623}
624
625
626#define drbd_peer_str drbd_role_str
627#define drbd_pdsk_str drbd_disk_str
628
629#define drbd_susp_str(A) ((A) ? "1" : "0")
630#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
631#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
632#define drbd_user_isp_str(A) ((A) ? "1" : "0")
633
634#define PSC(A) \
635 ({ if (ns.A != os.A) { \
636 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
637 drbd_##A##_str(os.A), \
638 drbd_##A##_str(ns.A)); \
639 } })
640
641/**
642 * is_valid_state() - Returns an SS_ error code if ns is not valid
643 * @mdev: DRBD device.
644 * @ns: State to consider.
645 */
646static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
647{
648 /* See drbd_state_sw_errors in drbd_strings.c */
649
650 enum drbd_fencing_p fp;
651 int rv = SS_SUCCESS;
652
653 fp = FP_DONT_CARE;
654 if (get_ldev(mdev)) {
655 fp = mdev->ldev->dc.fencing;
656 put_ldev(mdev);
657 }
658
659 if (get_net_conf(mdev)) {
660 if (!mdev->net_conf->two_primaries &&
661 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
662 rv = SS_TWO_PRIMARIES;
663 put_net_conf(mdev);
664 }
665
666 if (rv <= 0)
667 /* already found a reason to abort */;
668 else if (ns.role == R_SECONDARY && mdev->open_cnt)
669 rv = SS_DEVICE_IN_USE;
670
671 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
672 rv = SS_NO_UP_TO_DATE_DISK;
673
674 else if (fp >= FP_RESOURCE &&
675 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
676 rv = SS_PRIMARY_NOP;
677
678 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
679 rv = SS_NO_UP_TO_DATE_DISK;
680
681 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
682 rv = SS_NO_LOCAL_DISK;
683
684 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
685 rv = SS_NO_REMOTE_DISK;
686
8d4ce82b
LE
687 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
688 rv = SS_NO_UP_TO_DATE_DISK;
689
b411b363
PR
690 else if ((ns.conn == C_CONNECTED ||
691 ns.conn == C_WF_BITMAP_S ||
692 ns.conn == C_SYNC_SOURCE ||
693 ns.conn == C_PAUSED_SYNC_S) &&
694 ns.disk == D_OUTDATED)
695 rv = SS_CONNECTED_OUTDATES;
696
697 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
698 (mdev->sync_conf.verify_alg[0] == 0))
699 rv = SS_NO_VERIFY_ALG;
700
701 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
702 mdev->agreed_pro_version < 88)
703 rv = SS_NOT_SUPPORTED;
704
705 return rv;
706}
707
708/**
709 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
710 * @mdev: DRBD device.
711 * @ns: new state.
712 * @os: old state.
713 */
714static int is_valid_state_transition(struct drbd_conf *mdev,
715 union drbd_state ns, union drbd_state os)
716{
717 int rv = SS_SUCCESS;
718
719 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
720 os.conn > C_CONNECTED)
721 rv = SS_RESYNC_RUNNING;
722
723 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
724 rv = SS_ALREADY_STANDALONE;
725
726 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
727 rv = SS_IS_DISKLESS;
728
729 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
730 rv = SS_NO_NET_CONFIG;
731
732 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
733 rv = SS_LOWER_THAN_OUTDATED;
734
735 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
736 rv = SS_IN_TRANSIENT_STATE;
737
738 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
739 rv = SS_IN_TRANSIENT_STATE;
740
741 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
742 rv = SS_NEED_CONNECTION;
743
744 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
745 ns.conn != os.conn && os.conn > C_CONNECTED)
746 rv = SS_RESYNC_RUNNING;
747
748 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
749 os.conn < C_CONNECTED)
750 rv = SS_NEED_CONNECTION;
751
752 return rv;
753}
754
755/**
756 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
757 * @mdev: DRBD device.
758 * @os: old state.
759 * @ns: new state.
760 * @warn_sync_abort:
761 *
762 * When we loose connection, we have to set the state of the peers disk (pdsk)
763 * to D_UNKNOWN. This rule and many more along those lines are in this function.
764 */
765static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
766 union drbd_state ns, int *warn_sync_abort)
767{
768 enum drbd_fencing_p fp;
769
770 fp = FP_DONT_CARE;
771 if (get_ldev(mdev)) {
772 fp = mdev->ldev->dc.fencing;
773 put_ldev(mdev);
774 }
775
776 /* Disallow Network errors to configure a device's network part */
777 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
778 os.conn <= C_DISCONNECTING)
779 ns.conn = os.conn;
780
781 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
782 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
783 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
784 ns.conn = os.conn;
785
786 /* After C_DISCONNECTING only C_STANDALONE may follow */
787 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
788 ns.conn = os.conn;
789
790 if (ns.conn < C_CONNECTED) {
791 ns.peer_isp = 0;
792 ns.peer = R_UNKNOWN;
793 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
794 ns.pdsk = D_UNKNOWN;
795 }
796
797 /* Clear the aftr_isp when becoming unconfigured */
798 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
799 ns.aftr_isp = 0;
800
801 if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
802 ns.pdsk = D_UNKNOWN;
803
804 /* Abort resync if a disk fails/detaches */
805 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
806 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
807 if (warn_sync_abort)
808 *warn_sync_abort = 1;
809 ns.conn = C_CONNECTED;
810 }
811
812 if (ns.conn >= C_CONNECTED &&
813 ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
814 (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
815 switch (ns.conn) {
816 case C_WF_BITMAP_T:
817 case C_PAUSED_SYNC_T:
818 ns.disk = D_OUTDATED;
819 break;
820 case C_CONNECTED:
821 case C_WF_BITMAP_S:
822 case C_SYNC_SOURCE:
823 case C_PAUSED_SYNC_S:
824 ns.disk = D_UP_TO_DATE;
825 break;
826 case C_SYNC_TARGET:
827 ns.disk = D_INCONSISTENT;
828 dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
829 break;
830 }
831 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
832 dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
833 }
834
835 if (ns.conn >= C_CONNECTED &&
836 (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
837 switch (ns.conn) {
838 case C_CONNECTED:
839 case C_WF_BITMAP_T:
840 case C_PAUSED_SYNC_T:
841 case C_SYNC_TARGET:
842 ns.pdsk = D_UP_TO_DATE;
843 break;
844 case C_WF_BITMAP_S:
845 case C_PAUSED_SYNC_S:
e0f83012
LE
846 /* remap any consistent state to D_OUTDATED,
847 * but disallow "upgrade" of not even consistent states.
848 */
849 ns.pdsk =
850 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
851 ? os.pdsk : D_OUTDATED;
b411b363
PR
852 break;
853 case C_SYNC_SOURCE:
854 ns.pdsk = D_INCONSISTENT;
855 dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
856 break;
857 }
858 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
859 dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
860 }
861
862 /* Connection breaks down before we finished "Negotiating" */
863 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
864 get_ldev_if_state(mdev, D_NEGOTIATING)) {
865 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
866 ns.disk = mdev->new_state_tmp.disk;
867 ns.pdsk = mdev->new_state_tmp.pdsk;
868 } else {
869 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
870 ns.disk = D_DISKLESS;
871 ns.pdsk = D_UNKNOWN;
872 }
873 put_ldev(mdev);
874 }
875
876 if (fp == FP_STONITH &&
0a492166
PR
877 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
878 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
879 ns.susp = 1;
b411b363
PR
880
881 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
882 if (ns.conn == C_SYNC_SOURCE)
883 ns.conn = C_PAUSED_SYNC_S;
884 if (ns.conn == C_SYNC_TARGET)
885 ns.conn = C_PAUSED_SYNC_T;
886 } else {
887 if (ns.conn == C_PAUSED_SYNC_S)
888 ns.conn = C_SYNC_SOURCE;
889 if (ns.conn == C_PAUSED_SYNC_T)
890 ns.conn = C_SYNC_TARGET;
891 }
892
893 return ns;
894}
895
896/* helper for __drbd_set_state */
897static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
898{
899 if (cs == C_VERIFY_T) {
900 /* starting online verify from an arbitrary position
901 * does not fit well into the existing protocol.
902 * on C_VERIFY_T, we initialize ov_left and friends
903 * implicitly in receive_DataRequest once the
904 * first P_OV_REQUEST is received */
905 mdev->ov_start_sector = ~(sector_t)0;
906 } else {
907 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
908 if (bit >= mdev->rs_total)
909 mdev->ov_start_sector =
910 BM_BIT_TO_SECT(mdev->rs_total - 1);
911 mdev->ov_position = mdev->ov_start_sector;
912 }
913}
914
915/**
916 * __drbd_set_state() - Set a new DRBD state
917 * @mdev: DRBD device.
918 * @ns: new state.
919 * @flags: Flags
920 * @done: Optional completion, that will get completed after the after_state_ch() finished
921 *
922 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
923 */
924int __drbd_set_state(struct drbd_conf *mdev,
925 union drbd_state ns, enum chg_state_flags flags,
926 struct completion *done)
927{
928 union drbd_state os;
929 int rv = SS_SUCCESS;
930 int warn_sync_abort = 0;
931 struct after_state_chg_work *ascw;
932
933 os = mdev->state;
934
935 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
936
937 if (ns.i == os.i)
938 return SS_NOTHING_TO_DO;
939
940 if (!(flags & CS_HARD)) {
941 /* pre-state-change checks ; only look at ns */
942 /* See drbd_state_sw_errors in drbd_strings.c */
943
944 rv = is_valid_state(mdev, ns);
945 if (rv < SS_SUCCESS) {
946 /* If the old state was illegal as well, then let
947 this happen...*/
948
949 if (is_valid_state(mdev, os) == rv) {
950 dev_err(DEV, "Considering state change from bad state. "
951 "Error would be: '%s'\n",
952 drbd_set_st_err_str(rv));
953 print_st(mdev, "old", os);
954 print_st(mdev, "new", ns);
955 rv = is_valid_state_transition(mdev, ns, os);
956 }
957 } else
958 rv = is_valid_state_transition(mdev, ns, os);
959 }
960
961 if (rv < SS_SUCCESS) {
962 if (flags & CS_VERBOSE)
963 print_st_err(mdev, os, ns, rv);
964 return rv;
965 }
966
967 if (warn_sync_abort)
968 dev_warn(DEV, "Resync aborted.\n");
969
970 {
971 char *pbp, pb[300];
972 pbp = pb;
973 *pbp = 0;
974 PSC(role);
975 PSC(peer);
976 PSC(conn);
977 PSC(disk);
978 PSC(pdsk);
979 PSC(susp);
980 PSC(aftr_isp);
981 PSC(peer_isp);
982 PSC(user_isp);
983 dev_info(DEV, "%s\n", pb);
984 }
985
986 /* solve the race between becoming unconfigured,
987 * worker doing the cleanup, and
988 * admin reconfiguring us:
989 * on (re)configure, first set CONFIG_PENDING,
990 * then wait for a potentially exiting worker,
991 * start the worker, and schedule one no_op.
992 * then proceed with configuration.
993 */
994 if (ns.disk == D_DISKLESS &&
995 ns.conn == C_STANDALONE &&
996 ns.role == R_SECONDARY &&
997 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
998 set_bit(DEVICE_DYING, &mdev->flags);
999
1000 mdev->state.i = ns.i;
1001 wake_up(&mdev->misc_wait);
1002 wake_up(&mdev->state_wait);
1003
1004 /* post-state-change actions */
1005 if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) {
1006 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1007 mod_timer(&mdev->resync_timer, jiffies);
1008 }
1009
1010 /* aborted verify run. log the last position */
1011 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1012 ns.conn < C_CONNECTED) {
1013 mdev->ov_start_sector =
1014 BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
1015 dev_info(DEV, "Online Verify reached sector %llu\n",
1016 (unsigned long long)mdev->ov_start_sector);
1017 }
1018
1019 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1020 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1021 dev_info(DEV, "Syncer continues.\n");
1022 mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
1023 if (ns.conn == C_SYNC_TARGET) {
1024 if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
1025 mod_timer(&mdev->resync_timer, jiffies);
1026 /* This if (!test_bit) is only needed for the case
1027 that a device that has ceased to used its timer,
1028 i.e. it is already in drbd_resync_finished() gets
1029 paused and resumed. */
1030 }
1031 }
1032
1033 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1034 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1035 dev_info(DEV, "Resync suspended\n");
1036 mdev->rs_mark_time = jiffies;
1037 if (ns.conn == C_PAUSED_SYNC_T)
1038 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1039 }
1040
1041 if (os.conn == C_CONNECTED &&
1042 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1043 mdev->ov_position = 0;
1044 mdev->rs_total =
1045 mdev->rs_mark_left = drbd_bm_bits(mdev);
1046 if (mdev->agreed_pro_version >= 90)
1047 set_ov_position(mdev, ns.conn);
1048 else
1049 mdev->ov_start_sector = 0;
1050 mdev->ov_left = mdev->rs_total
1051 - BM_SECT_TO_BIT(mdev->ov_position);
1052 mdev->rs_start =
1053 mdev->rs_mark_time = jiffies;
1054 mdev->ov_last_oos_size = 0;
1055 mdev->ov_last_oos_start = 0;
1056
1057 if (ns.conn == C_VERIFY_S) {
1058 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1059 (unsigned long long)mdev->ov_position);
1060 mod_timer(&mdev->resync_timer, jiffies);
1061 }
1062 }
1063
1064 if (get_ldev(mdev)) {
1065 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1066 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1067 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1068
1069 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1070 mdf |= MDF_CRASHED_PRIMARY;
1071 if (mdev->state.role == R_PRIMARY ||
1072 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1073 mdf |= MDF_PRIMARY_IND;
1074 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1075 mdf |= MDF_CONNECTED_IND;
1076 if (mdev->state.disk > D_INCONSISTENT)
1077 mdf |= MDF_CONSISTENT;
1078 if (mdev->state.disk > D_OUTDATED)
1079 mdf |= MDF_WAS_UP_TO_DATE;
1080 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1081 mdf |= MDF_PEER_OUT_DATED;
1082 if (mdf != mdev->ldev->md.flags) {
1083 mdev->ldev->md.flags = mdf;
1084 drbd_md_mark_dirty(mdev);
1085 }
1086 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1087 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1088 put_ldev(mdev);
1089 }
1090
1091 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1092 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1093 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1094 set_bit(CONSIDER_RESYNC, &mdev->flags);
1095
1096 /* Receiver should clean up itself */
1097 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1098 drbd_thread_stop_nowait(&mdev->receiver);
1099
1100 /* Now the receiver finished cleaning up itself, it should die */
1101 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1102 drbd_thread_stop_nowait(&mdev->receiver);
1103
1104 /* Upon network failure, we need to restart the receiver. */
1105 if (os.conn > C_TEAR_DOWN &&
1106 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1107 drbd_thread_restart_nowait(&mdev->receiver);
1108
1109 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1110 if (ascw) {
1111 ascw->os = os;
1112 ascw->ns = ns;
1113 ascw->flags = flags;
1114 ascw->w.cb = w_after_state_ch;
1115 ascw->done = done;
1116 drbd_queue_work(&mdev->data.work, &ascw->w);
1117 } else {
1118 dev_warn(DEV, "Could not kmalloc an ascw\n");
1119 }
1120
1121 return rv;
1122}
1123
1124static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1125{
1126 struct after_state_chg_work *ascw =
1127 container_of(w, struct after_state_chg_work, w);
1128 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1129 if (ascw->flags & CS_WAIT_COMPLETE) {
1130 D_ASSERT(ascw->done != NULL);
1131 complete(ascw->done);
1132 }
1133 kfree(ascw);
1134
1135 return 1;
1136}
1137
1138static void abw_start_sync(struct drbd_conf *mdev, int rv)
1139{
1140 if (rv) {
1141 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1142 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1143 return;
1144 }
1145
1146 switch (mdev->state.conn) {
1147 case C_STARTING_SYNC_T:
1148 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1149 break;
1150 case C_STARTING_SYNC_S:
1151 drbd_start_resync(mdev, C_SYNC_SOURCE);
1152 break;
1153 }
1154}
1155
1156/**
1157 * after_state_ch() - Perform after state change actions that may sleep
1158 * @mdev: DRBD device.
1159 * @os: old state.
1160 * @ns: new state.
1161 * @flags: Flags
1162 */
1163static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1164 union drbd_state ns, enum chg_state_flags flags)
1165{
1166 enum drbd_fencing_p fp;
1167
1168 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1169 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1170 if (mdev->p_uuid)
1171 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1172 }
1173
1174 fp = FP_DONT_CARE;
1175 if (get_ldev(mdev)) {
1176 fp = mdev->ldev->dc.fencing;
1177 put_ldev(mdev);
1178 }
1179
1180 /* Inform userspace about the change... */
1181 drbd_bcast_state(mdev, ns);
1182
1183 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1184 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1185 drbd_khelper(mdev, "pri-on-incon-degr");
1186
1187 /* Here we have the actions that are performed after a
1188 state change. This function might sleep */
1189
1190 if (fp == FP_STONITH && ns.susp) {
1191 /* case1: The outdate peer handler is successful:
1192 * case2: The connection was established again: */
1193 if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) ||
1194 (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
1195 tl_clear(mdev);
1196 spin_lock_irq(&mdev->req_lock);
1197 _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
1198 spin_unlock_irq(&mdev->req_lock);
1199 }
1200 }
1201 /* Do not change the order of the if above and the two below... */
1202 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1203 drbd_send_uuids(mdev);
1204 drbd_send_state(mdev);
1205 }
1206 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1207 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1208
1209 /* Lost contact to peer's copy of the data */
1210 if ((os.pdsk >= D_INCONSISTENT &&
1211 os.pdsk != D_UNKNOWN &&
1212 os.pdsk != D_OUTDATED)
1213 && (ns.pdsk < D_INCONSISTENT ||
1214 ns.pdsk == D_UNKNOWN ||
1215 ns.pdsk == D_OUTDATED)) {
1216 kfree(mdev->p_uuid);
1217 mdev->p_uuid = NULL;
1218 if (get_ldev(mdev)) {
1219 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1220 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1221 drbd_uuid_new_current(mdev);
1222 drbd_send_uuids(mdev);
1223 }
1224 put_ldev(mdev);
1225 }
1226 }
1227
1228 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1229 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
1230 drbd_uuid_new_current(mdev);
1231
1232 /* D_DISKLESS Peer becomes secondary */
1233 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1234 drbd_al_to_on_disk_bm(mdev);
1235 put_ldev(mdev);
1236 }
1237
1238 /* Last part of the attaching process ... */
1239 if (ns.conn >= C_CONNECTED &&
1240 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1241 kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
1242 mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
e89b591c 1243 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
b411b363
PR
1244 drbd_send_uuids(mdev);
1245 drbd_send_state(mdev);
1246 }
1247
1248 /* We want to pause/continue resync, tell peer. */
1249 if (ns.conn >= C_CONNECTED &&
1250 ((os.aftr_isp != ns.aftr_isp) ||
1251 (os.user_isp != ns.user_isp)))
1252 drbd_send_state(mdev);
1253
1254 /* In case one of the isp bits got set, suspend other devices. */
1255 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1256 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1257 suspend_other_sg(mdev);
1258
1259 /* Make sure the peer gets informed about eventual state
1260 changes (ISP bits) while we were in WFReportParams. */
1261 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1262 drbd_send_state(mdev);
1263
1264 /* We are in the progress to start a full sync... */
1265 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1266 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1267 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1268
1269 /* We are invalidating our self... */
1270 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1271 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1272 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1273
1274 if (os.disk > D_FAILED && ns.disk == D_FAILED) {
1275 enum drbd_io_error_p eh;
1276
1277 eh = EP_PASS_ON;
1278 if (get_ldev_if_state(mdev, D_FAILED)) {
1279 eh = mdev->ldev->dc.on_io_error;
1280 put_ldev(mdev);
1281 }
1282
1283 drbd_rs_cancel_all(mdev);
1284 /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
1285 and it is D_DISKLESS here, local_cnt can only go down, it can
1286 not increase... It will reach zero */
1287 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1288 mdev->rs_total = 0;
1289 mdev->rs_failed = 0;
1290 atomic_set(&mdev->rs_pending_cnt, 0);
1291
1292 spin_lock_irq(&mdev->req_lock);
1293 _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
1294 spin_unlock_irq(&mdev->req_lock);
1295
1296 if (eh == EP_CALL_HELPER)
1297 drbd_khelper(mdev, "local-io-error");
1298 }
1299
1300 if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1301
1302 if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
1303 if (drbd_send_state(mdev))
1304 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1305 else
1306 dev_err(DEV, "Sending state in drbd_io_error() failed\n");
1307 }
1308
0a6dbf2b 1309 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
b411b363
PR
1310 lc_destroy(mdev->resync);
1311 mdev->resync = NULL;
1312 lc_destroy(mdev->act_log);
1313 mdev->act_log = NULL;
1314 __no_warn(local,
1315 drbd_free_bc(mdev->ldev);
1316 mdev->ldev = NULL;);
1317
1318 if (mdev->md_io_tmpp)
1319 __free_page(mdev->md_io_tmpp);
1320 }
1321
1322 /* Disks got bigger while they were detached */
1323 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1324 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1325 if (ns.conn == C_CONNECTED)
1326 resync_after_online_grow(mdev);
1327 }
1328
1329 /* A resync finished or aborted, wake paused devices... */
1330 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1331 (os.peer_isp && !ns.peer_isp) ||
1332 (os.user_isp && !ns.user_isp))
1333 resume_next_sg(mdev);
1334
1335 /* Upon network connection, we need to start the receiver */
1336 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1337 drbd_thread_start(&mdev->receiver);
1338
1339 /* Terminate worker thread if we are unconfigured - it will be
1340 restarted as needed... */
1341 if (ns.disk == D_DISKLESS &&
1342 ns.conn == C_STANDALONE &&
1343 ns.role == R_SECONDARY) {
1344 if (os.aftr_isp != ns.aftr_isp)
1345 resume_next_sg(mdev);
1346 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1347 if (test_bit(DEVICE_DYING, &mdev->flags))
1348 drbd_thread_stop_nowait(&mdev->worker);
1349 }
1350
1351 drbd_md_sync(mdev);
1352}
1353
1354
1355static int drbd_thread_setup(void *arg)
1356{
1357 struct drbd_thread *thi = (struct drbd_thread *) arg;
1358 struct drbd_conf *mdev = thi->mdev;
1359 unsigned long flags;
1360 int retval;
1361
1362restart:
1363 retval = thi->function(thi);
1364
1365 spin_lock_irqsave(&thi->t_lock, flags);
1366
1367 /* if the receiver has been "Exiting", the last thing it did
1368 * was set the conn state to "StandAlone",
1369 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1370 * and receiver thread will be "started".
1371 * drbd_thread_start needs to set "Restarting" in that case.
1372 * t_state check and assignment needs to be within the same spinlock,
1373 * so either thread_start sees Exiting, and can remap to Restarting,
1374 * or thread_start see None, and can proceed as normal.
1375 */
1376
1377 if (thi->t_state == Restarting) {
1378 dev_info(DEV, "Restarting %s\n", current->comm);
1379 thi->t_state = Running;
1380 spin_unlock_irqrestore(&thi->t_lock, flags);
1381 goto restart;
1382 }
1383
1384 thi->task = NULL;
1385 thi->t_state = None;
1386 smp_mb();
1387 complete(&thi->stop);
1388 spin_unlock_irqrestore(&thi->t_lock, flags);
1389
1390 dev_info(DEV, "Terminating %s\n", current->comm);
1391
1392 /* Release mod reference taken when thread was started */
1393 module_put(THIS_MODULE);
1394 return retval;
1395}
1396
1397static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1398 int (*func) (struct drbd_thread *))
1399{
1400 spin_lock_init(&thi->t_lock);
1401 thi->task = NULL;
1402 thi->t_state = None;
1403 thi->function = func;
1404 thi->mdev = mdev;
1405}
1406
1407int drbd_thread_start(struct drbd_thread *thi)
1408{
1409 struct drbd_conf *mdev = thi->mdev;
1410 struct task_struct *nt;
1411 unsigned long flags;
1412
1413 const char *me =
1414 thi == &mdev->receiver ? "receiver" :
1415 thi == &mdev->asender ? "asender" :
1416 thi == &mdev->worker ? "worker" : "NONSENSE";
1417
1418 /* is used from state engine doing drbd_thread_stop_nowait,
1419 * while holding the req lock irqsave */
1420 spin_lock_irqsave(&thi->t_lock, flags);
1421
1422 switch (thi->t_state) {
1423 case None:
1424 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1425 me, current->comm, current->pid);
1426
1427 /* Get ref on module for thread - this is released when thread exits */
1428 if (!try_module_get(THIS_MODULE)) {
1429 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1430 spin_unlock_irqrestore(&thi->t_lock, flags);
1431 return FALSE;
1432 }
1433
1434 init_completion(&thi->stop);
1435 D_ASSERT(thi->task == NULL);
1436 thi->reset_cpu_mask = 1;
1437 thi->t_state = Running;
1438 spin_unlock_irqrestore(&thi->t_lock, flags);
1439 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1440
1441 nt = kthread_create(drbd_thread_setup, (void *) thi,
1442 "drbd%d_%s", mdev_to_minor(mdev), me);
1443
1444 if (IS_ERR(nt)) {
1445 dev_err(DEV, "Couldn't start thread\n");
1446
1447 module_put(THIS_MODULE);
1448 return FALSE;
1449 }
1450 spin_lock_irqsave(&thi->t_lock, flags);
1451 thi->task = nt;
1452 thi->t_state = Running;
1453 spin_unlock_irqrestore(&thi->t_lock, flags);
1454 wake_up_process(nt);
1455 break;
1456 case Exiting:
1457 thi->t_state = Restarting;
1458 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1459 me, current->comm, current->pid);
1460 /* fall through */
1461 case Running:
1462 case Restarting:
1463 default:
1464 spin_unlock_irqrestore(&thi->t_lock, flags);
1465 break;
1466 }
1467
1468 return TRUE;
1469}
1470
1471
1472void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1473{
1474 unsigned long flags;
1475
1476 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1477
1478 /* may be called from state engine, holding the req lock irqsave */
1479 spin_lock_irqsave(&thi->t_lock, flags);
1480
1481 if (thi->t_state == None) {
1482 spin_unlock_irqrestore(&thi->t_lock, flags);
1483 if (restart)
1484 drbd_thread_start(thi);
1485 return;
1486 }
1487
1488 if (thi->t_state != ns) {
1489 if (thi->task == NULL) {
1490 spin_unlock_irqrestore(&thi->t_lock, flags);
1491 return;
1492 }
1493
1494 thi->t_state = ns;
1495 smp_mb();
1496 init_completion(&thi->stop);
1497 if (thi->task != current)
1498 force_sig(DRBD_SIGKILL, thi->task);
1499
1500 }
1501
1502 spin_unlock_irqrestore(&thi->t_lock, flags);
1503
1504 if (wait)
1505 wait_for_completion(&thi->stop);
1506}
1507
1508#ifdef CONFIG_SMP
1509/**
1510 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1511 * @mdev: DRBD device.
1512 *
1513 * Forces all threads of a device onto the same CPU. This is beneficial for
1514 * DRBD's performance. May be overwritten by user's configuration.
1515 */
1516void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1517{
1518 int ord, cpu;
1519
1520 /* user override. */
1521 if (cpumask_weight(mdev->cpu_mask))
1522 return;
1523
1524 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1525 for_each_online_cpu(cpu) {
1526 if (ord-- == 0) {
1527 cpumask_set_cpu(cpu, mdev->cpu_mask);
1528 return;
1529 }
1530 }
1531 /* should not be reached */
1532 cpumask_setall(mdev->cpu_mask);
1533}
1534
1535/**
1536 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1537 * @mdev: DRBD device.
1538 *
1539 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1540 * prematurely.
1541 */
1542void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1543{
1544 struct task_struct *p = current;
1545 struct drbd_thread *thi =
1546 p == mdev->asender.task ? &mdev->asender :
1547 p == mdev->receiver.task ? &mdev->receiver :
1548 p == mdev->worker.task ? &mdev->worker :
1549 NULL;
1550 ERR_IF(thi == NULL)
1551 return;
1552 if (!thi->reset_cpu_mask)
1553 return;
1554 thi->reset_cpu_mask = 0;
1555 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1556}
1557#endif
1558
1559/* the appropriate socket mutex must be held already */
1560int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1561 enum drbd_packets cmd, struct p_header *h,
1562 size_t size, unsigned msg_flags)
1563{
1564 int sent, ok;
1565
1566 ERR_IF(!h) return FALSE;
1567 ERR_IF(!size) return FALSE;
1568
1569 h->magic = BE_DRBD_MAGIC;
1570 h->command = cpu_to_be16(cmd);
1571 h->length = cpu_to_be16(size-sizeof(struct p_header));
1572
b411b363
PR
1573 sent = drbd_send(mdev, sock, h, size, msg_flags);
1574
1575 ok = (sent == size);
1576 if (!ok)
1577 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1578 cmdname(cmd), (int)size, sent);
1579 return ok;
1580}
1581
1582/* don't pass the socket. we may only look at it
1583 * when we hold the appropriate socket mutex.
1584 */
1585int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1586 enum drbd_packets cmd, struct p_header *h, size_t size)
1587{
1588 int ok = 0;
1589 struct socket *sock;
1590
1591 if (use_data_socket) {
1592 mutex_lock(&mdev->data.mutex);
1593 sock = mdev->data.socket;
1594 } else {
1595 mutex_lock(&mdev->meta.mutex);
1596 sock = mdev->meta.socket;
1597 }
1598
1599 /* drbd_disconnect() could have called drbd_free_sock()
1600 * while we were waiting in down()... */
1601 if (likely(sock != NULL))
1602 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1603
1604 if (use_data_socket)
1605 mutex_unlock(&mdev->data.mutex);
1606 else
1607 mutex_unlock(&mdev->meta.mutex);
1608 return ok;
1609}
1610
1611int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1612 size_t size)
1613{
1614 struct p_header h;
1615 int ok;
1616
1617 h.magic = BE_DRBD_MAGIC;
1618 h.command = cpu_to_be16(cmd);
1619 h.length = cpu_to_be16(size);
1620
1621 if (!drbd_get_data_sock(mdev))
1622 return 0;
1623
b411b363
PR
1624 ok = (sizeof(h) ==
1625 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1626 ok = ok && (size ==
1627 drbd_send(mdev, mdev->data.socket, data, size, 0));
1628
1629 drbd_put_data_sock(mdev);
1630
1631 return ok;
1632}
1633
1634int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1635{
1636 struct p_rs_param_89 *p;
1637 struct socket *sock;
1638 int size, rv;
1639 const int apv = mdev->agreed_pro_version;
1640
1641 size = apv <= 87 ? sizeof(struct p_rs_param)
1642 : apv == 88 ? sizeof(struct p_rs_param)
1643 + strlen(mdev->sync_conf.verify_alg) + 1
1644 : /* 89 */ sizeof(struct p_rs_param_89);
1645
1646 /* used from admin command context and receiver/worker context.
1647 * to avoid kmalloc, grab the socket right here,
1648 * then use the pre-allocated sbuf there */
1649 mutex_lock(&mdev->data.mutex);
1650 sock = mdev->data.socket;
1651
1652 if (likely(sock != NULL)) {
1653 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1654
1655 p = &mdev->data.sbuf.rs_param_89;
1656
1657 /* initialize verify_alg and csums_alg */
1658 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1659
1660 p->rate = cpu_to_be32(sc->rate);
1661
1662 if (apv >= 88)
1663 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1664 if (apv >= 89)
1665 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1666
1667 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1668 } else
1669 rv = 0; /* not ok */
1670
1671 mutex_unlock(&mdev->data.mutex);
1672
1673 return rv;
1674}
1675
1676int drbd_send_protocol(struct drbd_conf *mdev)
1677{
1678 struct p_protocol *p;
cf14c2e9 1679 int size, cf, rv;
b411b363
PR
1680
1681 size = sizeof(struct p_protocol);
1682
1683 if (mdev->agreed_pro_version >= 87)
1684 size += strlen(mdev->net_conf->integrity_alg) + 1;
1685
1686 /* we must not recurse into our own queue,
1687 * as that is blocked during handshake */
1688 p = kmalloc(size, GFP_NOIO);
1689 if (p == NULL)
1690 return 0;
1691
1692 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1693 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1694 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1695 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
b411b363
PR
1696 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1697
cf14c2e9
PR
1698 cf = 0;
1699 if (mdev->net_conf->want_lose)
1700 cf |= CF_WANT_LOSE;
1701 if (mdev->net_conf->dry_run) {
1702 if (mdev->agreed_pro_version >= 92)
1703 cf |= CF_DRY_RUN;
1704 else {
1705 dev_err(DEV, "--dry-run is not supported by peer");
7ac314c8 1706 kfree(p);
cf14c2e9
PR
1707 return 0;
1708 }
1709 }
1710 p->conn_flags = cpu_to_be32(cf);
1711
b411b363
PR
1712 if (mdev->agreed_pro_version >= 87)
1713 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1714
1715 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1716 (struct p_header *)p, size);
1717 kfree(p);
1718 return rv;
1719}
1720
1721int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1722{
1723 struct p_uuids p;
1724 int i;
1725
1726 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1727 return 1;
1728
1729 for (i = UI_CURRENT; i < UI_SIZE; i++)
1730 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1731
1732 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1733 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1734 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1735 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1736 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1737 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1738
1739 put_ldev(mdev);
1740
1741 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1742 (struct p_header *)&p, sizeof(p));
1743}
1744
1745int drbd_send_uuids(struct drbd_conf *mdev)
1746{
1747 return _drbd_send_uuids(mdev, 0);
1748}
1749
1750int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1751{
1752 return _drbd_send_uuids(mdev, 8);
1753}
1754
1755
1756int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1757{
1758 struct p_rs_uuid p;
1759
1760 p.uuid = cpu_to_be64(val);
1761
1762 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1763 (struct p_header *)&p, sizeof(p));
1764}
1765
e89b591c 1766int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
b411b363
PR
1767{
1768 struct p_sizes p;
1769 sector_t d_size, u_size;
1770 int q_order_type;
1771 int ok;
1772
1773 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1774 D_ASSERT(mdev->ldev->backing_bdev);
1775 d_size = drbd_get_max_capacity(mdev->ldev);
1776 u_size = mdev->ldev->dc.disk_size;
1777 q_order_type = drbd_queue_order_type(mdev);
b411b363
PR
1778 put_ldev(mdev);
1779 } else {
1780 d_size = 0;
1781 u_size = 0;
1782 q_order_type = QUEUE_ORDERED_NONE;
1783 }
1784
1785 p.d_size = cpu_to_be64(d_size);
1786 p.u_size = cpu_to_be64(u_size);
1787 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1788 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
e89b591c
PR
1789 p.queue_order_type = cpu_to_be16(q_order_type);
1790 p.dds_flags = cpu_to_be16(flags);
b411b363
PR
1791
1792 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1793 (struct p_header *)&p, sizeof(p));
1794 return ok;
1795}
1796
1797/**
1798 * drbd_send_state() - Sends the drbd state to the peer
1799 * @mdev: DRBD device.
1800 */
1801int drbd_send_state(struct drbd_conf *mdev)
1802{
1803 struct socket *sock;
1804 struct p_state p;
1805 int ok = 0;
1806
1807 /* Grab state lock so we wont send state if we're in the middle
1808 * of a cluster wide state change on another thread */
1809 drbd_state_lock(mdev);
1810
1811 mutex_lock(&mdev->data.mutex);
1812
1813 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1814 sock = mdev->data.socket;
1815
1816 if (likely(sock != NULL)) {
1817 ok = _drbd_send_cmd(mdev, sock, P_STATE,
1818 (struct p_header *)&p, sizeof(p), 0);
1819 }
1820
1821 mutex_unlock(&mdev->data.mutex);
1822
1823 drbd_state_unlock(mdev);
1824 return ok;
1825}
1826
1827int drbd_send_state_req(struct drbd_conf *mdev,
1828 union drbd_state mask, union drbd_state val)
1829{
1830 struct p_req_state p;
1831
1832 p.mask = cpu_to_be32(mask.i);
1833 p.val = cpu_to_be32(val.i);
1834
1835 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
1836 (struct p_header *)&p, sizeof(p));
1837}
1838
1839int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1840{
1841 struct p_req_state_reply p;
1842
1843 p.retcode = cpu_to_be32(retcode);
1844
1845 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
1846 (struct p_header *)&p, sizeof(p));
1847}
1848
1849int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1850 struct p_compressed_bm *p,
1851 struct bm_xfer_ctx *c)
1852{
1853 struct bitstream bs;
1854 unsigned long plain_bits;
1855 unsigned long tmp;
1856 unsigned long rl;
1857 unsigned len;
1858 unsigned toggle;
1859 int bits;
1860
1861 /* may we use this feature? */
1862 if ((mdev->sync_conf.use_rle == 0) ||
1863 (mdev->agreed_pro_version < 90))
1864 return 0;
1865
1866 if (c->bit_offset >= c->bm_bits)
1867 return 0; /* nothing to do. */
1868
1869 /* use at most thus many bytes */
1870 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1871 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1872 /* plain bits covered in this code string */
1873 plain_bits = 0;
1874
1875 /* p->encoding & 0x80 stores whether the first run length is set.
1876 * bit offset is implicit.
1877 * start with toggle == 2 to be able to tell the first iteration */
1878 toggle = 2;
1879
1880 /* see how much plain bits we can stuff into one packet
1881 * using RLE and VLI. */
1882 do {
1883 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1884 : _drbd_bm_find_next(mdev, c->bit_offset);
1885 if (tmp == -1UL)
1886 tmp = c->bm_bits;
1887 rl = tmp - c->bit_offset;
1888
1889 if (toggle == 2) { /* first iteration */
1890 if (rl == 0) {
1891 /* the first checked bit was set,
1892 * store start value, */
1893 DCBP_set_start(p, 1);
1894 /* but skip encoding of zero run length */
1895 toggle = !toggle;
1896 continue;
1897 }
1898 DCBP_set_start(p, 0);
1899 }
1900
1901 /* paranoia: catch zero runlength.
1902 * can only happen if bitmap is modified while we scan it. */
1903 if (rl == 0) {
1904 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1905 "t:%u bo:%lu\n", toggle, c->bit_offset);
1906 return -1;
1907 }
1908
1909 bits = vli_encode_bits(&bs, rl);
1910 if (bits == -ENOBUFS) /* buffer full */
1911 break;
1912 if (bits <= 0) {
1913 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1914 return 0;
1915 }
1916
1917 toggle = !toggle;
1918 plain_bits += rl;
1919 c->bit_offset = tmp;
1920 } while (c->bit_offset < c->bm_bits);
1921
1922 len = bs.cur.b - p->code + !!bs.cur.bit;
1923
1924 if (plain_bits < (len << 3)) {
1925 /* incompressible with this method.
1926 * we need to rewind both word and bit position. */
1927 c->bit_offset -= plain_bits;
1928 bm_xfer_ctx_bit_to_word_offset(c);
1929 c->bit_offset = c->word_offset * BITS_PER_LONG;
1930 return 0;
1931 }
1932
1933 /* RLE + VLI was able to compress it just fine.
1934 * update c->word_offset. */
1935 bm_xfer_ctx_bit_to_word_offset(c);
1936
1937 /* store pad_bits */
1938 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
1939
1940 return len;
1941}
1942
1943enum { OK, FAILED, DONE }
1944send_bitmap_rle_or_plain(struct drbd_conf *mdev,
1945 struct p_header *h, struct bm_xfer_ctx *c)
1946{
1947 struct p_compressed_bm *p = (void*)h;
1948 unsigned long num_words;
1949 int len;
1950 int ok;
1951
1952 len = fill_bitmap_rle_bits(mdev, p, c);
1953
1954 if (len < 0)
1955 return FAILED;
1956
1957 if (len) {
1958 DCBP_set_code(p, RLE_VLI_Bits);
1959 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
1960 sizeof(*p) + len, 0);
1961
1962 c->packets[0]++;
1963 c->bytes[0] += sizeof(*p) + len;
1964
1965 if (c->bit_offset >= c->bm_bits)
1966 len = 0; /* DONE */
1967 } else {
1968 /* was not compressible.
1969 * send a buffer full of plain text bits instead. */
1970 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
1971 len = num_words * sizeof(long);
1972 if (len)
1973 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
1974 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
1975 h, sizeof(struct p_header) + len, 0);
1976 c->word_offset += num_words;
1977 c->bit_offset = c->word_offset * BITS_PER_LONG;
1978
1979 c->packets[1]++;
1980 c->bytes[1] += sizeof(struct p_header) + len;
1981
1982 if (c->bit_offset > c->bm_bits)
1983 c->bit_offset = c->bm_bits;
1984 }
1985 ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
1986
1987 if (ok == DONE)
1988 INFO_bm_xfer_stats(mdev, "send", c);
1989 return ok;
1990}
1991
1992/* See the comment at receive_bitmap() */
1993int _drbd_send_bitmap(struct drbd_conf *mdev)
1994{
1995 struct bm_xfer_ctx c;
1996 struct p_header *p;
1997 int ret;
1998
1999 ERR_IF(!mdev->bitmap) return FALSE;
2000
2001 /* maybe we should use some per thread scratch page,
2002 * and allocate that during initial device creation? */
2003 p = (struct p_header *) __get_free_page(GFP_NOIO);
2004 if (!p) {
2005 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2006 return FALSE;
2007 }
2008
2009 if (get_ldev(mdev)) {
2010 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2011 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2012 drbd_bm_set_all(mdev);
2013 if (drbd_bm_write(mdev)) {
2014 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2015 * but otherwise process as per normal - need to tell other
2016 * side that a full resync is required! */
2017 dev_err(DEV, "Failed to write bitmap to disk!\n");
2018 } else {
2019 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2020 drbd_md_sync(mdev);
2021 }
2022 }
2023 put_ldev(mdev);
2024 }
2025
2026 c = (struct bm_xfer_ctx) {
2027 .bm_bits = drbd_bm_bits(mdev),
2028 .bm_words = drbd_bm_words(mdev),
2029 };
2030
2031 do {
2032 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2033 } while (ret == OK);
2034
2035 free_page((unsigned long) p);
2036 return (ret == DONE);
2037}
2038
2039int drbd_send_bitmap(struct drbd_conf *mdev)
2040{
2041 int err;
2042
2043 if (!drbd_get_data_sock(mdev))
2044 return -1;
2045 err = !_drbd_send_bitmap(mdev);
2046 drbd_put_data_sock(mdev);
2047 return err;
2048}
2049
2050int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2051{
2052 int ok;
2053 struct p_barrier_ack p;
2054
2055 p.barrier = barrier_nr;
2056 p.set_size = cpu_to_be32(set_size);
2057
2058 if (mdev->state.conn < C_CONNECTED)
2059 return FALSE;
2060 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2061 (struct p_header *)&p, sizeof(p));
2062 return ok;
2063}
2064
2065/**
2066 * _drbd_send_ack() - Sends an ack packet
2067 * @mdev: DRBD device.
2068 * @cmd: Packet command code.
2069 * @sector: sector, needs to be in big endian byte order
2070 * @blksize: size in byte, needs to be in big endian byte order
2071 * @block_id: Id, big endian byte order
2072 */
2073static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2074 u64 sector,
2075 u32 blksize,
2076 u64 block_id)
2077{
2078 int ok;
2079 struct p_block_ack p;
2080
2081 p.sector = sector;
2082 p.block_id = block_id;
2083 p.blksize = blksize;
2084 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2085
2086 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2087 return FALSE;
2088 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2089 (struct p_header *)&p, sizeof(p));
2090 return ok;
2091}
2092
2093int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2094 struct p_data *dp)
2095{
2096 const int header_size = sizeof(struct p_data)
2097 - sizeof(struct p_header);
2098 int data_size = ((struct p_header *)dp)->length - header_size;
2099
2100 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2101 dp->block_id);
2102}
2103
2104int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2105 struct p_block_req *rp)
2106{
2107 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2108}
2109
2110/**
2111 * drbd_send_ack() - Sends an ack packet
2112 * @mdev: DRBD device.
2113 * @cmd: Packet command code.
2114 * @e: Epoch entry.
2115 */
2116int drbd_send_ack(struct drbd_conf *mdev,
2117 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2118{
2119 return _drbd_send_ack(mdev, cmd,
2120 cpu_to_be64(e->sector),
2121 cpu_to_be32(e->size),
2122 e->block_id);
2123}
2124
2125/* This function misuses the block_id field to signal if the blocks
2126 * are is sync or not. */
2127int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2128 sector_t sector, int blksize, u64 block_id)
2129{
2130 return _drbd_send_ack(mdev, cmd,
2131 cpu_to_be64(sector),
2132 cpu_to_be32(blksize),
2133 cpu_to_be64(block_id));
2134}
2135
2136int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2137 sector_t sector, int size, u64 block_id)
2138{
2139 int ok;
2140 struct p_block_req p;
2141
2142 p.sector = cpu_to_be64(sector);
2143 p.block_id = block_id;
2144 p.blksize = cpu_to_be32(size);
2145
2146 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2147 (struct p_header *)&p, sizeof(p));
2148 return ok;
2149}
2150
2151int drbd_send_drequest_csum(struct drbd_conf *mdev,
2152 sector_t sector, int size,
2153 void *digest, int digest_size,
2154 enum drbd_packets cmd)
2155{
2156 int ok;
2157 struct p_block_req p;
2158
2159 p.sector = cpu_to_be64(sector);
2160 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2161 p.blksize = cpu_to_be32(size);
2162
2163 p.head.magic = BE_DRBD_MAGIC;
2164 p.head.command = cpu_to_be16(cmd);
2165 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
2166
2167 mutex_lock(&mdev->data.mutex);
2168
2169 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2170 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2171
2172 mutex_unlock(&mdev->data.mutex);
2173
2174 return ok;
2175}
2176
2177int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2178{
2179 int ok;
2180 struct p_block_req p;
2181
2182 p.sector = cpu_to_be64(sector);
2183 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2184 p.blksize = cpu_to_be32(size);
2185
2186 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2187 (struct p_header *)&p, sizeof(p));
2188 return ok;
2189}
2190
7237bc43
PR
2191static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds)
2192{
2193 struct p_delay_probe dp;
2194 int offset, ok = 0;
2195 struct timeval now;
2196
2197 mutex_lock(&ds->mutex);
2198 if (likely(ds->socket)) {
2199 do_gettimeofday(&now);
2200 offset = now.tv_usec - mdev->dps_time.tv_usec +
2201 (now.tv_sec - mdev->dps_time.tv_sec) * 1000000;
2202 dp.seq_num = cpu_to_be32(atomic_read(&mdev->delay_seq));
2203 dp.offset = cpu_to_be32(offset);
2204
2205 ok = _drbd_send_cmd(mdev, ds->socket, P_DELAY_PROBE,
2206 (struct p_header *)&dp, sizeof(dp), 0);
2207 }
2208 mutex_unlock(&ds->mutex);
2209
2210 return ok;
2211}
2212
2213static int drbd_send_dalay_probes(struct drbd_conf *mdev)
2214{
2215 int ok;
2216 atomic_inc(&mdev->delay_seq);
2217 do_gettimeofday(&mdev->dps_time);
2218 ok = drbd_send_delay_probe(mdev, &mdev->meta);
2219 ok = ok && drbd_send_delay_probe(mdev, &mdev->data);
2220
2221 return ok;
2222}
2223
b411b363
PR
2224/* called on sndtimeo
2225 * returns FALSE if we should retry,
2226 * TRUE if we think connection is dead
2227 */
2228static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2229{
2230 int drop_it;
2231 /* long elapsed = (long)(jiffies - mdev->last_received); */
2232
2233 drop_it = mdev->meta.socket == sock
2234 || !mdev->asender.task
2235 || get_t_state(&mdev->asender) != Running
2236 || mdev->state.conn < C_CONNECTED;
2237
2238 if (drop_it)
2239 return TRUE;
2240
2241 drop_it = !--mdev->ko_count;
2242 if (!drop_it) {
2243 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2244 current->comm, current->pid, mdev->ko_count);
2245 request_ping(mdev);
2246 }
2247
2248 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2249}
2250
2251/* The idea of sendpage seems to be to put some kind of reference
2252 * to the page into the skb, and to hand it over to the NIC. In
2253 * this process get_page() gets called.
2254 *
2255 * As soon as the page was really sent over the network put_page()
2256 * gets called by some part of the network layer. [ NIC driver? ]
2257 *
2258 * [ get_page() / put_page() increment/decrement the count. If count
2259 * reaches 0 the page will be freed. ]
2260 *
2261 * This works nicely with pages from FSs.
2262 * But this means that in protocol A we might signal IO completion too early!
2263 *
2264 * In order not to corrupt data during a resync we must make sure
2265 * that we do not reuse our own buffer pages (EEs) to early, therefore
2266 * we have the net_ee list.
2267 *
2268 * XFS seems to have problems, still, it submits pages with page_count == 0!
2269 * As a workaround, we disable sendpage on pages
2270 * with page_count == 0 or PageSlab.
2271 */
2272static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2273 int offset, size_t size)
2274{
2275 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
2276 kunmap(page);
2277 if (sent == size)
2278 mdev->send_cnt += size>>9;
2279 return sent == size;
2280}
2281
2282static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2283 int offset, size_t size)
2284{
2285 mm_segment_t oldfs = get_fs();
2286 int sent, ok;
2287 int len = size;
2288
2289 /* e.g. XFS meta- & log-data is in slab pages, which have a
2290 * page_count of 0 and/or have PageSlab() set.
2291 * we cannot use send_page for those, as that does get_page();
2292 * put_page(); and would cause either a VM_BUG directly, or
2293 * __page_cache_release a page that would actually still be referenced
2294 * by someone, leading to some obscure delayed Oops somewhere else. */
2295 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2296 return _drbd_no_send_page(mdev, page, offset, size);
2297
2298 drbd_update_congested(mdev);
2299 set_fs(KERNEL_DS);
2300 do {
2301 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2302 offset, len,
2303 MSG_NOSIGNAL);
2304 if (sent == -EAGAIN) {
2305 if (we_should_drop_the_connection(mdev,
2306 mdev->data.socket))
2307 break;
2308 else
2309 continue;
2310 }
2311 if (sent <= 0) {
2312 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2313 __func__, (int)size, len, sent);
2314 break;
2315 }
2316 len -= sent;
2317 offset += sent;
2318 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2319 set_fs(oldfs);
2320 clear_bit(NET_CONGESTED, &mdev->flags);
2321
2322 ok = (len == 0);
2323 if (likely(ok))
2324 mdev->send_cnt += size>>9;
2325 return ok;
2326}
2327
2328static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2329{
2330 struct bio_vec *bvec;
2331 int i;
2332 __bio_for_each_segment(bvec, bio, i, 0) {
2333 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2334 bvec->bv_offset, bvec->bv_len))
2335 return 0;
2336 }
2337 return 1;
2338}
2339
2340static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2341{
2342 struct bio_vec *bvec;
2343 int i;
2344 __bio_for_each_segment(bvec, bio, i, 0) {
2345 if (!_drbd_send_page(mdev, bvec->bv_page,
2346 bvec->bv_offset, bvec->bv_len))
2347 return 0;
2348 }
2349
2350 return 1;
2351}
2352
2353/* Used to send write requests
2354 * R_PRIMARY -> Peer (P_DATA)
2355 */
2356int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2357{
2358 int ok = 1;
2359 struct p_data p;
2360 unsigned int dp_flags = 0;
2361 void *dgb;
2362 int dgs;
2363
2364 if (!drbd_get_data_sock(mdev))
2365 return 0;
2366
2367 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2368 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2369
2370 p.head.magic = BE_DRBD_MAGIC;
2371 p.head.command = cpu_to_be16(P_DATA);
2372 p.head.length =
2373 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
2374
2375 p.sector = cpu_to_be64(req->sector);
2376 p.block_id = (unsigned long)req;
2377 p.seq_num = cpu_to_be32(req->seq_num =
2378 atomic_add_return(1, &mdev->packet_seq));
2379 dp_flags = 0;
2380
2381 /* NOTE: no need to check if barriers supported here as we would
2382 * not pass the test in make_request_common in that case
2383 */
2384 if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) {
2385 dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
2386 /* dp_flags |= DP_HARDBARRIER; */
2387 }
2388 if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO))
2389 dp_flags |= DP_RW_SYNC;
2390 /* for now handle SYNCIO and UNPLUG
2391 * as if they still were one and the same flag */
2392 if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG))
2393 dp_flags |= DP_RW_SYNC;
2394 if (mdev->state.conn >= C_SYNC_SOURCE &&
2395 mdev->state.conn <= C_PAUSED_SYNC_T)
2396 dp_flags |= DP_MAY_SET_IN_SYNC;
2397
2398 p.dp_flags = cpu_to_be32(dp_flags);
b411b363
PR
2399 set_bit(UNPLUG_REMOTE, &mdev->flags);
2400 ok = (sizeof(p) ==
2401 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
2402 if (ok && dgs) {
2403 dgb = mdev->int_dig_out;
2404 drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2405 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2406 }
2407 if (ok) {
2408 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2409 ok = _drbd_send_bio(mdev, req->master_bio);
2410 else
2411 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2412 }
2413
2414 drbd_put_data_sock(mdev);
2415 return ok;
2416}
2417
2418/* answer packet, used to send data back for read requests:
2419 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2420 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2421 */
2422int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2423 struct drbd_epoch_entry *e)
2424{
2425 int ok;
2426 struct p_data p;
2427 void *dgb;
2428 int dgs;
2429
2430 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2431 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2432
2433 p.head.magic = BE_DRBD_MAGIC;
2434 p.head.command = cpu_to_be16(cmd);
2435 p.head.length =
2436 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
2437
2438 p.sector = cpu_to_be64(e->sector);
2439 p.block_id = e->block_id;
2440 /* p.seq_num = 0; No sequence numbers here.. */
2441
2442 /* Only called by our kernel thread.
2443 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2444 * in response to admin command or module unload.
2445 */
2446 if (!drbd_get_data_sock(mdev))
2447 return 0;
2448
b411b363
PR
2449 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
2450 sizeof(p), MSG_MORE);
2451 if (ok && dgs) {
2452 dgb = mdev->int_dig_out;
2453 drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
2454 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
2455 }
2456 if (ok)
2457 ok = _drbd_send_zc_bio(mdev, e->private_bio);
2458
2459 drbd_put_data_sock(mdev);
2460 return ok;
2461}
2462
2463/*
2464 drbd_send distinguishes two cases:
2465
2466 Packets sent via the data socket "sock"
2467 and packets sent via the meta data socket "msock"
2468
2469 sock msock
2470 -----------------+-------------------------+------------------------------
2471 timeout conf.timeout / 2 conf.timeout / 2
2472 timeout action send a ping via msock Abort communication
2473 and close all sockets
2474*/
2475
2476/*
2477 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2478 */
2479int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2480 void *buf, size_t size, unsigned msg_flags)
2481{
2482 struct kvec iov;
2483 struct msghdr msg;
2484 int rv, sent = 0;
2485
2486 if (!sock)
2487 return -1000;
2488
2489 /* THINK if (signal_pending) return ... ? */
2490
2491 iov.iov_base = buf;
2492 iov.iov_len = size;
2493
2494 msg.msg_name = NULL;
2495 msg.msg_namelen = 0;
2496 msg.msg_control = NULL;
2497 msg.msg_controllen = 0;
2498 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2499
2500 if (sock == mdev->data.socket) {
2501 mdev->ko_count = mdev->net_conf->ko_count;
2502 drbd_update_congested(mdev);
2503 }
2504 do {
2505 /* STRANGE
2506 * tcp_sendmsg does _not_ use its size parameter at all ?
2507 *
2508 * -EAGAIN on timeout, -EINTR on signal.
2509 */
2510/* THINK
2511 * do we need to block DRBD_SIG if sock == &meta.socket ??
2512 * otherwise wake_asender() might interrupt some send_*Ack !
2513 */
2514 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2515 if (rv == -EAGAIN) {
2516 if (we_should_drop_the_connection(mdev, sock))
2517 break;
2518 else
2519 continue;
2520 }
2521 D_ASSERT(rv != 0);
2522 if (rv == -EINTR) {
2523 flush_signals(current);
2524 rv = 0;
2525 }
2526 if (rv < 0)
2527 break;
2528 sent += rv;
2529 iov.iov_base += rv;
2530 iov.iov_len -= rv;
2531 } while (sent < size);
2532
2533 if (sock == mdev->data.socket)
2534 clear_bit(NET_CONGESTED, &mdev->flags);
2535
2536 if (rv <= 0) {
2537 if (rv != -EAGAIN) {
2538 dev_err(DEV, "%s_sendmsg returned %d\n",
2539 sock == mdev->meta.socket ? "msock" : "sock",
2540 rv);
2541 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2542 } else
2543 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2544 }
2545
2546 return sent;
2547}
2548
2549static int drbd_open(struct block_device *bdev, fmode_t mode)
2550{
2551 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2552 unsigned long flags;
2553 int rv = 0;
2554
2555 spin_lock_irqsave(&mdev->req_lock, flags);
2556 /* to have a stable mdev->state.role
2557 * and no race with updating open_cnt */
2558
2559 if (mdev->state.role != R_PRIMARY) {
2560 if (mode & FMODE_WRITE)
2561 rv = -EROFS;
2562 else if (!allow_oos)
2563 rv = -EMEDIUMTYPE;
2564 }
2565
2566 if (!rv)
2567 mdev->open_cnt++;
2568 spin_unlock_irqrestore(&mdev->req_lock, flags);
2569
2570 return rv;
2571}
2572
2573static int drbd_release(struct gendisk *gd, fmode_t mode)
2574{
2575 struct drbd_conf *mdev = gd->private_data;
2576 mdev->open_cnt--;
2577 return 0;
2578}
2579
2580static void drbd_unplug_fn(struct request_queue *q)
2581{
2582 struct drbd_conf *mdev = q->queuedata;
2583
b411b363
PR
2584 /* unplug FIRST */
2585 spin_lock_irq(q->queue_lock);
2586 blk_remove_plug(q);
2587 spin_unlock_irq(q->queue_lock);
2588
2589 /* only if connected */
2590 spin_lock_irq(&mdev->req_lock);
2591 if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2592 D_ASSERT(mdev->state.role == R_PRIMARY);
2593 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2594 /* add to the data.work queue,
2595 * unless already queued.
2596 * XXX this might be a good addition to drbd_queue_work
2597 * anyways, to detect "double queuing" ... */
2598 if (list_empty(&mdev->unplug_work.list))
2599 drbd_queue_work(&mdev->data.work,
2600 &mdev->unplug_work);
2601 }
2602 }
2603 spin_unlock_irq(&mdev->req_lock);
2604
2605 if (mdev->state.disk >= D_INCONSISTENT)
2606 drbd_kick_lo(mdev);
2607}
2608
2609static void drbd_set_defaults(struct drbd_conf *mdev)
2610{
2611 mdev->sync_conf.after = DRBD_AFTER_DEF;
2612 mdev->sync_conf.rate = DRBD_RATE_DEF;
2613 mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF;
2614 mdev->state = (union drbd_state) {
2615 { .role = R_SECONDARY,
2616 .peer = R_UNKNOWN,
2617 .conn = C_STANDALONE,
2618 .disk = D_DISKLESS,
2619 .pdsk = D_UNKNOWN,
2620 .susp = 0
2621 } };
2622}
2623
2624void drbd_init_set_defaults(struct drbd_conf *mdev)
2625{
2626 /* the memset(,0,) did most of this.
2627 * note: only assignments, no allocation in here */
2628
2629 drbd_set_defaults(mdev);
2630
2631 /* for now, we do NOT yet support it,
2632 * even though we start some framework
2633 * to eventually support barriers */
2634 set_bit(NO_BARRIER_SUPP, &mdev->flags);
2635
2636 atomic_set(&mdev->ap_bio_cnt, 0);
2637 atomic_set(&mdev->ap_pending_cnt, 0);
2638 atomic_set(&mdev->rs_pending_cnt, 0);
2639 atomic_set(&mdev->unacked_cnt, 0);
2640 atomic_set(&mdev->local_cnt, 0);
2641 atomic_set(&mdev->net_cnt, 0);
2642 atomic_set(&mdev->packet_seq, 0);
2643 atomic_set(&mdev->pp_in_use, 0);
0ced55a3 2644 atomic_set(&mdev->delay_seq, 0);
b411b363
PR
2645
2646 mutex_init(&mdev->md_io_mutex);
2647 mutex_init(&mdev->data.mutex);
2648 mutex_init(&mdev->meta.mutex);
2649 sema_init(&mdev->data.work.s, 0);
2650 sema_init(&mdev->meta.work.s, 0);
2651 mutex_init(&mdev->state_mutex);
2652
2653 spin_lock_init(&mdev->data.work.q_lock);
2654 spin_lock_init(&mdev->meta.work.q_lock);
2655
2656 spin_lock_init(&mdev->al_lock);
2657 spin_lock_init(&mdev->req_lock);
2658 spin_lock_init(&mdev->peer_seq_lock);
2659 spin_lock_init(&mdev->epoch_lock);
2660
2661 INIT_LIST_HEAD(&mdev->active_ee);
2662 INIT_LIST_HEAD(&mdev->sync_ee);
2663 INIT_LIST_HEAD(&mdev->done_ee);
2664 INIT_LIST_HEAD(&mdev->read_ee);
2665 INIT_LIST_HEAD(&mdev->net_ee);
2666 INIT_LIST_HEAD(&mdev->resync_reads);
2667 INIT_LIST_HEAD(&mdev->data.work.q);
2668 INIT_LIST_HEAD(&mdev->meta.work.q);
2669 INIT_LIST_HEAD(&mdev->resync_work.list);
2670 INIT_LIST_HEAD(&mdev->unplug_work.list);
2671 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2672 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
0ced55a3
PR
2673 INIT_LIST_HEAD(&mdev->delay_probes);
2674
b411b363
PR
2675 mdev->resync_work.cb = w_resync_inactive;
2676 mdev->unplug_work.cb = w_send_write_hint;
2677 mdev->md_sync_work.cb = w_md_sync;
2678 mdev->bm_io_work.w.cb = w_bitmap_io;
2679 init_timer(&mdev->resync_timer);
2680 init_timer(&mdev->md_sync_timer);
2681 mdev->resync_timer.function = resync_timer_fn;
2682 mdev->resync_timer.data = (unsigned long) mdev;
2683 mdev->md_sync_timer.function = md_sync_timer_fn;
2684 mdev->md_sync_timer.data = (unsigned long) mdev;
2685
2686 init_waitqueue_head(&mdev->misc_wait);
2687 init_waitqueue_head(&mdev->state_wait);
2688 init_waitqueue_head(&mdev->ee_wait);
2689 init_waitqueue_head(&mdev->al_wait);
2690 init_waitqueue_head(&mdev->seq_wait);
2691
2692 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2693 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2694 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2695
2696 mdev->agreed_pro_version = PRO_VERSION_MAX;
2697 mdev->write_ordering = WO_bio_barrier;
2698 mdev->resync_wenr = LC_FREE;
2699}
2700
2701void drbd_mdev_cleanup(struct drbd_conf *mdev)
2702{
2703 if (mdev->receiver.t_state != None)
2704 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2705 mdev->receiver.t_state);
2706
2707 /* no need to lock it, I'm the only thread alive */
2708 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2709 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2710 mdev->al_writ_cnt =
2711 mdev->bm_writ_cnt =
2712 mdev->read_cnt =
2713 mdev->recv_cnt =
2714 mdev->send_cnt =
2715 mdev->writ_cnt =
2716 mdev->p_size =
2717 mdev->rs_start =
2718 mdev->rs_total =
2719 mdev->rs_failed =
2720 mdev->rs_mark_left =
2721 mdev->rs_mark_time = 0;
2722 D_ASSERT(mdev->net_conf == NULL);
2723
2724 drbd_set_my_capacity(mdev, 0);
2725 if (mdev->bitmap) {
2726 /* maybe never allocated. */
02d9a94b 2727 drbd_bm_resize(mdev, 0, 1);
b411b363
PR
2728 drbd_bm_cleanup(mdev);
2729 }
2730
2731 drbd_free_resources(mdev);
2732
2733 /*
2734 * currently we drbd_init_ee only on module load, so
2735 * we may do drbd_release_ee only on module unload!
2736 */
2737 D_ASSERT(list_empty(&mdev->active_ee));
2738 D_ASSERT(list_empty(&mdev->sync_ee));
2739 D_ASSERT(list_empty(&mdev->done_ee));
2740 D_ASSERT(list_empty(&mdev->read_ee));
2741 D_ASSERT(list_empty(&mdev->net_ee));
2742 D_ASSERT(list_empty(&mdev->resync_reads));
2743 D_ASSERT(list_empty(&mdev->data.work.q));
2744 D_ASSERT(list_empty(&mdev->meta.work.q));
2745 D_ASSERT(list_empty(&mdev->resync_work.list));
2746 D_ASSERT(list_empty(&mdev->unplug_work.list));
2747
2748}
2749
2750
2751static void drbd_destroy_mempools(void)
2752{
2753 struct page *page;
2754
2755 while (drbd_pp_pool) {
2756 page = drbd_pp_pool;
2757 drbd_pp_pool = (struct page *)page_private(page);
2758 __free_page(page);
2759 drbd_pp_vacant--;
2760 }
2761
2762 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2763
2764 if (drbd_ee_mempool)
2765 mempool_destroy(drbd_ee_mempool);
2766 if (drbd_request_mempool)
2767 mempool_destroy(drbd_request_mempool);
2768 if (drbd_ee_cache)
2769 kmem_cache_destroy(drbd_ee_cache);
2770 if (drbd_request_cache)
2771 kmem_cache_destroy(drbd_request_cache);
2772 if (drbd_bm_ext_cache)
2773 kmem_cache_destroy(drbd_bm_ext_cache);
2774 if (drbd_al_ext_cache)
2775 kmem_cache_destroy(drbd_al_ext_cache);
2776
2777 drbd_ee_mempool = NULL;
2778 drbd_request_mempool = NULL;
2779 drbd_ee_cache = NULL;
2780 drbd_request_cache = NULL;
2781 drbd_bm_ext_cache = NULL;
2782 drbd_al_ext_cache = NULL;
2783
2784 return;
2785}
2786
2787static int drbd_create_mempools(void)
2788{
2789 struct page *page;
2790 const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2791 int i;
2792
2793 /* prepare our caches and mempools */
2794 drbd_request_mempool = NULL;
2795 drbd_ee_cache = NULL;
2796 drbd_request_cache = NULL;
2797 drbd_bm_ext_cache = NULL;
2798 drbd_al_ext_cache = NULL;
2799 drbd_pp_pool = NULL;
2800
2801 /* caches */
2802 drbd_request_cache = kmem_cache_create(
2803 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2804 if (drbd_request_cache == NULL)
2805 goto Enomem;
2806
2807 drbd_ee_cache = kmem_cache_create(
2808 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2809 if (drbd_ee_cache == NULL)
2810 goto Enomem;
2811
2812 drbd_bm_ext_cache = kmem_cache_create(
2813 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2814 if (drbd_bm_ext_cache == NULL)
2815 goto Enomem;
2816
2817 drbd_al_ext_cache = kmem_cache_create(
2818 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2819 if (drbd_al_ext_cache == NULL)
2820 goto Enomem;
2821
2822 /* mempools */
2823 drbd_request_mempool = mempool_create(number,
2824 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2825 if (drbd_request_mempool == NULL)
2826 goto Enomem;
2827
2828 drbd_ee_mempool = mempool_create(number,
2829 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2830 if (drbd_request_mempool == NULL)
2831 goto Enomem;
2832
2833 /* drbd's page pool */
2834 spin_lock_init(&drbd_pp_lock);
2835
2836 for (i = 0; i < number; i++) {
2837 page = alloc_page(GFP_HIGHUSER);
2838 if (!page)
2839 goto Enomem;
2840 set_page_private(page, (unsigned long)drbd_pp_pool);
2841 drbd_pp_pool = page;
2842 }
2843 drbd_pp_vacant = number;
2844
2845 return 0;
2846
2847Enomem:
2848 drbd_destroy_mempools(); /* in case we allocated some */
2849 return -ENOMEM;
2850}
2851
2852static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2853 void *unused)
2854{
2855 /* just so we have it. you never know what interesting things we
2856 * might want to do here some day...
2857 */
2858
2859 return NOTIFY_DONE;
2860}
2861
2862static struct notifier_block drbd_notifier = {
2863 .notifier_call = drbd_notify_sys,
2864};
2865
2866static void drbd_release_ee_lists(struct drbd_conf *mdev)
2867{
2868 int rr;
2869
2870 rr = drbd_release_ee(mdev, &mdev->active_ee);
2871 if (rr)
2872 dev_err(DEV, "%d EEs in active list found!\n", rr);
2873
2874 rr = drbd_release_ee(mdev, &mdev->sync_ee);
2875 if (rr)
2876 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2877
2878 rr = drbd_release_ee(mdev, &mdev->read_ee);
2879 if (rr)
2880 dev_err(DEV, "%d EEs in read list found!\n", rr);
2881
2882 rr = drbd_release_ee(mdev, &mdev->done_ee);
2883 if (rr)
2884 dev_err(DEV, "%d EEs in done list found!\n", rr);
2885
2886 rr = drbd_release_ee(mdev, &mdev->net_ee);
2887 if (rr)
2888 dev_err(DEV, "%d EEs in net list found!\n", rr);
2889}
2890
2891/* caution. no locking.
2892 * currently only used from module cleanup code. */
2893static void drbd_delete_device(unsigned int minor)
2894{
2895 struct drbd_conf *mdev = minor_to_mdev(minor);
2896
2897 if (!mdev)
2898 return;
2899
2900 /* paranoia asserts */
2901 if (mdev->open_cnt != 0)
2902 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
2903 __FILE__ , __LINE__);
2904
2905 ERR_IF (!list_empty(&mdev->data.work.q)) {
2906 struct list_head *lp;
2907 list_for_each(lp, &mdev->data.work.q) {
2908 dev_err(DEV, "lp = %p\n", lp);
2909 }
2910 };
2911 /* end paranoia asserts */
2912
2913 del_gendisk(mdev->vdisk);
2914
2915 /* cleanup stuff that may have been allocated during
2916 * device (re-)configuration or state changes */
2917
2918 if (mdev->this_bdev)
2919 bdput(mdev->this_bdev);
2920
2921 drbd_free_resources(mdev);
2922
2923 drbd_release_ee_lists(mdev);
2924
2925 /* should be free'd on disconnect? */
2926 kfree(mdev->ee_hash);
2927 /*
2928 mdev->ee_hash_s = 0;
2929 mdev->ee_hash = NULL;
2930 */
2931
2932 lc_destroy(mdev->act_log);
2933 lc_destroy(mdev->resync);
2934
2935 kfree(mdev->p_uuid);
2936 /* mdev->p_uuid = NULL; */
2937
2938 kfree(mdev->int_dig_out);
2939 kfree(mdev->int_dig_in);
2940 kfree(mdev->int_dig_vv);
2941
2942 /* cleanup the rest that has been
2943 * allocated from drbd_new_device
2944 * and actually free the mdev itself */
2945 drbd_free_mdev(mdev);
2946}
2947
2948static void drbd_cleanup(void)
2949{
2950 unsigned int i;
2951
2952 unregister_reboot_notifier(&drbd_notifier);
2953
2954 drbd_nl_cleanup();
2955
2956 if (minor_table) {
2957 if (drbd_proc)
2958 remove_proc_entry("drbd", NULL);
2959 i = minor_count;
2960 while (i--)
2961 drbd_delete_device(i);
2962 drbd_destroy_mempools();
2963 }
2964
2965 kfree(minor_table);
2966
2967 unregister_blkdev(DRBD_MAJOR, "drbd");
2968
2969 printk(KERN_INFO "drbd: module cleanup done.\n");
2970}
2971
2972/**
2973 * drbd_congested() - Callback for pdflush
2974 * @congested_data: User data
2975 * @bdi_bits: Bits pdflush is currently interested in
2976 *
2977 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
2978 */
2979static int drbd_congested(void *congested_data, int bdi_bits)
2980{
2981 struct drbd_conf *mdev = congested_data;
2982 struct request_queue *q;
2983 char reason = '-';
2984 int r = 0;
2985
2986 if (!__inc_ap_bio_cond(mdev)) {
2987 /* DRBD has frozen IO */
2988 r = bdi_bits;
2989 reason = 'd';
2990 goto out;
2991 }
2992
2993 if (get_ldev(mdev)) {
2994 q = bdev_get_queue(mdev->ldev->backing_bdev);
2995 r = bdi_congested(&q->backing_dev_info, bdi_bits);
2996 put_ldev(mdev);
2997 if (r)
2998 reason = 'b';
2999 }
3000
3001 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3002 r |= (1 << BDI_async_congested);
3003 reason = reason == 'b' ? 'a' : 'n';
3004 }
3005
3006out:
3007 mdev->congestion_reason = reason;
3008 return r;
3009}
3010
3011struct drbd_conf *drbd_new_device(unsigned int minor)
3012{
3013 struct drbd_conf *mdev;
3014 struct gendisk *disk;
3015 struct request_queue *q;
3016
3017 /* GFP_KERNEL, we are outside of all write-out paths */
3018 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3019 if (!mdev)
3020 return NULL;
3021 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3022 goto out_no_cpumask;
3023
3024 mdev->minor = minor;
3025
3026 drbd_init_set_defaults(mdev);
3027
3028 q = blk_alloc_queue(GFP_KERNEL);
3029 if (!q)
3030 goto out_no_q;
3031 mdev->rq_queue = q;
3032 q->queuedata = mdev;
b411b363
PR
3033
3034 disk = alloc_disk(1);
3035 if (!disk)
3036 goto out_no_disk;
3037 mdev->vdisk = disk;
3038
3039 set_disk_ro(disk, TRUE);
3040
3041 disk->queue = q;
3042 disk->major = DRBD_MAJOR;
3043 disk->first_minor = minor;
3044 disk->fops = &drbd_ops;
3045 sprintf(disk->disk_name, "drbd%d", minor);
3046 disk->private_data = mdev;
3047
3048 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3049 /* we have no partitions. we contain only ourselves. */
3050 mdev->this_bdev->bd_contains = mdev->this_bdev;
3051
3052 q->backing_dev_info.congested_fn = drbd_congested;
3053 q->backing_dev_info.congested_data = mdev;
3054
3055 blk_queue_make_request(q, drbd_make_request_26);
98ec286e 3056 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
b411b363
PR
3057 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3058 blk_queue_merge_bvec(q, drbd_merge_bvec);
3059 q->queue_lock = &mdev->req_lock; /* needed since we use */
3060 /* plugging on a queue, that actually has no requests! */
3061 q->unplug_fn = drbd_unplug_fn;
3062
3063 mdev->md_io_page = alloc_page(GFP_KERNEL);
3064 if (!mdev->md_io_page)
3065 goto out_no_io_page;
3066
3067 if (drbd_bm_init(mdev))
3068 goto out_no_bitmap;
3069 /* no need to lock access, we are still initializing this minor device. */
3070 if (!tl_init(mdev))
3071 goto out_no_tl;
3072
3073 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3074 if (!mdev->app_reads_hash)
3075 goto out_no_app_reads;
3076
3077 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3078 if (!mdev->current_epoch)
3079 goto out_no_epoch;
3080
3081 INIT_LIST_HEAD(&mdev->current_epoch->list);
3082 mdev->epochs = 1;
3083
3084 return mdev;
3085
3086/* out_whatever_else:
3087 kfree(mdev->current_epoch); */
3088out_no_epoch:
3089 kfree(mdev->app_reads_hash);
3090out_no_app_reads:
3091 tl_cleanup(mdev);
3092out_no_tl:
3093 drbd_bm_cleanup(mdev);
3094out_no_bitmap:
3095 __free_page(mdev->md_io_page);
3096out_no_io_page:
3097 put_disk(disk);
3098out_no_disk:
3099 blk_cleanup_queue(q);
3100out_no_q:
3101 free_cpumask_var(mdev->cpu_mask);
3102out_no_cpumask:
3103 kfree(mdev);
3104 return NULL;
3105}
3106
3107/* counterpart of drbd_new_device.
3108 * last part of drbd_delete_device. */
3109void drbd_free_mdev(struct drbd_conf *mdev)
3110{
3111 kfree(mdev->current_epoch);
3112 kfree(mdev->app_reads_hash);
3113 tl_cleanup(mdev);
3114 if (mdev->bitmap) /* should no longer be there. */
3115 drbd_bm_cleanup(mdev);
3116 __free_page(mdev->md_io_page);
3117 put_disk(mdev->vdisk);
3118 blk_cleanup_queue(mdev->rq_queue);
3119 free_cpumask_var(mdev->cpu_mask);
3120 kfree(mdev);
3121}
3122
3123
3124int __init drbd_init(void)
3125{
3126 int err;
3127
3128 if (sizeof(struct p_handshake) != 80) {
3129 printk(KERN_ERR
3130 "drbd: never change the size or layout "
3131 "of the HandShake packet.\n");
3132 return -EINVAL;
3133 }
3134
3135 if (1 > minor_count || minor_count > 255) {
3136 printk(KERN_ERR
3137 "drbd: invalid minor_count (%d)\n", minor_count);
3138#ifdef MODULE
3139 return -EINVAL;
3140#else
3141 minor_count = 8;
3142#endif
3143 }
3144
3145 err = drbd_nl_init();
3146 if (err)
3147 return err;
3148
3149 err = register_blkdev(DRBD_MAJOR, "drbd");
3150 if (err) {
3151 printk(KERN_ERR
3152 "drbd: unable to register block device major %d\n",
3153 DRBD_MAJOR);
3154 return err;
3155 }
3156
3157 register_reboot_notifier(&drbd_notifier);
3158
3159 /*
3160 * allocate all necessary structs
3161 */
3162 err = -ENOMEM;
3163
3164 init_waitqueue_head(&drbd_pp_wait);
3165
3166 drbd_proc = NULL; /* play safe for drbd_cleanup */
3167 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3168 GFP_KERNEL);
3169 if (!minor_table)
3170 goto Enomem;
3171
3172 err = drbd_create_mempools();
3173 if (err)
3174 goto Enomem;
3175
8c484ee4 3176 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
b411b363
PR
3177 if (!drbd_proc) {
3178 printk(KERN_ERR "drbd: unable to register proc file\n");
3179 goto Enomem;
3180 }
3181
3182 rwlock_init(&global_state_lock);
3183
3184 printk(KERN_INFO "drbd: initialized. "
3185 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3186 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3187 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3188 printk(KERN_INFO "drbd: registered as block device major %d\n",
3189 DRBD_MAJOR);
3190 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3191
3192 return 0; /* Success! */
3193
3194Enomem:
3195 drbd_cleanup();
3196 if (err == -ENOMEM)
3197 /* currently always the case */
3198 printk(KERN_ERR "drbd: ran out of memory\n");
3199 else
3200 printk(KERN_ERR "drbd: initialization failure\n");
3201 return err;
3202}
3203
3204void drbd_free_bc(struct drbd_backing_dev *ldev)
3205{
3206 if (ldev == NULL)
3207 return;
3208
3209 bd_release(ldev->backing_bdev);
3210 bd_release(ldev->md_bdev);
3211
3212 fput(ldev->lo_file);
3213 fput(ldev->md_file);
3214
3215 kfree(ldev);
3216}
3217
3218void drbd_free_sock(struct drbd_conf *mdev)
3219{
3220 if (mdev->data.socket) {
4589d7f8 3221 mutex_lock(&mdev->data.mutex);
b411b363
PR
3222 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3223 sock_release(mdev->data.socket);
3224 mdev->data.socket = NULL;
4589d7f8 3225 mutex_unlock(&mdev->data.mutex);
b411b363
PR
3226 }
3227 if (mdev->meta.socket) {
4589d7f8 3228 mutex_lock(&mdev->meta.mutex);
b411b363
PR
3229 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3230 sock_release(mdev->meta.socket);
3231 mdev->meta.socket = NULL;
4589d7f8 3232 mutex_unlock(&mdev->meta.mutex);
b411b363
PR
3233 }
3234}
3235
3236
3237void drbd_free_resources(struct drbd_conf *mdev)
3238{
3239 crypto_free_hash(mdev->csums_tfm);
3240 mdev->csums_tfm = NULL;
3241 crypto_free_hash(mdev->verify_tfm);
3242 mdev->verify_tfm = NULL;
3243 crypto_free_hash(mdev->cram_hmac_tfm);
3244 mdev->cram_hmac_tfm = NULL;
3245 crypto_free_hash(mdev->integrity_w_tfm);
3246 mdev->integrity_w_tfm = NULL;
3247 crypto_free_hash(mdev->integrity_r_tfm);
3248 mdev->integrity_r_tfm = NULL;
3249
3250 drbd_free_sock(mdev);
3251
3252 __no_warn(local,
3253 drbd_free_bc(mdev->ldev);
3254 mdev->ldev = NULL;);
3255}
3256
3257/* meta data management */
3258
3259struct meta_data_on_disk {
3260 u64 la_size; /* last agreed size. */
3261 u64 uuid[UI_SIZE]; /* UUIDs. */
3262 u64 device_uuid;
3263 u64 reserved_u64_1;
3264 u32 flags; /* MDF */
3265 u32 magic;
3266 u32 md_size_sect;
3267 u32 al_offset; /* offset to this block */
3268 u32 al_nr_extents; /* important for restoring the AL */
3269 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3270 u32 bm_offset; /* offset to the bitmap, from here */
3271 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3272 u32 reserved_u32[4];
3273
3274} __packed;
3275
3276/**
3277 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3278 * @mdev: DRBD device.
3279 */
3280void drbd_md_sync(struct drbd_conf *mdev)
3281{
3282 struct meta_data_on_disk *buffer;
3283 sector_t sector;
3284 int i;
3285
3286 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3287 return;
3288 del_timer(&mdev->md_sync_timer);
3289
3290 /* We use here D_FAILED and not D_ATTACHING because we try to write
3291 * metadata even if we detach due to a disk failure! */
3292 if (!get_ldev_if_state(mdev, D_FAILED))
3293 return;
3294
b411b363
PR
3295 mutex_lock(&mdev->md_io_mutex);
3296 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3297 memset(buffer, 0, 512);
3298
3299 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3300 for (i = UI_CURRENT; i < UI_SIZE; i++)
3301 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3302 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3303 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3304
3305 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3306 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3307 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3308 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3309 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3310
3311 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3312
3313 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3314 sector = mdev->ldev->md.md_offset;
3315
3316 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3317 clear_bit(MD_DIRTY, &mdev->flags);
3318 } else {
3319 /* this was a try anyways ... */
3320 dev_err(DEV, "meta data update failed!\n");
3321
3322 drbd_chk_io_error(mdev, 1, TRUE);
3323 }
3324
3325 /* Update mdev->ldev->md.la_size_sect,
3326 * since we updated it on metadata. */
3327 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3328
3329 mutex_unlock(&mdev->md_io_mutex);
3330 put_ldev(mdev);
3331}
3332
3333/**
3334 * drbd_md_read() - Reads in the meta data super block
3335 * @mdev: DRBD device.
3336 * @bdev: Device from which the meta data should be read in.
3337 *
3338 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3339 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3340 */
3341int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3342{
3343 struct meta_data_on_disk *buffer;
3344 int i, rv = NO_ERROR;
3345
3346 if (!get_ldev_if_state(mdev, D_ATTACHING))
3347 return ERR_IO_MD_DISK;
3348
b411b363
PR
3349 mutex_lock(&mdev->md_io_mutex);
3350 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3351
3352 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3353 /* NOTE: cant do normal error processing here as this is
3354 called BEFORE disk is attached */
3355 dev_err(DEV, "Error while reading metadata.\n");
3356 rv = ERR_IO_MD_DISK;
3357 goto err;
3358 }
3359
3360 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3361 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3362 rv = ERR_MD_INVALID;
3363 goto err;
3364 }
3365 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3366 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3367 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3368 rv = ERR_MD_INVALID;
3369 goto err;
3370 }
3371 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3372 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3373 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3374 rv = ERR_MD_INVALID;
3375 goto err;
3376 }
3377 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3378 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3379 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3380 rv = ERR_MD_INVALID;
3381 goto err;
3382 }
3383
3384 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3385 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3386 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3387 rv = ERR_MD_INVALID;
3388 goto err;
3389 }
3390
3391 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3392 for (i = UI_CURRENT; i < UI_SIZE; i++)
3393 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3394 bdev->md.flags = be32_to_cpu(buffer->flags);
3395 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3396 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3397
3398 if (mdev->sync_conf.al_extents < 7)
3399 mdev->sync_conf.al_extents = 127;
3400
3401 err:
3402 mutex_unlock(&mdev->md_io_mutex);
3403 put_ldev(mdev);
3404
3405 return rv;
3406}
3407
3408/**
3409 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3410 * @mdev: DRBD device.
3411 *
3412 * Call this function if you change anything that should be written to
3413 * the meta-data super block. This function sets MD_DIRTY, and starts a
3414 * timer that ensures that within five seconds you have to call drbd_md_sync().
3415 */
3416void drbd_md_mark_dirty(struct drbd_conf *mdev)
3417{
3418 set_bit(MD_DIRTY, &mdev->flags);
3419 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3420}
3421
3422
3423static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3424{
3425 int i;
3426
6a0afdf5 3427 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
b411b363 3428 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
b411b363
PR
3429}
3430
3431void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3432{
3433 if (idx == UI_CURRENT) {
3434 if (mdev->state.role == R_PRIMARY)
3435 val |= 1;
3436 else
3437 val &= ~((u64)1);
3438
3439 drbd_set_ed_uuid(mdev, val);
3440 }
3441
3442 mdev->ldev->md.uuid[idx] = val;
b411b363
PR
3443 drbd_md_mark_dirty(mdev);
3444}
3445
3446
3447void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3448{
3449 if (mdev->ldev->md.uuid[idx]) {
3450 drbd_uuid_move_history(mdev);
3451 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
b411b363
PR
3452 }
3453 _drbd_uuid_set(mdev, idx, val);
3454}
3455
3456/**
3457 * drbd_uuid_new_current() - Creates a new current UUID
3458 * @mdev: DRBD device.
3459 *
3460 * Creates a new current UUID, and rotates the old current UUID into
3461 * the bitmap slot. Causes an incremental resync upon next connect.
3462 */
3463void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3464{
3465 u64 val;
3466
3467 dev_info(DEV, "Creating new current UUID\n");
3468 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3469 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
b411b363
PR
3470
3471 get_random_bytes(&val, sizeof(u64));
3472 _drbd_uuid_set(mdev, UI_CURRENT, val);
3473}
3474
3475void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3476{
3477 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3478 return;
3479
3480 if (val == 0) {
3481 drbd_uuid_move_history(mdev);
3482 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3483 mdev->ldev->md.uuid[UI_BITMAP] = 0;
b411b363
PR
3484 } else {
3485 if (mdev->ldev->md.uuid[UI_BITMAP])
3486 dev_warn(DEV, "bm UUID already set");
3487
3488 mdev->ldev->md.uuid[UI_BITMAP] = val;
3489 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3490
b411b363
PR
3491 }
3492 drbd_md_mark_dirty(mdev);
3493}
3494
3495/**
3496 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3497 * @mdev: DRBD device.
3498 *
3499 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3500 */
3501int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3502{
3503 int rv = -EIO;
3504
3505 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3506 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3507 drbd_md_sync(mdev);
3508 drbd_bm_set_all(mdev);
3509
3510 rv = drbd_bm_write(mdev);
3511
3512 if (!rv) {
3513 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3514 drbd_md_sync(mdev);
3515 }
3516
3517 put_ldev(mdev);
3518 }
3519
3520 return rv;
3521}
3522
3523/**
3524 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3525 * @mdev: DRBD device.
3526 *
3527 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3528 */
3529int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3530{
3531 int rv = -EIO;
3532
3533 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3534 drbd_bm_clear_all(mdev);
3535 rv = drbd_bm_write(mdev);
3536 put_ldev(mdev);
3537 }
3538
3539 return rv;
3540}
3541
3542static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3543{
3544 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3545 int rv;
3546
3547 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3548
3549 drbd_bm_lock(mdev, work->why);
3550 rv = work->io_fn(mdev);
3551 drbd_bm_unlock(mdev);
3552
3553 clear_bit(BITMAP_IO, &mdev->flags);
3554 wake_up(&mdev->misc_wait);
3555
3556 if (work->done)
3557 work->done(mdev, rv);
3558
3559 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3560 work->why = NULL;
3561
3562 return 1;
3563}
3564
3565/**
3566 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3567 * @mdev: DRBD device.
3568 * @io_fn: IO callback to be called when bitmap IO is possible
3569 * @done: callback to be called after the bitmap IO was performed
3570 * @why: Descriptive text of the reason for doing the IO
3571 *
3572 * While IO on the bitmap happens we freeze application IO thus we ensure
3573 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3574 * called from worker context. It MUST NOT be used while a previous such
3575 * work is still pending!
3576 */
3577void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3578 int (*io_fn)(struct drbd_conf *),
3579 void (*done)(struct drbd_conf *, int),
3580 char *why)
3581{
3582 D_ASSERT(current == mdev->worker.task);
3583
3584 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3585 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3586 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3587 if (mdev->bm_io_work.why)
3588 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3589 why, mdev->bm_io_work.why);
3590
3591 mdev->bm_io_work.io_fn = io_fn;
3592 mdev->bm_io_work.done = done;
3593 mdev->bm_io_work.why = why;
3594
3595 set_bit(BITMAP_IO, &mdev->flags);
3596 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3597 if (list_empty(&mdev->bm_io_work.w.list)) {
3598 set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3599 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3600 } else
3601 dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3602 }
3603}
3604
3605/**
3606 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3607 * @mdev: DRBD device.
3608 * @io_fn: IO callback to be called when bitmap IO is possible
3609 * @why: Descriptive text of the reason for doing the IO
3610 *
3611 * freezes application IO while that the actual IO operations runs. This
3612 * functions MAY NOT be called from worker context.
3613 */
3614int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3615{
3616 int rv;
3617
3618 D_ASSERT(current != mdev->worker.task);
3619
3620 drbd_suspend_io(mdev);
3621
3622 drbd_bm_lock(mdev, why);
3623 rv = io_fn(mdev);
3624 drbd_bm_unlock(mdev);
3625
3626 drbd_resume_io(mdev);
3627
3628 return rv;
3629}
3630
3631void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3632{
3633 if ((mdev->ldev->md.flags & flag) != flag) {
3634 drbd_md_mark_dirty(mdev);
3635 mdev->ldev->md.flags |= flag;
3636 }
3637}
3638
3639void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3640{
3641 if ((mdev->ldev->md.flags & flag) != 0) {
3642 drbd_md_mark_dirty(mdev);
3643 mdev->ldev->md.flags &= ~flag;
3644 }
3645}
3646int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3647{
3648 return (bdev->md.flags & flag) != 0;
3649}
3650
3651static void md_sync_timer_fn(unsigned long data)
3652{
3653 struct drbd_conf *mdev = (struct drbd_conf *) data;
3654
3655 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3656}
3657
3658static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3659{
3660 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3661 drbd_md_sync(mdev);
3662
3663 return 1;
3664}
3665
3666#ifdef CONFIG_DRBD_FAULT_INJECTION
3667/* Fault insertion support including random number generator shamelessly
3668 * stolen from kernel/rcutorture.c */
3669struct fault_random_state {
3670 unsigned long state;
3671 unsigned long count;
3672};
3673
3674#define FAULT_RANDOM_MULT 39916801 /* prime */
3675#define FAULT_RANDOM_ADD 479001701 /* prime */
3676#define FAULT_RANDOM_REFRESH 10000
3677
3678/*
3679 * Crude but fast random-number generator. Uses a linear congruential
3680 * generator, with occasional help from get_random_bytes().
3681 */
3682static unsigned long
3683_drbd_fault_random(struct fault_random_state *rsp)
3684{
3685 long refresh;
3686
49829ea7 3687 if (!rsp->count--) {
b411b363
PR
3688 get_random_bytes(&refresh, sizeof(refresh));
3689 rsp->state += refresh;
3690 rsp->count = FAULT_RANDOM_REFRESH;
3691 }
3692 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3693 return swahw32(rsp->state);
3694}
3695
3696static char *
3697_drbd_fault_str(unsigned int type) {
3698 static char *_faults[] = {
3699 [DRBD_FAULT_MD_WR] = "Meta-data write",
3700 [DRBD_FAULT_MD_RD] = "Meta-data read",
3701 [DRBD_FAULT_RS_WR] = "Resync write",
3702 [DRBD_FAULT_RS_RD] = "Resync read",
3703 [DRBD_FAULT_DT_WR] = "Data write",
3704 [DRBD_FAULT_DT_RD] = "Data read",
3705 [DRBD_FAULT_DT_RA] = "Data read ahead",
3706 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
6b4388ac
PR
3707 [DRBD_FAULT_AL_EE] = "EE allocation",
3708 [DRBD_FAULT_RECEIVE] = "receive data corruption",
b411b363
PR
3709 };
3710
3711 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3712}
3713
3714unsigned int
3715_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3716{
3717 static struct fault_random_state rrs = {0, 0};
3718
3719 unsigned int ret = (
3720 (fault_devs == 0 ||
3721 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3722 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3723
3724 if (ret) {
3725 fault_count++;
3726
3727 if (printk_ratelimit())
3728 dev_warn(DEV, "***Simulating %s failure\n",
3729 _drbd_fault_str(type));
3730 }
3731
3732 return ret;
3733}
3734#endif
3735
3736const char *drbd_buildtag(void)
3737{
3738 /* DRBD built from external sources has here a reference to the
3739 git hash of the source code. */
3740
3741 static char buildtag[38] = "\0uilt-in";
3742
3743 if (buildtag[0] == 0) {
3744#ifdef CONFIG_MODULES
3745 if (THIS_MODULE != NULL)
3746 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3747 else
3748#endif
3749 buildtag[0] = 'b';
3750 }
3751
3752 return buildtag;
3753}
3754
3755module_init(drbd_init)
3756module_exit(drbd_cleanup)
3757
b411b363
PR
3758EXPORT_SYMBOL(drbd_conn_str);
3759EXPORT_SYMBOL(drbd_role_str);
3760EXPORT_SYMBOL(drbd_disk_str);
3761EXPORT_SYMBOL(drbd_set_st_err_str);
This page took 0.181346 seconds and 5 git commands to generate.