drbd: Moved the state functions into its own source file
[deliverable/linux.git] / drivers / block / drbd / drbd_main.c
CommitLineData
b411b363
PR
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
b411b363 29#include <linux/module.h>
b411b363
PR
30#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
2a48fc0a 35#include <linux/mutex.h>
b411b363
PR
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
b411b363
PR
55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
2a48fc0a 59static DEFINE_MUTEX(drbd_main_mutex);
b411b363
PR
60int drbdd_init(struct drbd_thread *);
61int drbd_worker(struct drbd_thread *);
62int drbd_asender(struct drbd_thread *);
63
64int drbd_init(void);
65static int drbd_open(struct block_device *bdev, fmode_t mode);
66static int drbd_release(struct gendisk *gd, fmode_t mode);
b411b363
PR
67static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
68static void md_sync_timer_fn(unsigned long data);
69static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
e9e6f3ec 70static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
b411b363 71
b411b363
PR
72MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
73 "Lars Ellenberg <lars@linbit.com>");
74MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
75MODULE_VERSION(REL_VERSION);
76MODULE_LICENSE("GPL");
2b8a90b5
PR
77MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
78 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
b411b363
PR
79MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
80
81#include <linux/moduleparam.h>
82/* allow_open_on_secondary */
83MODULE_PARM_DESC(allow_oos, "DONT USE!");
84/* thanks to these macros, if compiled into the kernel (not-module),
85 * this becomes the boot parameter drbd.minor_count */
86module_param(minor_count, uint, 0444);
87module_param(disable_sendpage, bool, 0644);
88module_param(allow_oos, bool, 0);
89module_param(cn_idx, uint, 0444);
90module_param(proc_details, int, 0644);
91
92#ifdef CONFIG_DRBD_FAULT_INJECTION
93int enable_faults;
94int fault_rate;
95static int fault_count;
96int fault_devs;
97/* bitmap of enabled faults */
98module_param(enable_faults, int, 0664);
99/* fault rate % value - applies to all enabled faults */
100module_param(fault_rate, int, 0664);
101/* count of faults inserted */
102module_param(fault_count, int, 0664);
103/* bitmap of devices to insert faults on */
104module_param(fault_devs, int, 0644);
105#endif
106
107/* module parameter, defined */
2b8a90b5 108unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
b411b363
PR
109int disable_sendpage;
110int allow_oos;
111unsigned int cn_idx = CN_IDX_DRBD;
112int proc_details; /* Detail level in proc drbd*/
113
114/* Module parameter for setting the user mode helper program
115 * to run. Default is /sbin/drbdadm */
116char usermode_helper[80] = "/sbin/drbdadm";
117
118module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
119
120/* in 2.6.x, our device mapping and config info contains our virtual gendisks
121 * as member "struct gendisk *vdisk;"
122 */
123struct drbd_conf **minor_table;
2111438b 124struct list_head drbd_tconns; /* list of struct drbd_tconn */
b411b363
PR
125
126struct kmem_cache *drbd_request_cache;
6c852bec 127struct kmem_cache *drbd_ee_cache; /* peer requests */
b411b363
PR
128struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
129struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
130mempool_t *drbd_request_mempool;
131mempool_t *drbd_ee_mempool;
132
133/* I do not use a standard mempool, because:
134 1) I want to hand out the pre-allocated objects first.
135 2) I want to be able to interrupt sleeping allocation with a signal.
136 Note: This is a single linked list, the next pointer is the private
137 member of struct page.
138 */
139struct page *drbd_pp_pool;
140spinlock_t drbd_pp_lock;
141int drbd_pp_vacant;
142wait_queue_head_t drbd_pp_wait;
143
144DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
145
7d4e9d09 146static const struct block_device_operations drbd_ops = {
b411b363
PR
147 .owner = THIS_MODULE,
148 .open = drbd_open,
149 .release = drbd_release,
150};
151
152#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
153
154#ifdef __CHECKER__
155/* When checking with sparse, and this is an inline function, sparse will
156 give tons of false positives. When this is a real functions sparse works.
157 */
158int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
159{
160 int io_allowed;
161
162 atomic_inc(&mdev->local_cnt);
163 io_allowed = (mdev->state.disk >= mins);
164 if (!io_allowed) {
165 if (atomic_dec_and_test(&mdev->local_cnt))
166 wake_up(&mdev->misc_wait);
167 }
168 return io_allowed;
169}
170
171#endif
172
173/**
174 * DOC: The transfer log
175 *
176 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
87eeee41 177 * mdev->tconn->newest_tle points to the head, mdev->tconn->oldest_tle points to the tail
b411b363
PR
178 * of the list. There is always at least one &struct drbd_tl_epoch object.
179 *
180 * Each &struct drbd_tl_epoch has a circular double linked list of requests
181 * attached.
182 */
183static int tl_init(struct drbd_conf *mdev)
184{
185 struct drbd_tl_epoch *b;
186
187 /* during device minor initialization, we may well use GFP_KERNEL */
188 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
189 if (!b)
190 return 0;
191 INIT_LIST_HEAD(&b->requests);
192 INIT_LIST_HEAD(&b->w.list);
193 b->next = NULL;
194 b->br_number = 4711;
7e602c0a 195 b->n_writes = 0;
b411b363
PR
196 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
197
87eeee41
PR
198 mdev->tconn->oldest_tle = b;
199 mdev->tconn->newest_tle = b;
200 INIT_LIST_HEAD(&mdev->tconn->out_of_sequence_requests);
b411b363 201
b411b363
PR
202 return 1;
203}
204
205static void tl_cleanup(struct drbd_conf *mdev)
206{
87eeee41
PR
207 D_ASSERT(mdev->tconn->oldest_tle == mdev->tconn->newest_tle);
208 D_ASSERT(list_empty(&mdev->tconn->out_of_sequence_requests));
209 kfree(mdev->tconn->oldest_tle);
210 mdev->tconn->oldest_tle = NULL;
211 kfree(mdev->tconn->unused_spare_tle);
212 mdev->tconn->unused_spare_tle = NULL;
d628769b
AG
213}
214
b411b363
PR
215/**
216 * _tl_add_barrier() - Adds a barrier to the transfer log
217 * @mdev: DRBD device.
218 * @new: Barrier to be added before the current head of the TL.
219 *
220 * The caller must hold the req_lock.
221 */
222void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
223{
224 struct drbd_tl_epoch *newest_before;
225
226 INIT_LIST_HEAD(&new->requests);
227 INIT_LIST_HEAD(&new->w.list);
228 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
229 new->next = NULL;
7e602c0a 230 new->n_writes = 0;
b411b363 231
87eeee41 232 newest_before = mdev->tconn->newest_tle;
b411b363
PR
233 /* never send a barrier number == 0, because that is special-cased
234 * when using TCQ for our write ordering code */
235 new->br_number = (newest_before->br_number+1) ?: 1;
87eeee41
PR
236 if (mdev->tconn->newest_tle != new) {
237 mdev->tconn->newest_tle->next = new;
238 mdev->tconn->newest_tle = new;
b411b363
PR
239 }
240}
241
242/**
243 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
244 * @mdev: DRBD device.
245 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
246 * @set_size: Expected number of requests before that barrier.
247 *
248 * In case the passed barrier_nr or set_size does not match the oldest
249 * &struct drbd_tl_epoch objects this function will cause a termination
250 * of the connection.
251 */
252void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
253 unsigned int set_size)
254{
255 struct drbd_tl_epoch *b, *nob; /* next old barrier */
256 struct list_head *le, *tle;
257 struct drbd_request *r;
258
87eeee41 259 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 260
87eeee41 261 b = mdev->tconn->oldest_tle;
b411b363
PR
262
263 /* first some paranoia code */
264 if (b == NULL) {
265 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
266 barrier_nr);
267 goto bail;
268 }
269 if (b->br_number != barrier_nr) {
270 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
271 barrier_nr, b->br_number);
272 goto bail;
273 }
7e602c0a
PR
274 if (b->n_writes != set_size) {
275 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
276 barrier_nr, set_size, b->n_writes);
b411b363
PR
277 goto bail;
278 }
279
280 /* Clean up list of requests processed during current epoch */
281 list_for_each_safe(le, tle, &b->requests) {
282 r = list_entry(le, struct drbd_request, tl_requests);
8554df1c 283 _req_mod(r, BARRIER_ACKED);
b411b363
PR
284 }
285 /* There could be requests on the list waiting for completion
286 of the write to the local disk. To avoid corruptions of
287 slab's data structures we have to remove the lists head.
288
289 Also there could have been a barrier ack out of sequence, overtaking
290 the write acks - which would be a bug and violating write ordering.
291 To not deadlock in case we lose connection while such requests are
292 still pending, we need some way to find them for the
8554df1c 293 _req_mode(CONNECTION_LOST_WHILE_PENDING).
b411b363
PR
294
295 These have been list_move'd to the out_of_sequence_requests list in
8554df1c 296 _req_mod(, BARRIER_ACKED) above.
b411b363
PR
297 */
298 list_del_init(&b->requests);
299
300 nob = b->next;
301 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
302 _tl_add_barrier(mdev, b);
303 if (nob)
87eeee41 304 mdev->tconn->oldest_tle = nob;
b411b363 305 /* if nob == NULL b was the only barrier, and becomes the new
87eeee41 306 barrier. Therefore mdev->tconn->oldest_tle points already to b */
b411b363
PR
307 } else {
308 D_ASSERT(nob != NULL);
87eeee41 309 mdev->tconn->oldest_tle = nob;
b411b363
PR
310 kfree(b);
311 }
312
87eeee41 313 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
314 dec_ap_pending(mdev);
315
316 return;
317
318bail:
87eeee41 319 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
320 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
321}
322
617049aa 323
b411b363 324/**
11b58e73 325 * _tl_restart() - Walks the transfer log, and applies an action to all requests
b411b363 326 * @mdev: DRBD device.
11b58e73 327 * @what: The action/event to perform with all request objects
b411b363 328 *
8554df1c
AG
329 * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
330 * RESTART_FROZEN_DISK_IO.
b411b363 331 */
b8907339 332void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
b411b363 333{
11b58e73 334 struct drbd_tl_epoch *b, *tmp, **pn;
b9b98716 335 struct list_head *le, *tle, carry_reads;
11b58e73
PR
336 struct drbd_request *req;
337 int rv, n_writes, n_reads;
b411b363 338
87eeee41
PR
339 b = mdev->tconn->oldest_tle;
340 pn = &mdev->tconn->oldest_tle;
b411b363 341 while (b) {
11b58e73
PR
342 n_writes = 0;
343 n_reads = 0;
b9b98716 344 INIT_LIST_HEAD(&carry_reads);
b411b363 345 list_for_each_safe(le, tle, &b->requests) {
11b58e73
PR
346 req = list_entry(le, struct drbd_request, tl_requests);
347 rv = _req_mod(req, what);
348
349 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
350 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
b411b363
PR
351 }
352 tmp = b->next;
353
b9b98716 354 if (n_writes) {
8554df1c 355 if (what == RESEND) {
11b58e73
PR
356 b->n_writes = n_writes;
357 if (b->w.cb == NULL) {
358 b->w.cb = w_send_barrier;
359 inc_ap_pending(mdev);
360 set_bit(CREATE_BARRIER, &mdev->flags);
361 }
362
e42325a5 363 drbd_queue_work(&mdev->tconn->data.work, &b->w);
11b58e73
PR
364 }
365 pn = &b->next;
366 } else {
b9b98716
PR
367 if (n_reads)
368 list_add(&carry_reads, &b->requests);
11b58e73
PR
369 /* there could still be requests on that ring list,
370 * in case local io is still pending */
371 list_del(&b->requests);
372
373 /* dec_ap_pending corresponding to queue_barrier.
374 * the newest barrier may not have been queued yet,
375 * in which case w.cb is still NULL. */
376 if (b->w.cb != NULL)
377 dec_ap_pending(mdev);
378
87eeee41 379 if (b == mdev->tconn->newest_tle) {
11b58e73
PR
380 /* recycle, but reinit! */
381 D_ASSERT(tmp == NULL);
382 INIT_LIST_HEAD(&b->requests);
b9b98716 383 list_splice(&carry_reads, &b->requests);
11b58e73
PR
384 INIT_LIST_HEAD(&b->w.list);
385 b->w.cb = NULL;
386 b->br_number = net_random();
387 b->n_writes = 0;
388
389 *pn = b;
390 break;
391 }
392 *pn = tmp;
393 kfree(b);
b411b363 394 }
b411b363 395 b = tmp;
b9b98716 396 list_splice(&carry_reads, &b->requests);
b411b363 397 }
11b58e73
PR
398}
399
b411b363
PR
400
401/**
402 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
403 * @mdev: DRBD device.
404 *
405 * This is called after the connection to the peer was lost. The storage covered
406 * by the requests on the transfer gets marked as our of sync. Called from the
407 * receiver thread and the worker thread.
408 */
409void tl_clear(struct drbd_conf *mdev)
410{
b411b363
PR
411 struct list_head *le, *tle;
412 struct drbd_request *r;
b411b363 413
87eeee41 414 spin_lock_irq(&mdev->tconn->req_lock);
b411b363 415
8554df1c 416 _tl_restart(mdev, CONNECTION_LOST_WHILE_PENDING);
b411b363
PR
417
418 /* we expect this list to be empty. */
87eeee41 419 D_ASSERT(list_empty(&mdev->tconn->out_of_sequence_requests));
b411b363
PR
420
421 /* but just in case, clean it up anyways! */
87eeee41 422 list_for_each_safe(le, tle, &mdev->tconn->out_of_sequence_requests) {
b411b363
PR
423 r = list_entry(le, struct drbd_request, tl_requests);
424 /* It would be nice to complete outside of spinlock.
425 * But this is easier for now. */
8554df1c 426 _req_mod(r, CONNECTION_LOST_WHILE_PENDING);
b411b363
PR
427 }
428
429 /* ensure bit indicating barrier is required is clear */
430 clear_bit(CREATE_BARRIER, &mdev->flags);
431
87eeee41 432 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
433}
434
11b58e73
PR
435void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
436{
87eeee41 437 spin_lock_irq(&mdev->tconn->req_lock);
11b58e73 438 _tl_restart(mdev, what);
87eeee41 439 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
440}
441
b411b363
PR
442static int drbd_thread_setup(void *arg)
443{
444 struct drbd_thread *thi = (struct drbd_thread *) arg;
445 struct drbd_conf *mdev = thi->mdev;
446 unsigned long flags;
447 int retval;
448
449restart:
450 retval = thi->function(thi);
451
452 spin_lock_irqsave(&thi->t_lock, flags);
453
e77a0a5c 454 /* if the receiver has been "EXITING", the last thing it did
b411b363
PR
455 * was set the conn state to "StandAlone",
456 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
457 * and receiver thread will be "started".
e77a0a5c 458 * drbd_thread_start needs to set "RESTARTING" in that case.
b411b363 459 * t_state check and assignment needs to be within the same spinlock,
e77a0a5c
AG
460 * so either thread_start sees EXITING, and can remap to RESTARTING,
461 * or thread_start see NONE, and can proceed as normal.
b411b363
PR
462 */
463
e77a0a5c 464 if (thi->t_state == RESTARTING) {
b411b363 465 dev_info(DEV, "Restarting %s\n", current->comm);
e77a0a5c 466 thi->t_state = RUNNING;
b411b363
PR
467 spin_unlock_irqrestore(&thi->t_lock, flags);
468 goto restart;
469 }
470
471 thi->task = NULL;
e77a0a5c 472 thi->t_state = NONE;
b411b363
PR
473 smp_mb();
474 complete(&thi->stop);
475 spin_unlock_irqrestore(&thi->t_lock, flags);
476
477 dev_info(DEV, "Terminating %s\n", current->comm);
478
479 /* Release mod reference taken when thread was started */
480 module_put(THIS_MODULE);
481 return retval;
482}
483
484static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
485 int (*func) (struct drbd_thread *))
486{
487 spin_lock_init(&thi->t_lock);
488 thi->task = NULL;
e77a0a5c 489 thi->t_state = NONE;
b411b363
PR
490 thi->function = func;
491 thi->mdev = mdev;
492}
493
494int drbd_thread_start(struct drbd_thread *thi)
495{
496 struct drbd_conf *mdev = thi->mdev;
497 struct task_struct *nt;
498 unsigned long flags;
499
500 const char *me =
e6b3ea83
PR
501 thi == &mdev->tconn->receiver ? "receiver" :
502 thi == &mdev->tconn->asender ? "asender" :
503 thi == &mdev->tconn->worker ? "worker" : "NONSENSE";
b411b363
PR
504
505 /* is used from state engine doing drbd_thread_stop_nowait,
506 * while holding the req lock irqsave */
507 spin_lock_irqsave(&thi->t_lock, flags);
508
509 switch (thi->t_state) {
e77a0a5c 510 case NONE:
b411b363
PR
511 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
512 me, current->comm, current->pid);
513
514 /* Get ref on module for thread - this is released when thread exits */
515 if (!try_module_get(THIS_MODULE)) {
516 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
517 spin_unlock_irqrestore(&thi->t_lock, flags);
81e84650 518 return false;
b411b363
PR
519 }
520
521 init_completion(&thi->stop);
522 D_ASSERT(thi->task == NULL);
523 thi->reset_cpu_mask = 1;
e77a0a5c 524 thi->t_state = RUNNING;
b411b363
PR
525 spin_unlock_irqrestore(&thi->t_lock, flags);
526 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
527
528 nt = kthread_create(drbd_thread_setup, (void *) thi,
529 "drbd%d_%s", mdev_to_minor(mdev), me);
530
531 if (IS_ERR(nt)) {
532 dev_err(DEV, "Couldn't start thread\n");
533
534 module_put(THIS_MODULE);
81e84650 535 return false;
b411b363
PR
536 }
537 spin_lock_irqsave(&thi->t_lock, flags);
538 thi->task = nt;
e77a0a5c 539 thi->t_state = RUNNING;
b411b363
PR
540 spin_unlock_irqrestore(&thi->t_lock, flags);
541 wake_up_process(nt);
542 break;
e77a0a5c
AG
543 case EXITING:
544 thi->t_state = RESTARTING;
b411b363
PR
545 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
546 me, current->comm, current->pid);
547 /* fall through */
e77a0a5c
AG
548 case RUNNING:
549 case RESTARTING:
b411b363
PR
550 default:
551 spin_unlock_irqrestore(&thi->t_lock, flags);
552 break;
553 }
554
81e84650 555 return true;
b411b363
PR
556}
557
558
559void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
560{
561 unsigned long flags;
562
e77a0a5c 563 enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
b411b363
PR
564
565 /* may be called from state engine, holding the req lock irqsave */
566 spin_lock_irqsave(&thi->t_lock, flags);
567
e77a0a5c 568 if (thi->t_state == NONE) {
b411b363
PR
569 spin_unlock_irqrestore(&thi->t_lock, flags);
570 if (restart)
571 drbd_thread_start(thi);
572 return;
573 }
574
575 if (thi->t_state != ns) {
576 if (thi->task == NULL) {
577 spin_unlock_irqrestore(&thi->t_lock, flags);
578 return;
579 }
580
581 thi->t_state = ns;
582 smp_mb();
583 init_completion(&thi->stop);
584 if (thi->task != current)
585 force_sig(DRBD_SIGKILL, thi->task);
586
587 }
588
589 spin_unlock_irqrestore(&thi->t_lock, flags);
590
591 if (wait)
592 wait_for_completion(&thi->stop);
593}
594
595#ifdef CONFIG_SMP
596/**
597 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
598 * @mdev: DRBD device.
599 *
600 * Forces all threads of a device onto the same CPU. This is beneficial for
601 * DRBD's performance. May be overwritten by user's configuration.
602 */
603void drbd_calc_cpu_mask(struct drbd_conf *mdev)
604{
605 int ord, cpu;
606
607 /* user override. */
608 if (cpumask_weight(mdev->cpu_mask))
609 return;
610
611 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
612 for_each_online_cpu(cpu) {
613 if (ord-- == 0) {
614 cpumask_set_cpu(cpu, mdev->cpu_mask);
615 return;
616 }
617 }
618 /* should not be reached */
619 cpumask_setall(mdev->cpu_mask);
620}
621
622/**
623 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
624 * @mdev: DRBD device.
625 *
626 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
627 * prematurely.
628 */
629void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
630{
631 struct task_struct *p = current;
632 struct drbd_thread *thi =
e6b3ea83
PR
633 p == mdev->tconn->asender.task ? &mdev->tconn->asender :
634 p == mdev->tconn->receiver.task ? &mdev->tconn->receiver :
635 p == mdev->tconn->worker.task ? &mdev->tconn->worker :
b411b363 636 NULL;
841ce241 637 if (!expect(thi != NULL))
b411b363
PR
638 return;
639 if (!thi->reset_cpu_mask)
640 return;
641 thi->reset_cpu_mask = 0;
642 set_cpus_allowed_ptr(p, mdev->cpu_mask);
643}
644#endif
645
fd340c12 646static void prepare_header80(struct drbd_conf *mdev, struct p_header80 *h,
d8763023 647 enum drbd_packet cmd, int size)
fd340c12
PR
648{
649 h->magic = cpu_to_be32(DRBD_MAGIC);
650 h->command = cpu_to_be16(cmd);
651 h->length = cpu_to_be16(size);
652}
653
654static void prepare_header95(struct drbd_conf *mdev, struct p_header95 *h,
d8763023 655 enum drbd_packet cmd, int size)
fd340c12
PR
656{
657 h->magic = cpu_to_be16(DRBD_MAGIC_BIG);
658 h->command = cpu_to_be16(cmd);
659 h->length = cpu_to_be32(size);
660}
661
662static void prepare_header(struct drbd_conf *mdev, struct p_header *h,
d8763023 663 enum drbd_packet cmd, int size)
fd340c12
PR
664{
665 if (mdev->tconn->agreed_pro_version >= 100 || size > DRBD_MAX_SIZE_H80_PACKET)
666 prepare_header95(mdev, &h->h95, cmd, size);
667 else
668 prepare_header80(mdev, &h->h80, cmd, size);
669}
670
b411b363
PR
671/* the appropriate socket mutex must be held already */
672int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
d8763023
AG
673 enum drbd_packet cmd, struct p_header *h, size_t size,
674 unsigned msg_flags)
b411b363
PR
675{
676 int sent, ok;
677
841ce241
AG
678 if (!expect(h))
679 return false;
680 if (!expect(size))
681 return false;
b411b363 682
fd340c12 683 prepare_header(mdev, h, cmd, size - sizeof(struct p_header));
b411b363 684
b411b363
PR
685 sent = drbd_send(mdev, sock, h, size, msg_flags);
686
687 ok = (sent == size);
0ddc5549
LE
688 if (!ok && !signal_pending(current))
689 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
b411b363
PR
690 cmdname(cmd), (int)size, sent);
691 return ok;
692}
693
694/* don't pass the socket. we may only look at it
695 * when we hold the appropriate socket mutex.
696 */
697int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
d8763023 698 enum drbd_packet cmd, struct p_header *h, size_t size)
b411b363
PR
699{
700 int ok = 0;
701 struct socket *sock;
702
703 if (use_data_socket) {
e42325a5
PR
704 mutex_lock(&mdev->tconn->data.mutex);
705 sock = mdev->tconn->data.socket;
b411b363 706 } else {
e42325a5
PR
707 mutex_lock(&mdev->tconn->meta.mutex);
708 sock = mdev->tconn->meta.socket;
b411b363
PR
709 }
710
711 /* drbd_disconnect() could have called drbd_free_sock()
712 * while we were waiting in down()... */
713 if (likely(sock != NULL))
714 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
715
716 if (use_data_socket)
e42325a5 717 mutex_unlock(&mdev->tconn->data.mutex);
b411b363 718 else
e42325a5 719 mutex_unlock(&mdev->tconn->meta.mutex);
b411b363
PR
720 return ok;
721}
722
d8763023 723int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packet cmd, char *data,
b411b363
PR
724 size_t size)
725{
fd340c12 726 struct p_header h;
b411b363
PR
727 int ok;
728
fd340c12 729 prepare_header(mdev, &h, cmd, size);
b411b363
PR
730
731 if (!drbd_get_data_sock(mdev))
732 return 0;
733
b411b363 734 ok = (sizeof(h) ==
e42325a5 735 drbd_send(mdev, mdev->tconn->data.socket, &h, sizeof(h), 0));
b411b363 736 ok = ok && (size ==
e42325a5 737 drbd_send(mdev, mdev->tconn->data.socket, data, size, 0));
b411b363
PR
738
739 drbd_put_data_sock(mdev);
740
741 return ok;
742}
743
744int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
745{
8e26f9cc 746 struct p_rs_param_95 *p;
b411b363
PR
747 struct socket *sock;
748 int size, rv;
31890f4a 749 const int apv = mdev->tconn->agreed_pro_version;
b411b363
PR
750
751 size = apv <= 87 ? sizeof(struct p_rs_param)
752 : apv == 88 ? sizeof(struct p_rs_param)
753 + strlen(mdev->sync_conf.verify_alg) + 1
8e26f9cc
PR
754 : apv <= 94 ? sizeof(struct p_rs_param_89)
755 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363
PR
756
757 /* used from admin command context and receiver/worker context.
758 * to avoid kmalloc, grab the socket right here,
759 * then use the pre-allocated sbuf there */
e42325a5
PR
760 mutex_lock(&mdev->tconn->data.mutex);
761 sock = mdev->tconn->data.socket;
b411b363
PR
762
763 if (likely(sock != NULL)) {
d8763023
AG
764 enum drbd_packet cmd =
765 apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
b411b363 766
e42325a5 767 p = &mdev->tconn->data.sbuf.rs_param_95;
b411b363
PR
768
769 /* initialize verify_alg and csums_alg */
770 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
771
772 p->rate = cpu_to_be32(sc->rate);
8e26f9cc
PR
773 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
774 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
775 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
776 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
b411b363
PR
777
778 if (apv >= 88)
779 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
780 if (apv >= 89)
781 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
782
783 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
784 } else
785 rv = 0; /* not ok */
786
e42325a5 787 mutex_unlock(&mdev->tconn->data.mutex);
b411b363
PR
788
789 return rv;
790}
791
792int drbd_send_protocol(struct drbd_conf *mdev)
793{
794 struct p_protocol *p;
cf14c2e9 795 int size, cf, rv;
b411b363
PR
796
797 size = sizeof(struct p_protocol);
798
31890f4a 799 if (mdev->tconn->agreed_pro_version >= 87)
89e58e75 800 size += strlen(mdev->tconn->net_conf->integrity_alg) + 1;
b411b363
PR
801
802 /* we must not recurse into our own queue,
803 * as that is blocked during handshake */
804 p = kmalloc(size, GFP_NOIO);
805 if (p == NULL)
806 return 0;
807
89e58e75
PR
808 p->protocol = cpu_to_be32(mdev->tconn->net_conf->wire_protocol);
809 p->after_sb_0p = cpu_to_be32(mdev->tconn->net_conf->after_sb_0p);
810 p->after_sb_1p = cpu_to_be32(mdev->tconn->net_conf->after_sb_1p);
811 p->after_sb_2p = cpu_to_be32(mdev->tconn->net_conf->after_sb_2p);
812 p->two_primaries = cpu_to_be32(mdev->tconn->net_conf->two_primaries);
b411b363 813
cf14c2e9 814 cf = 0;
89e58e75 815 if (mdev->tconn->net_conf->want_lose)
cf14c2e9 816 cf |= CF_WANT_LOSE;
89e58e75 817 if (mdev->tconn->net_conf->dry_run) {
31890f4a 818 if (mdev->tconn->agreed_pro_version >= 92)
cf14c2e9
PR
819 cf |= CF_DRY_RUN;
820 else {
821 dev_err(DEV, "--dry-run is not supported by peer");
7ac314c8 822 kfree(p);
148efa16 823 return -1;
cf14c2e9
PR
824 }
825 }
826 p->conn_flags = cpu_to_be32(cf);
827
31890f4a 828 if (mdev->tconn->agreed_pro_version >= 87)
89e58e75 829 strcpy(p->integrity_alg, mdev->tconn->net_conf->integrity_alg);
b411b363 830
c012949a 831 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, &p->head, size);
b411b363
PR
832 kfree(p);
833 return rv;
834}
835
836int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
837{
838 struct p_uuids p;
839 int i;
840
841 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
842 return 1;
843
844 for (i = UI_CURRENT; i < UI_SIZE; i++)
845 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
846
847 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
848 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
89e58e75 849 uuid_flags |= mdev->tconn->net_conf->want_lose ? 1 : 0;
b411b363
PR
850 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
851 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
852 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
853
854 put_ldev(mdev);
855
c012949a 856 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, &p.head, sizeof(p));
b411b363
PR
857}
858
859int drbd_send_uuids(struct drbd_conf *mdev)
860{
861 return _drbd_send_uuids(mdev, 0);
862}
863
864int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
865{
866 return _drbd_send_uuids(mdev, 8);
867}
868
62b0da3a
LE
869void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
870{
871 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
872 u64 *uuid = mdev->ldev->md.uuid;
873 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
874 text,
875 (unsigned long long)uuid[UI_CURRENT],
876 (unsigned long long)uuid[UI_BITMAP],
877 (unsigned long long)uuid[UI_HISTORY_START],
878 (unsigned long long)uuid[UI_HISTORY_END]);
879 put_ldev(mdev);
880 } else {
881 dev_info(DEV, "%s effective data uuid: %016llX\n",
882 text,
883 (unsigned long long)mdev->ed_uuid);
884 }
885}
886
5a22db89 887int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
b411b363
PR
888{
889 struct p_rs_uuid p;
5a22db89
LE
890 u64 uuid;
891
892 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
b411b363 893
4a23f264 894 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
5a22db89 895 drbd_uuid_set(mdev, UI_BITMAP, uuid);
62b0da3a 896 drbd_print_uuids(mdev, "updated sync UUID");
5a22db89
LE
897 drbd_md_sync(mdev);
898 p.uuid = cpu_to_be64(uuid);
b411b363 899
c012949a 900 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, &p.head, sizeof(p));
b411b363
PR
901}
902
e89b591c 903int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
b411b363
PR
904{
905 struct p_sizes p;
906 sector_t d_size, u_size;
99432fcc 907 int q_order_type, max_bio_size;
b411b363
PR
908 int ok;
909
910 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
911 D_ASSERT(mdev->ldev->backing_bdev);
912 d_size = drbd_get_max_capacity(mdev->ldev);
913 u_size = mdev->ldev->dc.disk_size;
914 q_order_type = drbd_queue_order_type(mdev);
99432fcc
PR
915 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
916 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
b411b363
PR
917 put_ldev(mdev);
918 } else {
919 d_size = 0;
920 u_size = 0;
921 q_order_type = QUEUE_ORDERED_NONE;
99432fcc 922 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
b411b363
PR
923 }
924
925 p.d_size = cpu_to_be64(d_size);
926 p.u_size = cpu_to_be64(u_size);
927 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
99432fcc 928 p.max_bio_size = cpu_to_be32(max_bio_size);
e89b591c
PR
929 p.queue_order_type = cpu_to_be16(q_order_type);
930 p.dds_flags = cpu_to_be16(flags);
b411b363 931
c012949a 932 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, &p.head, sizeof(p));
b411b363
PR
933 return ok;
934}
935
936/**
937 * drbd_send_state() - Sends the drbd state to the peer
938 * @mdev: DRBD device.
939 */
940int drbd_send_state(struct drbd_conf *mdev)
941{
942 struct socket *sock;
943 struct p_state p;
944 int ok = 0;
945
946 /* Grab state lock so we wont send state if we're in the middle
947 * of a cluster wide state change on another thread */
948 drbd_state_lock(mdev);
949
e42325a5 950 mutex_lock(&mdev->tconn->data.mutex);
b411b363
PR
951
952 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
e42325a5 953 sock = mdev->tconn->data.socket;
b411b363
PR
954
955 if (likely(sock != NULL)) {
c012949a 956 ok = _drbd_send_cmd(mdev, sock, P_STATE, &p.head, sizeof(p), 0);
b411b363
PR
957 }
958
e42325a5 959 mutex_unlock(&mdev->tconn->data.mutex);
b411b363
PR
960
961 drbd_state_unlock(mdev);
962 return ok;
963}
964
965int drbd_send_state_req(struct drbd_conf *mdev,
966 union drbd_state mask, union drbd_state val)
967{
968 struct p_req_state p;
969
970 p.mask = cpu_to_be32(mask.i);
971 p.val = cpu_to_be32(val.i);
972
c012949a 973 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, &p.head, sizeof(p));
b411b363
PR
974}
975
bf885f8a 976int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
b411b363
PR
977{
978 struct p_req_state_reply p;
979
980 p.retcode = cpu_to_be32(retcode);
981
c012949a 982 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, &p.head, sizeof(p));
b411b363
PR
983}
984
985int fill_bitmap_rle_bits(struct drbd_conf *mdev,
986 struct p_compressed_bm *p,
987 struct bm_xfer_ctx *c)
988{
989 struct bitstream bs;
990 unsigned long plain_bits;
991 unsigned long tmp;
992 unsigned long rl;
993 unsigned len;
994 unsigned toggle;
995 int bits;
996
997 /* may we use this feature? */
998 if ((mdev->sync_conf.use_rle == 0) ||
31890f4a 999 (mdev->tconn->agreed_pro_version < 90))
b411b363
PR
1000 return 0;
1001
1002 if (c->bit_offset >= c->bm_bits)
1003 return 0; /* nothing to do. */
1004
1005 /* use at most thus many bytes */
1006 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1007 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1008 /* plain bits covered in this code string */
1009 plain_bits = 0;
1010
1011 /* p->encoding & 0x80 stores whether the first run length is set.
1012 * bit offset is implicit.
1013 * start with toggle == 2 to be able to tell the first iteration */
1014 toggle = 2;
1015
1016 /* see how much plain bits we can stuff into one packet
1017 * using RLE and VLI. */
1018 do {
1019 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1020 : _drbd_bm_find_next(mdev, c->bit_offset);
1021 if (tmp == -1UL)
1022 tmp = c->bm_bits;
1023 rl = tmp - c->bit_offset;
1024
1025 if (toggle == 2) { /* first iteration */
1026 if (rl == 0) {
1027 /* the first checked bit was set,
1028 * store start value, */
1029 DCBP_set_start(p, 1);
1030 /* but skip encoding of zero run length */
1031 toggle = !toggle;
1032 continue;
1033 }
1034 DCBP_set_start(p, 0);
1035 }
1036
1037 /* paranoia: catch zero runlength.
1038 * can only happen if bitmap is modified while we scan it. */
1039 if (rl == 0) {
1040 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1041 "t:%u bo:%lu\n", toggle, c->bit_offset);
1042 return -1;
1043 }
1044
1045 bits = vli_encode_bits(&bs, rl);
1046 if (bits == -ENOBUFS) /* buffer full */
1047 break;
1048 if (bits <= 0) {
1049 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1050 return 0;
1051 }
1052
1053 toggle = !toggle;
1054 plain_bits += rl;
1055 c->bit_offset = tmp;
1056 } while (c->bit_offset < c->bm_bits);
1057
1058 len = bs.cur.b - p->code + !!bs.cur.bit;
1059
1060 if (plain_bits < (len << 3)) {
1061 /* incompressible with this method.
1062 * we need to rewind both word and bit position. */
1063 c->bit_offset -= plain_bits;
1064 bm_xfer_ctx_bit_to_word_offset(c);
1065 c->bit_offset = c->word_offset * BITS_PER_LONG;
1066 return 0;
1067 }
1068
1069 /* RLE + VLI was able to compress it just fine.
1070 * update c->word_offset. */
1071 bm_xfer_ctx_bit_to_word_offset(c);
1072
1073 /* store pad_bits */
1074 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
1075
1076 return len;
1077}
1078
f70af118
AG
1079/**
1080 * send_bitmap_rle_or_plain
1081 *
1082 * Return 0 when done, 1 when another iteration is needed, and a negative error
1083 * code upon failure.
1084 */
1085static int
b411b363 1086send_bitmap_rle_or_plain(struct drbd_conf *mdev,
c012949a 1087 struct p_header *h, struct bm_xfer_ctx *c)
b411b363
PR
1088{
1089 struct p_compressed_bm *p = (void*)h;
1090 unsigned long num_words;
1091 int len;
1092 int ok;
1093
1094 len = fill_bitmap_rle_bits(mdev, p, c);
1095
1096 if (len < 0)
f70af118 1097 return -EIO;
b411b363
PR
1098
1099 if (len) {
1100 DCBP_set_code(p, RLE_VLI_Bits);
e42325a5 1101 ok = _drbd_send_cmd(mdev, mdev->tconn->data.socket, P_COMPRESSED_BITMAP, h,
b411b363
PR
1102 sizeof(*p) + len, 0);
1103
1104 c->packets[0]++;
1105 c->bytes[0] += sizeof(*p) + len;
1106
1107 if (c->bit_offset >= c->bm_bits)
1108 len = 0; /* DONE */
1109 } else {
1110 /* was not compressible.
1111 * send a buffer full of plain text bits instead. */
1112 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
1113 len = num_words * sizeof(long);
1114 if (len)
1115 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
e42325a5 1116 ok = _drbd_send_cmd(mdev, mdev->tconn->data.socket, P_BITMAP,
0b70a13d 1117 h, sizeof(struct p_header80) + len, 0);
b411b363
PR
1118 c->word_offset += num_words;
1119 c->bit_offset = c->word_offset * BITS_PER_LONG;
1120
1121 c->packets[1]++;
0b70a13d 1122 c->bytes[1] += sizeof(struct p_header80) + len;
b411b363
PR
1123
1124 if (c->bit_offset > c->bm_bits)
1125 c->bit_offset = c->bm_bits;
1126 }
f70af118
AG
1127 if (ok) {
1128 if (len == 0) {
1129 INFO_bm_xfer_stats(mdev, "send", c);
1130 return 0;
1131 } else
1132 return 1;
1133 }
1134 return -EIO;
b411b363
PR
1135}
1136
1137/* See the comment at receive_bitmap() */
1138int _drbd_send_bitmap(struct drbd_conf *mdev)
1139{
1140 struct bm_xfer_ctx c;
c012949a 1141 struct p_header *p;
f70af118 1142 int err;
b411b363 1143
841ce241
AG
1144 if (!expect(mdev->bitmap))
1145 return false;
b411b363
PR
1146
1147 /* maybe we should use some per thread scratch page,
1148 * and allocate that during initial device creation? */
c012949a 1149 p = (struct p_header *) __get_free_page(GFP_NOIO);
b411b363
PR
1150 if (!p) {
1151 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
81e84650 1152 return false;
b411b363
PR
1153 }
1154
1155 if (get_ldev(mdev)) {
1156 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
1157 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
1158 drbd_bm_set_all(mdev);
1159 if (drbd_bm_write(mdev)) {
1160 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
1161 * but otherwise process as per normal - need to tell other
1162 * side that a full resync is required! */
1163 dev_err(DEV, "Failed to write bitmap to disk!\n");
1164 } else {
1165 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
1166 drbd_md_sync(mdev);
1167 }
1168 }
1169 put_ldev(mdev);
1170 }
1171
1172 c = (struct bm_xfer_ctx) {
1173 .bm_bits = drbd_bm_bits(mdev),
1174 .bm_words = drbd_bm_words(mdev),
1175 };
1176
1177 do {
f70af118
AG
1178 err = send_bitmap_rle_or_plain(mdev, p, &c);
1179 } while (err > 0);
b411b363
PR
1180
1181 free_page((unsigned long) p);
f70af118 1182 return err == 0;
b411b363
PR
1183}
1184
1185int drbd_send_bitmap(struct drbd_conf *mdev)
1186{
1187 int err;
1188
1189 if (!drbd_get_data_sock(mdev))
1190 return -1;
1191 err = !_drbd_send_bitmap(mdev);
1192 drbd_put_data_sock(mdev);
1193 return err;
1194}
1195
1196int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
1197{
1198 int ok;
1199 struct p_barrier_ack p;
1200
1201 p.barrier = barrier_nr;
1202 p.set_size = cpu_to_be32(set_size);
1203
1204 if (mdev->state.conn < C_CONNECTED)
81e84650 1205 return false;
c012949a 1206 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, &p.head, sizeof(p));
b411b363
PR
1207 return ok;
1208}
1209
1210/**
1211 * _drbd_send_ack() - Sends an ack packet
1212 * @mdev: DRBD device.
1213 * @cmd: Packet command code.
1214 * @sector: sector, needs to be in big endian byte order
1215 * @blksize: size in byte, needs to be in big endian byte order
1216 * @block_id: Id, big endian byte order
1217 */
d8763023
AG
1218static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
1219 u64 sector, u32 blksize, u64 block_id)
b411b363
PR
1220{
1221 int ok;
1222 struct p_block_ack p;
1223
1224 p.sector = sector;
1225 p.block_id = block_id;
1226 p.blksize = blksize;
1227 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
1228
e42325a5 1229 if (!mdev->tconn->meta.socket || mdev->state.conn < C_CONNECTED)
81e84650 1230 return false;
c012949a 1231 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, &p.head, sizeof(p));
b411b363
PR
1232 return ok;
1233}
1234
2b2bf214
LE
1235/* dp->sector and dp->block_id already/still in network byte order,
1236 * data_size is payload size according to dp->head,
1237 * and may need to be corrected for digest size. */
d8763023 1238int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
2b2bf214 1239 struct p_data *dp, int data_size)
b411b363 1240{
a0638456
PR
1241 data_size -= (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_r_tfm) ?
1242 crypto_hash_digestsize(mdev->tconn->integrity_r_tfm) : 0;
b411b363
PR
1243 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
1244 dp->block_id);
1245}
1246
d8763023 1247int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
b411b363
PR
1248 struct p_block_req *rp)
1249{
1250 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
1251}
1252
1253/**
1254 * drbd_send_ack() - Sends an ack packet
db830c46
AG
1255 * @mdev: DRBD device
1256 * @cmd: packet command code
1257 * @peer_req: peer request
b411b363 1258 */
d8763023 1259int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
db830c46 1260 struct drbd_peer_request *peer_req)
b411b363
PR
1261{
1262 return _drbd_send_ack(mdev, cmd,
db830c46
AG
1263 cpu_to_be64(peer_req->i.sector),
1264 cpu_to_be32(peer_req->i.size),
1265 peer_req->block_id);
b411b363
PR
1266}
1267
1268/* This function misuses the block_id field to signal if the blocks
1269 * are is sync or not. */
d8763023 1270int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
b411b363
PR
1271 sector_t sector, int blksize, u64 block_id)
1272{
1273 return _drbd_send_ack(mdev, cmd,
1274 cpu_to_be64(sector),
1275 cpu_to_be32(blksize),
1276 cpu_to_be64(block_id));
1277}
1278
1279int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
1280 sector_t sector, int size, u64 block_id)
1281{
1282 int ok;
1283 struct p_block_req p;
1284
1285 p.sector = cpu_to_be64(sector);
1286 p.block_id = block_id;
1287 p.blksize = cpu_to_be32(size);
1288
c012949a 1289 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &p.head, sizeof(p));
b411b363
PR
1290 return ok;
1291}
1292
d8763023
AG
1293int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size,
1294 void *digest, int digest_size, enum drbd_packet cmd)
b411b363
PR
1295{
1296 int ok;
1297 struct p_block_req p;
1298
fd340c12 1299 prepare_header(mdev, &p.head, cmd, sizeof(p) - sizeof(struct p_header) + digest_size);
b411b363 1300 p.sector = cpu_to_be64(sector);
9a8e7753 1301 p.block_id = ID_SYNCER /* unused */;
b411b363
PR
1302 p.blksize = cpu_to_be32(size);
1303
e42325a5 1304 mutex_lock(&mdev->tconn->data.mutex);
b411b363 1305
e42325a5
PR
1306 ok = (sizeof(p) == drbd_send(mdev, mdev->tconn->data.socket, &p, sizeof(p), 0));
1307 ok = ok && (digest_size == drbd_send(mdev, mdev->tconn->data.socket, digest, digest_size, 0));
b411b363 1308
e42325a5 1309 mutex_unlock(&mdev->tconn->data.mutex);
b411b363
PR
1310
1311 return ok;
1312}
1313
1314int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
1315{
1316 int ok;
1317 struct p_block_req p;
1318
1319 p.sector = cpu_to_be64(sector);
9a8e7753 1320 p.block_id = ID_SYNCER /* unused */;
b411b363
PR
1321 p.blksize = cpu_to_be32(size);
1322
c012949a 1323 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, &p.head, sizeof(p));
b411b363
PR
1324 return ok;
1325}
1326
1327/* called on sndtimeo
81e84650
AG
1328 * returns false if we should retry,
1329 * true if we think connection is dead
b411b363
PR
1330 */
1331static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
1332{
1333 int drop_it;
1334 /* long elapsed = (long)(jiffies - mdev->last_received); */
1335
e42325a5 1336 drop_it = mdev->tconn->meta.socket == sock
e6b3ea83
PR
1337 || !mdev->tconn->asender.task
1338 || get_t_state(&mdev->tconn->asender) != RUNNING
b411b363
PR
1339 || mdev->state.conn < C_CONNECTED;
1340
1341 if (drop_it)
81e84650 1342 return true;
b411b363 1343
31890f4a 1344 drop_it = !--mdev->tconn->ko_count;
b411b363
PR
1345 if (!drop_it) {
1346 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
31890f4a 1347 current->comm, current->pid, mdev->tconn->ko_count);
b411b363
PR
1348 request_ping(mdev);
1349 }
1350
1351 return drop_it; /* && (mdev->state == R_PRIMARY) */;
1352}
1353
9e204cdd
AG
1354static void drbd_update_congested(struct drbd_conf *mdev)
1355{
1356 struct sock *sk = mdev->tconn->data.socket->sk;
1357 if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
1358 set_bit(NET_CONGESTED, &mdev->flags);
1359}
1360
b411b363
PR
1361/* The idea of sendpage seems to be to put some kind of reference
1362 * to the page into the skb, and to hand it over to the NIC. In
1363 * this process get_page() gets called.
1364 *
1365 * As soon as the page was really sent over the network put_page()
1366 * gets called by some part of the network layer. [ NIC driver? ]
1367 *
1368 * [ get_page() / put_page() increment/decrement the count. If count
1369 * reaches 0 the page will be freed. ]
1370 *
1371 * This works nicely with pages from FSs.
1372 * But this means that in protocol A we might signal IO completion too early!
1373 *
1374 * In order not to corrupt data during a resync we must make sure
1375 * that we do not reuse our own buffer pages (EEs) to early, therefore
1376 * we have the net_ee list.
1377 *
1378 * XFS seems to have problems, still, it submits pages with page_count == 0!
1379 * As a workaround, we disable sendpage on pages
1380 * with page_count == 0 or PageSlab.
1381 */
1382static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 1383 int offset, size_t size, unsigned msg_flags)
b411b363 1384{
e42325a5 1385 int sent = drbd_send(mdev, mdev->tconn->data.socket, kmap(page) + offset, size, msg_flags);
b411b363
PR
1386 kunmap(page);
1387 if (sent == size)
1388 mdev->send_cnt += size>>9;
1389 return sent == size;
1390}
1391
1392static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 1393 int offset, size_t size, unsigned msg_flags)
b411b363
PR
1394{
1395 mm_segment_t oldfs = get_fs();
1396 int sent, ok;
1397 int len = size;
1398
1399 /* e.g. XFS meta- & log-data is in slab pages, which have a
1400 * page_count of 0 and/or have PageSlab() set.
1401 * we cannot use send_page for those, as that does get_page();
1402 * put_page(); and would cause either a VM_BUG directly, or
1403 * __page_cache_release a page that would actually still be referenced
1404 * by someone, leading to some obscure delayed Oops somewhere else. */
1405 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
ba11ad9a 1406 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
b411b363 1407
ba11ad9a 1408 msg_flags |= MSG_NOSIGNAL;
b411b363
PR
1409 drbd_update_congested(mdev);
1410 set_fs(KERNEL_DS);
1411 do {
e42325a5 1412 sent = mdev->tconn->data.socket->ops->sendpage(mdev->tconn->data.socket, page,
b411b363 1413 offset, len,
ba11ad9a 1414 msg_flags);
b411b363
PR
1415 if (sent == -EAGAIN) {
1416 if (we_should_drop_the_connection(mdev,
e42325a5 1417 mdev->tconn->data.socket))
b411b363
PR
1418 break;
1419 else
1420 continue;
1421 }
1422 if (sent <= 0) {
1423 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
1424 __func__, (int)size, len, sent);
1425 break;
1426 }
1427 len -= sent;
1428 offset += sent;
1429 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
1430 set_fs(oldfs);
1431 clear_bit(NET_CONGESTED, &mdev->flags);
1432
1433 ok = (len == 0);
1434 if (likely(ok))
1435 mdev->send_cnt += size>>9;
1436 return ok;
1437}
1438
1439static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
1440{
1441 struct bio_vec *bvec;
1442 int i;
ba11ad9a 1443 /* hint all but last page with MSG_MORE */
b411b363
PR
1444 __bio_for_each_segment(bvec, bio, i, 0) {
1445 if (!_drbd_no_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
1446 bvec->bv_offset, bvec->bv_len,
1447 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
1448 return 0;
1449 }
1450 return 1;
1451}
1452
1453static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
1454{
1455 struct bio_vec *bvec;
1456 int i;
ba11ad9a 1457 /* hint all but last page with MSG_MORE */
b411b363
PR
1458 __bio_for_each_segment(bvec, bio, i, 0) {
1459 if (!_drbd_send_page(mdev, bvec->bv_page,
ba11ad9a
LE
1460 bvec->bv_offset, bvec->bv_len,
1461 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
b411b363
PR
1462 return 0;
1463 }
b411b363
PR
1464 return 1;
1465}
1466
db830c46
AG
1467static int _drbd_send_zc_ee(struct drbd_conf *mdev,
1468 struct drbd_peer_request *peer_req)
45bb912b 1469{
db830c46
AG
1470 struct page *page = peer_req->pages;
1471 unsigned len = peer_req->i.size;
1472
ba11ad9a 1473 /* hint all but last page with MSG_MORE */
45bb912b
LE
1474 page_chain_for_each(page) {
1475 unsigned l = min_t(unsigned, len, PAGE_SIZE);
ba11ad9a
LE
1476 if (!_drbd_send_page(mdev, page, 0, l,
1477 page_chain_next(page) ? MSG_MORE : 0))
45bb912b
LE
1478 return 0;
1479 len -= l;
1480 }
1481 return 1;
1482}
1483
76d2e7ec
PR
1484static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
1485{
31890f4a 1486 if (mdev->tconn->agreed_pro_version >= 95)
76d2e7ec 1487 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
76d2e7ec
PR
1488 (bi_rw & REQ_FUA ? DP_FUA : 0) |
1489 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
1490 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
1491 else
721a9602 1492 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
76d2e7ec
PR
1493}
1494
b411b363
PR
1495/* Used to send write requests
1496 * R_PRIMARY -> Peer (P_DATA)
1497 */
1498int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
1499{
1500 int ok = 1;
1501 struct p_data p;
1502 unsigned int dp_flags = 0;
1503 void *dgb;
1504 int dgs;
1505
1506 if (!drbd_get_data_sock(mdev))
1507 return 0;
1508
a0638456
PR
1509 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_w_tfm) ?
1510 crypto_hash_digestsize(mdev->tconn->integrity_w_tfm) : 0;
b411b363 1511
fd340c12 1512 prepare_header(mdev, &p.head, P_DATA, sizeof(p) - sizeof(struct p_header) + dgs + req->i.size);
ace652ac 1513 p.sector = cpu_to_be64(req->i.sector);
b411b363 1514 p.block_id = (unsigned long)req;
fd340c12 1515 p.seq_num = cpu_to_be32(req->seq_num = atomic_add_return(1, &mdev->packet_seq));
b411b363 1516
76d2e7ec
PR
1517 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
1518
b411b363
PR
1519 if (mdev->state.conn >= C_SYNC_SOURCE &&
1520 mdev->state.conn <= C_PAUSED_SYNC_T)
1521 dp_flags |= DP_MAY_SET_IN_SYNC;
1522
1523 p.dp_flags = cpu_to_be32(dp_flags);
b411b363
PR
1524 set_bit(UNPLUG_REMOTE, &mdev->flags);
1525 ok = (sizeof(p) ==
e42325a5 1526 drbd_send(mdev, mdev->tconn->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
b411b363 1527 if (ok && dgs) {
a0638456
PR
1528 dgb = mdev->tconn->int_dig_out;
1529 drbd_csum_bio(mdev, mdev->tconn->integrity_w_tfm, req->master_bio, dgb);
e42325a5 1530 ok = dgs == drbd_send(mdev, mdev->tconn->data.socket, dgb, dgs, 0);
b411b363
PR
1531 }
1532 if (ok) {
470be44a
LE
1533 /* For protocol A, we have to memcpy the payload into
1534 * socket buffers, as we may complete right away
1535 * as soon as we handed it over to tcp, at which point the data
1536 * pages may become invalid.
1537 *
1538 * For data-integrity enabled, we copy it as well, so we can be
1539 * sure that even if the bio pages may still be modified, it
1540 * won't change the data on the wire, thus if the digest checks
1541 * out ok after sending on this side, but does not fit on the
1542 * receiving side, we sure have detected corruption elsewhere.
1543 */
89e58e75 1544 if (mdev->tconn->net_conf->wire_protocol == DRBD_PROT_A || dgs)
b411b363
PR
1545 ok = _drbd_send_bio(mdev, req->master_bio);
1546 else
1547 ok = _drbd_send_zc_bio(mdev, req->master_bio);
470be44a
LE
1548
1549 /* double check digest, sometimes buffers have been modified in flight. */
1550 if (dgs > 0 && dgs <= 64) {
24c4830c 1551 /* 64 byte, 512 bit, is the largest digest size
470be44a
LE
1552 * currently supported in kernel crypto. */
1553 unsigned char digest[64];
a0638456
PR
1554 drbd_csum_bio(mdev, mdev->tconn->integrity_w_tfm, req->master_bio, digest);
1555 if (memcmp(mdev->tconn->int_dig_out, digest, dgs)) {
470be44a
LE
1556 dev_warn(DEV,
1557 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
ace652ac 1558 (unsigned long long)req->i.sector, req->i.size);
470be44a
LE
1559 }
1560 } /* else if (dgs > 64) {
1561 ... Be noisy about digest too large ...
1562 } */
b411b363
PR
1563 }
1564
1565 drbd_put_data_sock(mdev);
bd26bfc5 1566
b411b363
PR
1567 return ok;
1568}
1569
1570/* answer packet, used to send data back for read requests:
1571 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
1572 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
1573 */
d8763023 1574int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd,
db830c46 1575 struct drbd_peer_request *peer_req)
b411b363
PR
1576{
1577 int ok;
1578 struct p_data p;
1579 void *dgb;
1580 int dgs;
1581
a0638456
PR
1582 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_w_tfm) ?
1583 crypto_hash_digestsize(mdev->tconn->integrity_w_tfm) : 0;
b411b363 1584
db830c46
AG
1585 prepare_header(mdev, &p.head, cmd, sizeof(p) -
1586 sizeof(struct p_header80) +
1587 dgs + peer_req->i.size);
1588 p.sector = cpu_to_be64(peer_req->i.sector);
1589 p.block_id = peer_req->block_id;
cc378270 1590 p.seq_num = 0; /* unused */
b411b363
PR
1591
1592 /* Only called by our kernel thread.
1593 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
1594 * in response to admin command or module unload.
1595 */
1596 if (!drbd_get_data_sock(mdev))
1597 return 0;
1598
e42325a5 1599 ok = sizeof(p) == drbd_send(mdev, mdev->tconn->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
b411b363 1600 if (ok && dgs) {
a0638456 1601 dgb = mdev->tconn->int_dig_out;
db830c46 1602 drbd_csum_ee(mdev, mdev->tconn->integrity_w_tfm, peer_req, dgb);
e42325a5 1603 ok = dgs == drbd_send(mdev, mdev->tconn->data.socket, dgb, dgs, 0);
b411b363
PR
1604 }
1605 if (ok)
db830c46 1606 ok = _drbd_send_zc_ee(mdev, peer_req);
b411b363
PR
1607
1608 drbd_put_data_sock(mdev);
bd26bfc5 1609
b411b363
PR
1610 return ok;
1611}
1612
73a01a18
PR
1613int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
1614{
1615 struct p_block_desc p;
1616
ace652ac
AG
1617 p.sector = cpu_to_be64(req->i.sector);
1618 p.blksize = cpu_to_be32(req->i.size);
73a01a18
PR
1619
1620 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
1621}
1622
b411b363
PR
1623/*
1624 drbd_send distinguishes two cases:
1625
1626 Packets sent via the data socket "sock"
1627 and packets sent via the meta data socket "msock"
1628
1629 sock msock
1630 -----------------+-------------------------+------------------------------
1631 timeout conf.timeout / 2 conf.timeout / 2
1632 timeout action send a ping via msock Abort communication
1633 and close all sockets
1634*/
1635
1636/*
1637 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
1638 */
1639int drbd_send(struct drbd_conf *mdev, struct socket *sock,
1640 void *buf, size_t size, unsigned msg_flags)
1641{
1642 struct kvec iov;
1643 struct msghdr msg;
1644 int rv, sent = 0;
1645
1646 if (!sock)
1647 return -1000;
1648
1649 /* THINK if (signal_pending) return ... ? */
1650
1651 iov.iov_base = buf;
1652 iov.iov_len = size;
1653
1654 msg.msg_name = NULL;
1655 msg.msg_namelen = 0;
1656 msg.msg_control = NULL;
1657 msg.msg_controllen = 0;
1658 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
1659
e42325a5 1660 if (sock == mdev->tconn->data.socket) {
31890f4a 1661 mdev->tconn->ko_count = mdev->tconn->net_conf->ko_count;
b411b363
PR
1662 drbd_update_congested(mdev);
1663 }
1664 do {
1665 /* STRANGE
1666 * tcp_sendmsg does _not_ use its size parameter at all ?
1667 *
1668 * -EAGAIN on timeout, -EINTR on signal.
1669 */
1670/* THINK
1671 * do we need to block DRBD_SIG if sock == &meta.socket ??
1672 * otherwise wake_asender() might interrupt some send_*Ack !
1673 */
1674 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
1675 if (rv == -EAGAIN) {
1676 if (we_should_drop_the_connection(mdev, sock))
1677 break;
1678 else
1679 continue;
1680 }
1681 D_ASSERT(rv != 0);
1682 if (rv == -EINTR) {
1683 flush_signals(current);
1684 rv = 0;
1685 }
1686 if (rv < 0)
1687 break;
1688 sent += rv;
1689 iov.iov_base += rv;
1690 iov.iov_len -= rv;
1691 } while (sent < size);
1692
e42325a5 1693 if (sock == mdev->tconn->data.socket)
b411b363
PR
1694 clear_bit(NET_CONGESTED, &mdev->flags);
1695
1696 if (rv <= 0) {
1697 if (rv != -EAGAIN) {
1698 dev_err(DEV, "%s_sendmsg returned %d\n",
e42325a5 1699 sock == mdev->tconn->meta.socket ? "msock" : "sock",
b411b363
PR
1700 rv);
1701 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
1702 } else
1703 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
1704 }
1705
1706 return sent;
1707}
1708
1709static int drbd_open(struct block_device *bdev, fmode_t mode)
1710{
1711 struct drbd_conf *mdev = bdev->bd_disk->private_data;
1712 unsigned long flags;
1713 int rv = 0;
1714
2a48fc0a 1715 mutex_lock(&drbd_main_mutex);
87eeee41 1716 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
b411b363
PR
1717 /* to have a stable mdev->state.role
1718 * and no race with updating open_cnt */
1719
1720 if (mdev->state.role != R_PRIMARY) {
1721 if (mode & FMODE_WRITE)
1722 rv = -EROFS;
1723 else if (!allow_oos)
1724 rv = -EMEDIUMTYPE;
1725 }
1726
1727 if (!rv)
1728 mdev->open_cnt++;
87eeee41 1729 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
2a48fc0a 1730 mutex_unlock(&drbd_main_mutex);
b411b363
PR
1731
1732 return rv;
1733}
1734
1735static int drbd_release(struct gendisk *gd, fmode_t mode)
1736{
1737 struct drbd_conf *mdev = gd->private_data;
2a48fc0a 1738 mutex_lock(&drbd_main_mutex);
b411b363 1739 mdev->open_cnt--;
2a48fc0a 1740 mutex_unlock(&drbd_main_mutex);
b411b363
PR
1741 return 0;
1742}
1743
b411b363
PR
1744static void drbd_set_defaults(struct drbd_conf *mdev)
1745{
85f4cc17
PR
1746 /* This way we get a compile error when sync_conf grows,
1747 and we forgot to initialize it here */
1748 mdev->sync_conf = (struct syncer_conf) {
1749 /* .rate = */ DRBD_RATE_DEF,
1750 /* .after = */ DRBD_AFTER_DEF,
1751 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
85f4cc17
PR
1752 /* .verify_alg = */ {}, 0,
1753 /* .cpu_mask = */ {}, 0,
1754 /* .csums_alg = */ {}, 0,
e756414f 1755 /* .use_rle = */ 0,
9a31d716
PR
1756 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
1757 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
1758 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
1759 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
0f0601f4
LE
1760 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
1761 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
85f4cc17
PR
1762 };
1763
1764 /* Have to use that way, because the layout differs between
1765 big endian and little endian */
b411b363
PR
1766 mdev->state = (union drbd_state) {
1767 { .role = R_SECONDARY,
1768 .peer = R_UNKNOWN,
1769 .conn = C_STANDALONE,
1770 .disk = D_DISKLESS,
1771 .pdsk = D_UNKNOWN,
fb22c402
PR
1772 .susp = 0,
1773 .susp_nod = 0,
1774 .susp_fen = 0
b411b363
PR
1775 } };
1776}
1777
1778void drbd_init_set_defaults(struct drbd_conf *mdev)
1779{
1780 /* the memset(,0,) did most of this.
1781 * note: only assignments, no allocation in here */
1782
1783 drbd_set_defaults(mdev);
1784
b411b363
PR
1785 atomic_set(&mdev->ap_bio_cnt, 0);
1786 atomic_set(&mdev->ap_pending_cnt, 0);
1787 atomic_set(&mdev->rs_pending_cnt, 0);
1788 atomic_set(&mdev->unacked_cnt, 0);
1789 atomic_set(&mdev->local_cnt, 0);
b411b363 1790 atomic_set(&mdev->pp_in_use, 0);
435f0740 1791 atomic_set(&mdev->pp_in_use_by_net, 0);
778f271d 1792 atomic_set(&mdev->rs_sect_in, 0);
0f0601f4 1793 atomic_set(&mdev->rs_sect_ev, 0);
759fbdfb 1794 atomic_set(&mdev->ap_in_flight, 0);
b411b363
PR
1795
1796 mutex_init(&mdev->md_io_mutex);
e42325a5
PR
1797 mutex_init(&mdev->tconn->data.mutex);
1798 mutex_init(&mdev->tconn->meta.mutex);
1799 sema_init(&mdev->tconn->data.work.s, 0);
1800 sema_init(&mdev->tconn->meta.work.s, 0);
b411b363
PR
1801 mutex_init(&mdev->state_mutex);
1802
e42325a5
PR
1803 spin_lock_init(&mdev->tconn->data.work.q_lock);
1804 spin_lock_init(&mdev->tconn->meta.work.q_lock);
b411b363
PR
1805
1806 spin_lock_init(&mdev->al_lock);
87eeee41 1807 spin_lock_init(&mdev->tconn->req_lock);
b411b363
PR
1808 spin_lock_init(&mdev->peer_seq_lock);
1809 spin_lock_init(&mdev->epoch_lock);
1810
1811 INIT_LIST_HEAD(&mdev->active_ee);
1812 INIT_LIST_HEAD(&mdev->sync_ee);
1813 INIT_LIST_HEAD(&mdev->done_ee);
1814 INIT_LIST_HEAD(&mdev->read_ee);
1815 INIT_LIST_HEAD(&mdev->net_ee);
1816 INIT_LIST_HEAD(&mdev->resync_reads);
e42325a5
PR
1817 INIT_LIST_HEAD(&mdev->tconn->data.work.q);
1818 INIT_LIST_HEAD(&mdev->tconn->meta.work.q);
b411b363
PR
1819 INIT_LIST_HEAD(&mdev->resync_work.list);
1820 INIT_LIST_HEAD(&mdev->unplug_work.list);
e9e6f3ec 1821 INIT_LIST_HEAD(&mdev->go_diskless.list);
b411b363 1822 INIT_LIST_HEAD(&mdev->md_sync_work.list);
c4752ef1 1823 INIT_LIST_HEAD(&mdev->start_resync_work.list);
b411b363 1824 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
0ced55a3 1825
794abb75 1826 mdev->resync_work.cb = w_resync_timer;
b411b363 1827 mdev->unplug_work.cb = w_send_write_hint;
e9e6f3ec 1828 mdev->go_diskless.cb = w_go_diskless;
b411b363
PR
1829 mdev->md_sync_work.cb = w_md_sync;
1830 mdev->bm_io_work.w.cb = w_bitmap_io;
370a43e7 1831 mdev->start_resync_work.cb = w_start_resync;
b411b363
PR
1832 init_timer(&mdev->resync_timer);
1833 init_timer(&mdev->md_sync_timer);
370a43e7 1834 init_timer(&mdev->start_resync_timer);
7fde2be9 1835 init_timer(&mdev->request_timer);
b411b363
PR
1836 mdev->resync_timer.function = resync_timer_fn;
1837 mdev->resync_timer.data = (unsigned long) mdev;
1838 mdev->md_sync_timer.function = md_sync_timer_fn;
1839 mdev->md_sync_timer.data = (unsigned long) mdev;
370a43e7
PR
1840 mdev->start_resync_timer.function = start_resync_timer_fn;
1841 mdev->start_resync_timer.data = (unsigned long) mdev;
7fde2be9
PR
1842 mdev->request_timer.function = request_timer_fn;
1843 mdev->request_timer.data = (unsigned long) mdev;
b411b363
PR
1844
1845 init_waitqueue_head(&mdev->misc_wait);
1846 init_waitqueue_head(&mdev->state_wait);
1847 init_waitqueue_head(&mdev->ee_wait);
1848 init_waitqueue_head(&mdev->al_wait);
1849 init_waitqueue_head(&mdev->seq_wait);
1850
e6b3ea83
PR
1851 drbd_thread_init(mdev, &mdev->tconn->receiver, drbdd_init);
1852 drbd_thread_init(mdev, &mdev->tconn->worker, drbd_worker);
1853 drbd_thread_init(mdev, &mdev->tconn->asender, drbd_asender);
b411b363 1854
fd340c12 1855 /* mdev->tconn->agreed_pro_version gets initialized in drbd_connect() */
2451fc3b 1856 mdev->write_ordering = WO_bdev_flush;
b411b363 1857 mdev->resync_wenr = LC_FREE;
99432fcc
PR
1858 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
1859 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
b411b363
PR
1860}
1861
1862void drbd_mdev_cleanup(struct drbd_conf *mdev)
1863{
1d7734a0 1864 int i;
e6b3ea83 1865 if (mdev->tconn->receiver.t_state != NONE)
b411b363 1866 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
e6b3ea83 1867 mdev->tconn->receiver.t_state);
b411b363
PR
1868
1869 /* no need to lock it, I'm the only thread alive */
1870 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
1871 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
1872 mdev->al_writ_cnt =
1873 mdev->bm_writ_cnt =
1874 mdev->read_cnt =
1875 mdev->recv_cnt =
1876 mdev->send_cnt =
1877 mdev->writ_cnt =
1878 mdev->p_size =
1879 mdev->rs_start =
1880 mdev->rs_total =
1d7734a0
LE
1881 mdev->rs_failed = 0;
1882 mdev->rs_last_events = 0;
0f0601f4 1883 mdev->rs_last_sect_ev = 0;
1d7734a0
LE
1884 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1885 mdev->rs_mark_left[i] = 0;
1886 mdev->rs_mark_time[i] = 0;
1887 }
89e58e75 1888 D_ASSERT(mdev->tconn->net_conf == NULL);
b411b363
PR
1889
1890 drbd_set_my_capacity(mdev, 0);
1891 if (mdev->bitmap) {
1892 /* maybe never allocated. */
02d9a94b 1893 drbd_bm_resize(mdev, 0, 1);
b411b363
PR
1894 drbd_bm_cleanup(mdev);
1895 }
1896
1897 drbd_free_resources(mdev);
0778286a 1898 clear_bit(AL_SUSPENDED, &mdev->flags);
b411b363
PR
1899
1900 /*
1901 * currently we drbd_init_ee only on module load, so
1902 * we may do drbd_release_ee only on module unload!
1903 */
1904 D_ASSERT(list_empty(&mdev->active_ee));
1905 D_ASSERT(list_empty(&mdev->sync_ee));
1906 D_ASSERT(list_empty(&mdev->done_ee));
1907 D_ASSERT(list_empty(&mdev->read_ee));
1908 D_ASSERT(list_empty(&mdev->net_ee));
1909 D_ASSERT(list_empty(&mdev->resync_reads));
e42325a5
PR
1910 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
1911 D_ASSERT(list_empty(&mdev->tconn->meta.work.q));
b411b363
PR
1912 D_ASSERT(list_empty(&mdev->resync_work.list));
1913 D_ASSERT(list_empty(&mdev->unplug_work.list));
e9e6f3ec 1914 D_ASSERT(list_empty(&mdev->go_diskless.list));
2265b473
LE
1915
1916 drbd_set_defaults(mdev);
b411b363
PR
1917}
1918
1919
1920static void drbd_destroy_mempools(void)
1921{
1922 struct page *page;
1923
1924 while (drbd_pp_pool) {
1925 page = drbd_pp_pool;
1926 drbd_pp_pool = (struct page *)page_private(page);
1927 __free_page(page);
1928 drbd_pp_vacant--;
1929 }
1930
1931 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
1932
1933 if (drbd_ee_mempool)
1934 mempool_destroy(drbd_ee_mempool);
1935 if (drbd_request_mempool)
1936 mempool_destroy(drbd_request_mempool);
1937 if (drbd_ee_cache)
1938 kmem_cache_destroy(drbd_ee_cache);
1939 if (drbd_request_cache)
1940 kmem_cache_destroy(drbd_request_cache);
1941 if (drbd_bm_ext_cache)
1942 kmem_cache_destroy(drbd_bm_ext_cache);
1943 if (drbd_al_ext_cache)
1944 kmem_cache_destroy(drbd_al_ext_cache);
1945
1946 drbd_ee_mempool = NULL;
1947 drbd_request_mempool = NULL;
1948 drbd_ee_cache = NULL;
1949 drbd_request_cache = NULL;
1950 drbd_bm_ext_cache = NULL;
1951 drbd_al_ext_cache = NULL;
1952
1953 return;
1954}
1955
1956static int drbd_create_mempools(void)
1957{
1958 struct page *page;
1816a2b4 1959 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
b411b363
PR
1960 int i;
1961
1962 /* prepare our caches and mempools */
1963 drbd_request_mempool = NULL;
1964 drbd_ee_cache = NULL;
1965 drbd_request_cache = NULL;
1966 drbd_bm_ext_cache = NULL;
1967 drbd_al_ext_cache = NULL;
1968 drbd_pp_pool = NULL;
1969
1970 /* caches */
1971 drbd_request_cache = kmem_cache_create(
1972 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
1973 if (drbd_request_cache == NULL)
1974 goto Enomem;
1975
1976 drbd_ee_cache = kmem_cache_create(
f6ffca9f 1977 "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
b411b363
PR
1978 if (drbd_ee_cache == NULL)
1979 goto Enomem;
1980
1981 drbd_bm_ext_cache = kmem_cache_create(
1982 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
1983 if (drbd_bm_ext_cache == NULL)
1984 goto Enomem;
1985
1986 drbd_al_ext_cache = kmem_cache_create(
1987 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
1988 if (drbd_al_ext_cache == NULL)
1989 goto Enomem;
1990
1991 /* mempools */
1992 drbd_request_mempool = mempool_create(number,
1993 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
1994 if (drbd_request_mempool == NULL)
1995 goto Enomem;
1996
1997 drbd_ee_mempool = mempool_create(number,
1998 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2027ae1f 1999 if (drbd_ee_mempool == NULL)
b411b363
PR
2000 goto Enomem;
2001
2002 /* drbd's page pool */
2003 spin_lock_init(&drbd_pp_lock);
2004
2005 for (i = 0; i < number; i++) {
2006 page = alloc_page(GFP_HIGHUSER);
2007 if (!page)
2008 goto Enomem;
2009 set_page_private(page, (unsigned long)drbd_pp_pool);
2010 drbd_pp_pool = page;
2011 }
2012 drbd_pp_vacant = number;
2013
2014 return 0;
2015
2016Enomem:
2017 drbd_destroy_mempools(); /* in case we allocated some */
2018 return -ENOMEM;
2019}
2020
2021static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2022 void *unused)
2023{
2024 /* just so we have it. you never know what interesting things we
2025 * might want to do here some day...
2026 */
2027
2028 return NOTIFY_DONE;
2029}
2030
2031static struct notifier_block drbd_notifier = {
2032 .notifier_call = drbd_notify_sys,
2033};
2034
2035static void drbd_release_ee_lists(struct drbd_conf *mdev)
2036{
2037 int rr;
2038
2039 rr = drbd_release_ee(mdev, &mdev->active_ee);
2040 if (rr)
2041 dev_err(DEV, "%d EEs in active list found!\n", rr);
2042
2043 rr = drbd_release_ee(mdev, &mdev->sync_ee);
2044 if (rr)
2045 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2046
2047 rr = drbd_release_ee(mdev, &mdev->read_ee);
2048 if (rr)
2049 dev_err(DEV, "%d EEs in read list found!\n", rr);
2050
2051 rr = drbd_release_ee(mdev, &mdev->done_ee);
2052 if (rr)
2053 dev_err(DEV, "%d EEs in done list found!\n", rr);
2054
2055 rr = drbd_release_ee(mdev, &mdev->net_ee);
2056 if (rr)
2057 dev_err(DEV, "%d EEs in net list found!\n", rr);
2058}
2059
2060/* caution. no locking.
2061 * currently only used from module cleanup code. */
2062static void drbd_delete_device(unsigned int minor)
2063{
2064 struct drbd_conf *mdev = minor_to_mdev(minor);
2065
2066 if (!mdev)
2067 return;
2068
2069 /* paranoia asserts */
70dc65e1 2070 D_ASSERT(mdev->open_cnt == 0);
e42325a5 2071 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
b411b363
PR
2072 /* end paranoia asserts */
2073
2074 del_gendisk(mdev->vdisk);
2075
2076 /* cleanup stuff that may have been allocated during
2077 * device (re-)configuration or state changes */
2078
2079 if (mdev->this_bdev)
2080 bdput(mdev->this_bdev);
2081
2082 drbd_free_resources(mdev);
2111438b 2083 drbd_free_tconn(mdev->tconn);
b411b363
PR
2084
2085 drbd_release_ee_lists(mdev);
2086
b411b363
PR
2087 lc_destroy(mdev->act_log);
2088 lc_destroy(mdev->resync);
2089
2090 kfree(mdev->p_uuid);
2091 /* mdev->p_uuid = NULL; */
2092
b411b363
PR
2093 /* cleanup the rest that has been
2094 * allocated from drbd_new_device
2095 * and actually free the mdev itself */
2096 drbd_free_mdev(mdev);
2097}
2098
2099static void drbd_cleanup(void)
2100{
2101 unsigned int i;
2102
2103 unregister_reboot_notifier(&drbd_notifier);
2104
17a93f30
LE
2105 /* first remove proc,
2106 * drbdsetup uses it's presence to detect
2107 * whether DRBD is loaded.
2108 * If we would get stuck in proc removal,
2109 * but have netlink already deregistered,
2110 * some drbdsetup commands may wait forever
2111 * for an answer.
2112 */
2113 if (drbd_proc)
2114 remove_proc_entry("drbd", NULL);
2115
b411b363
PR
2116 drbd_nl_cleanup();
2117
2118 if (minor_table) {
b411b363
PR
2119 i = minor_count;
2120 while (i--)
2121 drbd_delete_device(i);
2122 drbd_destroy_mempools();
2123 }
2124
2125 kfree(minor_table);
2126
2127 unregister_blkdev(DRBD_MAJOR, "drbd");
2128
2129 printk(KERN_INFO "drbd: module cleanup done.\n");
2130}
2131
2132/**
2133 * drbd_congested() - Callback for pdflush
2134 * @congested_data: User data
2135 * @bdi_bits: Bits pdflush is currently interested in
2136 *
2137 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
2138 */
2139static int drbd_congested(void *congested_data, int bdi_bits)
2140{
2141 struct drbd_conf *mdev = congested_data;
2142 struct request_queue *q;
2143 char reason = '-';
2144 int r = 0;
2145
1b881ef7 2146 if (!may_inc_ap_bio(mdev)) {
b411b363
PR
2147 /* DRBD has frozen IO */
2148 r = bdi_bits;
2149 reason = 'd';
2150 goto out;
2151 }
2152
2153 if (get_ldev(mdev)) {
2154 q = bdev_get_queue(mdev->ldev->backing_bdev);
2155 r = bdi_congested(&q->backing_dev_info, bdi_bits);
2156 put_ldev(mdev);
2157 if (r)
2158 reason = 'b';
2159 }
2160
2161 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
2162 r |= (1 << BDI_async_congested);
2163 reason = reason == 'b' ? 'a' : 'n';
2164 }
2165
2166out:
2167 mdev->congestion_reason = reason;
2168 return r;
2169}
2170
2111438b
PR
2171struct drbd_tconn *drbd_new_tconn(char *name)
2172{
2173 struct drbd_tconn *tconn;
2174
2175 tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL);
2176 if (!tconn)
2177 return NULL;
2178
2179 tconn->name = kstrdup(name, GFP_KERNEL);
2180 if (!tconn->name)
2181 goto fail;
2182
b2fb6dbe
PR
2183 atomic_set(&tconn->net_cnt, 0);
2184 init_waitqueue_head(&tconn->net_cnt_wait);
2185
2111438b
PR
2186 write_lock_irq(&global_state_lock);
2187 list_add(&tconn->all_tconn, &drbd_tconns);
2188 write_unlock_irq(&global_state_lock);
2189
2190 return tconn;
2191
2192fail:
2193 kfree(tconn->name);
2194 kfree(tconn);
2195
2196 return NULL;
2197}
2198
2199void drbd_free_tconn(struct drbd_tconn *tconn)
2200{
2201 write_lock_irq(&global_state_lock);
2202 list_del(&tconn->all_tconn);
2203 write_unlock_irq(&global_state_lock);
2204
2205 kfree(tconn->name);
b42a70ad
PR
2206 kfree(tconn->int_dig_out);
2207 kfree(tconn->int_dig_in);
2208 kfree(tconn->int_dig_vv);
2111438b
PR
2209 kfree(tconn);
2210}
2211
b411b363
PR
2212struct drbd_conf *drbd_new_device(unsigned int minor)
2213{
2214 struct drbd_conf *mdev;
2215 struct gendisk *disk;
2216 struct request_queue *q;
2217
2218 /* GFP_KERNEL, we are outside of all write-out paths */
2219 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
2220 if (!mdev)
2221 return NULL;
2111438b
PR
2222 mdev->tconn = drbd_new_tconn("dummy");
2223 if (!mdev->tconn)
2224 goto out_no_tconn;
2225
b411b363
PR
2226 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
2227 goto out_no_cpumask;
2228
2111438b 2229 mdev->tconn->volume0 = mdev;
b411b363
PR
2230 mdev->minor = minor;
2231
2232 drbd_init_set_defaults(mdev);
2233
2234 q = blk_alloc_queue(GFP_KERNEL);
2235 if (!q)
2236 goto out_no_q;
2237 mdev->rq_queue = q;
2238 q->queuedata = mdev;
b411b363
PR
2239
2240 disk = alloc_disk(1);
2241 if (!disk)
2242 goto out_no_disk;
2243 mdev->vdisk = disk;
2244
81e84650 2245 set_disk_ro(disk, true);
b411b363
PR
2246
2247 disk->queue = q;
2248 disk->major = DRBD_MAJOR;
2249 disk->first_minor = minor;
2250 disk->fops = &drbd_ops;
2251 sprintf(disk->disk_name, "drbd%d", minor);
2252 disk->private_data = mdev;
2253
2254 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
2255 /* we have no partitions. we contain only ourselves. */
2256 mdev->this_bdev->bd_contains = mdev->this_bdev;
2257
2258 q->backing_dev_info.congested_fn = drbd_congested;
2259 q->backing_dev_info.congested_data = mdev;
2260
2f58dcfc 2261 blk_queue_make_request(q, drbd_make_request);
99432fcc
PR
2262 /* Setting the max_hw_sectors to an odd value of 8kibyte here
2263 This triggers a max_bio_size message upon first attach or connect */
2264 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
b411b363
PR
2265 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
2266 blk_queue_merge_bvec(q, drbd_merge_bvec);
87eeee41 2267 q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */
b411b363
PR
2268
2269 mdev->md_io_page = alloc_page(GFP_KERNEL);
2270 if (!mdev->md_io_page)
2271 goto out_no_io_page;
2272
2273 if (drbd_bm_init(mdev))
2274 goto out_no_bitmap;
2275 /* no need to lock access, we are still initializing this minor device. */
2276 if (!tl_init(mdev))
2277 goto out_no_tl;
dac1389c 2278 mdev->read_requests = RB_ROOT;
de696716 2279 mdev->write_requests = RB_ROOT;
b411b363 2280
b411b363
PR
2281 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
2282 if (!mdev->current_epoch)
2283 goto out_no_epoch;
2284
2285 INIT_LIST_HEAD(&mdev->current_epoch->list);
2286 mdev->epochs = 1;
2287
2288 return mdev;
2289
2290/* out_whatever_else:
2291 kfree(mdev->current_epoch); */
2292out_no_epoch:
b411b363
PR
2293 tl_cleanup(mdev);
2294out_no_tl:
2295 drbd_bm_cleanup(mdev);
2296out_no_bitmap:
2297 __free_page(mdev->md_io_page);
2298out_no_io_page:
2299 put_disk(disk);
2300out_no_disk:
2301 blk_cleanup_queue(q);
2302out_no_q:
2303 free_cpumask_var(mdev->cpu_mask);
2304out_no_cpumask:
2111438b
PR
2305 drbd_free_tconn(mdev->tconn);
2306out_no_tconn:
b411b363
PR
2307 kfree(mdev);
2308 return NULL;
2309}
2310
2311/* counterpart of drbd_new_device.
2312 * last part of drbd_delete_device. */
2313void drbd_free_mdev(struct drbd_conf *mdev)
2314{
2315 kfree(mdev->current_epoch);
b411b363
PR
2316 tl_cleanup(mdev);
2317 if (mdev->bitmap) /* should no longer be there. */
2318 drbd_bm_cleanup(mdev);
2319 __free_page(mdev->md_io_page);
2320 put_disk(mdev->vdisk);
2321 blk_cleanup_queue(mdev->rq_queue);
2322 free_cpumask_var(mdev->cpu_mask);
2323 kfree(mdev);
2324}
2325
2326
2327int __init drbd_init(void)
2328{
2329 int err;
2330
fd340c12
PR
2331 BUILD_BUG_ON(sizeof(struct p_header80) != sizeof(struct p_header95));
2332 BUILD_BUG_ON(sizeof(struct p_handshake) != 80);
b411b363 2333
2b8a90b5 2334 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
b411b363
PR
2335 printk(KERN_ERR
2336 "drbd: invalid minor_count (%d)\n", minor_count);
2337#ifdef MODULE
2338 return -EINVAL;
2339#else
2340 minor_count = 8;
2341#endif
2342 }
2343
2344 err = drbd_nl_init();
2345 if (err)
2346 return err;
2347
2348 err = register_blkdev(DRBD_MAJOR, "drbd");
2349 if (err) {
2350 printk(KERN_ERR
2351 "drbd: unable to register block device major %d\n",
2352 DRBD_MAJOR);
2353 return err;
2354 }
2355
2356 register_reboot_notifier(&drbd_notifier);
2357
2358 /*
2359 * allocate all necessary structs
2360 */
2361 err = -ENOMEM;
2362
2363 init_waitqueue_head(&drbd_pp_wait);
2364
2365 drbd_proc = NULL; /* play safe for drbd_cleanup */
2366 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
2367 GFP_KERNEL);
2368 if (!minor_table)
2369 goto Enomem;
2370
2371 err = drbd_create_mempools();
2372 if (err)
2373 goto Enomem;
2374
8c484ee4 2375 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
b411b363
PR
2376 if (!drbd_proc) {
2377 printk(KERN_ERR "drbd: unable to register proc file\n");
2378 goto Enomem;
2379 }
2380
2381 rwlock_init(&global_state_lock);
2111438b 2382 INIT_LIST_HEAD(&drbd_tconns);
b411b363
PR
2383
2384 printk(KERN_INFO "drbd: initialized. "
2385 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
2386 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
2387 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
2388 printk(KERN_INFO "drbd: registered as block device major %d\n",
2389 DRBD_MAJOR);
2390 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
2391
2392 return 0; /* Success! */
2393
2394Enomem:
2395 drbd_cleanup();
2396 if (err == -ENOMEM)
2397 /* currently always the case */
2398 printk(KERN_ERR "drbd: ran out of memory\n");
2399 else
2400 printk(KERN_ERR "drbd: initialization failure\n");
2401 return err;
2402}
2403
2404void drbd_free_bc(struct drbd_backing_dev *ldev)
2405{
2406 if (ldev == NULL)
2407 return;
2408
e525fd89
TH
2409 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2410 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
b411b363
PR
2411
2412 kfree(ldev);
2413}
2414
2415void drbd_free_sock(struct drbd_conf *mdev)
2416{
e42325a5
PR
2417 if (mdev->tconn->data.socket) {
2418 mutex_lock(&mdev->tconn->data.mutex);
2419 kernel_sock_shutdown(mdev->tconn->data.socket, SHUT_RDWR);
2420 sock_release(mdev->tconn->data.socket);
2421 mdev->tconn->data.socket = NULL;
2422 mutex_unlock(&mdev->tconn->data.mutex);
b411b363 2423 }
e42325a5
PR
2424 if (mdev->tconn->meta.socket) {
2425 mutex_lock(&mdev->tconn->meta.mutex);
2426 kernel_sock_shutdown(mdev->tconn->meta.socket, SHUT_RDWR);
2427 sock_release(mdev->tconn->meta.socket);
2428 mdev->tconn->meta.socket = NULL;
2429 mutex_unlock(&mdev->tconn->meta.mutex);
b411b363
PR
2430 }
2431}
2432
2433
2434void drbd_free_resources(struct drbd_conf *mdev)
2435{
2436 crypto_free_hash(mdev->csums_tfm);
2437 mdev->csums_tfm = NULL;
2438 crypto_free_hash(mdev->verify_tfm);
2439 mdev->verify_tfm = NULL;
a0638456
PR
2440 crypto_free_hash(mdev->tconn->cram_hmac_tfm);
2441 mdev->tconn->cram_hmac_tfm = NULL;
2442 crypto_free_hash(mdev->tconn->integrity_w_tfm);
2443 mdev->tconn->integrity_w_tfm = NULL;
2444 crypto_free_hash(mdev->tconn->integrity_r_tfm);
2445 mdev->tconn->integrity_r_tfm = NULL;
b411b363
PR
2446
2447 drbd_free_sock(mdev);
2448
2449 __no_warn(local,
2450 drbd_free_bc(mdev->ldev);
2451 mdev->ldev = NULL;);
2452}
2453
2454/* meta data management */
2455
2456struct meta_data_on_disk {
2457 u64 la_size; /* last agreed size. */
2458 u64 uuid[UI_SIZE]; /* UUIDs. */
2459 u64 device_uuid;
2460 u64 reserved_u64_1;
2461 u32 flags; /* MDF */
2462 u32 magic;
2463 u32 md_size_sect;
2464 u32 al_offset; /* offset to this block */
2465 u32 al_nr_extents; /* important for restoring the AL */
2466 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
2467 u32 bm_offset; /* offset to the bitmap, from here */
2468 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
99432fcc
PR
2469 u32 la_peer_max_bio_size; /* last peer max_bio_size */
2470 u32 reserved_u32[3];
b411b363
PR
2471
2472} __packed;
2473
2474/**
2475 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
2476 * @mdev: DRBD device.
2477 */
2478void drbd_md_sync(struct drbd_conf *mdev)
2479{
2480 struct meta_data_on_disk *buffer;
2481 sector_t sector;
2482 int i;
2483
ee15b038
LE
2484 del_timer(&mdev->md_sync_timer);
2485 /* timer may be rearmed by drbd_md_mark_dirty() now. */
b411b363
PR
2486 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
2487 return;
b411b363
PR
2488
2489 /* We use here D_FAILED and not D_ATTACHING because we try to write
2490 * metadata even if we detach due to a disk failure! */
2491 if (!get_ldev_if_state(mdev, D_FAILED))
2492 return;
2493
b411b363
PR
2494 mutex_lock(&mdev->md_io_mutex);
2495 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
2496 memset(buffer, 0, 512);
2497
2498 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
2499 for (i = UI_CURRENT; i < UI_SIZE; i++)
2500 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
2501 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
2502 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
2503
2504 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
2505 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
2506 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
2507 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
2508 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
2509
2510 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
99432fcc 2511 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
b411b363
PR
2512
2513 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
2514 sector = mdev->ldev->md.md_offset;
2515
3f3a9b84 2516 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
b411b363
PR
2517 /* this was a try anyways ... */
2518 dev_err(DEV, "meta data update failed!\n");
81e84650 2519 drbd_chk_io_error(mdev, 1, true);
b411b363
PR
2520 }
2521
2522 /* Update mdev->ldev->md.la_size_sect,
2523 * since we updated it on metadata. */
2524 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
2525
2526 mutex_unlock(&mdev->md_io_mutex);
2527 put_ldev(mdev);
2528}
2529
2530/**
2531 * drbd_md_read() - Reads in the meta data super block
2532 * @mdev: DRBD device.
2533 * @bdev: Device from which the meta data should be read in.
2534 *
116676ca 2535 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
b411b363
PR
2536 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
2537 */
2538int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2539{
2540 struct meta_data_on_disk *buffer;
2541 int i, rv = NO_ERROR;
2542
2543 if (!get_ldev_if_state(mdev, D_ATTACHING))
2544 return ERR_IO_MD_DISK;
2545
b411b363
PR
2546 mutex_lock(&mdev->md_io_mutex);
2547 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
2548
2549 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
25985edc 2550 /* NOTE: can't do normal error processing here as this is
b411b363
PR
2551 called BEFORE disk is attached */
2552 dev_err(DEV, "Error while reading metadata.\n");
2553 rv = ERR_IO_MD_DISK;
2554 goto err;
2555 }
2556
e7fad8af 2557 if (buffer->magic != cpu_to_be32(DRBD_MD_MAGIC)) {
b411b363
PR
2558 dev_err(DEV, "Error while reading metadata, magic not found.\n");
2559 rv = ERR_MD_INVALID;
2560 goto err;
2561 }
2562 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
2563 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
2564 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
2565 rv = ERR_MD_INVALID;
2566 goto err;
2567 }
2568 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
2569 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
2570 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
2571 rv = ERR_MD_INVALID;
2572 goto err;
2573 }
2574 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
2575 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
2576 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
2577 rv = ERR_MD_INVALID;
2578 goto err;
2579 }
2580
2581 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
2582 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
2583 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
2584 rv = ERR_MD_INVALID;
2585 goto err;
2586 }
2587
2588 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
2589 for (i = UI_CURRENT; i < UI_SIZE; i++)
2590 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
2591 bdev->md.flags = be32_to_cpu(buffer->flags);
2592 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
2593 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
2594
87eeee41 2595 spin_lock_irq(&mdev->tconn->req_lock);
99432fcc
PR
2596 if (mdev->state.conn < C_CONNECTED) {
2597 int peer;
2598 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
2599 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
2600 mdev->peer_max_bio_size = peer;
2601 }
87eeee41 2602 spin_unlock_irq(&mdev->tconn->req_lock);
99432fcc 2603
b411b363
PR
2604 if (mdev->sync_conf.al_extents < 7)
2605 mdev->sync_conf.al_extents = 127;
2606
2607 err:
2608 mutex_unlock(&mdev->md_io_mutex);
2609 put_ldev(mdev);
2610
2611 return rv;
2612}
2613
2614/**
2615 * drbd_md_mark_dirty() - Mark meta data super block as dirty
2616 * @mdev: DRBD device.
2617 *
2618 * Call this function if you change anything that should be written to
2619 * the meta-data super block. This function sets MD_DIRTY, and starts a
2620 * timer that ensures that within five seconds you have to call drbd_md_sync().
2621 */
ca0e6098 2622#ifdef DEBUG
ee15b038
LE
2623void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
2624{
2625 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
2626 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
2627 mdev->last_md_mark_dirty.line = line;
2628 mdev->last_md_mark_dirty.func = func;
2629 }
2630}
2631#else
b411b363
PR
2632void drbd_md_mark_dirty(struct drbd_conf *mdev)
2633{
ee15b038 2634 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
ca0e6098 2635 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
b411b363 2636}
ee15b038 2637#endif
b411b363
PR
2638
2639static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
2640{
2641 int i;
2642
62b0da3a 2643 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
b411b363 2644 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
b411b363
PR
2645}
2646
2647void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
2648{
2649 if (idx == UI_CURRENT) {
2650 if (mdev->state.role == R_PRIMARY)
2651 val |= 1;
2652 else
2653 val &= ~((u64)1);
2654
2655 drbd_set_ed_uuid(mdev, val);
2656 }
2657
2658 mdev->ldev->md.uuid[idx] = val;
b411b363
PR
2659 drbd_md_mark_dirty(mdev);
2660}
2661
2662
2663void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
2664{
2665 if (mdev->ldev->md.uuid[idx]) {
2666 drbd_uuid_move_history(mdev);
2667 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
b411b363
PR
2668 }
2669 _drbd_uuid_set(mdev, idx, val);
2670}
2671
2672/**
2673 * drbd_uuid_new_current() - Creates a new current UUID
2674 * @mdev: DRBD device.
2675 *
2676 * Creates a new current UUID, and rotates the old current UUID into
2677 * the bitmap slot. Causes an incremental resync upon next connect.
2678 */
2679void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
2680{
2681 u64 val;
62b0da3a
LE
2682 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
2683
2684 if (bm_uuid)
2685 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 2686
b411b363 2687 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
b411b363
PR
2688
2689 get_random_bytes(&val, sizeof(u64));
2690 _drbd_uuid_set(mdev, UI_CURRENT, val);
62b0da3a 2691 drbd_print_uuids(mdev, "new current UUID");
aaa8e2b3
LE
2692 /* get it to stable storage _now_ */
2693 drbd_md_sync(mdev);
b411b363
PR
2694}
2695
2696void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
2697{
2698 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
2699 return;
2700
2701 if (val == 0) {
2702 drbd_uuid_move_history(mdev);
2703 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
2704 mdev->ldev->md.uuid[UI_BITMAP] = 0;
b411b363 2705 } else {
62b0da3a
LE
2706 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
2707 if (bm_uuid)
2708 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 2709
62b0da3a 2710 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
b411b363
PR
2711 }
2712 drbd_md_mark_dirty(mdev);
2713}
2714
2715/**
2716 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
2717 * @mdev: DRBD device.
2718 *
2719 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
2720 */
2721int drbd_bmio_set_n_write(struct drbd_conf *mdev)
2722{
2723 int rv = -EIO;
2724
2725 if (get_ldev_if_state(mdev, D_ATTACHING)) {
2726 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
2727 drbd_md_sync(mdev);
2728 drbd_bm_set_all(mdev);
2729
2730 rv = drbd_bm_write(mdev);
2731
2732 if (!rv) {
2733 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2734 drbd_md_sync(mdev);
2735 }
2736
2737 put_ldev(mdev);
2738 }
2739
2740 return rv;
2741}
2742
2743/**
2744 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
2745 * @mdev: DRBD device.
2746 *
2747 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
2748 */
2749int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
2750{
2751 int rv = -EIO;
2752
0778286a 2753 drbd_resume_al(mdev);
b411b363
PR
2754 if (get_ldev_if_state(mdev, D_ATTACHING)) {
2755 drbd_bm_clear_all(mdev);
2756 rv = drbd_bm_write(mdev);
2757 put_ldev(mdev);
2758 }
2759
2760 return rv;
2761}
2762
2763static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
2764{
2765 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
02851e9f 2766 int rv = -EIO;
b411b363
PR
2767
2768 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
2769
02851e9f 2770 if (get_ldev(mdev)) {
20ceb2b2 2771 drbd_bm_lock(mdev, work->why, work->flags);
02851e9f
LE
2772 rv = work->io_fn(mdev);
2773 drbd_bm_unlock(mdev);
2774 put_ldev(mdev);
2775 }
b411b363
PR
2776
2777 clear_bit(BITMAP_IO, &mdev->flags);
127b3178 2778 smp_mb__after_clear_bit();
b411b363
PR
2779 wake_up(&mdev->misc_wait);
2780
2781 if (work->done)
2782 work->done(mdev, rv);
2783
2784 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
2785 work->why = NULL;
20ceb2b2 2786 work->flags = 0;
b411b363
PR
2787
2788 return 1;
2789}
2790
82f59cc6
LE
2791void drbd_ldev_destroy(struct drbd_conf *mdev)
2792{
2793 lc_destroy(mdev->resync);
2794 mdev->resync = NULL;
2795 lc_destroy(mdev->act_log);
2796 mdev->act_log = NULL;
2797 __no_warn(local,
2798 drbd_free_bc(mdev->ldev);
2799 mdev->ldev = NULL;);
2800
2801 if (mdev->md_io_tmpp) {
2802 __free_page(mdev->md_io_tmpp);
2803 mdev->md_io_tmpp = NULL;
2804 }
2805 clear_bit(GO_DISKLESS, &mdev->flags);
2806}
2807
e9e6f3ec
LE
2808static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
2809{
2810 D_ASSERT(mdev->state.disk == D_FAILED);
9d282875
LE
2811 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
2812 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
82f59cc6
LE
2813 * the protected members anymore, though, so once put_ldev reaches zero
2814 * again, it will be safe to free them. */
e9e6f3ec 2815 drbd_force_state(mdev, NS(disk, D_DISKLESS));
e9e6f3ec
LE
2816 return 1;
2817}
2818
2819void drbd_go_diskless(struct drbd_conf *mdev)
2820{
2821 D_ASSERT(mdev->state.disk == D_FAILED);
2822 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
e42325a5 2823 drbd_queue_work(&mdev->tconn->data.work, &mdev->go_diskless);
e9e6f3ec
LE
2824}
2825
b411b363
PR
2826/**
2827 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
2828 * @mdev: DRBD device.
2829 * @io_fn: IO callback to be called when bitmap IO is possible
2830 * @done: callback to be called after the bitmap IO was performed
2831 * @why: Descriptive text of the reason for doing the IO
2832 *
2833 * While IO on the bitmap happens we freeze application IO thus we ensure
2834 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
2835 * called from worker context. It MUST NOT be used while a previous such
2836 * work is still pending!
2837 */
2838void drbd_queue_bitmap_io(struct drbd_conf *mdev,
2839 int (*io_fn)(struct drbd_conf *),
2840 void (*done)(struct drbd_conf *, int),
20ceb2b2 2841 char *why, enum bm_flag flags)
b411b363 2842{
e6b3ea83 2843 D_ASSERT(current == mdev->tconn->worker.task);
b411b363
PR
2844
2845 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
2846 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
2847 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
2848 if (mdev->bm_io_work.why)
2849 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
2850 why, mdev->bm_io_work.why);
2851
2852 mdev->bm_io_work.io_fn = io_fn;
2853 mdev->bm_io_work.done = done;
2854 mdev->bm_io_work.why = why;
20ceb2b2 2855 mdev->bm_io_work.flags = flags;
b411b363 2856
87eeee41 2857 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
2858 set_bit(BITMAP_IO, &mdev->flags);
2859 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
127b3178 2860 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
e42325a5 2861 drbd_queue_work(&mdev->tconn->data.work, &mdev->bm_io_work.w);
b411b363 2862 }
87eeee41 2863 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
2864}
2865
2866/**
2867 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
2868 * @mdev: DRBD device.
2869 * @io_fn: IO callback to be called when bitmap IO is possible
2870 * @why: Descriptive text of the reason for doing the IO
2871 *
2872 * freezes application IO while that the actual IO operations runs. This
2873 * functions MAY NOT be called from worker context.
2874 */
20ceb2b2
LE
2875int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
2876 char *why, enum bm_flag flags)
b411b363
PR
2877{
2878 int rv;
2879
e6b3ea83 2880 D_ASSERT(current != mdev->tconn->worker.task);
b411b363 2881
20ceb2b2
LE
2882 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
2883 drbd_suspend_io(mdev);
b411b363 2884
20ceb2b2 2885 drbd_bm_lock(mdev, why, flags);
b411b363
PR
2886 rv = io_fn(mdev);
2887 drbd_bm_unlock(mdev);
2888
20ceb2b2
LE
2889 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
2890 drbd_resume_io(mdev);
b411b363
PR
2891
2892 return rv;
2893}
2894
2895void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
2896{
2897 if ((mdev->ldev->md.flags & flag) != flag) {
2898 drbd_md_mark_dirty(mdev);
2899 mdev->ldev->md.flags |= flag;
2900 }
2901}
2902
2903void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
2904{
2905 if ((mdev->ldev->md.flags & flag) != 0) {
2906 drbd_md_mark_dirty(mdev);
2907 mdev->ldev->md.flags &= ~flag;
2908 }
2909}
2910int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
2911{
2912 return (bdev->md.flags & flag) != 0;
2913}
2914
2915static void md_sync_timer_fn(unsigned long data)
2916{
2917 struct drbd_conf *mdev = (struct drbd_conf *) data;
2918
e42325a5 2919 drbd_queue_work_front(&mdev->tconn->data.work, &mdev->md_sync_work);
b411b363
PR
2920}
2921
2922static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
2923{
2924 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
ee15b038
LE
2925#ifdef DEBUG
2926 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
2927 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
2928#endif
b411b363 2929 drbd_md_sync(mdev);
b411b363
PR
2930 return 1;
2931}
2932
d8763023 2933const char *cmdname(enum drbd_packet cmd)
f2ad9063
AG
2934{
2935 /* THINK may need to become several global tables
2936 * when we want to support more than
2937 * one PRO_VERSION */
2938 static const char *cmdnames[] = {
2939 [P_DATA] = "Data",
2940 [P_DATA_REPLY] = "DataReply",
2941 [P_RS_DATA_REPLY] = "RSDataReply",
2942 [P_BARRIER] = "Barrier",
2943 [P_BITMAP] = "ReportBitMap",
2944 [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
2945 [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
2946 [P_UNPLUG_REMOTE] = "UnplugRemote",
2947 [P_DATA_REQUEST] = "DataRequest",
2948 [P_RS_DATA_REQUEST] = "RSDataRequest",
2949 [P_SYNC_PARAM] = "SyncParam",
2950 [P_SYNC_PARAM89] = "SyncParam89",
2951 [P_PROTOCOL] = "ReportProtocol",
2952 [P_UUIDS] = "ReportUUIDs",
2953 [P_SIZES] = "ReportSizes",
2954 [P_STATE] = "ReportState",
2955 [P_SYNC_UUID] = "ReportSyncUUID",
2956 [P_AUTH_CHALLENGE] = "AuthChallenge",
2957 [P_AUTH_RESPONSE] = "AuthResponse",
2958 [P_PING] = "Ping",
2959 [P_PING_ACK] = "PingAck",
2960 [P_RECV_ACK] = "RecvAck",
2961 [P_WRITE_ACK] = "WriteAck",
2962 [P_RS_WRITE_ACK] = "RSWriteAck",
2963 [P_DISCARD_ACK] = "DiscardAck",
2964 [P_NEG_ACK] = "NegAck",
2965 [P_NEG_DREPLY] = "NegDReply",
2966 [P_NEG_RS_DREPLY] = "NegRSDReply",
2967 [P_BARRIER_ACK] = "BarrierAck",
2968 [P_STATE_CHG_REQ] = "StateChgRequest",
2969 [P_STATE_CHG_REPLY] = "StateChgReply",
2970 [P_OV_REQUEST] = "OVRequest",
2971 [P_OV_REPLY] = "OVReply",
2972 [P_OV_RESULT] = "OVResult",
2973 [P_CSUM_RS_REQUEST] = "CsumRSRequest",
2974 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
2975 [P_COMPRESSED_BITMAP] = "CBitmap",
2976 [P_DELAY_PROBE] = "DelayProbe",
2977 [P_OUT_OF_SYNC] = "OutOfSync",
2978 [P_MAX_CMD] = NULL,
2979 };
2980
2981 if (cmd == P_HAND_SHAKE_M)
2982 return "HandShakeM";
2983 if (cmd == P_HAND_SHAKE_S)
2984 return "HandShakeS";
2985 if (cmd == P_HAND_SHAKE)
2986 return "HandShake";
2987 if (cmd >= P_MAX_CMD)
2988 return "Unknown";
2989 return cmdnames[cmd];
2990}
2991
b411b363
PR
2992#ifdef CONFIG_DRBD_FAULT_INJECTION
2993/* Fault insertion support including random number generator shamelessly
2994 * stolen from kernel/rcutorture.c */
2995struct fault_random_state {
2996 unsigned long state;
2997 unsigned long count;
2998};
2999
3000#define FAULT_RANDOM_MULT 39916801 /* prime */
3001#define FAULT_RANDOM_ADD 479001701 /* prime */
3002#define FAULT_RANDOM_REFRESH 10000
3003
3004/*
3005 * Crude but fast random-number generator. Uses a linear congruential
3006 * generator, with occasional help from get_random_bytes().
3007 */
3008static unsigned long
3009_drbd_fault_random(struct fault_random_state *rsp)
3010{
3011 long refresh;
3012
49829ea7 3013 if (!rsp->count--) {
b411b363
PR
3014 get_random_bytes(&refresh, sizeof(refresh));
3015 rsp->state += refresh;
3016 rsp->count = FAULT_RANDOM_REFRESH;
3017 }
3018 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3019 return swahw32(rsp->state);
3020}
3021
3022static char *
3023_drbd_fault_str(unsigned int type) {
3024 static char *_faults[] = {
3025 [DRBD_FAULT_MD_WR] = "Meta-data write",
3026 [DRBD_FAULT_MD_RD] = "Meta-data read",
3027 [DRBD_FAULT_RS_WR] = "Resync write",
3028 [DRBD_FAULT_RS_RD] = "Resync read",
3029 [DRBD_FAULT_DT_WR] = "Data write",
3030 [DRBD_FAULT_DT_RD] = "Data read",
3031 [DRBD_FAULT_DT_RA] = "Data read ahead",
3032 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
6b4388ac
PR
3033 [DRBD_FAULT_AL_EE] = "EE allocation",
3034 [DRBD_FAULT_RECEIVE] = "receive data corruption",
b411b363
PR
3035 };
3036
3037 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3038}
3039
3040unsigned int
3041_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3042{
3043 static struct fault_random_state rrs = {0, 0};
3044
3045 unsigned int ret = (
3046 (fault_devs == 0 ||
3047 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3048 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3049
3050 if (ret) {
3051 fault_count++;
3052
7383506c 3053 if (__ratelimit(&drbd_ratelimit_state))
b411b363
PR
3054 dev_warn(DEV, "***Simulating %s failure\n",
3055 _drbd_fault_str(type));
3056 }
3057
3058 return ret;
3059}
3060#endif
3061
3062const char *drbd_buildtag(void)
3063{
3064 /* DRBD built from external sources has here a reference to the
3065 git hash of the source code. */
3066
3067 static char buildtag[38] = "\0uilt-in";
3068
3069 if (buildtag[0] == 0) {
3070#ifdef CONFIG_MODULES
3071 if (THIS_MODULE != NULL)
3072 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3073 else
3074#endif
3075 buildtag[0] = 'b';
3076 }
3077
3078 return buildtag;
3079}
3080
3081module_init(drbd_init)
3082module_exit(drbd_cleanup)
3083
b411b363
PR
3084EXPORT_SYMBOL(drbd_conn_str);
3085EXPORT_SYMBOL(drbd_role_str);
3086EXPORT_SYMBOL(drbd_disk_str);
3087EXPORT_SYMBOL(drbd_set_st_err_str);
This page took 0.328001 seconds and 5 git commands to generate.