drbd: Improve the "unexpected packet" error messages
[deliverable/linux.git] / drivers / block / drbd / drbd_main.c
CommitLineData
b411b363
PR
1/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
b411b363 29#include <linux/module.h>
b411b363
PR
30#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
2a48fc0a 35#include <linux/mutex.h>
b411b363
PR
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
b411b363
PR
55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
2a48fc0a 59static DEFINE_MUTEX(drbd_main_mutex);
b411b363
PR
60int drbdd_init(struct drbd_thread *);
61int drbd_worker(struct drbd_thread *);
62int drbd_asender(struct drbd_thread *);
63
64int drbd_init(void);
65static int drbd_open(struct block_device *bdev, fmode_t mode);
66static int drbd_release(struct gendisk *gd, fmode_t mode);
99920dc5 67static int w_md_sync(struct drbd_work *w, int unused);
b411b363 68static void md_sync_timer_fn(unsigned long data);
99920dc5
AG
69static int w_bitmap_io(struct drbd_work *w, int unused);
70static int w_go_diskless(struct drbd_work *w, int unused);
b411b363 71
b411b363
PR
72MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
73 "Lars Ellenberg <lars@linbit.com>");
74MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
75MODULE_VERSION(REL_VERSION);
76MODULE_LICENSE("GPL");
81a5d60e 77MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
2b8a90b5 78 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
b411b363
PR
79MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
80
81#include <linux/moduleparam.h>
82/* allow_open_on_secondary */
83MODULE_PARM_DESC(allow_oos, "DONT USE!");
84/* thanks to these macros, if compiled into the kernel (not-module),
85 * this becomes the boot parameter drbd.minor_count */
86module_param(minor_count, uint, 0444);
87module_param(disable_sendpage, bool, 0644);
88module_param(allow_oos, bool, 0);
b411b363
PR
89module_param(proc_details, int, 0644);
90
91#ifdef CONFIG_DRBD_FAULT_INJECTION
92int enable_faults;
93int fault_rate;
94static int fault_count;
95int fault_devs;
96/* bitmap of enabled faults */
97module_param(enable_faults, int, 0664);
98/* fault rate % value - applies to all enabled faults */
99module_param(fault_rate, int, 0664);
100/* count of faults inserted */
101module_param(fault_count, int, 0664);
102/* bitmap of devices to insert faults on */
103module_param(fault_devs, int, 0644);
104#endif
105
106/* module parameter, defined */
2b8a90b5 107unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
b411b363
PR
108int disable_sendpage;
109int allow_oos;
b411b363
PR
110int proc_details; /* Detail level in proc drbd*/
111
112/* Module parameter for setting the user mode helper program
113 * to run. Default is /sbin/drbdadm */
114char usermode_helper[80] = "/sbin/drbdadm";
115
116module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
117
118/* in 2.6.x, our device mapping and config info contains our virtual gendisks
119 * as member "struct gendisk *vdisk;"
120 */
81a5d60e 121struct idr minors;
2111438b 122struct list_head drbd_tconns; /* list of struct drbd_tconn */
b411b363
PR
123
124struct kmem_cache *drbd_request_cache;
6c852bec 125struct kmem_cache *drbd_ee_cache; /* peer requests */
b411b363
PR
126struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
127struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
128mempool_t *drbd_request_mempool;
129mempool_t *drbd_ee_mempool;
35abf594 130mempool_t *drbd_md_io_page_pool;
da4a75d2 131struct bio_set *drbd_md_io_bio_set;
b411b363
PR
132
133/* I do not use a standard mempool, because:
134 1) I want to hand out the pre-allocated objects first.
135 2) I want to be able to interrupt sleeping allocation with a signal.
136 Note: This is a single linked list, the next pointer is the private
137 member of struct page.
138 */
139struct page *drbd_pp_pool;
140spinlock_t drbd_pp_lock;
141int drbd_pp_vacant;
142wait_queue_head_t drbd_pp_wait;
143
144DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
145
7d4e9d09 146static const struct block_device_operations drbd_ops = {
b411b363
PR
147 .owner = THIS_MODULE,
148 .open = drbd_open,
149 .release = drbd_release,
150};
151
da4a75d2
LE
152static void bio_destructor_drbd(struct bio *bio)
153{
154 bio_free(bio, drbd_md_io_bio_set);
155}
156
157struct bio *bio_alloc_drbd(gfp_t gfp_mask)
158{
159 struct bio *bio;
160
161 if (!drbd_md_io_bio_set)
162 return bio_alloc(gfp_mask, 1);
163
164 bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
165 if (!bio)
166 return NULL;
167 bio->bi_destructor = bio_destructor_drbd;
168 return bio;
169}
170
b411b363
PR
171#ifdef __CHECKER__
172/* When checking with sparse, and this is an inline function, sparse will
173 give tons of false positives. When this is a real functions sparse works.
174 */
175int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
176{
177 int io_allowed;
178
179 atomic_inc(&mdev->local_cnt);
180 io_allowed = (mdev->state.disk >= mins);
181 if (!io_allowed) {
182 if (atomic_dec_and_test(&mdev->local_cnt))
183 wake_up(&mdev->misc_wait);
184 }
185 return io_allowed;
186}
187
188#endif
189
190/**
191 * DOC: The transfer log
192 *
193 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
87eeee41 194 * mdev->tconn->newest_tle points to the head, mdev->tconn->oldest_tle points to the tail
b411b363
PR
195 * of the list. There is always at least one &struct drbd_tl_epoch object.
196 *
197 * Each &struct drbd_tl_epoch has a circular double linked list of requests
198 * attached.
199 */
2f5cdd0b 200static int tl_init(struct drbd_tconn *tconn)
b411b363
PR
201{
202 struct drbd_tl_epoch *b;
203
204 /* during device minor initialization, we may well use GFP_KERNEL */
205 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
206 if (!b)
207 return 0;
208 INIT_LIST_HEAD(&b->requests);
209 INIT_LIST_HEAD(&b->w.list);
210 b->next = NULL;
211 b->br_number = 4711;
7e602c0a 212 b->n_writes = 0;
b411b363
PR
213 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
214
2f5cdd0b
PR
215 tconn->oldest_tle = b;
216 tconn->newest_tle = b;
217 INIT_LIST_HEAD(&tconn->out_of_sequence_requests);
b411b363 218
b411b363
PR
219 return 1;
220}
221
2f5cdd0b 222static void tl_cleanup(struct drbd_tconn *tconn)
b411b363 223{
2f5cdd0b
PR
224 if (tconn->oldest_tle != tconn->newest_tle)
225 conn_err(tconn, "ASSERT FAILED: oldest_tle == newest_tle\n");
226 if (!list_empty(&tconn->out_of_sequence_requests))
227 conn_err(tconn, "ASSERT FAILED: list_empty(out_of_sequence_requests)\n");
228 kfree(tconn->oldest_tle);
229 tconn->oldest_tle = NULL;
230 kfree(tconn->unused_spare_tle);
231 tconn->unused_spare_tle = NULL;
d628769b
AG
232}
233
b411b363
PR
234/**
235 * _tl_add_barrier() - Adds a barrier to the transfer log
236 * @mdev: DRBD device.
237 * @new: Barrier to be added before the current head of the TL.
238 *
239 * The caller must hold the req_lock.
240 */
2f5cdd0b 241void _tl_add_barrier(struct drbd_tconn *tconn, struct drbd_tl_epoch *new)
b411b363
PR
242{
243 struct drbd_tl_epoch *newest_before;
244
245 INIT_LIST_HEAD(&new->requests);
246 INIT_LIST_HEAD(&new->w.list);
247 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
248 new->next = NULL;
7e602c0a 249 new->n_writes = 0;
b411b363 250
2f5cdd0b 251 newest_before = tconn->newest_tle;
b411b363
PR
252 /* never send a barrier number == 0, because that is special-cased
253 * when using TCQ for our write ordering code */
254 new->br_number = (newest_before->br_number+1) ?: 1;
2f5cdd0b
PR
255 if (tconn->newest_tle != new) {
256 tconn->newest_tle->next = new;
257 tconn->newest_tle = new;
b411b363
PR
258 }
259}
260
261/**
262 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
263 * @mdev: DRBD device.
264 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
265 * @set_size: Expected number of requests before that barrier.
266 *
267 * In case the passed barrier_nr or set_size does not match the oldest
268 * &struct drbd_tl_epoch objects this function will cause a termination
269 * of the connection.
270 */
2f5cdd0b
PR
271void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr,
272 unsigned int set_size)
b411b363 273{
2f5cdd0b 274 struct drbd_conf *mdev;
b411b363
PR
275 struct drbd_tl_epoch *b, *nob; /* next old barrier */
276 struct list_head *le, *tle;
277 struct drbd_request *r;
278
2f5cdd0b 279 spin_lock_irq(&tconn->req_lock);
b411b363 280
2f5cdd0b 281 b = tconn->oldest_tle;
b411b363
PR
282
283 /* first some paranoia code */
284 if (b == NULL) {
2f5cdd0b
PR
285 conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
286 barrier_nr);
b411b363
PR
287 goto bail;
288 }
289 if (b->br_number != barrier_nr) {
2f5cdd0b
PR
290 conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n",
291 barrier_nr, b->br_number);
b411b363
PR
292 goto bail;
293 }
7e602c0a 294 if (b->n_writes != set_size) {
2f5cdd0b
PR
295 conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
296 barrier_nr, set_size, b->n_writes);
b411b363
PR
297 goto bail;
298 }
299
300 /* Clean up list of requests processed during current epoch */
301 list_for_each_safe(le, tle, &b->requests) {
302 r = list_entry(le, struct drbd_request, tl_requests);
8554df1c 303 _req_mod(r, BARRIER_ACKED);
b411b363
PR
304 }
305 /* There could be requests on the list waiting for completion
306 of the write to the local disk. To avoid corruptions of
307 slab's data structures we have to remove the lists head.
308
309 Also there could have been a barrier ack out of sequence, overtaking
310 the write acks - which would be a bug and violating write ordering.
311 To not deadlock in case we lose connection while such requests are
312 still pending, we need some way to find them for the
8554df1c 313 _req_mode(CONNECTION_LOST_WHILE_PENDING).
b411b363
PR
314
315 These have been list_move'd to the out_of_sequence_requests list in
8554df1c 316 _req_mod(, BARRIER_ACKED) above.
b411b363
PR
317 */
318 list_del_init(&b->requests);
2f5cdd0b 319 mdev = b->w.mdev;
b411b363
PR
320
321 nob = b->next;
322 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
2f5cdd0b 323 _tl_add_barrier(tconn, b);
b411b363 324 if (nob)
2f5cdd0b 325 tconn->oldest_tle = nob;
b411b363 326 /* if nob == NULL b was the only barrier, and becomes the new
2f5cdd0b 327 barrier. Therefore tconn->oldest_tle points already to b */
b411b363
PR
328 } else {
329 D_ASSERT(nob != NULL);
2f5cdd0b 330 tconn->oldest_tle = nob;
b411b363
PR
331 kfree(b);
332 }
333
2f5cdd0b 334 spin_unlock_irq(&tconn->req_lock);
b411b363
PR
335 dec_ap_pending(mdev);
336
337 return;
338
339bail:
2f5cdd0b
PR
340 spin_unlock_irq(&tconn->req_lock);
341 conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
b411b363
PR
342}
343
617049aa 344
b411b363 345/**
11b58e73 346 * _tl_restart() - Walks the transfer log, and applies an action to all requests
b411b363 347 * @mdev: DRBD device.
11b58e73 348 * @what: The action/event to perform with all request objects
b411b363 349 *
8554df1c
AG
350 * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
351 * RESTART_FROZEN_DISK_IO.
b411b363 352 */
2f5cdd0b 353void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
b411b363 354{
11b58e73 355 struct drbd_tl_epoch *b, *tmp, **pn;
b9b98716 356 struct list_head *le, *tle, carry_reads;
11b58e73
PR
357 struct drbd_request *req;
358 int rv, n_writes, n_reads;
b411b363 359
2f5cdd0b
PR
360 b = tconn->oldest_tle;
361 pn = &tconn->oldest_tle;
b411b363 362 while (b) {
11b58e73
PR
363 n_writes = 0;
364 n_reads = 0;
b9b98716 365 INIT_LIST_HEAD(&carry_reads);
b411b363 366 list_for_each_safe(le, tle, &b->requests) {
11b58e73
PR
367 req = list_entry(le, struct drbd_request, tl_requests);
368 rv = _req_mod(req, what);
369
370 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
371 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
b411b363
PR
372 }
373 tmp = b->next;
374
b9b98716 375 if (n_writes) {
8554df1c 376 if (what == RESEND) {
11b58e73
PR
377 b->n_writes = n_writes;
378 if (b->w.cb == NULL) {
379 b->w.cb = w_send_barrier;
2f5cdd0b
PR
380 inc_ap_pending(b->w.mdev);
381 set_bit(CREATE_BARRIER, &b->w.mdev->flags);
11b58e73
PR
382 }
383
2f5cdd0b 384 drbd_queue_work(&tconn->data.work, &b->w);
11b58e73
PR
385 }
386 pn = &b->next;
387 } else {
b9b98716
PR
388 if (n_reads)
389 list_add(&carry_reads, &b->requests);
11b58e73
PR
390 /* there could still be requests on that ring list,
391 * in case local io is still pending */
392 list_del(&b->requests);
393
394 /* dec_ap_pending corresponding to queue_barrier.
395 * the newest barrier may not have been queued yet,
396 * in which case w.cb is still NULL. */
397 if (b->w.cb != NULL)
2f5cdd0b 398 dec_ap_pending(b->w.mdev);
11b58e73 399
2f5cdd0b 400 if (b == tconn->newest_tle) {
11b58e73 401 /* recycle, but reinit! */
2f5cdd0b
PR
402 if (tmp != NULL)
403 conn_err(tconn, "ASSERT FAILED tmp == NULL");
11b58e73 404 INIT_LIST_HEAD(&b->requests);
b9b98716 405 list_splice(&carry_reads, &b->requests);
11b58e73
PR
406 INIT_LIST_HEAD(&b->w.list);
407 b->w.cb = NULL;
408 b->br_number = net_random();
409 b->n_writes = 0;
410
411 *pn = b;
412 break;
413 }
414 *pn = tmp;
415 kfree(b);
b411b363 416 }
b411b363 417 b = tmp;
b9b98716 418 list_splice(&carry_reads, &b->requests);
b411b363 419 }
11b58e73
PR
420}
421
b411b363
PR
422
423/**
424 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
425 * @mdev: DRBD device.
426 *
427 * This is called after the connection to the peer was lost. The storage covered
428 * by the requests on the transfer gets marked as our of sync. Called from the
429 * receiver thread and the worker thread.
430 */
2f5cdd0b 431void tl_clear(struct drbd_tconn *tconn)
b411b363 432{
2f5cdd0b 433 struct drbd_conf *mdev;
b411b363
PR
434 struct list_head *le, *tle;
435 struct drbd_request *r;
e90285e0 436 int vnr;
b411b363 437
2f5cdd0b 438 spin_lock_irq(&tconn->req_lock);
b411b363 439
2f5cdd0b 440 _tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
b411b363
PR
441
442 /* we expect this list to be empty. */
2f5cdd0b
PR
443 if (!list_empty(&tconn->out_of_sequence_requests))
444 conn_err(tconn, "ASSERT FAILED list_empty(&out_of_sequence_requests)\n");
b411b363
PR
445
446 /* but just in case, clean it up anyways! */
2f5cdd0b 447 list_for_each_safe(le, tle, &tconn->out_of_sequence_requests) {
b411b363
PR
448 r = list_entry(le, struct drbd_request, tl_requests);
449 /* It would be nice to complete outside of spinlock.
450 * But this is easier for now. */
8554df1c 451 _req_mod(r, CONNECTION_LOST_WHILE_PENDING);
b411b363
PR
452 }
453
454 /* ensure bit indicating barrier is required is clear */
695d08fa 455 rcu_read_lock();
e90285e0 456 idr_for_each_entry(&tconn->volumes, mdev, vnr)
2f5cdd0b 457 clear_bit(CREATE_BARRIER, &mdev->flags);
695d08fa 458 rcu_read_unlock();
b411b363 459
2f5cdd0b 460 spin_unlock_irq(&tconn->req_lock);
b411b363
PR
461}
462
2f5cdd0b 463void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
11b58e73 464{
2f5cdd0b
PR
465 spin_lock_irq(&tconn->req_lock);
466 _tl_restart(tconn, what);
467 spin_unlock_irq(&tconn->req_lock);
b411b363
PR
468}
469
b411b363
PR
470static int drbd_thread_setup(void *arg)
471{
472 struct drbd_thread *thi = (struct drbd_thread *) arg;
392c8801 473 struct drbd_tconn *tconn = thi->tconn;
b411b363
PR
474 unsigned long flags;
475 int retval;
476
f1b3a6ec 477 snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
392c8801 478 thi->name[0], thi->tconn->name);
f1b3a6ec 479
b411b363
PR
480restart:
481 retval = thi->function(thi);
482
483 spin_lock_irqsave(&thi->t_lock, flags);
484
e77a0a5c 485 /* if the receiver has been "EXITING", the last thing it did
b411b363
PR
486 * was set the conn state to "StandAlone",
487 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
488 * and receiver thread will be "started".
e77a0a5c 489 * drbd_thread_start needs to set "RESTARTING" in that case.
b411b363 490 * t_state check and assignment needs to be within the same spinlock,
e77a0a5c
AG
491 * so either thread_start sees EXITING, and can remap to RESTARTING,
492 * or thread_start see NONE, and can proceed as normal.
b411b363
PR
493 */
494
e77a0a5c 495 if (thi->t_state == RESTARTING) {
392c8801 496 conn_info(tconn, "Restarting %s thread\n", thi->name);
e77a0a5c 497 thi->t_state = RUNNING;
b411b363
PR
498 spin_unlock_irqrestore(&thi->t_lock, flags);
499 goto restart;
500 }
501
502 thi->task = NULL;
e77a0a5c 503 thi->t_state = NONE;
b411b363 504 smp_mb();
992d6e91 505 complete_all(&thi->stop);
b411b363
PR
506 spin_unlock_irqrestore(&thi->t_lock, flags);
507
392c8801 508 conn_info(tconn, "Terminating %s\n", current->comm);
b411b363
PR
509
510 /* Release mod reference taken when thread was started */
9dc9fbb3
PR
511
512 kref_put(&tconn->kref, &conn_destroy);
b411b363
PR
513 module_put(THIS_MODULE);
514 return retval;
515}
516
392c8801 517static void drbd_thread_init(struct drbd_tconn *tconn, struct drbd_thread *thi,
bed879ae 518 int (*func) (struct drbd_thread *), char *name)
b411b363
PR
519{
520 spin_lock_init(&thi->t_lock);
521 thi->task = NULL;
e77a0a5c 522 thi->t_state = NONE;
b411b363 523 thi->function = func;
392c8801 524 thi->tconn = tconn;
bed879ae 525 strncpy(thi->name, name, ARRAY_SIZE(thi->name));
b411b363
PR
526}
527
528int drbd_thread_start(struct drbd_thread *thi)
529{
392c8801 530 struct drbd_tconn *tconn = thi->tconn;
b411b363
PR
531 struct task_struct *nt;
532 unsigned long flags;
533
b411b363
PR
534 /* is used from state engine doing drbd_thread_stop_nowait,
535 * while holding the req lock irqsave */
536 spin_lock_irqsave(&thi->t_lock, flags);
537
538 switch (thi->t_state) {
e77a0a5c 539 case NONE:
392c8801 540 conn_info(tconn, "Starting %s thread (from %s [%d])\n",
bed879ae 541 thi->name, current->comm, current->pid);
b411b363
PR
542
543 /* Get ref on module for thread - this is released when thread exits */
544 if (!try_module_get(THIS_MODULE)) {
392c8801 545 conn_err(tconn, "Failed to get module reference in drbd_thread_start\n");
b411b363 546 spin_unlock_irqrestore(&thi->t_lock, flags);
81e84650 547 return false;
b411b363
PR
548 }
549
9dc9fbb3
PR
550 kref_get(&thi->tconn->kref);
551
b411b363 552 init_completion(&thi->stop);
b411b363 553 thi->reset_cpu_mask = 1;
e77a0a5c 554 thi->t_state = RUNNING;
b411b363
PR
555 spin_unlock_irqrestore(&thi->t_lock, flags);
556 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
557
558 nt = kthread_create(drbd_thread_setup, (void *) thi,
392c8801 559 "drbd_%c_%s", thi->name[0], thi->tconn->name);
b411b363
PR
560
561 if (IS_ERR(nt)) {
392c8801 562 conn_err(tconn, "Couldn't start thread\n");
b411b363 563
9dc9fbb3 564 kref_put(&tconn->kref, &conn_destroy);
b411b363 565 module_put(THIS_MODULE);
81e84650 566 return false;
b411b363
PR
567 }
568 spin_lock_irqsave(&thi->t_lock, flags);
569 thi->task = nt;
e77a0a5c 570 thi->t_state = RUNNING;
b411b363
PR
571 spin_unlock_irqrestore(&thi->t_lock, flags);
572 wake_up_process(nt);
573 break;
e77a0a5c
AG
574 case EXITING:
575 thi->t_state = RESTARTING;
392c8801 576 conn_info(tconn, "Restarting %s thread (from %s [%d])\n",
bed879ae 577 thi->name, current->comm, current->pid);
b411b363 578 /* fall through */
e77a0a5c
AG
579 case RUNNING:
580 case RESTARTING:
b411b363
PR
581 default:
582 spin_unlock_irqrestore(&thi->t_lock, flags);
583 break;
584 }
585
81e84650 586 return true;
b411b363
PR
587}
588
589
590void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
591{
592 unsigned long flags;
593
e77a0a5c 594 enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
b411b363
PR
595
596 /* may be called from state engine, holding the req lock irqsave */
597 spin_lock_irqsave(&thi->t_lock, flags);
598
e77a0a5c 599 if (thi->t_state == NONE) {
b411b363
PR
600 spin_unlock_irqrestore(&thi->t_lock, flags);
601 if (restart)
602 drbd_thread_start(thi);
603 return;
604 }
605
606 if (thi->t_state != ns) {
607 if (thi->task == NULL) {
608 spin_unlock_irqrestore(&thi->t_lock, flags);
609 return;
610 }
611
612 thi->t_state = ns;
613 smp_mb();
614 init_completion(&thi->stop);
615 if (thi->task != current)
616 force_sig(DRBD_SIGKILL, thi->task);
b411b363
PR
617 }
618
619 spin_unlock_irqrestore(&thi->t_lock, flags);
620
621 if (wait)
622 wait_for_completion(&thi->stop);
623}
624
392c8801 625static struct drbd_thread *drbd_task_to_thread(struct drbd_tconn *tconn, struct task_struct *task)
bed879ae 626{
bed879ae
PR
627 struct drbd_thread *thi =
628 task == tconn->receiver.task ? &tconn->receiver :
629 task == tconn->asender.task ? &tconn->asender :
630 task == tconn->worker.task ? &tconn->worker : NULL;
631
632 return thi;
633}
634
392c8801 635char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task)
bed879ae 636{
392c8801 637 struct drbd_thread *thi = drbd_task_to_thread(tconn, task);
bed879ae
PR
638 return thi ? thi->name : task->comm;
639}
640
80883197 641int conn_lowest_minor(struct drbd_tconn *tconn)
80822284 642{
e90285e0 643 struct drbd_conf *mdev;
695d08fa 644 int vnr = 0, m;
774b3055 645
695d08fa 646 rcu_read_lock();
e90285e0 647 mdev = idr_get_next(&tconn->volumes, &vnr);
695d08fa
PR
648 m = mdev ? mdev_to_minor(mdev) : -1;
649 rcu_read_unlock();
650
651 return m;
80822284 652}
774b3055
PR
653
654#ifdef CONFIG_SMP
b411b363
PR
655/**
656 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
657 * @mdev: DRBD device.
658 *
659 * Forces all threads of a device onto the same CPU. This is beneficial for
660 * DRBD's performance. May be overwritten by user's configuration.
661 */
80822284 662void drbd_calc_cpu_mask(struct drbd_tconn *tconn)
b411b363
PR
663{
664 int ord, cpu;
665
666 /* user override. */
80822284 667 if (cpumask_weight(tconn->cpu_mask))
b411b363
PR
668 return;
669
80822284 670 ord = conn_lowest_minor(tconn) % cpumask_weight(cpu_online_mask);
b411b363
PR
671 for_each_online_cpu(cpu) {
672 if (ord-- == 0) {
80822284 673 cpumask_set_cpu(cpu, tconn->cpu_mask);
b411b363
PR
674 return;
675 }
676 }
677 /* should not be reached */
80822284 678 cpumask_setall(tconn->cpu_mask);
b411b363
PR
679}
680
681/**
682 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
683 * @mdev: DRBD device.
bc31fe33 684 * @thi: drbd_thread object
b411b363
PR
685 *
686 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
687 * prematurely.
688 */
80822284 689void drbd_thread_current_set_cpu(struct drbd_thread *thi)
b411b363
PR
690{
691 struct task_struct *p = current;
bed879ae 692
b411b363
PR
693 if (!thi->reset_cpu_mask)
694 return;
695 thi->reset_cpu_mask = 0;
392c8801 696 set_cpus_allowed_ptr(p, thi->tconn->cpu_mask);
b411b363
PR
697}
698#endif
699
52b061a4
AG
700/**
701 * drbd_header_size - size of a packet header
702 *
703 * The header size is a multiple of 8, so any payload following the header is
704 * word aligned on 64-bit architectures. (The bitmap send and receive code
705 * relies on this.)
706 */
707unsigned int drbd_header_size(struct drbd_tconn *tconn)
708{
0c8e36d9
AG
709 if (tconn->agreed_pro_version >= 100) {
710 BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8));
711 return sizeof(struct p_header100);
712 } else {
713 BUILD_BUG_ON(sizeof(struct p_header80) !=
714 sizeof(struct p_header95));
715 BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8));
716 return sizeof(struct p_header80);
717 }
52b061a4
AG
718}
719
e658983a 720static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
fd340c12
PR
721{
722 h->magic = cpu_to_be32(DRBD_MAGIC);
723 h->command = cpu_to_be16(cmd);
724 h->length = cpu_to_be16(size);
e658983a 725 return sizeof(struct p_header80);
fd340c12
PR
726}
727
e658983a 728static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
fd340c12
PR
729{
730 h->magic = cpu_to_be16(DRBD_MAGIC_BIG);
731 h->command = cpu_to_be16(cmd);
b55d84ba 732 h->length = cpu_to_be32(size);
e658983a 733 return sizeof(struct p_header95);
fd340c12
PR
734}
735
0c8e36d9
AG
736static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd,
737 int size, int vnr)
738{
739 h->magic = cpu_to_be32(DRBD_MAGIC_100);
740 h->volume = cpu_to_be16(vnr);
741 h->command = cpu_to_be16(cmd);
742 h->length = cpu_to_be32(size);
743 h->pad = 0;
744 return sizeof(struct p_header100);
745}
746
747static unsigned int prepare_header(struct drbd_tconn *tconn, int vnr,
748 void *buffer, enum drbd_packet cmd, int size)
d38e787e 749{
0c8e36d9
AG
750 if (tconn->agreed_pro_version >= 100)
751 return prepare_header100(buffer, cmd, size, vnr);
752 else if (tconn->agreed_pro_version >= 95 &&
753 size > DRBD_MAX_SIZE_H80_PACKET)
e658983a 754 return prepare_header95(buffer, cmd, size);
d38e787e 755 else
e658983a 756 return prepare_header80(buffer, cmd, size);
d38e787e
PR
757}
758
a7eb7bdf
AG
759static void *__conn_prepare_command(struct drbd_tconn *tconn,
760 struct drbd_socket *sock)
761{
762 if (!sock->socket)
763 return NULL;
764 return sock->sbuf + drbd_header_size(tconn);
765}
766
dba58587
AG
767void *conn_prepare_command(struct drbd_tconn *tconn, struct drbd_socket *sock)
768{
a7eb7bdf
AG
769 void *p;
770
dba58587 771 mutex_lock(&sock->mutex);
a7eb7bdf
AG
772 p = __conn_prepare_command(tconn, sock);
773 if (!p)
dba58587 774 mutex_unlock(&sock->mutex);
a7eb7bdf
AG
775
776 return p;
dba58587
AG
777}
778
779void *drbd_prepare_command(struct drbd_conf *mdev, struct drbd_socket *sock)
780{
781 return conn_prepare_command(mdev->tconn, sock);
782}
783
784static int __send_command(struct drbd_tconn *tconn, int vnr,
785 struct drbd_socket *sock, enum drbd_packet cmd,
786 unsigned int header_size, void *data,
787 unsigned int size)
788{
789 int msg_flags;
790 int err;
791
792 /*
793 * Called with @data == NULL and the size of the data blocks in @size
794 * for commands that send data blocks. For those commands, omit the
795 * MSG_MORE flag: this will increase the likelihood that data blocks
796 * which are page aligned on the sender will end up page aligned on the
797 * receiver.
798 */
799 msg_flags = data ? MSG_MORE : 0;
800
e658983a
AG
801 header_size += prepare_header(tconn, vnr, sock->sbuf, cmd,
802 header_size + size);
dba58587
AG
803 err = drbd_send_all(tconn, sock->socket, sock->sbuf, header_size,
804 msg_flags);
805 if (data && !err)
806 err = drbd_send_all(tconn, sock->socket, data, size, 0);
807 return err;
808}
809
a7eb7bdf
AG
810static int __conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock,
811 enum drbd_packet cmd, unsigned int header_size,
812 void *data, unsigned int size)
813{
814 return __send_command(tconn, 0, sock, cmd, header_size, data, size);
815}
816
dba58587
AG
817int conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock,
818 enum drbd_packet cmd, unsigned int header_size,
819 void *data, unsigned int size)
820{
821 int err;
822
a7eb7bdf 823 err = __conn_send_command(tconn, sock, cmd, header_size, data, size);
dba58587
AG
824 mutex_unlock(&sock->mutex);
825 return err;
826}
827
828int drbd_send_command(struct drbd_conf *mdev, struct drbd_socket *sock,
829 enum drbd_packet cmd, unsigned int header_size,
830 void *data, unsigned int size)
831{
832 int err;
833
834 err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, header_size,
835 data, size);
836 mutex_unlock(&sock->mutex);
837 return err;
838}
839
e307f352
AG
840int drbd_send_ping(struct drbd_tconn *tconn)
841{
9f5bdc33
AG
842 struct drbd_socket *sock;
843
844 sock = &tconn->meta;
845 if (!conn_prepare_command(tconn, sock))
846 return -EIO;
e658983a 847 return conn_send_command(tconn, sock, P_PING, 0, NULL, 0);
e307f352
AG
848}
849
850int drbd_send_ping_ack(struct drbd_tconn *tconn)
851{
9f5bdc33
AG
852 struct drbd_socket *sock;
853
854 sock = &tconn->meta;
855 if (!conn_prepare_command(tconn, sock))
856 return -EIO;
e658983a 857 return conn_send_command(tconn, sock, P_PING_ACK, 0, NULL, 0);
e307f352
AG
858}
859
f399002e 860int drbd_send_sync_param(struct drbd_conf *mdev)
b411b363 861{
7c96715a 862 struct drbd_socket *sock;
9f5bdc33
AG
863 struct p_rs_param_95 *p;
864 int size;
31890f4a 865 const int apv = mdev->tconn->agreed_pro_version;
9f5bdc33 866 enum drbd_packet cmd;
44ed167d 867 struct net_conf *nc;
daeda1cc 868 struct disk_conf *dc;
9f5bdc33
AG
869
870 sock = &mdev->tconn->data;
871 p = drbd_prepare_command(mdev, sock);
872 if (!p)
873 return -EIO;
b411b363 874
44ed167d
PR
875 rcu_read_lock();
876 nc = rcu_dereference(mdev->tconn->net_conf);
877
b411b363
PR
878 size = apv <= 87 ? sizeof(struct p_rs_param)
879 : apv == 88 ? sizeof(struct p_rs_param)
44ed167d 880 + strlen(nc->verify_alg) + 1
8e26f9cc
PR
881 : apv <= 94 ? sizeof(struct p_rs_param_89)
882 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
b411b363 883
9f5bdc33 884 cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
b411b363 885
9f5bdc33
AG
886 /* initialize verify_alg and csums_alg */
887 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
b411b363 888
9f5bdc33 889 if (get_ldev(mdev)) {
daeda1cc 890 dc = rcu_dereference(mdev->ldev->disk_conf);
6394b935 891 p->resync_rate = cpu_to_be32(dc->resync_rate);
daeda1cc
PR
892 p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead);
893 p->c_delay_target = cpu_to_be32(dc->c_delay_target);
894 p->c_fill_target = cpu_to_be32(dc->c_fill_target);
895 p->c_max_rate = cpu_to_be32(dc->c_max_rate);
9f5bdc33
AG
896 put_ldev(mdev);
897 } else {
6394b935 898 p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF);
9f5bdc33
AG
899 p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
900 p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
901 p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
902 p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
903 }
b411b363 904
9f5bdc33 905 if (apv >= 88)
44ed167d 906 strcpy(p->verify_alg, nc->verify_alg);
9f5bdc33 907 if (apv >= 89)
44ed167d
PR
908 strcpy(p->csums_alg, nc->csums_alg);
909 rcu_read_unlock();
b411b363 910
9f5bdc33 911 return drbd_send_command(mdev, sock, cmd, size, NULL, 0);
b411b363
PR
912}
913
d659f2aa 914int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd)
b411b363 915{
9f5bdc33 916 struct drbd_socket *sock;
b411b363 917 struct p_protocol *p;
44ed167d 918 struct net_conf *nc;
9f5bdc33 919 int size, cf;
b411b363 920
9f5bdc33 921 sock = &tconn->data;
a7eb7bdf 922 p = __conn_prepare_command(tconn, sock);
9f5bdc33
AG
923 if (!p)
924 return -EIO;
925
44ed167d
PR
926 rcu_read_lock();
927 nc = rcu_dereference(tconn->net_conf);
928
6dff2902 929 if (nc->tentative && tconn->agreed_pro_version < 92) {
44ed167d
PR
930 rcu_read_unlock();
931 mutex_unlock(&sock->mutex);
932 conn_err(tconn, "--dry-run is not supported by peer");
933 return -EOPNOTSUPP;
934 }
935
9f5bdc33 936 size = sizeof(*p);
dc8228d1 937 if (tconn->agreed_pro_version >= 87)
44ed167d 938 size += strlen(nc->integrity_alg) + 1;
b411b363 939
44ed167d
PR
940 p->protocol = cpu_to_be32(nc->wire_protocol);
941 p->after_sb_0p = cpu_to_be32(nc->after_sb_0p);
942 p->after_sb_1p = cpu_to_be32(nc->after_sb_1p);
943 p->after_sb_2p = cpu_to_be32(nc->after_sb_2p);
944 p->two_primaries = cpu_to_be32(nc->two_primaries);
cf14c2e9 945 cf = 0;
6139f60d
AG
946 if (nc->discard_my_data)
947 cf |= CF_DISCARD_MY_DATA;
6dff2902 948 if (nc->tentative)
9f5bdc33 949 cf |= CF_DRY_RUN;
cf14c2e9
PR
950 p->conn_flags = cpu_to_be32(cf);
951
dc8228d1 952 if (tconn->agreed_pro_version >= 87)
44ed167d
PR
953 strcpy(p->integrity_alg, nc->integrity_alg);
954 rcu_read_unlock();
955
d659f2aa 956 return __conn_send_command(tconn, sock, cmd, size, NULL, 0);
a7eb7bdf
AG
957}
958
959int drbd_send_protocol(struct drbd_tconn *tconn)
960{
961 int err;
962
963 mutex_lock(&tconn->data.mutex);
d659f2aa 964 err = __drbd_send_protocol(tconn, P_PROTOCOL);
a7eb7bdf
AG
965 mutex_unlock(&tconn->data.mutex);
966
967 return err;
b411b363
PR
968}
969
970int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
971{
9f5bdc33
AG
972 struct drbd_socket *sock;
973 struct p_uuids *p;
b411b363
PR
974 int i;
975
976 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2ae5f95b 977 return 0;
b411b363 978
9f5bdc33
AG
979 sock = &mdev->tconn->data;
980 p = drbd_prepare_command(mdev, sock);
981 if (!p) {
982 put_ldev(mdev);
983 return -EIO;
984 }
b411b363 985 for (i = UI_CURRENT; i < UI_SIZE; i++)
9f5bdc33 986 p->uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
b411b363
PR
987
988 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
9f5bdc33 989 p->uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
44ed167d 990 rcu_read_lock();
6139f60d 991 uuid_flags |= rcu_dereference(mdev->tconn->net_conf)->discard_my_data ? 1 : 0;
44ed167d 992 rcu_read_unlock();
b411b363
PR
993 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
994 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
9f5bdc33 995 p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
b411b363
PR
996
997 put_ldev(mdev);
9f5bdc33 998 return drbd_send_command(mdev, sock, P_UUIDS, sizeof(*p), NULL, 0);
b411b363
PR
999}
1000
1001int drbd_send_uuids(struct drbd_conf *mdev)
1002{
1003 return _drbd_send_uuids(mdev, 0);
1004}
1005
1006int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1007{
1008 return _drbd_send_uuids(mdev, 8);
1009}
1010
62b0da3a
LE
1011void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
1012{
1013 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1014 u64 *uuid = mdev->ldev->md.uuid;
1015 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
1016 text,
1017 (unsigned long long)uuid[UI_CURRENT],
1018 (unsigned long long)uuid[UI_BITMAP],
1019 (unsigned long long)uuid[UI_HISTORY_START],
1020 (unsigned long long)uuid[UI_HISTORY_END]);
1021 put_ldev(mdev);
1022 } else {
1023 dev_info(DEV, "%s effective data uuid: %016llX\n",
1024 text,
1025 (unsigned long long)mdev->ed_uuid);
1026 }
1027}
1028
9c1b7f72 1029void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
b411b363 1030{
9f5bdc33
AG
1031 struct drbd_socket *sock;
1032 struct p_rs_uuid *p;
5a22db89
LE
1033 u64 uuid;
1034
1035 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
b411b363 1036
4a23f264 1037 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
5a22db89 1038 drbd_uuid_set(mdev, UI_BITMAP, uuid);
62b0da3a 1039 drbd_print_uuids(mdev, "updated sync UUID");
5a22db89 1040 drbd_md_sync(mdev);
b411b363 1041
9f5bdc33
AG
1042 sock = &mdev->tconn->data;
1043 p = drbd_prepare_command(mdev, sock);
1044 if (p) {
1045 p->uuid = cpu_to_be64(uuid);
1046 drbd_send_command(mdev, sock, P_SYNC_UUID, sizeof(*p), NULL, 0);
1047 }
b411b363
PR
1048}
1049
e89b591c 1050int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
b411b363 1051{
9f5bdc33
AG
1052 struct drbd_socket *sock;
1053 struct p_sizes *p;
b411b363 1054 sector_t d_size, u_size;
99432fcc 1055 int q_order_type, max_bio_size;
b411b363
PR
1056
1057 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1058 D_ASSERT(mdev->ldev->backing_bdev);
1059 d_size = drbd_get_max_capacity(mdev->ldev);
daeda1cc
PR
1060 rcu_read_lock();
1061 u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
1062 rcu_read_unlock();
b411b363 1063 q_order_type = drbd_queue_order_type(mdev);
99432fcc
PR
1064 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
1065 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
b411b363
PR
1066 put_ldev(mdev);
1067 } else {
1068 d_size = 0;
1069 u_size = 0;
1070 q_order_type = QUEUE_ORDERED_NONE;
99432fcc 1071 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
b411b363
PR
1072 }
1073
9f5bdc33
AG
1074 sock = &mdev->tconn->data;
1075 p = drbd_prepare_command(mdev, sock);
1076 if (!p)
1077 return -EIO;
2ffca4f3
PR
1078
1079 if (mdev->tconn->agreed_pro_version <= 94)
1080 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
1081 else if (mdev->tconn->agreed_pro_version < 100)
1082 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE_P95);
1083
9f5bdc33
AG
1084 p->d_size = cpu_to_be64(d_size);
1085 p->u_size = cpu_to_be64(u_size);
1086 p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1087 p->max_bio_size = cpu_to_be32(max_bio_size);
1088 p->queue_order_type = cpu_to_be16(q_order_type);
1089 p->dds_flags = cpu_to_be16(flags);
1090 return drbd_send_command(mdev, sock, P_SIZES, sizeof(*p), NULL, 0);
b411b363
PR
1091}
1092
1093/**
1094 * drbd_send_state() - Sends the drbd state to the peer
1095 * @mdev: DRBD device.
1096 */
1097int drbd_send_state(struct drbd_conf *mdev)
1098{
7c96715a 1099 struct drbd_socket *sock;
9f5bdc33 1100 struct p_state *p;
b411b363 1101
7c96715a 1102 sock = &mdev->tconn->data;
9f5bdc33
AG
1103 p = drbd_prepare_command(mdev, sock);
1104 if (!p)
1105 return -EIO;
1106 p->state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1107 return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0);
1108}
b411b363 1109
9f5bdc33
AG
1110int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val)
1111{
1112 struct drbd_socket *sock;
1113 struct p_req_state *p;
b411b363 1114
9f5bdc33
AG
1115 sock = &mdev->tconn->data;
1116 p = drbd_prepare_command(mdev, sock);
1117 if (!p)
1118 return -EIO;
1119 p->mask = cpu_to_be32(mask.i);
1120 p->val = cpu_to_be32(val.i);
1121 return drbd_send_command(mdev, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0);
b411b363 1122
b411b363
PR
1123}
1124
9f5bdc33 1125int conn_send_state_req(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val)
b411b363 1126{
9f5bdc33
AG
1127 enum drbd_packet cmd;
1128 struct drbd_socket *sock;
1129 struct p_req_state *p;
b411b363 1130
9f5bdc33
AG
1131 cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ;
1132 sock = &tconn->data;
1133 p = conn_prepare_command(tconn, sock);
1134 if (!p)
1135 return -EIO;
1136 p->mask = cpu_to_be32(mask.i);
1137 p->val = cpu_to_be32(val.i);
1138 return conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0);
b411b363
PR
1139}
1140
2f4e7abe 1141void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
b411b363 1142{
9f5bdc33
AG
1143 struct drbd_socket *sock;
1144 struct p_req_state_reply *p;
b411b363 1145
9f5bdc33
AG
1146 sock = &mdev->tconn->meta;
1147 p = drbd_prepare_command(mdev, sock);
1148 if (p) {
1149 p->retcode = cpu_to_be32(retcode);
1150 drbd_send_command(mdev, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0);
1151 }
b411b363
PR
1152}
1153
9f5bdc33 1154void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode)
047cd4a6 1155{
9f5bdc33
AG
1156 struct drbd_socket *sock;
1157 struct p_req_state_reply *p;
047cd4a6
PR
1158 enum drbd_packet cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
1159
9f5bdc33
AG
1160 sock = &tconn->meta;
1161 p = conn_prepare_command(tconn, sock);
1162 if (p) {
1163 p->retcode = cpu_to_be32(retcode);
1164 conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0);
1165 }
047cd4a6
PR
1166}
1167
a02d1240
AG
1168static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
1169{
1170 BUG_ON(code & ~0xf);
1171 p->encoding = (p->encoding & ~0xf) | code;
1172}
1173
1174static void dcbp_set_start(struct p_compressed_bm *p, int set)
1175{
1176 p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
1177}
1178
1179static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n)
1180{
1181 BUG_ON(n & ~0x7);
1182 p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
1183}
1184
b411b363 1185int fill_bitmap_rle_bits(struct drbd_conf *mdev,
50d0b1ad
AG
1186 struct p_compressed_bm *p,
1187 unsigned int size,
1188 struct bm_xfer_ctx *c)
b411b363
PR
1189{
1190 struct bitstream bs;
1191 unsigned long plain_bits;
1192 unsigned long tmp;
1193 unsigned long rl;
1194 unsigned len;
1195 unsigned toggle;
44ed167d 1196 int bits, use_rle;
b411b363
PR
1197
1198 /* may we use this feature? */
44ed167d
PR
1199 rcu_read_lock();
1200 use_rle = rcu_dereference(mdev->tconn->net_conf)->use_rle;
1201 rcu_read_unlock();
1202 if (!use_rle || mdev->tconn->agreed_pro_version < 90)
1203 return 0;
b411b363
PR
1204
1205 if (c->bit_offset >= c->bm_bits)
1206 return 0; /* nothing to do. */
1207
1208 /* use at most thus many bytes */
50d0b1ad
AG
1209 bitstream_init(&bs, p->code, size, 0);
1210 memset(p->code, 0, size);
b411b363
PR
1211 /* plain bits covered in this code string */
1212 plain_bits = 0;
1213
1214 /* p->encoding & 0x80 stores whether the first run length is set.
1215 * bit offset is implicit.
1216 * start with toggle == 2 to be able to tell the first iteration */
1217 toggle = 2;
1218
1219 /* see how much plain bits we can stuff into one packet
1220 * using RLE and VLI. */
1221 do {
1222 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1223 : _drbd_bm_find_next(mdev, c->bit_offset);
1224 if (tmp == -1UL)
1225 tmp = c->bm_bits;
1226 rl = tmp - c->bit_offset;
1227
1228 if (toggle == 2) { /* first iteration */
1229 if (rl == 0) {
1230 /* the first checked bit was set,
1231 * store start value, */
a02d1240 1232 dcbp_set_start(p, 1);
b411b363
PR
1233 /* but skip encoding of zero run length */
1234 toggle = !toggle;
1235 continue;
1236 }
a02d1240 1237 dcbp_set_start(p, 0);
b411b363
PR
1238 }
1239
1240 /* paranoia: catch zero runlength.
1241 * can only happen if bitmap is modified while we scan it. */
1242 if (rl == 0) {
1243 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1244 "t:%u bo:%lu\n", toggle, c->bit_offset);
1245 return -1;
1246 }
1247
1248 bits = vli_encode_bits(&bs, rl);
1249 if (bits == -ENOBUFS) /* buffer full */
1250 break;
1251 if (bits <= 0) {
1252 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1253 return 0;
1254 }
1255
1256 toggle = !toggle;
1257 plain_bits += rl;
1258 c->bit_offset = tmp;
1259 } while (c->bit_offset < c->bm_bits);
1260
1261 len = bs.cur.b - p->code + !!bs.cur.bit;
1262
1263 if (plain_bits < (len << 3)) {
1264 /* incompressible with this method.
1265 * we need to rewind both word and bit position. */
1266 c->bit_offset -= plain_bits;
1267 bm_xfer_ctx_bit_to_word_offset(c);
1268 c->bit_offset = c->word_offset * BITS_PER_LONG;
1269 return 0;
1270 }
1271
1272 /* RLE + VLI was able to compress it just fine.
1273 * update c->word_offset. */
1274 bm_xfer_ctx_bit_to_word_offset(c);
1275
1276 /* store pad_bits */
a02d1240 1277 dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
b411b363
PR
1278
1279 return len;
1280}
1281
f70af118
AG
1282/**
1283 * send_bitmap_rle_or_plain
1284 *
1285 * Return 0 when done, 1 when another iteration is needed, and a negative error
1286 * code upon failure.
1287 */
1288static int
79ed9bd0 1289send_bitmap_rle_or_plain(struct drbd_conf *mdev, struct bm_xfer_ctx *c)
b411b363 1290{
9f5bdc33 1291 struct drbd_socket *sock = &mdev->tconn->data;
50d0b1ad 1292 unsigned int header_size = drbd_header_size(mdev->tconn);
e658983a 1293 struct p_compressed_bm *p = sock->sbuf + header_size;
a982dd57 1294 int len, err;
b411b363 1295
e658983a
AG
1296 len = fill_bitmap_rle_bits(mdev, p,
1297 DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c);
b411b363 1298 if (len < 0)
f70af118 1299 return -EIO;
b411b363
PR
1300
1301 if (len) {
a02d1240 1302 dcbp_set_code(p, RLE_VLI_Bits);
9f5bdc33
AG
1303 err = __send_command(mdev->tconn, mdev->vnr, sock,
1304 P_COMPRESSED_BITMAP, sizeof(*p) + len,
1305 NULL, 0);
b411b363 1306 c->packets[0]++;
e658983a 1307 c->bytes[0] += header_size + sizeof(*p) + len;
b411b363
PR
1308
1309 if (c->bit_offset >= c->bm_bits)
1310 len = 0; /* DONE */
1311 } else {
1312 /* was not compressible.
1313 * send a buffer full of plain text bits instead. */
50d0b1ad
AG
1314 unsigned int data_size;
1315 unsigned long num_words;
e658983a 1316 unsigned long *p = sock->sbuf + header_size;
50d0b1ad
AG
1317
1318 data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
e658983a 1319 num_words = min_t(size_t, data_size / sizeof(*p),
50d0b1ad 1320 c->bm_words - c->word_offset);
e658983a 1321 len = num_words * sizeof(*p);
b411b363 1322 if (len)
e658983a
AG
1323 drbd_bm_get_lel(mdev, c->word_offset, num_words, p);
1324 err = __send_command(mdev->tconn, mdev->vnr, sock, P_BITMAP, len, NULL, 0);
b411b363
PR
1325 c->word_offset += num_words;
1326 c->bit_offset = c->word_offset * BITS_PER_LONG;
1327
1328 c->packets[1]++;
50d0b1ad 1329 c->bytes[1] += header_size + len;
b411b363
PR
1330
1331 if (c->bit_offset > c->bm_bits)
1332 c->bit_offset = c->bm_bits;
1333 }
a982dd57 1334 if (!err) {
f70af118
AG
1335 if (len == 0) {
1336 INFO_bm_xfer_stats(mdev, "send", c);
1337 return 0;
1338 } else
1339 return 1;
1340 }
1341 return -EIO;
b411b363
PR
1342}
1343
1344/* See the comment at receive_bitmap() */
058820cd 1345static int _drbd_send_bitmap(struct drbd_conf *mdev)
b411b363
PR
1346{
1347 struct bm_xfer_ctx c;
f70af118 1348 int err;
b411b363 1349
841ce241
AG
1350 if (!expect(mdev->bitmap))
1351 return false;
b411b363 1352
b411b363
PR
1353 if (get_ldev(mdev)) {
1354 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
1355 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
1356 drbd_bm_set_all(mdev);
1357 if (drbd_bm_write(mdev)) {
1358 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
1359 * but otherwise process as per normal - need to tell other
1360 * side that a full resync is required! */
1361 dev_err(DEV, "Failed to write bitmap to disk!\n");
1362 } else {
1363 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
1364 drbd_md_sync(mdev);
1365 }
1366 }
1367 put_ldev(mdev);
1368 }
1369
1370 c = (struct bm_xfer_ctx) {
1371 .bm_bits = drbd_bm_bits(mdev),
1372 .bm_words = drbd_bm_words(mdev),
1373 };
1374
1375 do {
79ed9bd0 1376 err = send_bitmap_rle_or_plain(mdev, &c);
f70af118 1377 } while (err > 0);
b411b363 1378
f70af118 1379 return err == 0;
b411b363
PR
1380}
1381
1382int drbd_send_bitmap(struct drbd_conf *mdev)
1383{
9f5bdc33
AG
1384 struct drbd_socket *sock = &mdev->tconn->data;
1385 int err = -1;
b411b363 1386
9f5bdc33
AG
1387 mutex_lock(&sock->mutex);
1388 if (sock->socket)
1389 err = !_drbd_send_bitmap(mdev);
1390 mutex_unlock(&sock->mutex);
b411b363
PR
1391 return err;
1392}
9f5bdc33 1393
d4e67d7c 1394void drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
b411b363 1395{
9f5bdc33
AG
1396 struct drbd_socket *sock;
1397 struct p_barrier_ack *p;
b411b363 1398
9f5bdc33
AG
1399 if (mdev->state.conn < C_CONNECTED)
1400 return;
b411b363 1401
9f5bdc33
AG
1402 sock = &mdev->tconn->meta;
1403 p = drbd_prepare_command(mdev, sock);
1404 if (!p)
1405 return;
1406 p->barrier = barrier_nr;
1407 p->set_size = cpu_to_be32(set_size);
1408 drbd_send_command(mdev, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0);
b411b363
PR
1409}
1410
1411/**
1412 * _drbd_send_ack() - Sends an ack packet
1413 * @mdev: DRBD device.
1414 * @cmd: Packet command code.
1415 * @sector: sector, needs to be in big endian byte order
1416 * @blksize: size in byte, needs to be in big endian byte order
1417 * @block_id: Id, big endian byte order
1418 */
d8763023
AG
1419static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
1420 u64 sector, u32 blksize, u64 block_id)
b411b363 1421{
9f5bdc33
AG
1422 struct drbd_socket *sock;
1423 struct p_block_ack *p;
b411b363 1424
9f5bdc33
AG
1425 if (mdev->state.conn < C_CONNECTED)
1426 return -EIO;
b411b363 1427
9f5bdc33
AG
1428 sock = &mdev->tconn->meta;
1429 p = drbd_prepare_command(mdev, sock);
1430 if (!p)
a8c32aa8 1431 return -EIO;
9f5bdc33
AG
1432 p->sector = sector;
1433 p->block_id = block_id;
1434 p->blksize = blksize;
1435 p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq));
1436 return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0);
b411b363
PR
1437}
1438
2b2bf214
LE
1439/* dp->sector and dp->block_id already/still in network byte order,
1440 * data_size is payload size according to dp->head,
1441 * and may need to be corrected for digest size. */
a9a9994d
AG
1442void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
1443 struct p_data *dp, int data_size)
b411b363 1444{
88104ca4
AG
1445 if (mdev->tconn->peer_integrity_tfm)
1446 data_size -= crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
a9a9994d
AG
1447 _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
1448 dp->block_id);
b411b363
PR
1449}
1450
a9a9994d
AG
1451void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
1452 struct p_block_req *rp)
b411b363 1453{
a9a9994d 1454 _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
b411b363
PR
1455}
1456
1457/**
1458 * drbd_send_ack() - Sends an ack packet
db830c46
AG
1459 * @mdev: DRBD device
1460 * @cmd: packet command code
1461 * @peer_req: peer request
b411b363 1462 */
d8763023 1463int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
db830c46 1464 struct drbd_peer_request *peer_req)
b411b363 1465{
dd516121
AG
1466 return _drbd_send_ack(mdev, cmd,
1467 cpu_to_be64(peer_req->i.sector),
1468 cpu_to_be32(peer_req->i.size),
1469 peer_req->block_id);
b411b363
PR
1470}
1471
1472/* This function misuses the block_id field to signal if the blocks
1473 * are is sync or not. */
d8763023 1474int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
b411b363
PR
1475 sector_t sector, int blksize, u64 block_id)
1476{
fa79abd8
AG
1477 return _drbd_send_ack(mdev, cmd,
1478 cpu_to_be64(sector),
1479 cpu_to_be32(blksize),
1480 cpu_to_be64(block_id));
b411b363
PR
1481}
1482
1483int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
1484 sector_t sector, int size, u64 block_id)
1485{
9f5bdc33
AG
1486 struct drbd_socket *sock;
1487 struct p_block_req *p;
b411b363 1488
9f5bdc33
AG
1489 sock = &mdev->tconn->data;
1490 p = drbd_prepare_command(mdev, sock);
1491 if (!p)
1492 return -EIO;
1493 p->sector = cpu_to_be64(sector);
1494 p->block_id = block_id;
1495 p->blksize = cpu_to_be32(size);
1496 return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0);
b411b363
PR
1497}
1498
d8763023
AG
1499int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size,
1500 void *digest, int digest_size, enum drbd_packet cmd)
b411b363 1501{
9f5bdc33
AG
1502 struct drbd_socket *sock;
1503 struct p_block_req *p;
b411b363 1504
9f5bdc33 1505 /* FIXME: Put the digest into the preallocated socket buffer. */
b411b363 1506
9f5bdc33
AG
1507 sock = &mdev->tconn->data;
1508 p = drbd_prepare_command(mdev, sock);
1509 if (!p)
1510 return -EIO;
1511 p->sector = cpu_to_be64(sector);
1512 p->block_id = ID_SYNCER /* unused */;
1513 p->blksize = cpu_to_be32(size);
1514 return drbd_send_command(mdev, sock, cmd, sizeof(*p),
1515 digest, digest_size);
b411b363
PR
1516}
1517
1518int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
1519{
9f5bdc33
AG
1520 struct drbd_socket *sock;
1521 struct p_block_req *p;
b411b363 1522
9f5bdc33
AG
1523 sock = &mdev->tconn->data;
1524 p = drbd_prepare_command(mdev, sock);
1525 if (!p)
1526 return -EIO;
1527 p->sector = cpu_to_be64(sector);
1528 p->block_id = ID_SYNCER /* unused */;
1529 p->blksize = cpu_to_be32(size);
1530 return drbd_send_command(mdev, sock, P_OV_REQUEST, sizeof(*p), NULL, 0);
b411b363
PR
1531}
1532
1533/* called on sndtimeo
81e84650
AG
1534 * returns false if we should retry,
1535 * true if we think connection is dead
b411b363 1536 */
1a7ba646 1537static int we_should_drop_the_connection(struct drbd_tconn *tconn, struct socket *sock)
b411b363
PR
1538{
1539 int drop_it;
1540 /* long elapsed = (long)(jiffies - mdev->last_received); */
1541
1a7ba646
PR
1542 drop_it = tconn->meta.socket == sock
1543 || !tconn->asender.task
1544 || get_t_state(&tconn->asender) != RUNNING
bbeb641c 1545 || tconn->cstate < C_WF_REPORT_PARAMS;
b411b363
PR
1546
1547 if (drop_it)
81e84650 1548 return true;
b411b363 1549
1a7ba646 1550 drop_it = !--tconn->ko_count;
b411b363 1551 if (!drop_it) {
1a7ba646
PR
1552 conn_err(tconn, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
1553 current->comm, current->pid, tconn->ko_count);
1554 request_ping(tconn);
b411b363
PR
1555 }
1556
1557 return drop_it; /* && (mdev->state == R_PRIMARY) */;
1558}
1559
1a7ba646 1560static void drbd_update_congested(struct drbd_tconn *tconn)
9e204cdd 1561{
1a7ba646 1562 struct sock *sk = tconn->data.socket->sk;
9e204cdd 1563 if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
1a7ba646 1564 set_bit(NET_CONGESTED, &tconn->flags);
9e204cdd
AG
1565}
1566
b411b363
PR
1567/* The idea of sendpage seems to be to put some kind of reference
1568 * to the page into the skb, and to hand it over to the NIC. In
1569 * this process get_page() gets called.
1570 *
1571 * As soon as the page was really sent over the network put_page()
1572 * gets called by some part of the network layer. [ NIC driver? ]
1573 *
1574 * [ get_page() / put_page() increment/decrement the count. If count
1575 * reaches 0 the page will be freed. ]
1576 *
1577 * This works nicely with pages from FSs.
1578 * But this means that in protocol A we might signal IO completion too early!
1579 *
1580 * In order not to corrupt data during a resync we must make sure
1581 * that we do not reuse our own buffer pages (EEs) to early, therefore
1582 * we have the net_ee list.
1583 *
1584 * XFS seems to have problems, still, it submits pages with page_count == 0!
1585 * As a workaround, we disable sendpage on pages
1586 * with page_count == 0 or PageSlab.
1587 */
1588static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
b987427b 1589 int offset, size_t size, unsigned msg_flags)
b411b363 1590{
b987427b
AG
1591 struct socket *socket;
1592 void *addr;
1593 int err;
1594
1595 socket = mdev->tconn->data.socket;
1596 addr = kmap(page) + offset;
1597 err = drbd_send_all(mdev->tconn, socket, addr, size, msg_flags);
b411b363 1598 kunmap(page);
b987427b
AG
1599 if (!err)
1600 mdev->send_cnt += size >> 9;
1601 return err;
b411b363
PR
1602}
1603
1604static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
ba11ad9a 1605 int offset, size_t size, unsigned msg_flags)
b411b363 1606{
88b390ff 1607 struct socket *socket = mdev->tconn->data.socket;
b411b363 1608 mm_segment_t oldfs = get_fs();
b411b363 1609 int len = size;
88b390ff 1610 int err = -EIO;
b411b363
PR
1611
1612 /* e.g. XFS meta- & log-data is in slab pages, which have a
1613 * page_count of 0 and/or have PageSlab() set.
1614 * we cannot use send_page for those, as that does get_page();
1615 * put_page(); and would cause either a VM_BUG directly, or
1616 * __page_cache_release a page that would actually still be referenced
1617 * by someone, leading to some obscure delayed Oops somewhere else. */
1618 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
88b390ff 1619 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
b411b363 1620
ba11ad9a 1621 msg_flags |= MSG_NOSIGNAL;
1a7ba646 1622 drbd_update_congested(mdev->tconn);
b411b363
PR
1623 set_fs(KERNEL_DS);
1624 do {
88b390ff
AG
1625 int sent;
1626
1627 sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
b411b363 1628 if (sent <= 0) {
88b390ff
AG
1629 if (sent == -EAGAIN) {
1630 if (we_should_drop_the_connection(mdev->tconn, socket))
1631 break;
1632 continue;
1633 }
b411b363
PR
1634 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
1635 __func__, (int)size, len, sent);
88b390ff
AG
1636 if (sent < 0)
1637 err = sent;
b411b363
PR
1638 break;
1639 }
1640 len -= sent;
1641 offset += sent;
1642 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
1643 set_fs(oldfs);
01a311a5 1644 clear_bit(NET_CONGESTED, &mdev->tconn->flags);
b411b363 1645
88b390ff
AG
1646 if (len == 0) {
1647 err = 0;
1648 mdev->send_cnt += size >> 9;
1649 }
1650 return err;
b411b363
PR
1651}
1652
1653static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
1654{
1655 struct bio_vec *bvec;
1656 int i;
ba11ad9a 1657 /* hint all but last page with MSG_MORE */
b411b363 1658 __bio_for_each_segment(bvec, bio, i, 0) {
7fae55da
AG
1659 int err;
1660
1661 err = _drbd_no_send_page(mdev, bvec->bv_page,
1662 bvec->bv_offset, bvec->bv_len,
1663 i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
1664 if (err)
1665 return err;
b411b363 1666 }
7fae55da 1667 return 0;
b411b363
PR
1668}
1669
1670static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
1671{
1672 struct bio_vec *bvec;
1673 int i;
ba11ad9a 1674 /* hint all but last page with MSG_MORE */
b411b363 1675 __bio_for_each_segment(bvec, bio, i, 0) {
7fae55da
AG
1676 int err;
1677
1678 err = _drbd_send_page(mdev, bvec->bv_page,
1679 bvec->bv_offset, bvec->bv_len,
1680 i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
1681 if (err)
1682 return err;
b411b363 1683 }
7fae55da 1684 return 0;
b411b363
PR
1685}
1686
db830c46
AG
1687static int _drbd_send_zc_ee(struct drbd_conf *mdev,
1688 struct drbd_peer_request *peer_req)
45bb912b 1689{
db830c46
AG
1690 struct page *page = peer_req->pages;
1691 unsigned len = peer_req->i.size;
9f69230c 1692 int err;
db830c46 1693
ba11ad9a 1694 /* hint all but last page with MSG_MORE */
45bb912b
LE
1695 page_chain_for_each(page) {
1696 unsigned l = min_t(unsigned, len, PAGE_SIZE);
9f69230c
AG
1697
1698 err = _drbd_send_page(mdev, page, 0, l,
1699 page_chain_next(page) ? MSG_MORE : 0);
1700 if (err)
1701 return err;
45bb912b
LE
1702 len -= l;
1703 }
9f69230c 1704 return 0;
45bb912b
LE
1705}
1706
76d2e7ec
PR
1707static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
1708{
31890f4a 1709 if (mdev->tconn->agreed_pro_version >= 95)
76d2e7ec 1710 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
76d2e7ec
PR
1711 (bi_rw & REQ_FUA ? DP_FUA : 0) |
1712 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
1713 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
1714 else
721a9602 1715 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
76d2e7ec
PR
1716}
1717
b411b363
PR
1718/* Used to send write requests
1719 * R_PRIMARY -> Peer (P_DATA)
1720 */
1721int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
1722{
9f5bdc33
AG
1723 struct drbd_socket *sock;
1724 struct p_data *p;
b411b363 1725 unsigned int dp_flags = 0;
b411b363 1726 int dgs;
9f5bdc33 1727 int err;
b411b363 1728
46e1ce41
PR
1729 sock = &mdev->tconn->data;
1730 p = drbd_prepare_command(mdev, sock);
8d412fc6
AG
1731 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_tfm) ?
1732 crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0;
b411b363 1733
9f5bdc33
AG
1734 if (!p)
1735 return -EIO;
1736 p->sector = cpu_to_be64(req->i.sector);
1737 p->block_id = (unsigned long)req;
1738 p->seq_num = cpu_to_be32(req->seq_num = atomic_inc_return(&mdev->packet_seq));
76d2e7ec 1739 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
b411b363
PR
1740 if (mdev->state.conn >= C_SYNC_SOURCE &&
1741 mdev->state.conn <= C_PAUSED_SYNC_T)
1742 dp_flags |= DP_MAY_SET_IN_SYNC;
303d1448
PR
1743 if (mdev->tconn->agreed_pro_version >= 100) {
1744 if (req->rq_state & RQ_EXP_RECEIVE_ACK)
1745 dp_flags |= DP_SEND_RECEIVE_ACK;
1746 if (req->rq_state & RQ_EXP_WRITE_ACK)
1747 dp_flags |= DP_SEND_WRITE_ACK;
1748 }
9f5bdc33
AG
1749 p->dp_flags = cpu_to_be32(dp_flags);
1750 if (dgs)
8d412fc6 1751 drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, p + 1);
9f5bdc33 1752 err = __send_command(mdev->tconn, mdev->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size);
6bdb9b0e 1753 if (!err) {
470be44a
LE
1754 /* For protocol A, we have to memcpy the payload into
1755 * socket buffers, as we may complete right away
1756 * as soon as we handed it over to tcp, at which point the data
1757 * pages may become invalid.
1758 *
1759 * For data-integrity enabled, we copy it as well, so we can be
1760 * sure that even if the bio pages may still be modified, it
1761 * won't change the data on the wire, thus if the digest checks
1762 * out ok after sending on this side, but does not fit on the
1763 * receiving side, we sure have detected corruption elsewhere.
1764 */
303d1448 1765 if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || dgs)
6bdb9b0e 1766 err = _drbd_send_bio(mdev, req->master_bio);
b411b363 1767 else
6bdb9b0e 1768 err = _drbd_send_zc_bio(mdev, req->master_bio);
470be44a
LE
1769
1770 /* double check digest, sometimes buffers have been modified in flight. */
1771 if (dgs > 0 && dgs <= 64) {
24c4830c 1772 /* 64 byte, 512 bit, is the largest digest size
470be44a
LE
1773 * currently supported in kernel crypto. */
1774 unsigned char digest[64];
8d412fc6 1775 drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, digest);
9f5bdc33 1776 if (memcmp(p + 1, digest, dgs)) {
470be44a
LE
1777 dev_warn(DEV,
1778 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
ace652ac 1779 (unsigned long long)req->i.sector, req->i.size);
470be44a
LE
1780 }
1781 } /* else if (dgs > 64) {
1782 ... Be noisy about digest too large ...
1783 } */
b411b363 1784 }
9f5bdc33 1785 mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */
bd26bfc5 1786
6bdb9b0e 1787 return err;
b411b363
PR
1788}
1789
1790/* answer packet, used to send data back for read requests:
1791 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
1792 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
1793 */
d8763023 1794int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd,
db830c46 1795 struct drbd_peer_request *peer_req)
b411b363 1796{
9f5bdc33
AG
1797 struct drbd_socket *sock;
1798 struct p_data *p;
7b57b89d 1799 int err;
b411b363
PR
1800 int dgs;
1801
46e1ce41
PR
1802 sock = &mdev->tconn->data;
1803 p = drbd_prepare_command(mdev, sock);
1804
8d412fc6
AG
1805 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_tfm) ?
1806 crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0;
b411b363 1807
9f5bdc33
AG
1808 if (!p)
1809 return -EIO;
1810 p->sector = cpu_to_be64(peer_req->i.sector);
1811 p->block_id = peer_req->block_id;
1812 p->seq_num = 0; /* unused */
1813 if (dgs)
8d412fc6 1814 drbd_csum_ee(mdev, mdev->tconn->integrity_tfm, peer_req, p + 1);
9f5bdc33 1815 err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, sizeof(*p) + dgs, NULL, peer_req->i.size);
7b57b89d
AG
1816 if (!err)
1817 err = _drbd_send_zc_ee(mdev, peer_req);
9f5bdc33 1818 mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */
bd26bfc5 1819
7b57b89d 1820 return err;
b411b363
PR
1821}
1822
8f7bed77 1823int drbd_send_out_of_sync(struct drbd_conf *mdev, struct drbd_request *req)
73a01a18 1824{
9f5bdc33
AG
1825 struct drbd_socket *sock;
1826 struct p_block_desc *p;
73a01a18 1827
9f5bdc33
AG
1828 sock = &mdev->tconn->data;
1829 p = drbd_prepare_command(mdev, sock);
1830 if (!p)
1831 return -EIO;
1832 p->sector = cpu_to_be64(req->i.sector);
1833 p->blksize = cpu_to_be32(req->i.size);
1834 return drbd_send_command(mdev, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0);
73a01a18
PR
1835}
1836
b411b363
PR
1837/*
1838 drbd_send distinguishes two cases:
1839
1840 Packets sent via the data socket "sock"
1841 and packets sent via the meta data socket "msock"
1842
1843 sock msock
1844 -----------------+-------------------------+------------------------------
1845 timeout conf.timeout / 2 conf.timeout / 2
1846 timeout action send a ping via msock Abort communication
1847 and close all sockets
1848*/
1849
1850/*
1851 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
1852 */
bedbd2a5 1853int drbd_send(struct drbd_tconn *tconn, struct socket *sock,
b411b363
PR
1854 void *buf, size_t size, unsigned msg_flags)
1855{
1856 struct kvec iov;
1857 struct msghdr msg;
1858 int rv, sent = 0;
1859
1860 if (!sock)
c0d42c8e 1861 return -EBADR;
b411b363
PR
1862
1863 /* THINK if (signal_pending) return ... ? */
1864
1865 iov.iov_base = buf;
1866 iov.iov_len = size;
1867
1868 msg.msg_name = NULL;
1869 msg.msg_namelen = 0;
1870 msg.msg_control = NULL;
1871 msg.msg_controllen = 0;
1872 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
1873
bedbd2a5 1874 if (sock == tconn->data.socket) {
44ed167d
PR
1875 rcu_read_lock();
1876 tconn->ko_count = rcu_dereference(tconn->net_conf)->ko_count;
1877 rcu_read_unlock();
bedbd2a5 1878 drbd_update_congested(tconn);
b411b363
PR
1879 }
1880 do {
1881 /* STRANGE
1882 * tcp_sendmsg does _not_ use its size parameter at all ?
1883 *
1884 * -EAGAIN on timeout, -EINTR on signal.
1885 */
1886/* THINK
1887 * do we need to block DRBD_SIG if sock == &meta.socket ??
1888 * otherwise wake_asender() might interrupt some send_*Ack !
1889 */
1890 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
1891 if (rv == -EAGAIN) {
bedbd2a5 1892 if (we_should_drop_the_connection(tconn, sock))
b411b363
PR
1893 break;
1894 else
1895 continue;
1896 }
b411b363
PR
1897 if (rv == -EINTR) {
1898 flush_signals(current);
1899 rv = 0;
1900 }
1901 if (rv < 0)
1902 break;
1903 sent += rv;
1904 iov.iov_base += rv;
1905 iov.iov_len -= rv;
1906 } while (sent < size);
1907
bedbd2a5
PR
1908 if (sock == tconn->data.socket)
1909 clear_bit(NET_CONGESTED, &tconn->flags);
b411b363
PR
1910
1911 if (rv <= 0) {
1912 if (rv != -EAGAIN) {
bedbd2a5
PR
1913 conn_err(tconn, "%s_sendmsg returned %d\n",
1914 sock == tconn->meta.socket ? "msock" : "sock",
1915 rv);
bbeb641c 1916 conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
b411b363 1917 } else
bbeb641c 1918 conn_request_state(tconn, NS(conn, C_TIMEOUT), CS_HARD);
b411b363
PR
1919 }
1920
1921 return sent;
1922}
1923
fb708e40
AG
1924/**
1925 * drbd_send_all - Send an entire buffer
1926 *
1927 * Returns 0 upon success and a negative error value otherwise.
1928 */
1929int drbd_send_all(struct drbd_tconn *tconn, struct socket *sock, void *buffer,
1930 size_t size, unsigned msg_flags)
1931{
1932 int err;
1933
1934 err = drbd_send(tconn, sock, buffer, size, msg_flags);
1935 if (err < 0)
1936 return err;
1937 if (err != size)
1938 return -EIO;
1939 return 0;
1940}
1941
b411b363
PR
1942static int drbd_open(struct block_device *bdev, fmode_t mode)
1943{
1944 struct drbd_conf *mdev = bdev->bd_disk->private_data;
1945 unsigned long flags;
1946 int rv = 0;
1947
2a48fc0a 1948 mutex_lock(&drbd_main_mutex);
87eeee41 1949 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
b411b363
PR
1950 /* to have a stable mdev->state.role
1951 * and no race with updating open_cnt */
1952
1953 if (mdev->state.role != R_PRIMARY) {
1954 if (mode & FMODE_WRITE)
1955 rv = -EROFS;
1956 else if (!allow_oos)
1957 rv = -EMEDIUMTYPE;
1958 }
1959
1960 if (!rv)
1961 mdev->open_cnt++;
87eeee41 1962 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
2a48fc0a 1963 mutex_unlock(&drbd_main_mutex);
b411b363
PR
1964
1965 return rv;
1966}
1967
1968static int drbd_release(struct gendisk *gd, fmode_t mode)
1969{
1970 struct drbd_conf *mdev = gd->private_data;
2a48fc0a 1971 mutex_lock(&drbd_main_mutex);
b411b363 1972 mdev->open_cnt--;
2a48fc0a 1973 mutex_unlock(&drbd_main_mutex);
b411b363
PR
1974 return 0;
1975}
1976
b411b363
PR
1977static void drbd_set_defaults(struct drbd_conf *mdev)
1978{
f399002e
LE
1979 /* Beware! The actual layout differs
1980 * between big endian and little endian */
da9fbc27 1981 mdev->state = (union drbd_dev_state) {
b411b363
PR
1982 { .role = R_SECONDARY,
1983 .peer = R_UNKNOWN,
1984 .conn = C_STANDALONE,
1985 .disk = D_DISKLESS,
1986 .pdsk = D_UNKNOWN,
b411b363
PR
1987 } };
1988}
1989
1990void drbd_init_set_defaults(struct drbd_conf *mdev)
1991{
1992 /* the memset(,0,) did most of this.
1993 * note: only assignments, no allocation in here */
1994
1995 drbd_set_defaults(mdev);
1996
b411b363
PR
1997 atomic_set(&mdev->ap_bio_cnt, 0);
1998 atomic_set(&mdev->ap_pending_cnt, 0);
1999 atomic_set(&mdev->rs_pending_cnt, 0);
2000 atomic_set(&mdev->unacked_cnt, 0);
2001 atomic_set(&mdev->local_cnt, 0);
435f0740 2002 atomic_set(&mdev->pp_in_use_by_net, 0);
778f271d 2003 atomic_set(&mdev->rs_sect_in, 0);
0f0601f4 2004 atomic_set(&mdev->rs_sect_ev, 0);
759fbdfb 2005 atomic_set(&mdev->ap_in_flight, 0);
b411b363
PR
2006
2007 mutex_init(&mdev->md_io_mutex);
8410da8f
PR
2008 mutex_init(&mdev->own_state_mutex);
2009 mdev->state_mutex = &mdev->own_state_mutex;
b411b363 2010
b411b363 2011 spin_lock_init(&mdev->al_lock);
b411b363
PR
2012 spin_lock_init(&mdev->peer_seq_lock);
2013 spin_lock_init(&mdev->epoch_lock);
2014
2015 INIT_LIST_HEAD(&mdev->active_ee);
2016 INIT_LIST_HEAD(&mdev->sync_ee);
2017 INIT_LIST_HEAD(&mdev->done_ee);
2018 INIT_LIST_HEAD(&mdev->read_ee);
2019 INIT_LIST_HEAD(&mdev->net_ee);
2020 INIT_LIST_HEAD(&mdev->resync_reads);
b411b363
PR
2021 INIT_LIST_HEAD(&mdev->resync_work.list);
2022 INIT_LIST_HEAD(&mdev->unplug_work.list);
e9e6f3ec 2023 INIT_LIST_HEAD(&mdev->go_diskless.list);
b411b363 2024 INIT_LIST_HEAD(&mdev->md_sync_work.list);
c4752ef1 2025 INIT_LIST_HEAD(&mdev->start_resync_work.list);
b411b363 2026 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
0ced55a3 2027
794abb75 2028 mdev->resync_work.cb = w_resync_timer;
b411b363 2029 mdev->unplug_work.cb = w_send_write_hint;
e9e6f3ec 2030 mdev->go_diskless.cb = w_go_diskless;
b411b363
PR
2031 mdev->md_sync_work.cb = w_md_sync;
2032 mdev->bm_io_work.w.cb = w_bitmap_io;
370a43e7 2033 mdev->start_resync_work.cb = w_start_resync;
a21e9298
PR
2034
2035 mdev->resync_work.mdev = mdev;
2036 mdev->unplug_work.mdev = mdev;
2037 mdev->go_diskless.mdev = mdev;
2038 mdev->md_sync_work.mdev = mdev;
2039 mdev->bm_io_work.w.mdev = mdev;
2040 mdev->start_resync_work.mdev = mdev;
2041
b411b363
PR
2042 init_timer(&mdev->resync_timer);
2043 init_timer(&mdev->md_sync_timer);
370a43e7 2044 init_timer(&mdev->start_resync_timer);
7fde2be9 2045 init_timer(&mdev->request_timer);
b411b363
PR
2046 mdev->resync_timer.function = resync_timer_fn;
2047 mdev->resync_timer.data = (unsigned long) mdev;
2048 mdev->md_sync_timer.function = md_sync_timer_fn;
2049 mdev->md_sync_timer.data = (unsigned long) mdev;
370a43e7
PR
2050 mdev->start_resync_timer.function = start_resync_timer_fn;
2051 mdev->start_resync_timer.data = (unsigned long) mdev;
7fde2be9
PR
2052 mdev->request_timer.function = request_timer_fn;
2053 mdev->request_timer.data = (unsigned long) mdev;
b411b363
PR
2054
2055 init_waitqueue_head(&mdev->misc_wait);
2056 init_waitqueue_head(&mdev->state_wait);
2057 init_waitqueue_head(&mdev->ee_wait);
2058 init_waitqueue_head(&mdev->al_wait);
2059 init_waitqueue_head(&mdev->seq_wait);
2060
2451fc3b 2061 mdev->write_ordering = WO_bdev_flush;
b411b363 2062 mdev->resync_wenr = LC_FREE;
99432fcc
PR
2063 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
2064 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
b411b363
PR
2065}
2066
2067void drbd_mdev_cleanup(struct drbd_conf *mdev)
2068{
1d7734a0 2069 int i;
e6b3ea83 2070 if (mdev->tconn->receiver.t_state != NONE)
b411b363 2071 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
e6b3ea83 2072 mdev->tconn->receiver.t_state);
b411b363
PR
2073
2074 /* no need to lock it, I'm the only thread alive */
2075 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2076 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2077 mdev->al_writ_cnt =
2078 mdev->bm_writ_cnt =
2079 mdev->read_cnt =
2080 mdev->recv_cnt =
2081 mdev->send_cnt =
2082 mdev->writ_cnt =
2083 mdev->p_size =
2084 mdev->rs_start =
2085 mdev->rs_total =
1d7734a0
LE
2086 mdev->rs_failed = 0;
2087 mdev->rs_last_events = 0;
0f0601f4 2088 mdev->rs_last_sect_ev = 0;
1d7734a0
LE
2089 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2090 mdev->rs_mark_left[i] = 0;
2091 mdev->rs_mark_time[i] = 0;
2092 }
89e58e75 2093 D_ASSERT(mdev->tconn->net_conf == NULL);
b411b363
PR
2094
2095 drbd_set_my_capacity(mdev, 0);
2096 if (mdev->bitmap) {
2097 /* maybe never allocated. */
02d9a94b 2098 drbd_bm_resize(mdev, 0, 1);
b411b363
PR
2099 drbd_bm_cleanup(mdev);
2100 }
2101
1d041225
PR
2102 drbd_free_bc(mdev->ldev);
2103 mdev->ldev = NULL;
2104
0778286a 2105 clear_bit(AL_SUSPENDED, &mdev->flags);
b411b363 2106
b411b363
PR
2107 D_ASSERT(list_empty(&mdev->active_ee));
2108 D_ASSERT(list_empty(&mdev->sync_ee));
2109 D_ASSERT(list_empty(&mdev->done_ee));
2110 D_ASSERT(list_empty(&mdev->read_ee));
2111 D_ASSERT(list_empty(&mdev->net_ee));
2112 D_ASSERT(list_empty(&mdev->resync_reads));
e42325a5
PR
2113 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
2114 D_ASSERT(list_empty(&mdev->tconn->meta.work.q));
b411b363
PR
2115 D_ASSERT(list_empty(&mdev->resync_work.list));
2116 D_ASSERT(list_empty(&mdev->unplug_work.list));
e9e6f3ec 2117 D_ASSERT(list_empty(&mdev->go_diskless.list));
2265b473
LE
2118
2119 drbd_set_defaults(mdev);
b411b363
PR
2120}
2121
2122
2123static void drbd_destroy_mempools(void)
2124{
2125 struct page *page;
2126
2127 while (drbd_pp_pool) {
2128 page = drbd_pp_pool;
2129 drbd_pp_pool = (struct page *)page_private(page);
2130 __free_page(page);
2131 drbd_pp_vacant--;
2132 }
2133
2134 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2135
da4a75d2
LE
2136 if (drbd_md_io_bio_set)
2137 bioset_free(drbd_md_io_bio_set);
35abf594
LE
2138 if (drbd_md_io_page_pool)
2139 mempool_destroy(drbd_md_io_page_pool);
b411b363
PR
2140 if (drbd_ee_mempool)
2141 mempool_destroy(drbd_ee_mempool);
2142 if (drbd_request_mempool)
2143 mempool_destroy(drbd_request_mempool);
2144 if (drbd_ee_cache)
2145 kmem_cache_destroy(drbd_ee_cache);
2146 if (drbd_request_cache)
2147 kmem_cache_destroy(drbd_request_cache);
2148 if (drbd_bm_ext_cache)
2149 kmem_cache_destroy(drbd_bm_ext_cache);
2150 if (drbd_al_ext_cache)
2151 kmem_cache_destroy(drbd_al_ext_cache);
2152
da4a75d2 2153 drbd_md_io_bio_set = NULL;
35abf594 2154 drbd_md_io_page_pool = NULL;
b411b363
PR
2155 drbd_ee_mempool = NULL;
2156 drbd_request_mempool = NULL;
2157 drbd_ee_cache = NULL;
2158 drbd_request_cache = NULL;
2159 drbd_bm_ext_cache = NULL;
2160 drbd_al_ext_cache = NULL;
2161
2162 return;
2163}
2164
2165static int drbd_create_mempools(void)
2166{
2167 struct page *page;
1816a2b4 2168 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
b411b363
PR
2169 int i;
2170
2171 /* prepare our caches and mempools */
2172 drbd_request_mempool = NULL;
2173 drbd_ee_cache = NULL;
2174 drbd_request_cache = NULL;
2175 drbd_bm_ext_cache = NULL;
2176 drbd_al_ext_cache = NULL;
2177 drbd_pp_pool = NULL;
35abf594 2178 drbd_md_io_page_pool = NULL;
da4a75d2 2179 drbd_md_io_bio_set = NULL;
b411b363
PR
2180
2181 /* caches */
2182 drbd_request_cache = kmem_cache_create(
2183 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2184 if (drbd_request_cache == NULL)
2185 goto Enomem;
2186
2187 drbd_ee_cache = kmem_cache_create(
f6ffca9f 2188 "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
b411b363
PR
2189 if (drbd_ee_cache == NULL)
2190 goto Enomem;
2191
2192 drbd_bm_ext_cache = kmem_cache_create(
2193 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2194 if (drbd_bm_ext_cache == NULL)
2195 goto Enomem;
2196
2197 drbd_al_ext_cache = kmem_cache_create(
2198 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2199 if (drbd_al_ext_cache == NULL)
2200 goto Enomem;
2201
2202 /* mempools */
da4a75d2
LE
2203 drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
2204 if (drbd_md_io_bio_set == NULL)
2205 goto Enomem;
2206
35abf594
LE
2207 drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
2208 if (drbd_md_io_page_pool == NULL)
2209 goto Enomem;
2210
b411b363
PR
2211 drbd_request_mempool = mempool_create(number,
2212 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2213 if (drbd_request_mempool == NULL)
2214 goto Enomem;
2215
2216 drbd_ee_mempool = mempool_create(number,
2217 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2027ae1f 2218 if (drbd_ee_mempool == NULL)
b411b363
PR
2219 goto Enomem;
2220
2221 /* drbd's page pool */
2222 spin_lock_init(&drbd_pp_lock);
2223
2224 for (i = 0; i < number; i++) {
2225 page = alloc_page(GFP_HIGHUSER);
2226 if (!page)
2227 goto Enomem;
2228 set_page_private(page, (unsigned long)drbd_pp_pool);
2229 drbd_pp_pool = page;
2230 }
2231 drbd_pp_vacant = number;
2232
2233 return 0;
2234
2235Enomem:
2236 drbd_destroy_mempools(); /* in case we allocated some */
2237 return -ENOMEM;
2238}
2239
2240static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2241 void *unused)
2242{
2243 /* just so we have it. you never know what interesting things we
2244 * might want to do here some day...
2245 */
2246
2247 return NOTIFY_DONE;
2248}
2249
2250static struct notifier_block drbd_notifier = {
2251 .notifier_call = drbd_notify_sys,
2252};
2253
7721f567 2254static void drbd_release_all_peer_reqs(struct drbd_conf *mdev)
b411b363
PR
2255{
2256 int rr;
2257
7721f567 2258 rr = drbd_free_peer_reqs(mdev, &mdev->active_ee);
b411b363
PR
2259 if (rr)
2260 dev_err(DEV, "%d EEs in active list found!\n", rr);
2261
7721f567 2262 rr = drbd_free_peer_reqs(mdev, &mdev->sync_ee);
b411b363
PR
2263 if (rr)
2264 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2265
7721f567 2266 rr = drbd_free_peer_reqs(mdev, &mdev->read_ee);
b411b363
PR
2267 if (rr)
2268 dev_err(DEV, "%d EEs in read list found!\n", rr);
2269
7721f567 2270 rr = drbd_free_peer_reqs(mdev, &mdev->done_ee);
b411b363
PR
2271 if (rr)
2272 dev_err(DEV, "%d EEs in done list found!\n", rr);
2273
7721f567 2274 rr = drbd_free_peer_reqs(mdev, &mdev->net_ee);
b411b363
PR
2275 if (rr)
2276 dev_err(DEV, "%d EEs in net list found!\n", rr);
2277}
2278
774b3055 2279/* caution. no locking. */
81fa2e67 2280void drbd_minor_destroy(struct kref *kref)
b411b363 2281{
81fa2e67 2282 struct drbd_conf *mdev = container_of(kref, struct drbd_conf, kref);
9dc9fbb3
PR
2283 struct drbd_tconn *tconn = mdev->tconn;
2284
b411b363 2285 /* paranoia asserts */
70dc65e1 2286 D_ASSERT(mdev->open_cnt == 0);
e42325a5 2287 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
b411b363
PR
2288 /* end paranoia asserts */
2289
b411b363
PR
2290 /* cleanup stuff that may have been allocated during
2291 * device (re-)configuration or state changes */
2292
2293 if (mdev->this_bdev)
2294 bdput(mdev->this_bdev);
2295
1d041225
PR
2296 drbd_free_bc(mdev->ldev);
2297 mdev->ldev = NULL;
b411b363 2298
7721f567 2299 drbd_release_all_peer_reqs(mdev);
b411b363 2300
b411b363
PR
2301 lc_destroy(mdev->act_log);
2302 lc_destroy(mdev->resync);
2303
2304 kfree(mdev->p_uuid);
2305 /* mdev->p_uuid = NULL; */
2306
cd1d9950
PR
2307 kfree(mdev->current_epoch);
2308 if (mdev->bitmap) /* should no longer be there. */
2309 drbd_bm_cleanup(mdev);
2310 __free_page(mdev->md_io_page);
2311 put_disk(mdev->vdisk);
2312 blk_cleanup_queue(mdev->rq_queue);
9958c857 2313 kfree(mdev->rs_plan_s);
cd1d9950 2314 kfree(mdev);
9dc9fbb3
PR
2315
2316 kref_put(&tconn->kref, &conn_destroy);
b411b363
PR
2317}
2318
2319static void drbd_cleanup(void)
2320{
2321 unsigned int i;
81a5d60e 2322 struct drbd_conf *mdev;
81fa2e67 2323 struct drbd_tconn *tconn, *tmp;
b411b363
PR
2324
2325 unregister_reboot_notifier(&drbd_notifier);
2326
17a93f30
LE
2327 /* first remove proc,
2328 * drbdsetup uses it's presence to detect
2329 * whether DRBD is loaded.
2330 * If we would get stuck in proc removal,
2331 * but have netlink already deregistered,
2332 * some drbdsetup commands may wait forever
2333 * for an answer.
2334 */
2335 if (drbd_proc)
2336 remove_proc_entry("drbd", NULL);
2337
3b98c0c2 2338 drbd_genl_unregister();
b411b363 2339
81fa2e67
PR
2340 idr_for_each_entry(&minors, mdev, i) {
2341 idr_remove(&minors, mdev_to_minor(mdev));
2342 idr_remove(&mdev->tconn->volumes, mdev->vnr);
2343 del_gendisk(mdev->vdisk);
c141ebda 2344 /* synchronize_rcu(); No other threads running at this point */
81fa2e67
PR
2345 kref_put(&mdev->kref, &drbd_minor_destroy);
2346 }
2347
c141ebda 2348 /* not _rcu since, no other updater anymore. Genl already unregistered */
81fa2e67 2349 list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) {
c141ebda
PR
2350 list_del(&tconn->all_tconn); /* not _rcu no proc, not other threads */
2351 /* synchronize_rcu(); */
81fa2e67
PR
2352 kref_put(&tconn->kref, &conn_destroy);
2353 }
ff370e5a 2354
81a5d60e 2355 drbd_destroy_mempools();
b411b363
PR
2356 unregister_blkdev(DRBD_MAJOR, "drbd");
2357
81a5d60e
PR
2358 idr_destroy(&minors);
2359
b411b363
PR
2360 printk(KERN_INFO "drbd: module cleanup done.\n");
2361}
2362
2363/**
2364 * drbd_congested() - Callback for pdflush
2365 * @congested_data: User data
2366 * @bdi_bits: Bits pdflush is currently interested in
2367 *
2368 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
2369 */
2370static int drbd_congested(void *congested_data, int bdi_bits)
2371{
2372 struct drbd_conf *mdev = congested_data;
2373 struct request_queue *q;
2374 char reason = '-';
2375 int r = 0;
2376
1b881ef7 2377 if (!may_inc_ap_bio(mdev)) {
b411b363
PR
2378 /* DRBD has frozen IO */
2379 r = bdi_bits;
2380 reason = 'd';
2381 goto out;
2382 }
2383
2384 if (get_ldev(mdev)) {
2385 q = bdev_get_queue(mdev->ldev->backing_bdev);
2386 r = bdi_congested(&q->backing_dev_info, bdi_bits);
2387 put_ldev(mdev);
2388 if (r)
2389 reason = 'b';
2390 }
2391
01a311a5 2392 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->tconn->flags)) {
b411b363
PR
2393 r |= (1 << BDI_async_congested);
2394 reason = reason == 'b' ? 'a' : 'n';
2395 }
2396
2397out:
2398 mdev->congestion_reason = reason;
2399 return r;
2400}
2401
6699b655
PR
2402static void drbd_init_workqueue(struct drbd_work_queue* wq)
2403{
2404 sema_init(&wq->s, 0);
2405 spin_lock_init(&wq->q_lock);
2406 INIT_LIST_HEAD(&wq->q);
2407}
2408
0ace9dfa 2409struct drbd_tconn *conn_get_by_name(const char *name)
1aba4d7f
PR
2410{
2411 struct drbd_tconn *tconn;
2412
3b98c0c2
LE
2413 if (!name || !name[0])
2414 return NULL;
2415
c141ebda 2416 rcu_read_lock();
ec0bddbc 2417 list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) {
0ace9dfa
PR
2418 if (!strcmp(tconn->name, name)) {
2419 kref_get(&tconn->kref);
1aba4d7f 2420 goto found;
0ace9dfa 2421 }
1aba4d7f
PR
2422 }
2423 tconn = NULL;
2424found:
c141ebda 2425 rcu_read_unlock();
1aba4d7f
PR
2426 return tconn;
2427}
2428
089c075d
AG
2429struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len,
2430 void *peer_addr, int peer_addr_len)
2431{
2432 struct drbd_tconn *tconn;
2433
2434 rcu_read_lock();
2435 list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) {
2436 if (tconn->my_addr_len == my_addr_len &&
2437 tconn->peer_addr_len == peer_addr_len &&
2438 !memcmp(&tconn->my_addr, my_addr, my_addr_len) &&
2439 !memcmp(&tconn->peer_addr, peer_addr, peer_addr_len)) {
2440 kref_get(&tconn->kref);
2441 goto found;
2442 }
2443 }
2444 tconn = NULL;
2445found:
2446 rcu_read_unlock();
2447 return tconn;
2448}
2449
e6ef8a5c
AG
2450static int drbd_alloc_socket(struct drbd_socket *socket)
2451{
2452 socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
2453 if (!socket->rbuf)
2454 return -ENOMEM;
5a87d920
AG
2455 socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
2456 if (!socket->sbuf)
2457 return -ENOMEM;
e6ef8a5c
AG
2458 return 0;
2459}
2460
2461static void drbd_free_socket(struct drbd_socket *socket)
2462{
5a87d920 2463 free_page((unsigned long) socket->sbuf);
e6ef8a5c
AG
2464 free_page((unsigned long) socket->rbuf);
2465}
2466
91fd4dad
PR
2467void conn_free_crypto(struct drbd_tconn *tconn)
2468{
1d041225
PR
2469 drbd_free_sock(tconn);
2470
2471 crypto_free_hash(tconn->csums_tfm);
2472 crypto_free_hash(tconn->verify_tfm);
91fd4dad 2473 crypto_free_hash(tconn->cram_hmac_tfm);
8d412fc6 2474 crypto_free_hash(tconn->integrity_tfm);
5b614abe 2475 crypto_free_hash(tconn->peer_integrity_tfm);
91fd4dad
PR
2476 kfree(tconn->int_dig_in);
2477 kfree(tconn->int_dig_vv);
1d041225
PR
2478
2479 tconn->csums_tfm = NULL;
2480 tconn->verify_tfm = NULL;
91fd4dad 2481 tconn->cram_hmac_tfm = NULL;
8d412fc6 2482 tconn->integrity_tfm = NULL;
5b614abe 2483 tconn->peer_integrity_tfm = NULL;
91fd4dad
PR
2484 tconn->int_dig_in = NULL;
2485 tconn->int_dig_vv = NULL;
2486}
2487
afbbfa88
AG
2488int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts)
2489{
2490 cpumask_var_t new_cpu_mask;
2491 int err;
2492
2493 if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL))
2494 return -ENOMEM;
2495 /*
2496 retcode = ERR_NOMEM;
2497 drbd_msg_put_info("unable to allocate cpumask");
2498 */
2499
2500 /* silently ignore cpu mask on UP kernel */
2501 if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
2502 /* FIXME: Get rid of constant 32 here */
2503 err = __bitmap_parse(res_opts->cpu_mask, 32, 0,
2504 cpumask_bits(new_cpu_mask), nr_cpu_ids);
2505 if (err) {
2506 conn_warn(tconn, "__bitmap_parse() failed with %d\n", err);
2507 /* retcode = ERR_CPU_MASK_PARSE; */
2508 goto fail;
2509 }
2510 }
2511 tconn->res_opts = *res_opts;
2512 if (!cpumask_equal(tconn->cpu_mask, new_cpu_mask)) {
2513 cpumask_copy(tconn->cpu_mask, new_cpu_mask);
2514 drbd_calc_cpu_mask(tconn);
2515 tconn->receiver.reset_cpu_mask = 1;
2516 tconn->asender.reset_cpu_mask = 1;
2517 tconn->worker.reset_cpu_mask = 1;
2518 }
2519 err = 0;
2520
2521fail:
2522 free_cpumask_var(new_cpu_mask);
2523 return err;
2524
2525}
2526
ec0bddbc 2527/* caller must be under genl_lock() */
afbbfa88 2528struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts)
2111438b
PR
2529{
2530 struct drbd_tconn *tconn;
2531
2532 tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL);
2533 if (!tconn)
2534 return NULL;
2535
2536 tconn->name = kstrdup(name, GFP_KERNEL);
2537 if (!tconn->name)
2538 goto fail;
2539
e6ef8a5c
AG
2540 if (drbd_alloc_socket(&tconn->data))
2541 goto fail;
2542 if (drbd_alloc_socket(&tconn->meta))
2543 goto fail;
2544
774b3055
PR
2545 if (!zalloc_cpumask_var(&tconn->cpu_mask, GFP_KERNEL))
2546 goto fail;
2547
afbbfa88
AG
2548 if (set_resource_options(tconn, res_opts))
2549 goto fail;
2550
2f5cdd0b
PR
2551 if (!tl_init(tconn))
2552 goto fail;
2553
bbeb641c 2554 tconn->cstate = C_STANDALONE;
8410da8f 2555 mutex_init(&tconn->cstate_mutex);
6699b655 2556 spin_lock_init(&tconn->req_lock);
a0095508 2557 mutex_init(&tconn->conf_update);
2a67d8b9 2558 init_waitqueue_head(&tconn->ping_wait);
062e879c 2559 idr_init(&tconn->volumes);
b2fb6dbe 2560
6699b655
PR
2561 drbd_init_workqueue(&tconn->data.work);
2562 mutex_init(&tconn->data.mutex);
2563
2564 drbd_init_workqueue(&tconn->meta.work);
2565 mutex_init(&tconn->meta.mutex);
2566
392c8801
PR
2567 drbd_thread_init(tconn, &tconn->receiver, drbdd_init, "receiver");
2568 drbd_thread_init(tconn, &tconn->worker, drbd_worker, "worker");
2569 drbd_thread_init(tconn, &tconn->asender, drbd_asender, "asender");
2570
9dc9fbb3 2571 kref_init(&tconn->kref);
ec0bddbc 2572 list_add_tail_rcu(&tconn->all_tconn, &drbd_tconns);
2111438b
PR
2573
2574 return tconn;
2575
2576fail:
2f5cdd0b 2577 tl_cleanup(tconn);
774b3055 2578 free_cpumask_var(tconn->cpu_mask);
e6ef8a5c
AG
2579 drbd_free_socket(&tconn->meta);
2580 drbd_free_socket(&tconn->data);
2111438b
PR
2581 kfree(tconn->name);
2582 kfree(tconn);
2583
2584 return NULL;
2585}
2586
9dc9fbb3 2587void conn_destroy(struct kref *kref)
2111438b 2588{
9dc9fbb3
PR
2589 struct drbd_tconn *tconn = container_of(kref, struct drbd_tconn, kref);
2590
062e879c 2591 idr_destroy(&tconn->volumes);
2111438b 2592
774b3055 2593 free_cpumask_var(tconn->cpu_mask);
e6ef8a5c
AG
2594 drbd_free_socket(&tconn->meta);
2595 drbd_free_socket(&tconn->data);
2111438b 2596 kfree(tconn->name);
b42a70ad
PR
2597 kfree(tconn->int_dig_in);
2598 kfree(tconn->int_dig_vv);
2111438b
PR
2599 kfree(tconn);
2600}
2601
774b3055 2602enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr)
b411b363
PR
2603{
2604 struct drbd_conf *mdev;
2605 struct gendisk *disk;
2606 struct request_queue *q;
774b3055 2607 int vnr_got = vnr;
81a5d60e 2608 int minor_got = minor;
8432b314 2609 enum drbd_ret_code err = ERR_NOMEM;
774b3055
PR
2610
2611 mdev = minor_to_mdev(minor);
2612 if (mdev)
2613 return ERR_MINOR_EXISTS;
b411b363
PR
2614
2615 /* GFP_KERNEL, we are outside of all write-out paths */
2616 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
2617 if (!mdev)
774b3055
PR
2618 return ERR_NOMEM;
2619
9dc9fbb3 2620 kref_get(&tconn->kref);
774b3055 2621 mdev->tconn = tconn;
9dc9fbb3 2622
b411b363 2623 mdev->minor = minor;
3b98c0c2 2624 mdev->vnr = vnr;
b411b363
PR
2625
2626 drbd_init_set_defaults(mdev);
2627
2628 q = blk_alloc_queue(GFP_KERNEL);
2629 if (!q)
2630 goto out_no_q;
2631 mdev->rq_queue = q;
2632 q->queuedata = mdev;
b411b363
PR
2633
2634 disk = alloc_disk(1);
2635 if (!disk)
2636 goto out_no_disk;
2637 mdev->vdisk = disk;
2638
81e84650 2639 set_disk_ro(disk, true);
b411b363
PR
2640
2641 disk->queue = q;
2642 disk->major = DRBD_MAJOR;
2643 disk->first_minor = minor;
2644 disk->fops = &drbd_ops;
2645 sprintf(disk->disk_name, "drbd%d", minor);
2646 disk->private_data = mdev;
2647
2648 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
2649 /* we have no partitions. we contain only ourselves. */
2650 mdev->this_bdev->bd_contains = mdev->this_bdev;
2651
2652 q->backing_dev_info.congested_fn = drbd_congested;
2653 q->backing_dev_info.congested_data = mdev;
2654
2f58dcfc 2655 blk_queue_make_request(q, drbd_make_request);
99432fcc
PR
2656 /* Setting the max_hw_sectors to an odd value of 8kibyte here
2657 This triggers a max_bio_size message upon first attach or connect */
2658 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
b411b363
PR
2659 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
2660 blk_queue_merge_bvec(q, drbd_merge_bvec);
87eeee41 2661 q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */
b411b363
PR
2662
2663 mdev->md_io_page = alloc_page(GFP_KERNEL);
2664 if (!mdev->md_io_page)
2665 goto out_no_io_page;
2666
2667 if (drbd_bm_init(mdev))
2668 goto out_no_bitmap;
dac1389c 2669 mdev->read_requests = RB_ROOT;
de696716 2670 mdev->write_requests = RB_ROOT;
b411b363 2671
b411b363
PR
2672 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
2673 if (!mdev->current_epoch)
2674 goto out_no_epoch;
2675
2676 INIT_LIST_HEAD(&mdev->current_epoch->list);
2677 mdev->epochs = 1;
2678
81a5d60e 2679 if (!idr_pre_get(&minors, GFP_KERNEL))
8432b314
LE
2680 goto out_no_minor_idr;
2681 if (idr_get_new_above(&minors, mdev, minor, &minor_got))
2682 goto out_no_minor_idr;
81a5d60e 2683 if (minor_got != minor) {
8432b314
LE
2684 err = ERR_MINOR_EXISTS;
2685 drbd_msg_put_info("requested minor exists already");
569083c0 2686 goto out_idr_remove_minor;
81a5d60e 2687 }
8432b314
LE
2688
2689 if (!idr_pre_get(&tconn->volumes, GFP_KERNEL))
2690 goto out_idr_remove_minor;
2691 if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got))
2692 goto out_idr_remove_minor;
2693 if (vnr_got != vnr) {
2694 err = ERR_INVALID_REQUEST;
2695 drbd_msg_put_info("requested volume exists already");
2696 goto out_idr_remove_vol;
2697 }
774b3055 2698 add_disk(disk);
81fa2e67 2699 kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */
774b3055 2700
2325eb66
PR
2701 /* inherit the connection state */
2702 mdev->state.conn = tconn->cstate;
2703 if (mdev->state.conn == C_WF_REPORT_PARAMS)
c141ebda 2704 drbd_connected(mdev);
2325eb66 2705
774b3055 2706 return NO_ERROR;
b411b363 2707
569083c0
LE
2708out_idr_remove_vol:
2709 idr_remove(&tconn->volumes, vnr_got);
8432b314
LE
2710out_idr_remove_minor:
2711 idr_remove(&minors, minor_got);
569083c0 2712 synchronize_rcu();
8432b314 2713out_no_minor_idr:
81a5d60e 2714 kfree(mdev->current_epoch);
b411b363 2715out_no_epoch:
b411b363
PR
2716 drbd_bm_cleanup(mdev);
2717out_no_bitmap:
2718 __free_page(mdev->md_io_page);
2719out_no_io_page:
2720 put_disk(disk);
2721out_no_disk:
2722 blk_cleanup_queue(q);
2723out_no_q:
b411b363 2724 kfree(mdev);
9dc9fbb3 2725 kref_put(&tconn->kref, &conn_destroy);
8432b314 2726 return err;
b411b363
PR
2727}
2728
b411b363
PR
2729int __init drbd_init(void)
2730{
2731 int err;
2732
2b8a90b5 2733 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
b411b363 2734 printk(KERN_ERR
81a5d60e 2735 "drbd: invalid minor_count (%d)\n", minor_count);
b411b363
PR
2736#ifdef MODULE
2737 return -EINVAL;
2738#else
46530e85 2739 minor_count = DRBD_MINOR_COUNT_DEF;
b411b363
PR
2740#endif
2741 }
2742
b411b363
PR
2743 err = register_blkdev(DRBD_MAJOR, "drbd");
2744 if (err) {
2745 printk(KERN_ERR
2746 "drbd: unable to register block device major %d\n",
2747 DRBD_MAJOR);
2748 return err;
2749 }
2750
3b98c0c2
LE
2751 err = drbd_genl_register();
2752 if (err) {
2753 printk(KERN_ERR "drbd: unable to register generic netlink family\n");
2754 goto fail;
2755 }
2756
2757
b411b363
PR
2758 register_reboot_notifier(&drbd_notifier);
2759
2760 /*
2761 * allocate all necessary structs
2762 */
2763 err = -ENOMEM;
2764
2765 init_waitqueue_head(&drbd_pp_wait);
2766
2767 drbd_proc = NULL; /* play safe for drbd_cleanup */
81a5d60e 2768 idr_init(&minors);
b411b363
PR
2769
2770 err = drbd_create_mempools();
2771 if (err)
3b98c0c2 2772 goto fail;
b411b363 2773
8c484ee4 2774 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
b411b363
PR
2775 if (!drbd_proc) {
2776 printk(KERN_ERR "drbd: unable to register proc file\n");
3b98c0c2 2777 goto fail;
b411b363
PR
2778 }
2779
2780 rwlock_init(&global_state_lock);
2111438b 2781 INIT_LIST_HEAD(&drbd_tconns);
b411b363
PR
2782
2783 printk(KERN_INFO "drbd: initialized. "
2784 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
2785 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
2786 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
2787 printk(KERN_INFO "drbd: registered as block device major %d\n",
2788 DRBD_MAJOR);
b411b363
PR
2789
2790 return 0; /* Success! */
2791
3b98c0c2 2792fail:
b411b363
PR
2793 drbd_cleanup();
2794 if (err == -ENOMEM)
2795 /* currently always the case */
2796 printk(KERN_ERR "drbd: ran out of memory\n");
2797 else
2798 printk(KERN_ERR "drbd: initialization failure\n");
2799 return err;
2800}
2801
2802void drbd_free_bc(struct drbd_backing_dev *ldev)
2803{
2804 if (ldev == NULL)
2805 return;
2806
e525fd89
TH
2807 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2808 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
b411b363
PR
2809
2810 kfree(ldev);
2811}
2812
360cc740
PR
2813void drbd_free_sock(struct drbd_tconn *tconn)
2814{
2815 if (tconn->data.socket) {
2816 mutex_lock(&tconn->data.mutex);
2817 kernel_sock_shutdown(tconn->data.socket, SHUT_RDWR);
2818 sock_release(tconn->data.socket);
2819 tconn->data.socket = NULL;
2820 mutex_unlock(&tconn->data.mutex);
b411b363 2821 }
360cc740
PR
2822 if (tconn->meta.socket) {
2823 mutex_lock(&tconn->meta.mutex);
2824 kernel_sock_shutdown(tconn->meta.socket, SHUT_RDWR);
2825 sock_release(tconn->meta.socket);
2826 tconn->meta.socket = NULL;
2827 mutex_unlock(&tconn->meta.mutex);
b411b363
PR
2828 }
2829}
2830
b411b363
PR
2831/* meta data management */
2832
2833struct meta_data_on_disk {
2834 u64 la_size; /* last agreed size. */
2835 u64 uuid[UI_SIZE]; /* UUIDs. */
2836 u64 device_uuid;
2837 u64 reserved_u64_1;
2838 u32 flags; /* MDF */
2839 u32 magic;
2840 u32 md_size_sect;
2841 u32 al_offset; /* offset to this block */
2842 u32 al_nr_extents; /* important for restoring the AL */
f399002e 2843 /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
b411b363
PR
2844 u32 bm_offset; /* offset to the bitmap, from here */
2845 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
99432fcc
PR
2846 u32 la_peer_max_bio_size; /* last peer max_bio_size */
2847 u32 reserved_u32[3];
b411b363
PR
2848
2849} __packed;
2850
2851/**
2852 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
2853 * @mdev: DRBD device.
2854 */
2855void drbd_md_sync(struct drbd_conf *mdev)
2856{
2857 struct meta_data_on_disk *buffer;
2858 sector_t sector;
2859 int i;
2860
ee15b038
LE
2861 del_timer(&mdev->md_sync_timer);
2862 /* timer may be rearmed by drbd_md_mark_dirty() now. */
b411b363
PR
2863 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
2864 return;
b411b363
PR
2865
2866 /* We use here D_FAILED and not D_ATTACHING because we try to write
2867 * metadata even if we detach due to a disk failure! */
2868 if (!get_ldev_if_state(mdev, D_FAILED))
2869 return;
2870
b411b363
PR
2871 mutex_lock(&mdev->md_io_mutex);
2872 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
2873 memset(buffer, 0, 512);
2874
2875 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
2876 for (i = UI_CURRENT; i < UI_SIZE; i++)
2877 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
2878 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
2879 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
2880
2881 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
2882 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
2883 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
2884 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
2885 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
2886
2887 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
99432fcc 2888 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
b411b363
PR
2889
2890 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
2891 sector = mdev->ldev->md.md_offset;
2892
3fbf4d21 2893 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
b411b363
PR
2894 /* this was a try anyways ... */
2895 dev_err(DEV, "meta data update failed!\n");
81e84650 2896 drbd_chk_io_error(mdev, 1, true);
b411b363
PR
2897 }
2898
2899 /* Update mdev->ldev->md.la_size_sect,
2900 * since we updated it on metadata. */
2901 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
2902
2903 mutex_unlock(&mdev->md_io_mutex);
2904 put_ldev(mdev);
2905}
2906
2907/**
2908 * drbd_md_read() - Reads in the meta data super block
2909 * @mdev: DRBD device.
2910 * @bdev: Device from which the meta data should be read in.
2911 *
116676ca 2912 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
b411b363
PR
2913 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
2914 */
2915int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2916{
2917 struct meta_data_on_disk *buffer;
2918 int i, rv = NO_ERROR;
2919
2920 if (!get_ldev_if_state(mdev, D_ATTACHING))
2921 return ERR_IO_MD_DISK;
2922
b411b363
PR
2923 mutex_lock(&mdev->md_io_mutex);
2924 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
2925
3fbf4d21 2926 if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
25985edc 2927 /* NOTE: can't do normal error processing here as this is
b411b363
PR
2928 called BEFORE disk is attached */
2929 dev_err(DEV, "Error while reading metadata.\n");
2930 rv = ERR_IO_MD_DISK;
2931 goto err;
2932 }
2933
e7fad8af 2934 if (buffer->magic != cpu_to_be32(DRBD_MD_MAGIC)) {
b411b363
PR
2935 dev_err(DEV, "Error while reading metadata, magic not found.\n");
2936 rv = ERR_MD_INVALID;
2937 goto err;
2938 }
2939 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
2940 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
2941 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
2942 rv = ERR_MD_INVALID;
2943 goto err;
2944 }
2945 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
2946 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
2947 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
2948 rv = ERR_MD_INVALID;
2949 goto err;
2950 }
2951 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
2952 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
2953 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
2954 rv = ERR_MD_INVALID;
2955 goto err;
2956 }
2957
2958 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
2959 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
2960 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
2961 rv = ERR_MD_INVALID;
2962 goto err;
2963 }
2964
2965 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
2966 for (i = UI_CURRENT; i < UI_SIZE; i++)
2967 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
2968 bdev->md.flags = be32_to_cpu(buffer->flags);
b411b363
PR
2969 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
2970
87eeee41 2971 spin_lock_irq(&mdev->tconn->req_lock);
99432fcc
PR
2972 if (mdev->state.conn < C_CONNECTED) {
2973 int peer;
2974 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
2975 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
2976 mdev->peer_max_bio_size = peer;
2977 }
87eeee41 2978 spin_unlock_irq(&mdev->tconn->req_lock);
99432fcc 2979
daeda1cc
PR
2980 /* This blocks wants to be get removed... */
2981 bdev->disk_conf->al_extents = be32_to_cpu(buffer->al_nr_extents);
2982 if (bdev->disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
2983 bdev->disk_conf->al_extents = DRBD_AL_EXTENTS_DEF;
b411b363
PR
2984
2985 err:
2986 mutex_unlock(&mdev->md_io_mutex);
2987 put_ldev(mdev);
2988
2989 return rv;
2990}
2991
2992/**
2993 * drbd_md_mark_dirty() - Mark meta data super block as dirty
2994 * @mdev: DRBD device.
2995 *
2996 * Call this function if you change anything that should be written to
2997 * the meta-data super block. This function sets MD_DIRTY, and starts a
2998 * timer that ensures that within five seconds you have to call drbd_md_sync().
2999 */
ca0e6098 3000#ifdef DEBUG
ee15b038
LE
3001void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3002{
3003 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3004 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3005 mdev->last_md_mark_dirty.line = line;
3006 mdev->last_md_mark_dirty.func = func;
3007 }
3008}
3009#else
b411b363
PR
3010void drbd_md_mark_dirty(struct drbd_conf *mdev)
3011{
ee15b038 3012 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
ca0e6098 3013 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
b411b363 3014}
ee15b038 3015#endif
b411b363
PR
3016
3017static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3018{
3019 int i;
3020
62b0da3a 3021 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
b411b363 3022 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
b411b363
PR
3023}
3024
3025void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3026{
3027 if (idx == UI_CURRENT) {
3028 if (mdev->state.role == R_PRIMARY)
3029 val |= 1;
3030 else
3031 val &= ~((u64)1);
3032
3033 drbd_set_ed_uuid(mdev, val);
3034 }
3035
3036 mdev->ldev->md.uuid[idx] = val;
b411b363
PR
3037 drbd_md_mark_dirty(mdev);
3038}
3039
3040
3041void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3042{
3043 if (mdev->ldev->md.uuid[idx]) {
3044 drbd_uuid_move_history(mdev);
3045 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
b411b363
PR
3046 }
3047 _drbd_uuid_set(mdev, idx, val);
3048}
3049
3050/**
3051 * drbd_uuid_new_current() - Creates a new current UUID
3052 * @mdev: DRBD device.
3053 *
3054 * Creates a new current UUID, and rotates the old current UUID into
3055 * the bitmap slot. Causes an incremental resync upon next connect.
3056 */
3057void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3058{
3059 u64 val;
62b0da3a
LE
3060 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3061
3062 if (bm_uuid)
3063 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 3064
b411b363 3065 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
b411b363
PR
3066
3067 get_random_bytes(&val, sizeof(u64));
3068 _drbd_uuid_set(mdev, UI_CURRENT, val);
62b0da3a 3069 drbd_print_uuids(mdev, "new current UUID");
aaa8e2b3
LE
3070 /* get it to stable storage _now_ */
3071 drbd_md_sync(mdev);
b411b363
PR
3072}
3073
3074void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3075{
3076 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3077 return;
3078
3079 if (val == 0) {
3080 drbd_uuid_move_history(mdev);
3081 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3082 mdev->ldev->md.uuid[UI_BITMAP] = 0;
b411b363 3083 } else {
62b0da3a
LE
3084 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3085 if (bm_uuid)
3086 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
b411b363 3087
62b0da3a 3088 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
b411b363
PR
3089 }
3090 drbd_md_mark_dirty(mdev);
3091}
3092
3093/**
3094 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3095 * @mdev: DRBD device.
3096 *
3097 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3098 */
3099int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3100{
3101 int rv = -EIO;
3102
3103 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3104 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3105 drbd_md_sync(mdev);
3106 drbd_bm_set_all(mdev);
3107
3108 rv = drbd_bm_write(mdev);
3109
3110 if (!rv) {
3111 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3112 drbd_md_sync(mdev);
3113 }
3114
3115 put_ldev(mdev);
3116 }
3117
3118 return rv;
3119}
3120
3121/**
3122 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3123 * @mdev: DRBD device.
3124 *
3125 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3126 */
3127int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3128{
3129 int rv = -EIO;
3130
0778286a 3131 drbd_resume_al(mdev);
b411b363
PR
3132 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3133 drbd_bm_clear_all(mdev);
3134 rv = drbd_bm_write(mdev);
3135 put_ldev(mdev);
3136 }
3137
3138 return rv;
3139}
3140
99920dc5 3141static int w_bitmap_io(struct drbd_work *w, int unused)
b411b363
PR
3142{
3143 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
00d56944 3144 struct drbd_conf *mdev = w->mdev;
02851e9f 3145 int rv = -EIO;
b411b363
PR
3146
3147 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3148
02851e9f 3149 if (get_ldev(mdev)) {
20ceb2b2 3150 drbd_bm_lock(mdev, work->why, work->flags);
02851e9f
LE
3151 rv = work->io_fn(mdev);
3152 drbd_bm_unlock(mdev);
3153 put_ldev(mdev);
3154 }
b411b363 3155
4738fa16 3156 clear_bit_unlock(BITMAP_IO, &mdev->flags);
b411b363
PR
3157 wake_up(&mdev->misc_wait);
3158
3159 if (work->done)
3160 work->done(mdev, rv);
3161
3162 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3163 work->why = NULL;
20ceb2b2 3164 work->flags = 0;
b411b363 3165
99920dc5 3166 return 0;
b411b363
PR
3167}
3168
82f59cc6
LE
3169void drbd_ldev_destroy(struct drbd_conf *mdev)
3170{
3171 lc_destroy(mdev->resync);
3172 mdev->resync = NULL;
3173 lc_destroy(mdev->act_log);
3174 mdev->act_log = NULL;
3175 __no_warn(local,
3176 drbd_free_bc(mdev->ldev);
3177 mdev->ldev = NULL;);
3178
82f59cc6
LE
3179 clear_bit(GO_DISKLESS, &mdev->flags);
3180}
3181
99920dc5 3182static int w_go_diskless(struct drbd_work *w, int unused)
e9e6f3ec 3183{
00d56944
PR
3184 struct drbd_conf *mdev = w->mdev;
3185
e9e6f3ec 3186 D_ASSERT(mdev->state.disk == D_FAILED);
9d282875
LE
3187 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3188 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
82f59cc6
LE
3189 * the protected members anymore, though, so once put_ldev reaches zero
3190 * again, it will be safe to free them. */
e9e6f3ec 3191 drbd_force_state(mdev, NS(disk, D_DISKLESS));
99920dc5 3192 return 0;
e9e6f3ec
LE
3193}
3194
3195void drbd_go_diskless(struct drbd_conf *mdev)
3196{
3197 D_ASSERT(mdev->state.disk == D_FAILED);
3198 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
e42325a5 3199 drbd_queue_work(&mdev->tconn->data.work, &mdev->go_diskless);
e9e6f3ec
LE
3200}
3201
b411b363
PR
3202/**
3203 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3204 * @mdev: DRBD device.
3205 * @io_fn: IO callback to be called when bitmap IO is possible
3206 * @done: callback to be called after the bitmap IO was performed
3207 * @why: Descriptive text of the reason for doing the IO
3208 *
3209 * While IO on the bitmap happens we freeze application IO thus we ensure
3210 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3211 * called from worker context. It MUST NOT be used while a previous such
3212 * work is still pending!
3213 */
3214void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3215 int (*io_fn)(struct drbd_conf *),
3216 void (*done)(struct drbd_conf *, int),
20ceb2b2 3217 char *why, enum bm_flag flags)
b411b363 3218{
e6b3ea83 3219 D_ASSERT(current == mdev->tconn->worker.task);
b411b363
PR
3220
3221 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3222 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3223 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3224 if (mdev->bm_io_work.why)
3225 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3226 why, mdev->bm_io_work.why);
3227
3228 mdev->bm_io_work.io_fn = io_fn;
3229 mdev->bm_io_work.done = done;
3230 mdev->bm_io_work.why = why;
20ceb2b2 3231 mdev->bm_io_work.flags = flags;
b411b363 3232
87eeee41 3233 spin_lock_irq(&mdev->tconn->req_lock);
b411b363
PR
3234 set_bit(BITMAP_IO, &mdev->flags);
3235 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
127b3178 3236 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
e42325a5 3237 drbd_queue_work(&mdev->tconn->data.work, &mdev->bm_io_work.w);
b411b363 3238 }
87eeee41 3239 spin_unlock_irq(&mdev->tconn->req_lock);
b411b363
PR
3240}
3241
3242/**
3243 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3244 * @mdev: DRBD device.
3245 * @io_fn: IO callback to be called when bitmap IO is possible
3246 * @why: Descriptive text of the reason for doing the IO
3247 *
3248 * freezes application IO while that the actual IO operations runs. This
3249 * functions MAY NOT be called from worker context.
3250 */
20ceb2b2
LE
3251int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
3252 char *why, enum bm_flag flags)
b411b363
PR
3253{
3254 int rv;
3255
e6b3ea83 3256 D_ASSERT(current != mdev->tconn->worker.task);
b411b363 3257
20ceb2b2
LE
3258 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
3259 drbd_suspend_io(mdev);
b411b363 3260
20ceb2b2 3261 drbd_bm_lock(mdev, why, flags);
b411b363
PR
3262 rv = io_fn(mdev);
3263 drbd_bm_unlock(mdev);
3264
20ceb2b2
LE
3265 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
3266 drbd_resume_io(mdev);
b411b363
PR
3267
3268 return rv;
3269}
3270
3271void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3272{
3273 if ((mdev->ldev->md.flags & flag) != flag) {
3274 drbd_md_mark_dirty(mdev);
3275 mdev->ldev->md.flags |= flag;
3276 }
3277}
3278
3279void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3280{
3281 if ((mdev->ldev->md.flags & flag) != 0) {
3282 drbd_md_mark_dirty(mdev);
3283 mdev->ldev->md.flags &= ~flag;
3284 }
3285}
3286int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3287{
3288 return (bdev->md.flags & flag) != 0;
3289}
3290
3291static void md_sync_timer_fn(unsigned long data)
3292{
3293 struct drbd_conf *mdev = (struct drbd_conf *) data;
3294
e42325a5 3295 drbd_queue_work_front(&mdev->tconn->data.work, &mdev->md_sync_work);
b411b363
PR
3296}
3297
99920dc5 3298static int w_md_sync(struct drbd_work *w, int unused)
b411b363 3299{
00d56944
PR
3300 struct drbd_conf *mdev = w->mdev;
3301
b411b363 3302 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
ee15b038
LE
3303#ifdef DEBUG
3304 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3305 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3306#endif
b411b363 3307 drbd_md_sync(mdev);
99920dc5 3308 return 0;
b411b363
PR
3309}
3310
d8763023 3311const char *cmdname(enum drbd_packet cmd)
f2ad9063
AG
3312{
3313 /* THINK may need to become several global tables
3314 * when we want to support more than
3315 * one PRO_VERSION */
3316 static const char *cmdnames[] = {
3317 [P_DATA] = "Data",
3318 [P_DATA_REPLY] = "DataReply",
3319 [P_RS_DATA_REPLY] = "RSDataReply",
3320 [P_BARRIER] = "Barrier",
3321 [P_BITMAP] = "ReportBitMap",
3322 [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
3323 [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
3324 [P_UNPLUG_REMOTE] = "UnplugRemote",
3325 [P_DATA_REQUEST] = "DataRequest",
3326 [P_RS_DATA_REQUEST] = "RSDataRequest",
3327 [P_SYNC_PARAM] = "SyncParam",
3328 [P_SYNC_PARAM89] = "SyncParam89",
3329 [P_PROTOCOL] = "ReportProtocol",
3330 [P_UUIDS] = "ReportUUIDs",
3331 [P_SIZES] = "ReportSizes",
3332 [P_STATE] = "ReportState",
3333 [P_SYNC_UUID] = "ReportSyncUUID",
3334 [P_AUTH_CHALLENGE] = "AuthChallenge",
3335 [P_AUTH_RESPONSE] = "AuthResponse",
3336 [P_PING] = "Ping",
3337 [P_PING_ACK] = "PingAck",
3338 [P_RECV_ACK] = "RecvAck",
3339 [P_WRITE_ACK] = "WriteAck",
3340 [P_RS_WRITE_ACK] = "RSWriteAck",
7be8da07 3341 [P_DISCARD_WRITE] = "DiscardWrite",
f2ad9063
AG
3342 [P_NEG_ACK] = "NegAck",
3343 [P_NEG_DREPLY] = "NegDReply",
3344 [P_NEG_RS_DREPLY] = "NegRSDReply",
3345 [P_BARRIER_ACK] = "BarrierAck",
3346 [P_STATE_CHG_REQ] = "StateChgRequest",
3347 [P_STATE_CHG_REPLY] = "StateChgReply",
3348 [P_OV_REQUEST] = "OVRequest",
3349 [P_OV_REPLY] = "OVReply",
3350 [P_OV_RESULT] = "OVResult",
3351 [P_CSUM_RS_REQUEST] = "CsumRSRequest",
3352 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
3353 [P_COMPRESSED_BITMAP] = "CBitmap",
3354 [P_DELAY_PROBE] = "DelayProbe",
3355 [P_OUT_OF_SYNC] = "OutOfSync",
7be8da07 3356 [P_RETRY_WRITE] = "RetryWrite",
ae25b336
LE
3357 [P_RS_CANCEL] = "RSCancel",
3358 [P_CONN_ST_CHG_REQ] = "conn_st_chg_req",
3359 [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply",
036b17ea
PR
3360 [P_RETRY_WRITE] = "retry_write",
3361 [P_PROTOCOL_UPDATE] = "protocol_update",
ae25b336
LE
3362
3363 /* enum drbd_packet, but not commands - obsoleted flags:
3364 * P_MAY_IGNORE
3365 * P_MAX_OPT_CMD
3366 */
f2ad9063
AG
3367 };
3368
ae25b336 3369 /* too big for the array: 0xfffX */
e5d6f33a
AG
3370 if (cmd == P_INITIAL_META)
3371 return "InitialMeta";
3372 if (cmd == P_INITIAL_DATA)
3373 return "InitialData";
6038178e
AG
3374 if (cmd == P_CONNECTION_FEATURES)
3375 return "ConnectionFeatures";
6e849ce8 3376 if (cmd >= ARRAY_SIZE(cmdnames))
f2ad9063
AG
3377 return "Unknown";
3378 return cmdnames[cmd];
3379}
3380
7be8da07
AG
3381/**
3382 * drbd_wait_misc - wait for a request to make progress
3383 * @mdev: device associated with the request
3384 * @i: the struct drbd_interval embedded in struct drbd_request or
3385 * struct drbd_peer_request
3386 */
3387int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i)
3388{
44ed167d 3389 struct net_conf *nc;
7be8da07
AG
3390 DEFINE_WAIT(wait);
3391 long timeout;
3392
44ed167d
PR
3393 rcu_read_lock();
3394 nc = rcu_dereference(mdev->tconn->net_conf);
3395 if (!nc) {
3396 rcu_read_unlock();
7be8da07 3397 return -ETIMEDOUT;
44ed167d
PR
3398 }
3399 timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT;
3400 rcu_read_unlock();
7be8da07
AG
3401
3402 /* Indicate to wake up mdev->misc_wait on progress. */
3403 i->waiting = true;
3404 prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE);
3405 spin_unlock_irq(&mdev->tconn->req_lock);
3406 timeout = schedule_timeout(timeout);
3407 finish_wait(&mdev->misc_wait, &wait);
3408 spin_lock_irq(&mdev->tconn->req_lock);
3409 if (!timeout || mdev->state.conn < C_CONNECTED)
3410 return -ETIMEDOUT;
3411 if (signal_pending(current))
3412 return -ERESTARTSYS;
3413 return 0;
3414}
3415
b411b363
PR
3416#ifdef CONFIG_DRBD_FAULT_INJECTION
3417/* Fault insertion support including random number generator shamelessly
3418 * stolen from kernel/rcutorture.c */
3419struct fault_random_state {
3420 unsigned long state;
3421 unsigned long count;
3422};
3423
3424#define FAULT_RANDOM_MULT 39916801 /* prime */
3425#define FAULT_RANDOM_ADD 479001701 /* prime */
3426#define FAULT_RANDOM_REFRESH 10000
3427
3428/*
3429 * Crude but fast random-number generator. Uses a linear congruential
3430 * generator, with occasional help from get_random_bytes().
3431 */
3432static unsigned long
3433_drbd_fault_random(struct fault_random_state *rsp)
3434{
3435 long refresh;
3436
49829ea7 3437 if (!rsp->count--) {
b411b363
PR
3438 get_random_bytes(&refresh, sizeof(refresh));
3439 rsp->state += refresh;
3440 rsp->count = FAULT_RANDOM_REFRESH;
3441 }
3442 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3443 return swahw32(rsp->state);
3444}
3445
3446static char *
3447_drbd_fault_str(unsigned int type) {
3448 static char *_faults[] = {
3449 [DRBD_FAULT_MD_WR] = "Meta-data write",
3450 [DRBD_FAULT_MD_RD] = "Meta-data read",
3451 [DRBD_FAULT_RS_WR] = "Resync write",
3452 [DRBD_FAULT_RS_RD] = "Resync read",
3453 [DRBD_FAULT_DT_WR] = "Data write",
3454 [DRBD_FAULT_DT_RD] = "Data read",
3455 [DRBD_FAULT_DT_RA] = "Data read ahead",
3456 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
6b4388ac
PR
3457 [DRBD_FAULT_AL_EE] = "EE allocation",
3458 [DRBD_FAULT_RECEIVE] = "receive data corruption",
b411b363
PR
3459 };
3460
3461 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3462}
3463
3464unsigned int
3465_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3466{
3467 static struct fault_random_state rrs = {0, 0};
3468
3469 unsigned int ret = (
3470 (fault_devs == 0 ||
3471 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3472 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3473
3474 if (ret) {
3475 fault_count++;
3476
7383506c 3477 if (__ratelimit(&drbd_ratelimit_state))
b411b363
PR
3478 dev_warn(DEV, "***Simulating %s failure\n",
3479 _drbd_fault_str(type));
3480 }
3481
3482 return ret;
3483}
3484#endif
3485
3486const char *drbd_buildtag(void)
3487{
3488 /* DRBD built from external sources has here a reference to the
3489 git hash of the source code. */
3490
3491 static char buildtag[38] = "\0uilt-in";
3492
3493 if (buildtag[0] == 0) {
3494#ifdef CONFIG_MODULES
3495 if (THIS_MODULE != NULL)
3496 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3497 else
3498#endif
3499 buildtag[0] = 'b';
3500 }
3501
3502 return buildtag;
3503}
3504
3505module_init(drbd_init)
3506module_exit(drbd_cleanup)
3507
b411b363
PR
3508EXPORT_SYMBOL(drbd_conn_str);
3509EXPORT_SYMBOL(drbd_role_str);
3510EXPORT_SYMBOL(drbd_disk_str);
3511EXPORT_SYMBOL(drbd_set_st_err_str);
This page took 0.356992 seconds and 5 git commands to generate.