staging: lustre: make o2iblnd local functions static
[deliverable/linux.git] / drivers / staging / lustre / lnet / klnds / o2iblnd / o2iblnd.c
1 /*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26 /*
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2015, Intel Corporation.
31 */
32 /*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lnet/klnds/o2iblnd/o2iblnd.c
37 *
38 * Author: Eric Barton <eric@bartonsoftware.com>
39 */
40
41 #include <asm/div64.h>
42 #include <asm/page.h>
43 #include "o2iblnd.h"
44
45 static lnd_t the_o2iblnd;
46
47 kib_data_t kiblnd_data;
48
49 static __u32 kiblnd_cksum(void *ptr, int nob)
50 {
51 char *c = ptr;
52 __u32 sum = 0;
53
54 while (nob-- > 0)
55 sum = ((sum << 1) | (sum >> 31)) + *c++;
56
57 /* ensure I don't return 0 (== no checksum) */
58 return !sum ? 1 : sum;
59 }
60
61 static char *kiblnd_msgtype2str(int type)
62 {
63 switch (type) {
64 case IBLND_MSG_CONNREQ:
65 return "CONNREQ";
66
67 case IBLND_MSG_CONNACK:
68 return "CONNACK";
69
70 case IBLND_MSG_NOOP:
71 return "NOOP";
72
73 case IBLND_MSG_IMMEDIATE:
74 return "IMMEDIATE";
75
76 case IBLND_MSG_PUT_REQ:
77 return "PUT_REQ";
78
79 case IBLND_MSG_PUT_NAK:
80 return "PUT_NAK";
81
82 case IBLND_MSG_PUT_ACK:
83 return "PUT_ACK";
84
85 case IBLND_MSG_PUT_DONE:
86 return "PUT_DONE";
87
88 case IBLND_MSG_GET_REQ:
89 return "GET_REQ";
90
91 case IBLND_MSG_GET_DONE:
92 return "GET_DONE";
93
94 default:
95 return "???";
96 }
97 }
98
99 static int kiblnd_msgtype2size(int type)
100 {
101 const int hdr_size = offsetof(kib_msg_t, ibm_u);
102
103 switch (type) {
104 case IBLND_MSG_CONNREQ:
105 case IBLND_MSG_CONNACK:
106 return hdr_size + sizeof(kib_connparams_t);
107
108 case IBLND_MSG_NOOP:
109 return hdr_size;
110
111 case IBLND_MSG_IMMEDIATE:
112 return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
113
114 case IBLND_MSG_PUT_REQ:
115 return hdr_size + sizeof(kib_putreq_msg_t);
116
117 case IBLND_MSG_PUT_ACK:
118 return hdr_size + sizeof(kib_putack_msg_t);
119
120 case IBLND_MSG_GET_REQ:
121 return hdr_size + sizeof(kib_get_msg_t);
122
123 case IBLND_MSG_PUT_NAK:
124 case IBLND_MSG_PUT_DONE:
125 case IBLND_MSG_GET_DONE:
126 return hdr_size + sizeof(kib_completion_msg_t);
127 default:
128 return -1;
129 }
130 }
131
132 static int kiblnd_unpack_rd(kib_msg_t *msg, int flip)
133 {
134 kib_rdma_desc_t *rd;
135 int nob;
136 int n;
137 int i;
138
139 LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ ||
140 msg->ibm_type == IBLND_MSG_PUT_ACK);
141
142 rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
143 &msg->ibm_u.get.ibgm_rd :
144 &msg->ibm_u.putack.ibpam_rd;
145
146 if (flip) {
147 __swab32s(&rd->rd_key);
148 __swab32s(&rd->rd_nfrags);
149 }
150
151 n = rd->rd_nfrags;
152
153 if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
154 CERROR("Bad nfrags: %d, should be 0 < n <= %d\n",
155 n, IBLND_MAX_RDMA_FRAGS);
156 return 1;
157 }
158
159 nob = offsetof(kib_msg_t, ibm_u) +
160 kiblnd_rd_msg_size(rd, msg->ibm_type, n);
161
162 if (msg->ibm_nob < nob) {
163 CERROR("Short %s: %d(%d)\n",
164 kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
165 return 1;
166 }
167
168 if (!flip)
169 return 0;
170
171 for (i = 0; i < n; i++) {
172 __swab32s(&rd->rd_frags[i].rf_nob);
173 __swab64s(&rd->rd_frags[i].rf_addr);
174 }
175
176 return 0;
177 }
178
179 void kiblnd_pack_msg(lnet_ni_t *ni, kib_msg_t *msg, int version,
180 int credits, lnet_nid_t dstnid, __u64 dststamp)
181 {
182 kib_net_t *net = ni->ni_data;
183
184 /*
185 * CAVEAT EMPTOR! all message fields not set here should have been
186 * initialised previously.
187 */
188 msg->ibm_magic = IBLND_MSG_MAGIC;
189 msg->ibm_version = version;
190 /* ibm_type */
191 msg->ibm_credits = credits;
192 /* ibm_nob */
193 msg->ibm_cksum = 0;
194 msg->ibm_srcnid = ni->ni_nid;
195 msg->ibm_srcstamp = net->ibn_incarnation;
196 msg->ibm_dstnid = dstnid;
197 msg->ibm_dststamp = dststamp;
198
199 if (*kiblnd_tunables.kib_cksum) {
200 /* NB ibm_cksum zero while computing cksum */
201 msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
202 }
203 }
204
205 int kiblnd_unpack_msg(kib_msg_t *msg, int nob)
206 {
207 const int hdr_size = offsetof(kib_msg_t, ibm_u);
208 __u32 msg_cksum;
209 __u16 version;
210 int msg_nob;
211 int flip;
212
213 /* 6 bytes are enough to have received magic + version */
214 if (nob < 6) {
215 CERROR("Short message: %d\n", nob);
216 return -EPROTO;
217 }
218
219 if (msg->ibm_magic == IBLND_MSG_MAGIC) {
220 flip = 0;
221 } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
222 flip = 1;
223 } else {
224 CERROR("Bad magic: %08x\n", msg->ibm_magic);
225 return -EPROTO;
226 }
227
228 version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
229 if (version != IBLND_MSG_VERSION &&
230 version != IBLND_MSG_VERSION_1) {
231 CERROR("Bad version: %x\n", version);
232 return -EPROTO;
233 }
234
235 if (nob < hdr_size) {
236 CERROR("Short message: %d\n", nob);
237 return -EPROTO;
238 }
239
240 msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
241 if (msg_nob > nob) {
242 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
243 return -EPROTO;
244 }
245
246 /*
247 * checksum must be computed with ibm_cksum zero and BEFORE anything
248 * gets flipped
249 */
250 msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
251 msg->ibm_cksum = 0;
252 if (msg_cksum &&
253 msg_cksum != kiblnd_cksum(msg, msg_nob)) {
254 CERROR("Bad checksum\n");
255 return -EPROTO;
256 }
257
258 msg->ibm_cksum = msg_cksum;
259
260 if (flip) {
261 /* leave magic unflipped as a clue to peer endianness */
262 msg->ibm_version = version;
263 CLASSERT(sizeof(msg->ibm_type) == 1);
264 CLASSERT(sizeof(msg->ibm_credits) == 1);
265 msg->ibm_nob = msg_nob;
266 __swab64s(&msg->ibm_srcnid);
267 __swab64s(&msg->ibm_srcstamp);
268 __swab64s(&msg->ibm_dstnid);
269 __swab64s(&msg->ibm_dststamp);
270 }
271
272 if (msg->ibm_srcnid == LNET_NID_ANY) {
273 CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
274 return -EPROTO;
275 }
276
277 if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
278 CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
279 msg_nob, kiblnd_msgtype2size(msg->ibm_type));
280 return -EPROTO;
281 }
282
283 switch (msg->ibm_type) {
284 default:
285 CERROR("Unknown message type %x\n", msg->ibm_type);
286 return -EPROTO;
287
288 case IBLND_MSG_NOOP:
289 case IBLND_MSG_IMMEDIATE:
290 case IBLND_MSG_PUT_REQ:
291 break;
292
293 case IBLND_MSG_PUT_ACK:
294 case IBLND_MSG_GET_REQ:
295 if (kiblnd_unpack_rd(msg, flip))
296 return -EPROTO;
297 break;
298
299 case IBLND_MSG_PUT_NAK:
300 case IBLND_MSG_PUT_DONE:
301 case IBLND_MSG_GET_DONE:
302 if (flip)
303 __swab32s(&msg->ibm_u.completion.ibcm_status);
304 break;
305
306 case IBLND_MSG_CONNREQ:
307 case IBLND_MSG_CONNACK:
308 if (flip) {
309 __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
310 __swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
311 __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
312 }
313 break;
314 }
315 return 0;
316 }
317
318 int kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
319 {
320 kib_peer_t *peer;
321 kib_net_t *net = ni->ni_data;
322 int cpt = lnet_cpt_of_nid(nid);
323 unsigned long flags;
324
325 LASSERT(net);
326 LASSERT(nid != LNET_NID_ANY);
327
328 LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer));
329 if (!peer) {
330 CERROR("Cannot allocate peer\n");
331 return -ENOMEM;
332 }
333
334 peer->ibp_ni = ni;
335 peer->ibp_nid = nid;
336 peer->ibp_error = 0;
337 peer->ibp_last_alive = 0;
338 atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */
339
340 INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */
341 INIT_LIST_HEAD(&peer->ibp_conns);
342 INIT_LIST_HEAD(&peer->ibp_tx_queue);
343
344 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
345
346 /* always called with a ref on ni, which prevents ni being shutdown */
347 LASSERT(!net->ibn_shutdown);
348
349 /* npeers only grows with the global lock held */
350 atomic_inc(&net->ibn_npeers);
351
352 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
353
354 *peerp = peer;
355 return 0;
356 }
357
358 void kiblnd_destroy_peer(kib_peer_t *peer)
359 {
360 kib_net_t *net = peer->ibp_ni->ni_data;
361
362 LASSERT(net);
363 LASSERT(!atomic_read(&peer->ibp_refcount));
364 LASSERT(!kiblnd_peer_active(peer));
365 LASSERT(!peer->ibp_connecting);
366 LASSERT(!peer->ibp_accepting);
367 LASSERT(list_empty(&peer->ibp_conns));
368 LASSERT(list_empty(&peer->ibp_tx_queue));
369
370 LIBCFS_FREE(peer, sizeof(*peer));
371
372 /*
373 * NB a peer's connections keep a reference on their peer until
374 * they are destroyed, so we can be assured that _all_ state to do
375 * with this peer has been cleaned up when its refcount drops to
376 * zero.
377 */
378 atomic_dec(&net->ibn_npeers);
379 }
380
381 kib_peer_t *kiblnd_find_peer_locked(lnet_nid_t nid)
382 {
383 /*
384 * the caller is responsible for accounting the additional reference
385 * that this creates
386 */
387 struct list_head *peer_list = kiblnd_nid2peerlist(nid);
388 struct list_head *tmp;
389 kib_peer_t *peer;
390
391 list_for_each(tmp, peer_list) {
392 peer = list_entry(tmp, kib_peer_t, ibp_list);
393
394 LASSERT(peer->ibp_connecting > 0 || /* creating conns */
395 peer->ibp_accepting > 0 ||
396 !list_empty(&peer->ibp_conns)); /* active conn */
397
398 if (peer->ibp_nid != nid)
399 continue;
400
401 CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
402 peer, libcfs_nid2str(nid),
403 atomic_read(&peer->ibp_refcount),
404 peer->ibp_version);
405 return peer;
406 }
407 return NULL;
408 }
409
410 void kiblnd_unlink_peer_locked(kib_peer_t *peer)
411 {
412 LASSERT(list_empty(&peer->ibp_conns));
413
414 LASSERT(kiblnd_peer_active(peer));
415 list_del_init(&peer->ibp_list);
416 /* lose peerlist's ref */
417 kiblnd_peer_decref(peer);
418 }
419
420 static int kiblnd_get_peer_info(lnet_ni_t *ni, int index,
421 lnet_nid_t *nidp, int *count)
422 {
423 kib_peer_t *peer;
424 struct list_head *ptmp;
425 int i;
426 unsigned long flags;
427
428 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
429
430 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
431 list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
432 peer = list_entry(ptmp, kib_peer_t, ibp_list);
433 LASSERT(peer->ibp_connecting > 0 ||
434 peer->ibp_accepting > 0 ||
435 !list_empty(&peer->ibp_conns));
436
437 if (peer->ibp_ni != ni)
438 continue;
439
440 if (index-- > 0)
441 continue;
442
443 *nidp = peer->ibp_nid;
444 *count = atomic_read(&peer->ibp_refcount);
445
446 read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
447 flags);
448 return 0;
449 }
450 }
451
452 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
453 return -ENOENT;
454 }
455
456 static void kiblnd_del_peer_locked(kib_peer_t *peer)
457 {
458 struct list_head *ctmp;
459 struct list_head *cnxt;
460 kib_conn_t *conn;
461
462 if (list_empty(&peer->ibp_conns)) {
463 kiblnd_unlink_peer_locked(peer);
464 } else {
465 list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
466 conn = list_entry(ctmp, kib_conn_t, ibc_list);
467
468 kiblnd_close_conn_locked(conn, 0);
469 }
470 /* NB closing peer's last conn unlinked it. */
471 }
472 /*
473 * NB peer now unlinked; might even be freed if the peer table had the
474 * last ref on it.
475 */
476 }
477
478 static int kiblnd_del_peer(lnet_ni_t *ni, lnet_nid_t nid)
479 {
480 LIST_HEAD(zombies);
481 struct list_head *ptmp;
482 struct list_head *pnxt;
483 kib_peer_t *peer;
484 int lo;
485 int hi;
486 int i;
487 unsigned long flags;
488 int rc = -ENOENT;
489
490 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
491
492 if (nid != LNET_NID_ANY) {
493 lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
494 hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
495 } else {
496 lo = 0;
497 hi = kiblnd_data.kib_peer_hash_size - 1;
498 }
499
500 for (i = lo; i <= hi; i++) {
501 list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
502 peer = list_entry(ptmp, kib_peer_t, ibp_list);
503 LASSERT(peer->ibp_connecting > 0 ||
504 peer->ibp_accepting > 0 ||
505 !list_empty(&peer->ibp_conns));
506
507 if (peer->ibp_ni != ni)
508 continue;
509
510 if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
511 continue;
512
513 if (!list_empty(&peer->ibp_tx_queue)) {
514 LASSERT(list_empty(&peer->ibp_conns));
515
516 list_splice_init(&peer->ibp_tx_queue,
517 &zombies);
518 }
519
520 kiblnd_del_peer_locked(peer);
521 rc = 0; /* matched something */
522 }
523 }
524
525 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
526
527 kiblnd_txlist_done(ni, &zombies, -EIO);
528
529 return rc;
530 }
531
532 static kib_conn_t *kiblnd_get_conn_by_idx(lnet_ni_t *ni, int index)
533 {
534 kib_peer_t *peer;
535 struct list_head *ptmp;
536 kib_conn_t *conn;
537 struct list_head *ctmp;
538 int i;
539 unsigned long flags;
540
541 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
542
543 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
544 list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
545 peer = list_entry(ptmp, kib_peer_t, ibp_list);
546 LASSERT(peer->ibp_connecting > 0 ||
547 peer->ibp_accepting > 0 ||
548 !list_empty(&peer->ibp_conns));
549
550 if (peer->ibp_ni != ni)
551 continue;
552
553 list_for_each(ctmp, &peer->ibp_conns) {
554 if (index-- > 0)
555 continue;
556
557 conn = list_entry(ctmp, kib_conn_t,
558 ibc_list);
559 kiblnd_conn_addref(conn);
560 read_unlock_irqrestore(
561 &kiblnd_data.kib_global_lock,
562 flags);
563 return conn;
564 }
565 }
566 }
567
568 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
569 return NULL;
570 }
571
572 int kiblnd_translate_mtu(int value)
573 {
574 switch (value) {
575 default:
576 return -1;
577 case 0:
578 return 0;
579 case 256:
580 return IB_MTU_256;
581 case 512:
582 return IB_MTU_512;
583 case 1024:
584 return IB_MTU_1024;
585 case 2048:
586 return IB_MTU_2048;
587 case 4096:
588 return IB_MTU_4096;
589 }
590 }
591
592 static void kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
593 {
594 int mtu;
595
596 /* XXX There is no path record for iWARP, set by netdev->change_mtu? */
597 if (!cmid->route.path_rec)
598 return;
599
600 mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
601 LASSERT(mtu >= 0);
602 if (mtu)
603 cmid->route.path_rec->mtu = mtu;
604 }
605
606 static int kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
607 {
608 cpumask_t *mask;
609 int vectors;
610 int off;
611 int i;
612 lnet_nid_t nid = conn->ibc_peer->ibp_nid;
613
614 vectors = conn->ibc_cmid->device->num_comp_vectors;
615 if (vectors <= 1)
616 return 0;
617
618 mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
619 if (!mask)
620 return 0;
621
622 /* hash NID to CPU id in this partition... */
623 off = do_div(nid, cpumask_weight(mask));
624 for_each_cpu(i, mask) {
625 if (!off--)
626 return i % vectors;
627 }
628
629 LBUG();
630 return 1;
631 }
632
633 kib_conn_t *kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
634 int state, int version)
635 {
636 /*
637 * CAVEAT EMPTOR:
638 * If the new conn is created successfully it takes over the caller's
639 * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself
640 * is destroyed. On failure, the caller's ref on 'peer' remains and
641 * she must dispose of 'cmid'. (Actually I'd block forever if I tried
642 * to destroy 'cmid' here since I'm called from the CM which still has
643 * its ref on 'cmid').
644 */
645 rwlock_t *glock = &kiblnd_data.kib_global_lock;
646 kib_net_t *net = peer->ibp_ni->ni_data;
647 kib_dev_t *dev;
648 struct ib_qp_init_attr *init_qp_attr;
649 struct kib_sched_info *sched;
650 struct ib_cq_init_attr cq_attr = {};
651 kib_conn_t *conn;
652 struct ib_cq *cq;
653 unsigned long flags;
654 int cpt;
655 int rc;
656 int i;
657
658 LASSERT(net);
659 LASSERT(!in_interrupt());
660
661 dev = net->ibn_dev;
662
663 cpt = lnet_cpt_of_nid(peer->ibp_nid);
664 sched = kiblnd_data.kib_scheds[cpt];
665
666 LASSERT(sched->ibs_nthreads > 0);
667
668 LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
669 sizeof(*init_qp_attr));
670 if (!init_qp_attr) {
671 CERROR("Can't allocate qp_attr for %s\n",
672 libcfs_nid2str(peer->ibp_nid));
673 goto failed_0;
674 }
675
676 LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
677 if (!conn) {
678 CERROR("Can't allocate connection for %s\n",
679 libcfs_nid2str(peer->ibp_nid));
680 goto failed_1;
681 }
682
683 conn->ibc_state = IBLND_CONN_INIT;
684 conn->ibc_version = version;
685 conn->ibc_peer = peer; /* I take the caller's ref */
686 cmid->context = conn; /* for future CM callbacks */
687 conn->ibc_cmid = cmid;
688
689 INIT_LIST_HEAD(&conn->ibc_early_rxs);
690 INIT_LIST_HEAD(&conn->ibc_tx_noops);
691 INIT_LIST_HEAD(&conn->ibc_tx_queue);
692 INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
693 INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
694 INIT_LIST_HEAD(&conn->ibc_active_txs);
695 spin_lock_init(&conn->ibc_lock);
696
697 LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
698 sizeof(*conn->ibc_connvars));
699 if (!conn->ibc_connvars) {
700 CERROR("Can't allocate in-progress connection state\n");
701 goto failed_2;
702 }
703
704 write_lock_irqsave(glock, flags);
705 if (dev->ibd_failover) {
706 write_unlock_irqrestore(glock, flags);
707 CERROR("%s: failover in progress\n", dev->ibd_ifname);
708 goto failed_2;
709 }
710
711 if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
712 /* wakeup failover thread and teardown connection */
713 if (kiblnd_dev_can_failover(dev)) {
714 list_add_tail(&dev->ibd_fail_list,
715 &kiblnd_data.kib_failed_devs);
716 wake_up(&kiblnd_data.kib_failover_waitq);
717 }
718
719 write_unlock_irqrestore(glock, flags);
720 CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
721 cmid->device->name, dev->ibd_ifname);
722 goto failed_2;
723 }
724
725 kiblnd_hdev_addref_locked(dev->ibd_hdev);
726 conn->ibc_hdev = dev->ibd_hdev;
727
728 kiblnd_setup_mtu_locked(cmid);
729
730 write_unlock_irqrestore(glock, flags);
731
732 LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
733 IBLND_RX_MSGS(version) * sizeof(kib_rx_t));
734 if (!conn->ibc_rxs) {
735 CERROR("Cannot allocate RX buffers\n");
736 goto failed_2;
737 }
738
739 rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
740 IBLND_RX_MSG_PAGES(version));
741 if (rc)
742 goto failed_2;
743
744 kiblnd_map_rx_descs(conn);
745
746 cq_attr.cqe = IBLND_CQ_ENTRIES(version);
747 cq_attr.comp_vector = kiblnd_get_completion_vector(conn, cpt);
748 cq = ib_create_cq(cmid->device,
749 kiblnd_cq_completion, kiblnd_cq_event, conn,
750 &cq_attr);
751 if (IS_ERR(cq)) {
752 CERROR("Can't create CQ: %ld, cqe: %d\n",
753 PTR_ERR(cq), IBLND_CQ_ENTRIES(version));
754 goto failed_2;
755 }
756
757 conn->ibc_cq = cq;
758
759 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
760 if (rc) {
761 CERROR("Can't request completion notificiation: %d\n", rc);
762 goto failed_2;
763 }
764
765 init_qp_attr->event_handler = kiblnd_qp_event;
766 init_qp_attr->qp_context = conn;
767 init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version);
768 init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version);
769 init_qp_attr->cap.max_send_sge = 1;
770 init_qp_attr->cap.max_recv_sge = 1;
771 init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
772 init_qp_attr->qp_type = IB_QPT_RC;
773 init_qp_attr->send_cq = cq;
774 init_qp_attr->recv_cq = cq;
775
776 conn->ibc_sched = sched;
777
778 rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
779 if (rc) {
780 CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
781 rc, init_qp_attr->cap.max_send_wr,
782 init_qp_attr->cap.max_recv_wr);
783 goto failed_2;
784 }
785
786 LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
787
788 /* 1 ref for caller and each rxmsg */
789 atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version));
790 conn->ibc_nrx = IBLND_RX_MSGS(version);
791
792 /* post receives */
793 for (i = 0; i < IBLND_RX_MSGS(version); i++) {
794 rc = kiblnd_post_rx(&conn->ibc_rxs[i],
795 IBLND_POSTRX_NO_CREDIT);
796 if (rc) {
797 CERROR("Can't post rxmsg: %d\n", rc);
798
799 /* Make posted receives complete */
800 kiblnd_abort_receives(conn);
801
802 /*
803 * correct # of posted buffers
804 * NB locking needed now I'm racing with completion
805 */
806 spin_lock_irqsave(&sched->ibs_lock, flags);
807 conn->ibc_nrx -= IBLND_RX_MSGS(version) - i;
808 spin_unlock_irqrestore(&sched->ibs_lock, flags);
809
810 /*
811 * cmid will be destroyed by CM(ofed) after cm_callback
812 * returned, so we can't refer it anymore
813 * (by kiblnd_connd()->kiblnd_destroy_conn)
814 */
815 rdma_destroy_qp(conn->ibc_cmid);
816 conn->ibc_cmid = NULL;
817
818 /* Drop my own and unused rxbuffer refcounts */
819 while (i++ <= IBLND_RX_MSGS(version))
820 kiblnd_conn_decref(conn);
821
822 return NULL;
823 }
824 }
825
826 /* Init successful! */
827 LASSERT(state == IBLND_CONN_ACTIVE_CONNECT ||
828 state == IBLND_CONN_PASSIVE_WAIT);
829 conn->ibc_state = state;
830
831 /* 1 more conn */
832 atomic_inc(&net->ibn_nconns);
833 return conn;
834
835 failed_2:
836 kiblnd_destroy_conn(conn);
837 failed_1:
838 LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
839 failed_0:
840 return NULL;
841 }
842
843 void kiblnd_destroy_conn(kib_conn_t *conn)
844 {
845 struct rdma_cm_id *cmid = conn->ibc_cmid;
846 kib_peer_t *peer = conn->ibc_peer;
847 int rc;
848
849 LASSERT(!in_interrupt());
850 LASSERT(!atomic_read(&conn->ibc_refcount));
851 LASSERT(list_empty(&conn->ibc_early_rxs));
852 LASSERT(list_empty(&conn->ibc_tx_noops));
853 LASSERT(list_empty(&conn->ibc_tx_queue));
854 LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd));
855 LASSERT(list_empty(&conn->ibc_tx_queue_nocred));
856 LASSERT(list_empty(&conn->ibc_active_txs));
857 LASSERT(!conn->ibc_noops_posted);
858 LASSERT(!conn->ibc_nsends_posted);
859
860 switch (conn->ibc_state) {
861 default:
862 /* conn must be completely disengaged from the network */
863 LBUG();
864
865 case IBLND_CONN_DISCONNECTED:
866 /* connvars should have been freed already */
867 LASSERT(!conn->ibc_connvars);
868 break;
869
870 case IBLND_CONN_INIT:
871 break;
872 }
873
874 /* conn->ibc_cmid might be destroyed by CM already */
875 if (cmid && cmid->qp)
876 rdma_destroy_qp(cmid);
877
878 if (conn->ibc_cq) {
879 rc = ib_destroy_cq(conn->ibc_cq);
880 if (rc)
881 CWARN("Error destroying CQ: %d\n", rc);
882 }
883
884 if (conn->ibc_rx_pages)
885 kiblnd_unmap_rx_descs(conn);
886
887 if (conn->ibc_rxs) {
888 LIBCFS_FREE(conn->ibc_rxs,
889 IBLND_RX_MSGS(conn->ibc_version)
890 * sizeof(kib_rx_t));
891 }
892
893 if (conn->ibc_connvars)
894 LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
895
896 if (conn->ibc_hdev)
897 kiblnd_hdev_decref(conn->ibc_hdev);
898
899 /* See CAVEAT EMPTOR above in kiblnd_create_conn */
900 if (conn->ibc_state != IBLND_CONN_INIT) {
901 kib_net_t *net = peer->ibp_ni->ni_data;
902
903 kiblnd_peer_decref(peer);
904 rdma_destroy_id(cmid);
905 atomic_dec(&net->ibn_nconns);
906 }
907
908 LIBCFS_FREE(conn, sizeof(*conn));
909 }
910
911 int kiblnd_close_peer_conns_locked(kib_peer_t *peer, int why)
912 {
913 kib_conn_t *conn;
914 struct list_head *ctmp;
915 struct list_head *cnxt;
916 int count = 0;
917
918 list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
919 conn = list_entry(ctmp, kib_conn_t, ibc_list);
920
921 CDEBUG(D_NET, "Closing conn -> %s, version: %x, reason: %d\n",
922 libcfs_nid2str(peer->ibp_nid),
923 conn->ibc_version, why);
924
925 kiblnd_close_conn_locked(conn, why);
926 count++;
927 }
928
929 return count;
930 }
931
932 int kiblnd_close_stale_conns_locked(kib_peer_t *peer,
933 int version, __u64 incarnation)
934 {
935 kib_conn_t *conn;
936 struct list_head *ctmp;
937 struct list_head *cnxt;
938 int count = 0;
939
940 list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
941 conn = list_entry(ctmp, kib_conn_t, ibc_list);
942
943 if (conn->ibc_version == version &&
944 conn->ibc_incarnation == incarnation)
945 continue;
946
947 CDEBUG(D_NET,
948 "Closing stale conn -> %s version: %x, incarnation:%#llx(%x, %#llx)\n",
949 libcfs_nid2str(peer->ibp_nid),
950 conn->ibc_version, conn->ibc_incarnation,
951 version, incarnation);
952
953 kiblnd_close_conn_locked(conn, -ESTALE);
954 count++;
955 }
956
957 return count;
958 }
959
960 static int kiblnd_close_matching_conns(lnet_ni_t *ni, lnet_nid_t nid)
961 {
962 kib_peer_t *peer;
963 struct list_head *ptmp;
964 struct list_head *pnxt;
965 int lo;
966 int hi;
967 int i;
968 unsigned long flags;
969 int count = 0;
970
971 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
972
973 if (nid != LNET_NID_ANY) {
974 lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
975 hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
976 } else {
977 lo = 0;
978 hi = kiblnd_data.kib_peer_hash_size - 1;
979 }
980
981 for (i = lo; i <= hi; i++) {
982 list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
983 peer = list_entry(ptmp, kib_peer_t, ibp_list);
984 LASSERT(peer->ibp_connecting > 0 ||
985 peer->ibp_accepting > 0 ||
986 !list_empty(&peer->ibp_conns));
987
988 if (peer->ibp_ni != ni)
989 continue;
990
991 if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
992 continue;
993
994 count += kiblnd_close_peer_conns_locked(peer, 0);
995 }
996 }
997
998 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
999
1000 /* wildcards always succeed */
1001 if (nid == LNET_NID_ANY)
1002 return 0;
1003
1004 return !count ? -ENOENT : 0;
1005 }
1006
1007 static int kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
1008 {
1009 struct libcfs_ioctl_data *data = arg;
1010 int rc = -EINVAL;
1011
1012 switch (cmd) {
1013 case IOC_LIBCFS_GET_PEER: {
1014 lnet_nid_t nid = 0;
1015 int count = 0;
1016
1017 rc = kiblnd_get_peer_info(ni, data->ioc_count,
1018 &nid, &count);
1019 data->ioc_nid = nid;
1020 data->ioc_count = count;
1021 break;
1022 }
1023
1024 case IOC_LIBCFS_DEL_PEER: {
1025 rc = kiblnd_del_peer(ni, data->ioc_nid);
1026 break;
1027 }
1028 case IOC_LIBCFS_GET_CONN: {
1029 kib_conn_t *conn;
1030
1031 rc = 0;
1032 conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
1033 if (!conn) {
1034 rc = -ENOENT;
1035 break;
1036 }
1037
1038 LASSERT(conn->ibc_cmid);
1039 data->ioc_nid = conn->ibc_peer->ibp_nid;
1040 if (!conn->ibc_cmid->route.path_rec)
1041 data->ioc_u32[0] = 0; /* iWarp has no path MTU */
1042 else
1043 data->ioc_u32[0] =
1044 ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
1045 kiblnd_conn_decref(conn);
1046 break;
1047 }
1048 case IOC_LIBCFS_CLOSE_CONNECTION: {
1049 rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
1050 break;
1051 }
1052
1053 default:
1054 break;
1055 }
1056
1057 return rc;
1058 }
1059
1060 static void kiblnd_query(lnet_ni_t *ni, lnet_nid_t nid, unsigned long *when)
1061 {
1062 unsigned long last_alive = 0;
1063 unsigned long now = cfs_time_current();
1064 rwlock_t *glock = &kiblnd_data.kib_global_lock;
1065 kib_peer_t *peer;
1066 unsigned long flags;
1067
1068 read_lock_irqsave(glock, flags);
1069
1070 peer = kiblnd_find_peer_locked(nid);
1071 if (peer) {
1072 LASSERT(peer->ibp_connecting > 0 || /* creating conns */
1073 peer->ibp_accepting > 0 ||
1074 !list_empty(&peer->ibp_conns)); /* active conn */
1075 last_alive = peer->ibp_last_alive;
1076 }
1077
1078 read_unlock_irqrestore(glock, flags);
1079
1080 if (last_alive)
1081 *when = last_alive;
1082
1083 /*
1084 * peer is not persistent in hash, trigger peer creation
1085 * and connection establishment with a NULL tx
1086 */
1087 if (!peer)
1088 kiblnd_launch_tx(ni, NULL, nid);
1089
1090 CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
1091 libcfs_nid2str(nid), peer,
1092 last_alive ? cfs_duration_sec(now - last_alive) : -1);
1093 }
1094
1095 static void kiblnd_free_pages(kib_pages_t *p)
1096 {
1097 int npages = p->ibp_npages;
1098 int i;
1099
1100 for (i = 0; i < npages; i++) {
1101 if (p->ibp_pages[i])
1102 __free_page(p->ibp_pages[i]);
1103 }
1104
1105 LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages]));
1106 }
1107
1108 int kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
1109 {
1110 kib_pages_t *p;
1111 int i;
1112
1113 LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
1114 offsetof(kib_pages_t, ibp_pages[npages]));
1115 if (!p) {
1116 CERROR("Can't allocate descriptor for %d pages\n", npages);
1117 return -ENOMEM;
1118 }
1119
1120 memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1121 p->ibp_npages = npages;
1122
1123 for (i = 0; i < npages; i++) {
1124 p->ibp_pages[i] = alloc_pages_node(
1125 cfs_cpt_spread_node(lnet_cpt_table(), cpt),
1126 GFP_NOFS, 0);
1127 if (!p->ibp_pages[i]) {
1128 CERROR("Can't allocate page %d of %d\n", i, npages);
1129 kiblnd_free_pages(p);
1130 return -ENOMEM;
1131 }
1132 }
1133
1134 *pp = p;
1135 return 0;
1136 }
1137
1138 void kiblnd_unmap_rx_descs(kib_conn_t *conn)
1139 {
1140 kib_rx_t *rx;
1141 int i;
1142
1143 LASSERT(conn->ibc_rxs);
1144 LASSERT(conn->ibc_hdev);
1145
1146 for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) {
1147 rx = &conn->ibc_rxs[i];
1148
1149 LASSERT(rx->rx_nob >= 0); /* not posted */
1150
1151 kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
1152 KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
1153 rx->rx_msgaddr),
1154 IBLND_MSG_SIZE, DMA_FROM_DEVICE);
1155 }
1156
1157 kiblnd_free_pages(conn->ibc_rx_pages);
1158
1159 conn->ibc_rx_pages = NULL;
1160 }
1161
1162 void kiblnd_map_rx_descs(kib_conn_t *conn)
1163 {
1164 kib_rx_t *rx;
1165 struct page *pg;
1166 int pg_off;
1167 int ipg;
1168 int i;
1169
1170 for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) {
1171 pg = conn->ibc_rx_pages->ibp_pages[ipg];
1172 rx = &conn->ibc_rxs[i];
1173
1174 rx->rx_conn = conn;
1175 rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
1176
1177 rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
1178 rx->rx_msg,
1179 IBLND_MSG_SIZE,
1180 DMA_FROM_DEVICE);
1181 LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
1182 rx->rx_msgaddr));
1183 KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
1184
1185 CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n",
1186 i, rx->rx_msg, rx->rx_msgaddr,
1187 (__u64)(page_to_phys(pg) + pg_off));
1188
1189 pg_off += IBLND_MSG_SIZE;
1190 LASSERT(pg_off <= PAGE_SIZE);
1191
1192 if (pg_off == PAGE_SIZE) {
1193 pg_off = 0;
1194 ipg++;
1195 LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version));
1196 }
1197 }
1198 }
1199
1200 static void kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
1201 {
1202 kib_hca_dev_t *hdev = tpo->tpo_hdev;
1203 kib_tx_t *tx;
1204 int i;
1205
1206 LASSERT(!tpo->tpo_pool.po_allocated);
1207
1208 if (!hdev)
1209 return;
1210
1211 for (i = 0; i < tpo->tpo_pool.po_size; i++) {
1212 tx = &tpo->tpo_tx_descs[i];
1213 kiblnd_dma_unmap_single(hdev->ibh_ibdev,
1214 KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
1215 tx->tx_msgaddr),
1216 IBLND_MSG_SIZE, DMA_TO_DEVICE);
1217 }
1218
1219 kiblnd_hdev_decref(hdev);
1220 tpo->tpo_hdev = NULL;
1221 }
1222
1223 static kib_hca_dev_t *kiblnd_current_hdev(kib_dev_t *dev)
1224 {
1225 kib_hca_dev_t *hdev;
1226 unsigned long flags;
1227 int i = 0;
1228
1229 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1230 while (dev->ibd_failover) {
1231 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1232 if (!(i++ % 50))
1233 CDEBUG(D_NET, "%s: Wait for failover\n",
1234 dev->ibd_ifname);
1235 schedule_timeout(cfs_time_seconds(1) / 100);
1236
1237 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1238 }
1239
1240 kiblnd_hdev_addref_locked(dev->ibd_hdev);
1241 hdev = dev->ibd_hdev;
1242
1243 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1244
1245 return hdev;
1246 }
1247
1248 static void kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
1249 {
1250 kib_pages_t *txpgs = tpo->tpo_tx_pages;
1251 kib_pool_t *pool = &tpo->tpo_pool;
1252 kib_net_t *net = pool->po_owner->ps_net;
1253 kib_dev_t *dev;
1254 struct page *page;
1255 kib_tx_t *tx;
1256 int page_offset;
1257 int ipage;
1258 int i;
1259
1260 LASSERT(net);
1261
1262 dev = net->ibn_dev;
1263
1264 /* pre-mapped messages are not bigger than 1 page */
1265 CLASSERT(IBLND_MSG_SIZE <= PAGE_SIZE);
1266
1267 /* No fancy arithmetic when we do the buffer calculations */
1268 CLASSERT(!(PAGE_SIZE % IBLND_MSG_SIZE));
1269
1270 tpo->tpo_hdev = kiblnd_current_hdev(dev);
1271
1272 for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
1273 page = txpgs->ibp_pages[ipage];
1274 tx = &tpo->tpo_tx_descs[i];
1275
1276 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1277 page_offset);
1278
1279 tx->tx_msgaddr = kiblnd_dma_map_single(
1280 tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
1281 IBLND_MSG_SIZE, DMA_TO_DEVICE);
1282 LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
1283 tx->tx_msgaddr));
1284 KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
1285
1286 list_add(&tx->tx_list, &pool->po_free_list);
1287
1288 page_offset += IBLND_MSG_SIZE;
1289 LASSERT(page_offset <= PAGE_SIZE);
1290
1291 if (page_offset == PAGE_SIZE) {
1292 page_offset = 0;
1293 ipage++;
1294 LASSERT(ipage <= txpgs->ibp_npages);
1295 }
1296 }
1297 }
1298
1299 struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev, __u64 addr, __u64 size)
1300 {
1301 __u64 index;
1302
1303 LASSERT(hdev->ibh_mrs[0]);
1304
1305 if (hdev->ibh_nmrs == 1)
1306 return hdev->ibh_mrs[0];
1307
1308 index = addr >> hdev->ibh_mr_shift;
1309
1310 if (index < hdev->ibh_nmrs &&
1311 index == ((addr + size - 1) >> hdev->ibh_mr_shift))
1312 return hdev->ibh_mrs[index];
1313
1314 return NULL;
1315 }
1316
1317 struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd)
1318 {
1319 struct ib_mr *prev_mr;
1320 struct ib_mr *mr;
1321 int i;
1322
1323 LASSERT(hdev->ibh_mrs[0]);
1324
1325 if (*kiblnd_tunables.kib_map_on_demand > 0 &&
1326 *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags)
1327 return NULL;
1328
1329 if (hdev->ibh_nmrs == 1)
1330 return hdev->ibh_mrs[0];
1331
1332 for (i = 0, mr = prev_mr = NULL;
1333 i < rd->rd_nfrags; i++) {
1334 mr = kiblnd_find_dma_mr(hdev,
1335 rd->rd_frags[i].rf_addr,
1336 rd->rd_frags[i].rf_nob);
1337 if (!prev_mr)
1338 prev_mr = mr;
1339
1340 if (!mr || prev_mr != mr) {
1341 /* Can't covered by one single MR */
1342 mr = NULL;
1343 break;
1344 }
1345 }
1346
1347 return mr;
1348 }
1349
1350 static void kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool)
1351 {
1352 LASSERT(!pool->fpo_map_count);
1353
1354 if (pool->fpo_fmr_pool)
1355 ib_destroy_fmr_pool(pool->fpo_fmr_pool);
1356
1357 if (pool->fpo_hdev)
1358 kiblnd_hdev_decref(pool->fpo_hdev);
1359
1360 LIBCFS_FREE(pool, sizeof(*pool));
1361 }
1362
1363 static void kiblnd_destroy_fmr_pool_list(struct list_head *head)
1364 {
1365 kib_fmr_pool_t *pool;
1366
1367 while (!list_empty(head)) {
1368 pool = list_entry(head->next, kib_fmr_pool_t, fpo_list);
1369 list_del(&pool->fpo_list);
1370 kiblnd_destroy_fmr_pool(pool);
1371 }
1372 }
1373
1374 static int kiblnd_fmr_pool_size(int ncpts)
1375 {
1376 int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts;
1377
1378 return max(IBLND_FMR_POOL, size);
1379 }
1380
1381 static int kiblnd_fmr_flush_trigger(int ncpts)
1382 {
1383 int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts;
1384
1385 return max(IBLND_FMR_POOL_FLUSH, size);
1386 }
1387
1388 static int kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps,
1389 kib_fmr_pool_t **pp_fpo)
1390 {
1391 /* FMR pool for RDMA */
1392 kib_dev_t *dev = fps->fps_net->ibn_dev;
1393 kib_fmr_pool_t *fpo;
1394 struct ib_fmr_pool_param param = {
1395 .max_pages_per_fmr = LNET_MAX_PAYLOAD / PAGE_SIZE,
1396 .page_shift = PAGE_SHIFT,
1397 .access = (IB_ACCESS_LOCAL_WRITE |
1398 IB_ACCESS_REMOTE_WRITE),
1399 .pool_size = fps->fps_pool_size,
1400 .dirty_watermark = fps->fps_flush_trigger,
1401 .flush_function = NULL,
1402 .flush_arg = NULL,
1403 .cache = !!*kiblnd_tunables.kib_fmr_cache};
1404 int rc;
1405
1406 LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
1407 if (!fpo)
1408 return -ENOMEM;
1409
1410 fpo->fpo_hdev = kiblnd_current_hdev(dev);
1411
1412 fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, &param);
1413 if (IS_ERR(fpo->fpo_fmr_pool)) {
1414 rc = PTR_ERR(fpo->fpo_fmr_pool);
1415 CERROR("Failed to create FMR pool: %d\n", rc);
1416
1417 kiblnd_hdev_decref(fpo->fpo_hdev);
1418 LIBCFS_FREE(fpo, sizeof(*fpo));
1419 return rc;
1420 }
1421
1422 fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1423 fpo->fpo_owner = fps;
1424 *pp_fpo = fpo;
1425
1426 return 0;
1427 }
1428
1429 static void kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps,
1430 struct list_head *zombies)
1431 {
1432 if (!fps->fps_net) /* intialized? */
1433 return;
1434
1435 spin_lock(&fps->fps_lock);
1436
1437 while (!list_empty(&fps->fps_pool_list)) {
1438 kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next,
1439 kib_fmr_pool_t, fpo_list);
1440 fpo->fpo_failed = 1;
1441 list_del(&fpo->fpo_list);
1442 if (!fpo->fpo_map_count)
1443 list_add(&fpo->fpo_list, zombies);
1444 else
1445 list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
1446 }
1447
1448 spin_unlock(&fps->fps_lock);
1449 }
1450
1451 static void kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
1452 {
1453 if (fps->fps_net) { /* initialized? */
1454 kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
1455 kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
1456 }
1457 }
1458
1459 static int kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt,
1460 kib_net_t *net, int pool_size,
1461 int flush_trigger)
1462 {
1463 kib_fmr_pool_t *fpo;
1464 int rc;
1465
1466 memset(fps, 0, sizeof(*fps));
1467
1468 fps->fps_net = net;
1469 fps->fps_cpt = cpt;
1470 fps->fps_pool_size = pool_size;
1471 fps->fps_flush_trigger = flush_trigger;
1472 spin_lock_init(&fps->fps_lock);
1473 INIT_LIST_HEAD(&fps->fps_pool_list);
1474 INIT_LIST_HEAD(&fps->fps_failed_pool_list);
1475
1476 rc = kiblnd_create_fmr_pool(fps, &fpo);
1477 if (!rc)
1478 list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
1479
1480 return rc;
1481 }
1482
1483 static int kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, unsigned long now)
1484 {
1485 if (fpo->fpo_map_count) /* still in use */
1486 return 0;
1487 if (fpo->fpo_failed)
1488 return 1;
1489 return cfs_time_aftereq(now, fpo->fpo_deadline);
1490 }
1491
1492 void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
1493 {
1494 LIST_HEAD(zombies);
1495 kib_fmr_pool_t *fpo = fmr->fmr_pool;
1496 kib_fmr_poolset_t *fps = fpo->fpo_owner;
1497 unsigned long now = cfs_time_current();
1498 kib_fmr_pool_t *tmp;
1499 int rc;
1500
1501 rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
1502 LASSERT(!rc);
1503
1504 if (status) {
1505 rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool);
1506 LASSERT(!rc);
1507 }
1508
1509 fmr->fmr_pool = NULL;
1510 fmr->fmr_pfmr = NULL;
1511
1512 spin_lock(&fps->fps_lock);
1513 fpo->fpo_map_count--; /* decref the pool */
1514
1515 list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
1516 /* the first pool is persistent */
1517 if (fps->fps_pool_list.next == &fpo->fpo_list)
1518 continue;
1519
1520 if (kiblnd_fmr_pool_is_idle(fpo, now)) {
1521 list_move(&fpo->fpo_list, &zombies);
1522 fps->fps_version++;
1523 }
1524 }
1525 spin_unlock(&fps->fps_lock);
1526
1527 if (!list_empty(&zombies))
1528 kiblnd_destroy_fmr_pool_list(&zombies);
1529 }
1530
1531 int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages,
1532 __u64 iov, kib_fmr_t *fmr)
1533 {
1534 struct ib_pool_fmr *pfmr;
1535 kib_fmr_pool_t *fpo;
1536 __u64 version;
1537 int rc;
1538
1539 again:
1540 spin_lock(&fps->fps_lock);
1541 version = fps->fps_version;
1542 list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
1543 fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1544 fpo->fpo_map_count++;
1545 spin_unlock(&fps->fps_lock);
1546
1547 pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool,
1548 pages, npages, iov);
1549 if (likely(!IS_ERR(pfmr))) {
1550 fmr->fmr_pool = fpo;
1551 fmr->fmr_pfmr = pfmr;
1552 return 0;
1553 }
1554
1555 spin_lock(&fps->fps_lock);
1556 fpo->fpo_map_count--;
1557 if (PTR_ERR(pfmr) != -EAGAIN) {
1558 spin_unlock(&fps->fps_lock);
1559 return PTR_ERR(pfmr);
1560 }
1561
1562 /* EAGAIN and ... */
1563 if (version != fps->fps_version) {
1564 spin_unlock(&fps->fps_lock);
1565 goto again;
1566 }
1567 }
1568
1569 if (fps->fps_increasing) {
1570 spin_unlock(&fps->fps_lock);
1571 CDEBUG(D_NET, "Another thread is allocating new FMR pool, waiting for her to complete\n");
1572 schedule();
1573 goto again;
1574 }
1575
1576 if (time_before(cfs_time_current(), fps->fps_next_retry)) {
1577 /* someone failed recently */
1578 spin_unlock(&fps->fps_lock);
1579 return -EAGAIN;
1580 }
1581
1582 fps->fps_increasing = 1;
1583 spin_unlock(&fps->fps_lock);
1584
1585 CDEBUG(D_NET, "Allocate new FMR pool\n");
1586 rc = kiblnd_create_fmr_pool(fps, &fpo);
1587 spin_lock(&fps->fps_lock);
1588 fps->fps_increasing = 0;
1589 if (!rc) {
1590 fps->fps_version++;
1591 list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
1592 } else {
1593 fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
1594 }
1595 spin_unlock(&fps->fps_lock);
1596
1597 goto again;
1598 }
1599
1600 static void kiblnd_fini_pool(kib_pool_t *pool)
1601 {
1602 LASSERT(list_empty(&pool->po_free_list));
1603 LASSERT(!pool->po_allocated);
1604
1605 CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
1606 }
1607
1608 static void kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
1609 {
1610 CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
1611
1612 memset(pool, 0, sizeof(*pool));
1613 INIT_LIST_HEAD(&pool->po_free_list);
1614 pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1615 pool->po_owner = ps;
1616 pool->po_size = size;
1617 }
1618
1619 static void kiblnd_destroy_pool_list(struct list_head *head)
1620 {
1621 kib_pool_t *pool;
1622
1623 while (!list_empty(head)) {
1624 pool = list_entry(head->next, kib_pool_t, po_list);
1625 list_del(&pool->po_list);
1626
1627 LASSERT(pool->po_owner);
1628 pool->po_owner->ps_pool_destroy(pool);
1629 }
1630 }
1631
1632 static void kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
1633 {
1634 if (!ps->ps_net) /* intialized? */
1635 return;
1636
1637 spin_lock(&ps->ps_lock);
1638 while (!list_empty(&ps->ps_pool_list)) {
1639 kib_pool_t *po = list_entry(ps->ps_pool_list.next,
1640 kib_pool_t, po_list);
1641 po->po_failed = 1;
1642 list_del(&po->po_list);
1643 if (!po->po_allocated)
1644 list_add(&po->po_list, zombies);
1645 else
1646 list_add(&po->po_list, &ps->ps_failed_pool_list);
1647 }
1648 spin_unlock(&ps->ps_lock);
1649 }
1650
1651 static void kiblnd_fini_poolset(kib_poolset_t *ps)
1652 {
1653 if (ps->ps_net) { /* initialized? */
1654 kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
1655 kiblnd_destroy_pool_list(&ps->ps_pool_list);
1656 }
1657 }
1658
1659 static int kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
1660 kib_net_t *net, char *name, int size,
1661 kib_ps_pool_create_t po_create,
1662 kib_ps_pool_destroy_t po_destroy,
1663 kib_ps_node_init_t nd_init,
1664 kib_ps_node_fini_t nd_fini)
1665 {
1666 kib_pool_t *pool;
1667 int rc;
1668
1669 memset(ps, 0, sizeof(*ps));
1670
1671 ps->ps_cpt = cpt;
1672 ps->ps_net = net;
1673 ps->ps_pool_create = po_create;
1674 ps->ps_pool_destroy = po_destroy;
1675 ps->ps_node_init = nd_init;
1676 ps->ps_node_fini = nd_fini;
1677 ps->ps_pool_size = size;
1678 if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
1679 >= sizeof(ps->ps_name))
1680 return -E2BIG;
1681 spin_lock_init(&ps->ps_lock);
1682 INIT_LIST_HEAD(&ps->ps_pool_list);
1683 INIT_LIST_HEAD(&ps->ps_failed_pool_list);
1684
1685 rc = ps->ps_pool_create(ps, size, &pool);
1686 if (!rc)
1687 list_add(&pool->po_list, &ps->ps_pool_list);
1688 else
1689 CERROR("Failed to create the first pool for %s\n", ps->ps_name);
1690
1691 return rc;
1692 }
1693
1694 static int kiblnd_pool_is_idle(kib_pool_t *pool, unsigned long now)
1695 {
1696 if (pool->po_allocated) /* still in use */
1697 return 0;
1698 if (pool->po_failed)
1699 return 1;
1700 return cfs_time_aftereq(now, pool->po_deadline);
1701 }
1702
1703 void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
1704 {
1705 LIST_HEAD(zombies);
1706 kib_poolset_t *ps = pool->po_owner;
1707 kib_pool_t *tmp;
1708 unsigned long now = cfs_time_current();
1709
1710 spin_lock(&ps->ps_lock);
1711
1712 if (ps->ps_node_fini)
1713 ps->ps_node_fini(pool, node);
1714
1715 LASSERT(pool->po_allocated > 0);
1716 list_add(node, &pool->po_free_list);
1717 pool->po_allocated--;
1718
1719 list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
1720 /* the first pool is persistent */
1721 if (ps->ps_pool_list.next == &pool->po_list)
1722 continue;
1723
1724 if (kiblnd_pool_is_idle(pool, now))
1725 list_move(&pool->po_list, &zombies);
1726 }
1727 spin_unlock(&ps->ps_lock);
1728
1729 if (!list_empty(&zombies))
1730 kiblnd_destroy_pool_list(&zombies);
1731 }
1732
1733 struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps)
1734 {
1735 struct list_head *node;
1736 kib_pool_t *pool;
1737 int rc;
1738
1739 again:
1740 spin_lock(&ps->ps_lock);
1741 list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
1742 if (list_empty(&pool->po_free_list))
1743 continue;
1744
1745 pool->po_allocated++;
1746 pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1747 node = pool->po_free_list.next;
1748 list_del(node);
1749
1750 if (ps->ps_node_init) {
1751 /* still hold the lock */
1752 ps->ps_node_init(pool, node);
1753 }
1754 spin_unlock(&ps->ps_lock);
1755 return node;
1756 }
1757
1758 /* no available tx pool and ... */
1759 if (ps->ps_increasing) {
1760 /* another thread is allocating a new pool */
1761 spin_unlock(&ps->ps_lock);
1762 CDEBUG(D_NET, "Another thread is allocating new %s pool, waiting for her to complete\n",
1763 ps->ps_name);
1764 schedule();
1765 goto again;
1766 }
1767
1768 if (time_before(cfs_time_current(), ps->ps_next_retry)) {
1769 /* someone failed recently */
1770 spin_unlock(&ps->ps_lock);
1771 return NULL;
1772 }
1773
1774 ps->ps_increasing = 1;
1775 spin_unlock(&ps->ps_lock);
1776
1777 CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
1778
1779 rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
1780
1781 spin_lock(&ps->ps_lock);
1782 ps->ps_increasing = 0;
1783 if (!rc) {
1784 list_add_tail(&pool->po_list, &ps->ps_pool_list);
1785 } else {
1786 ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
1787 CERROR("Can't allocate new %s pool because out of memory\n",
1788 ps->ps_name);
1789 }
1790 spin_unlock(&ps->ps_lock);
1791
1792 goto again;
1793 }
1794
1795 static void kiblnd_destroy_tx_pool(kib_pool_t *pool)
1796 {
1797 kib_tx_pool_t *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
1798 int i;
1799
1800 LASSERT(!pool->po_allocated);
1801
1802 if (tpo->tpo_tx_pages) {
1803 kiblnd_unmap_tx_pool(tpo);
1804 kiblnd_free_pages(tpo->tpo_tx_pages);
1805 }
1806
1807 if (!tpo->tpo_tx_descs)
1808 goto out;
1809
1810 for (i = 0; i < pool->po_size; i++) {
1811 kib_tx_t *tx = &tpo->tpo_tx_descs[i];
1812
1813 list_del(&tx->tx_list);
1814 if (tx->tx_pages)
1815 LIBCFS_FREE(tx->tx_pages,
1816 LNET_MAX_IOV *
1817 sizeof(*tx->tx_pages));
1818 if (tx->tx_frags)
1819 LIBCFS_FREE(tx->tx_frags,
1820 IBLND_MAX_RDMA_FRAGS *
1821 sizeof(*tx->tx_frags));
1822 if (tx->tx_wrq)
1823 LIBCFS_FREE(tx->tx_wrq,
1824 (1 + IBLND_MAX_RDMA_FRAGS) *
1825 sizeof(*tx->tx_wrq));
1826 if (tx->tx_sge)
1827 LIBCFS_FREE(tx->tx_sge,
1828 (1 + IBLND_MAX_RDMA_FRAGS) *
1829 sizeof(*tx->tx_sge));
1830 if (tx->tx_rd)
1831 LIBCFS_FREE(tx->tx_rd,
1832 offsetof(kib_rdma_desc_t,
1833 rd_frags[IBLND_MAX_RDMA_FRAGS]));
1834 }
1835
1836 LIBCFS_FREE(tpo->tpo_tx_descs,
1837 pool->po_size * sizeof(kib_tx_t));
1838 out:
1839 kiblnd_fini_pool(pool);
1840 LIBCFS_FREE(tpo, sizeof(*tpo));
1841 }
1842
1843 static int kiblnd_tx_pool_size(int ncpts)
1844 {
1845 int ntx = *kiblnd_tunables.kib_ntx / ncpts;
1846
1847 return max(IBLND_TX_POOL, ntx);
1848 }
1849
1850 static int kiblnd_create_tx_pool(kib_poolset_t *ps, int size,
1851 kib_pool_t **pp_po)
1852 {
1853 int i;
1854 int npg;
1855 kib_pool_t *pool;
1856 kib_tx_pool_t *tpo;
1857
1858 LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
1859 if (!tpo) {
1860 CERROR("Failed to allocate TX pool\n");
1861 return -ENOMEM;
1862 }
1863
1864 pool = &tpo->tpo_pool;
1865 kiblnd_init_pool(ps, pool, size);
1866 tpo->tpo_tx_descs = NULL;
1867 tpo->tpo_tx_pages = NULL;
1868
1869 npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
1870 if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg)) {
1871 CERROR("Can't allocate tx pages: %d\n", npg);
1872 LIBCFS_FREE(tpo, sizeof(*tpo));
1873 return -ENOMEM;
1874 }
1875
1876 LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
1877 size * sizeof(kib_tx_t));
1878 if (!tpo->tpo_tx_descs) {
1879 CERROR("Can't allocate %d tx descriptors\n", size);
1880 ps->ps_pool_destroy(pool);
1881 return -ENOMEM;
1882 }
1883
1884 memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
1885
1886 for (i = 0; i < size; i++) {
1887 kib_tx_t *tx = &tpo->tpo_tx_descs[i];
1888
1889 tx->tx_pool = tpo;
1890 if (ps->ps_net->ibn_fmr_ps) {
1891 LIBCFS_CPT_ALLOC(tx->tx_pages,
1892 lnet_cpt_table(), ps->ps_cpt,
1893 LNET_MAX_IOV * sizeof(*tx->tx_pages));
1894 if (!tx->tx_pages)
1895 break;
1896 }
1897
1898 LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
1899 IBLND_MAX_RDMA_FRAGS * sizeof(*tx->tx_frags));
1900 if (!tx->tx_frags)
1901 break;
1902
1903 sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS);
1904
1905 LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
1906 (1 + IBLND_MAX_RDMA_FRAGS) *
1907 sizeof(*tx->tx_wrq));
1908 if (!tx->tx_wrq)
1909 break;
1910
1911 LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
1912 (1 + IBLND_MAX_RDMA_FRAGS) *
1913 sizeof(*tx->tx_sge));
1914 if (!tx->tx_sge)
1915 break;
1916
1917 LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
1918 offsetof(kib_rdma_desc_t,
1919 rd_frags[IBLND_MAX_RDMA_FRAGS]));
1920 if (!tx->tx_rd)
1921 break;
1922 }
1923
1924 if (i == size) {
1925 kiblnd_map_tx_pool(tpo);
1926 *pp_po = pool;
1927 return 0;
1928 }
1929
1930 ps->ps_pool_destroy(pool);
1931 return -ENOMEM;
1932 }
1933
1934 static void kiblnd_tx_init(kib_pool_t *pool, struct list_head *node)
1935 {
1936 kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
1937 tps_poolset);
1938 kib_tx_t *tx = list_entry(node, kib_tx_t, tx_list);
1939
1940 tx->tx_cookie = tps->tps_next_tx_cookie++;
1941 }
1942
1943 static void kiblnd_net_fini_pools(kib_net_t *net)
1944 {
1945 int i;
1946
1947 cfs_cpt_for_each(i, lnet_cpt_table()) {
1948 kib_tx_poolset_t *tps;
1949 kib_fmr_poolset_t *fps;
1950
1951 if (net->ibn_tx_ps) {
1952 tps = net->ibn_tx_ps[i];
1953 kiblnd_fini_poolset(&tps->tps_poolset);
1954 }
1955
1956 if (net->ibn_fmr_ps) {
1957 fps = net->ibn_fmr_ps[i];
1958 kiblnd_fini_fmr_poolset(fps);
1959 }
1960 }
1961
1962 if (net->ibn_tx_ps) {
1963 cfs_percpt_free(net->ibn_tx_ps);
1964 net->ibn_tx_ps = NULL;
1965 }
1966
1967 if (net->ibn_fmr_ps) {
1968 cfs_percpt_free(net->ibn_fmr_ps);
1969 net->ibn_fmr_ps = NULL;
1970 }
1971 }
1972
1973 static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
1974 {
1975 unsigned long flags;
1976 int cpt;
1977 int rc = 0;
1978 int i;
1979
1980 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1981 if (!*kiblnd_tunables.kib_map_on_demand &&
1982 net->ibn_dev->ibd_hdev->ibh_nmrs == 1) {
1983 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1984 goto create_tx_pool;
1985 }
1986
1987 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1988
1989 if (*kiblnd_tunables.kib_fmr_pool_size <
1990 *kiblnd_tunables.kib_ntx / 4) {
1991 CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
1992 *kiblnd_tunables.kib_fmr_pool_size,
1993 *kiblnd_tunables.kib_ntx / 4);
1994 rc = -EINVAL;
1995 goto failed;
1996 }
1997
1998 /*
1999 * TX pool must be created later than FMR, see LU-2268
2000 * for details
2001 */
2002 LASSERT(!net->ibn_tx_ps);
2003
2004 /*
2005 * premapping can fail if ibd_nmr > 1, so we always create
2006 * FMR pool and map-on-demand if premapping failed
2007 */
2008
2009 net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
2010 sizeof(kib_fmr_poolset_t));
2011 if (!net->ibn_fmr_ps) {
2012 CERROR("Failed to allocate FMR pool array\n");
2013 rc = -ENOMEM;
2014 goto failed;
2015 }
2016
2017 for (i = 0; i < ncpts; i++) {
2018 cpt = !cpts ? i : cpts[i];
2019 rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net,
2020 kiblnd_fmr_pool_size(ncpts),
2021 kiblnd_fmr_flush_trigger(ncpts));
2022 if (rc == -ENOSYS && !i) /* no FMR */
2023 break;
2024
2025 if (rc) { /* a real error */
2026 CERROR("Can't initialize FMR pool for CPT %d: %d\n",
2027 cpt, rc);
2028 goto failed;
2029 }
2030 }
2031
2032 if (i > 0) {
2033 LASSERT(i == ncpts);
2034 goto create_tx_pool;
2035 }
2036
2037 cfs_percpt_free(net->ibn_fmr_ps);
2038 net->ibn_fmr_ps = NULL;
2039
2040 CWARN("Device does not support FMR\n");
2041 goto failed;
2042
2043 create_tx_pool:
2044 net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
2045 sizeof(kib_tx_poolset_t));
2046 if (!net->ibn_tx_ps) {
2047 CERROR("Failed to allocate tx pool array\n");
2048 rc = -ENOMEM;
2049 goto failed;
2050 }
2051
2052 for (i = 0; i < ncpts; i++) {
2053 cpt = !cpts ? i : cpts[i];
2054 rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
2055 cpt, net, "TX",
2056 kiblnd_tx_pool_size(ncpts),
2057 kiblnd_create_tx_pool,
2058 kiblnd_destroy_tx_pool,
2059 kiblnd_tx_init, NULL);
2060 if (rc) {
2061 CERROR("Can't initialize TX pool for CPT %d: %d\n",
2062 cpt, rc);
2063 goto failed;
2064 }
2065 }
2066
2067 return 0;
2068 failed:
2069 kiblnd_net_fini_pools(net);
2070 LASSERT(rc);
2071 return rc;
2072 }
2073
2074 static int kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
2075 {
2076 /*
2077 * It's safe to assume a HCA can handle a page size
2078 * matching that of the native system
2079 */
2080 hdev->ibh_page_shift = PAGE_SHIFT;
2081 hdev->ibh_page_size = 1 << PAGE_SHIFT;
2082 hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1);
2083
2084 hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size;
2085 if (hdev->ibh_mr_size == ~0ULL) {
2086 hdev->ibh_mr_shift = 64;
2087 return 0;
2088 }
2089
2090 for (hdev->ibh_mr_shift = 0;
2091 hdev->ibh_mr_shift < 64; hdev->ibh_mr_shift++) {
2092 if (hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) ||
2093 hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) - 1)
2094 return 0;
2095 }
2096
2097 CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
2098 return -EINVAL;
2099 }
2100
2101 static void kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
2102 {
2103 int i;
2104
2105 if (!hdev->ibh_nmrs || !hdev->ibh_mrs)
2106 return;
2107
2108 for (i = 0; i < hdev->ibh_nmrs; i++) {
2109 if (!hdev->ibh_mrs[i])
2110 break;
2111
2112 ib_dereg_mr(hdev->ibh_mrs[i]);
2113 }
2114
2115 LIBCFS_FREE(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
2116 hdev->ibh_mrs = NULL;
2117 hdev->ibh_nmrs = 0;
2118 }
2119
2120 void kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
2121 {
2122 kiblnd_hdev_cleanup_mrs(hdev);
2123
2124 if (hdev->ibh_pd)
2125 ib_dealloc_pd(hdev->ibh_pd);
2126
2127 if (hdev->ibh_cmid)
2128 rdma_destroy_id(hdev->ibh_cmid);
2129
2130 LIBCFS_FREE(hdev, sizeof(*hdev));
2131 }
2132
2133 static int kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
2134 {
2135 struct ib_mr *mr;
2136 int rc;
2137 int acflags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
2138
2139 rc = kiblnd_hdev_get_attr(hdev);
2140 if (rc)
2141 return rc;
2142
2143 LIBCFS_ALLOC(hdev->ibh_mrs, 1 * sizeof(*hdev->ibh_mrs));
2144 if (!hdev->ibh_mrs) {
2145 CERROR("Failed to allocate MRs table\n");
2146 return -ENOMEM;
2147 }
2148
2149 hdev->ibh_mrs[0] = NULL;
2150 hdev->ibh_nmrs = 1;
2151
2152 mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
2153 if (IS_ERR(mr)) {
2154 CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr));
2155 kiblnd_hdev_cleanup_mrs(hdev);
2156 return PTR_ERR(mr);
2157 }
2158
2159 hdev->ibh_mrs[0] = mr;
2160
2161 return 0;
2162 }
2163
2164 /* DUMMY */
2165 static int kiblnd_dummy_callback(struct rdma_cm_id *cmid,
2166 struct rdma_cm_event *event)
2167 {
2168 return 0;
2169 }
2170
2171 static int kiblnd_dev_need_failover(kib_dev_t *dev)
2172 {
2173 struct rdma_cm_id *cmid;
2174 struct sockaddr_in srcaddr;
2175 struct sockaddr_in dstaddr;
2176 int rc;
2177
2178 if (!dev->ibd_hdev || /* initializing */
2179 !dev->ibd_hdev->ibh_cmid || /* listener is dead */
2180 *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
2181 return 1;
2182
2183 /*
2184 * XXX: it's UGLY, but I don't have better way to find
2185 * ib-bonding HCA failover because:
2186 *
2187 * a. no reliable CM event for HCA failover...
2188 * b. no OFED API to get ib_device for current net_device...
2189 *
2190 * We have only two choices at this point:
2191 *
2192 * a. rdma_bind_addr(), it will conflict with listener cmid
2193 * b. rdma_resolve_addr() to zero addr
2194 */
2195 cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
2196 IB_QPT_RC);
2197 if (IS_ERR(cmid)) {
2198 rc = PTR_ERR(cmid);
2199 CERROR("Failed to create cmid for failover: %d\n", rc);
2200 return rc;
2201 }
2202
2203 memset(&srcaddr, 0, sizeof(srcaddr));
2204 srcaddr.sin_family = AF_INET;
2205 srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
2206
2207 memset(&dstaddr, 0, sizeof(dstaddr));
2208 dstaddr.sin_family = AF_INET;
2209 rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
2210 (struct sockaddr *)&dstaddr, 1);
2211 if (rc || !cmid->device) {
2212 CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
2213 dev->ibd_ifname, &dev->ibd_ifip,
2214 cmid->device, rc);
2215 rdma_destroy_id(cmid);
2216 return rc;
2217 }
2218
2219 rc = dev->ibd_hdev->ibh_ibdev != cmid->device; /* true for failover */
2220 rdma_destroy_id(cmid);
2221
2222 return rc;
2223 }
2224
2225 int kiblnd_dev_failover(kib_dev_t *dev)
2226 {
2227 LIST_HEAD(zombie_tpo);
2228 LIST_HEAD(zombie_ppo);
2229 LIST_HEAD(zombie_fpo);
2230 struct rdma_cm_id *cmid = NULL;
2231 kib_hca_dev_t *hdev = NULL;
2232 struct ib_pd *pd;
2233 kib_net_t *net;
2234 struct sockaddr_in addr;
2235 unsigned long flags;
2236 int rc = 0;
2237 int i;
2238
2239 LASSERT(*kiblnd_tunables.kib_dev_failover > 1 ||
2240 dev->ibd_can_failover || !dev->ibd_hdev);
2241
2242 rc = kiblnd_dev_need_failover(dev);
2243 if (rc <= 0)
2244 goto out;
2245
2246 if (dev->ibd_hdev &&
2247 dev->ibd_hdev->ibh_cmid) {
2248 /*
2249 * XXX it's not good to close old listener at here,
2250 * because we can fail to create new listener.
2251 * But we have to close it now, otherwise rdma_bind_addr
2252 * will return EADDRINUSE... How crap!
2253 */
2254 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2255
2256 cmid = dev->ibd_hdev->ibh_cmid;
2257 /*
2258 * make next schedule of kiblnd_dev_need_failover()
2259 * return 1 for me
2260 */
2261 dev->ibd_hdev->ibh_cmid = NULL;
2262 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2263
2264 rdma_destroy_id(cmid);
2265 }
2266
2267 cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
2268 IB_QPT_RC);
2269 if (IS_ERR(cmid)) {
2270 rc = PTR_ERR(cmid);
2271 CERROR("Failed to create cmid for failover: %d\n", rc);
2272 goto out;
2273 }
2274
2275 memset(&addr, 0, sizeof(addr));
2276 addr.sin_family = AF_INET;
2277 addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
2278 addr.sin_port = htons(*kiblnd_tunables.kib_service);
2279
2280 /* Bind to failover device or port */
2281 rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
2282 if (rc || !cmid->device) {
2283 CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
2284 dev->ibd_ifname, &dev->ibd_ifip,
2285 cmid->device, rc);
2286 rdma_destroy_id(cmid);
2287 goto out;
2288 }
2289
2290 LIBCFS_ALLOC(hdev, sizeof(*hdev));
2291 if (!hdev) {
2292 CERROR("Failed to allocate kib_hca_dev\n");
2293 rdma_destroy_id(cmid);
2294 rc = -ENOMEM;
2295 goto out;
2296 }
2297
2298 atomic_set(&hdev->ibh_ref, 1);
2299 hdev->ibh_dev = dev;
2300 hdev->ibh_cmid = cmid;
2301 hdev->ibh_ibdev = cmid->device;
2302
2303 pd = ib_alloc_pd(cmid->device);
2304 if (IS_ERR(pd)) {
2305 rc = PTR_ERR(pd);
2306 CERROR("Can't allocate PD: %d\n", rc);
2307 goto out;
2308 }
2309
2310 hdev->ibh_pd = pd;
2311
2312 rc = rdma_listen(cmid, 0);
2313 if (rc) {
2314 CERROR("Can't start new listener: %d\n", rc);
2315 goto out;
2316 }
2317
2318 rc = kiblnd_hdev_setup_mrs(hdev);
2319 if (rc) {
2320 CERROR("Can't setup device: %d\n", rc);
2321 goto out;
2322 }
2323
2324 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2325
2326 swap(dev->ibd_hdev, hdev); /* take over the refcount */
2327
2328 list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
2329 cfs_cpt_for_each(i, lnet_cpt_table()) {
2330 kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
2331 &zombie_tpo);
2332
2333 if (net->ibn_fmr_ps)
2334 kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
2335 &zombie_fpo);
2336 }
2337 }
2338
2339 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2340 out:
2341 if (!list_empty(&zombie_tpo))
2342 kiblnd_destroy_pool_list(&zombie_tpo);
2343 if (!list_empty(&zombie_ppo))
2344 kiblnd_destroy_pool_list(&zombie_ppo);
2345 if (!list_empty(&zombie_fpo))
2346 kiblnd_destroy_fmr_pool_list(&zombie_fpo);
2347 if (hdev)
2348 kiblnd_hdev_decref(hdev);
2349
2350 if (rc)
2351 dev->ibd_failed_failover++;
2352 else
2353 dev->ibd_failed_failover = 0;
2354
2355 return rc;
2356 }
2357
2358 void kiblnd_destroy_dev(kib_dev_t *dev)
2359 {
2360 LASSERT(!dev->ibd_nnets);
2361 LASSERT(list_empty(&dev->ibd_nets));
2362
2363 list_del(&dev->ibd_fail_list);
2364 list_del(&dev->ibd_list);
2365
2366 if (dev->ibd_hdev)
2367 kiblnd_hdev_decref(dev->ibd_hdev);
2368
2369 LIBCFS_FREE(dev, sizeof(*dev));
2370 }
2371
2372 static kib_dev_t *kiblnd_create_dev(char *ifname)
2373 {
2374 struct net_device *netdev;
2375 kib_dev_t *dev;
2376 __u32 netmask;
2377 __u32 ip;
2378 int up;
2379 int rc;
2380
2381 rc = lnet_ipif_query(ifname, &up, &ip, &netmask);
2382 if (rc) {
2383 CERROR("Can't query IPoIB interface %s: %d\n",
2384 ifname, rc);
2385 return NULL;
2386 }
2387
2388 if (!up) {
2389 CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
2390 return NULL;
2391 }
2392
2393 LIBCFS_ALLOC(dev, sizeof(*dev));
2394 if (!dev)
2395 return NULL;
2396
2397 netdev = dev_get_by_name(&init_net, ifname);
2398 if (!netdev) {
2399 dev->ibd_can_failover = 0;
2400 } else {
2401 dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
2402 dev_put(netdev);
2403 }
2404
2405 INIT_LIST_HEAD(&dev->ibd_nets);
2406 INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
2407 INIT_LIST_HEAD(&dev->ibd_fail_list);
2408 dev->ibd_ifip = ip;
2409 strcpy(&dev->ibd_ifname[0], ifname);
2410
2411 /* initialize the device */
2412 rc = kiblnd_dev_failover(dev);
2413 if (rc) {
2414 CERROR("Can't initialize device: %d\n", rc);
2415 LIBCFS_FREE(dev, sizeof(*dev));
2416 return NULL;
2417 }
2418
2419 list_add_tail(&dev->ibd_list, &kiblnd_data.kib_devs);
2420 return dev;
2421 }
2422
2423 static void kiblnd_base_shutdown(void)
2424 {
2425 struct kib_sched_info *sched;
2426 int i;
2427
2428 LASSERT(list_empty(&kiblnd_data.kib_devs));
2429
2430 switch (kiblnd_data.kib_init) {
2431 default:
2432 LBUG();
2433
2434 case IBLND_INIT_ALL:
2435 case IBLND_INIT_DATA:
2436 LASSERT(kiblnd_data.kib_peers);
2437 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
2438 LASSERT(list_empty(&kiblnd_data.kib_peers[i]));
2439 LASSERT(list_empty(&kiblnd_data.kib_connd_zombies));
2440 LASSERT(list_empty(&kiblnd_data.kib_connd_conns));
2441
2442 /* flag threads to terminate; wake and wait for them to die */
2443 kiblnd_data.kib_shutdown = 1;
2444
2445 /*
2446 * NB: we really want to stop scheduler threads net by net
2447 * instead of the whole module, this should be improved
2448 * with dynamic configuration LNet
2449 */
2450 cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
2451 wake_up_all(&sched->ibs_waitq);
2452
2453 wake_up_all(&kiblnd_data.kib_connd_waitq);
2454 wake_up_all(&kiblnd_data.kib_failover_waitq);
2455
2456 i = 2;
2457 while (atomic_read(&kiblnd_data.kib_nthreads)) {
2458 i++;
2459 /* power of 2 ? */
2460 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
2461 "Waiting for %d threads to terminate\n",
2462 atomic_read(&kiblnd_data.kib_nthreads));
2463 set_current_state(TASK_UNINTERRUPTIBLE);
2464 schedule_timeout(cfs_time_seconds(1));
2465 }
2466
2467 /* fall through */
2468
2469 case IBLND_INIT_NOTHING:
2470 break;
2471 }
2472
2473 if (kiblnd_data.kib_peers) {
2474 LIBCFS_FREE(kiblnd_data.kib_peers,
2475 sizeof(struct list_head) *
2476 kiblnd_data.kib_peer_hash_size);
2477 }
2478
2479 if (kiblnd_data.kib_scheds)
2480 cfs_percpt_free(kiblnd_data.kib_scheds);
2481
2482 kiblnd_data.kib_init = IBLND_INIT_NOTHING;
2483 module_put(THIS_MODULE);
2484 }
2485
2486 static void kiblnd_shutdown(lnet_ni_t *ni)
2487 {
2488 kib_net_t *net = ni->ni_data;
2489 rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
2490 int i;
2491 unsigned long flags;
2492
2493 LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
2494
2495 if (!net)
2496 goto out;
2497
2498 write_lock_irqsave(g_lock, flags);
2499 net->ibn_shutdown = 1;
2500 write_unlock_irqrestore(g_lock, flags);
2501
2502 switch (net->ibn_init) {
2503 default:
2504 LBUG();
2505
2506 case IBLND_INIT_ALL:
2507 /* nuke all existing peers within this net */
2508 kiblnd_del_peer(ni, LNET_NID_ANY);
2509
2510 /* Wait for all peer state to clean up */
2511 i = 2;
2512 while (atomic_read(&net->ibn_npeers)) {
2513 i++;
2514 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
2515 "%s: waiting for %d peers to disconnect\n",
2516 libcfs_nid2str(ni->ni_nid),
2517 atomic_read(&net->ibn_npeers));
2518 set_current_state(TASK_UNINTERRUPTIBLE);
2519 schedule_timeout(cfs_time_seconds(1));
2520 }
2521
2522 kiblnd_net_fini_pools(net);
2523
2524 write_lock_irqsave(g_lock, flags);
2525 LASSERT(net->ibn_dev->ibd_nnets > 0);
2526 net->ibn_dev->ibd_nnets--;
2527 list_del(&net->ibn_list);
2528 write_unlock_irqrestore(g_lock, flags);
2529
2530 /* fall through */
2531
2532 case IBLND_INIT_NOTHING:
2533 LASSERT(!atomic_read(&net->ibn_nconns));
2534
2535 if (net->ibn_dev && !net->ibn_dev->ibd_nnets)
2536 kiblnd_destroy_dev(net->ibn_dev);
2537
2538 break;
2539 }
2540
2541 net->ibn_init = IBLND_INIT_NOTHING;
2542 ni->ni_data = NULL;
2543
2544 LIBCFS_FREE(net, sizeof(*net));
2545
2546 out:
2547 if (list_empty(&kiblnd_data.kib_devs))
2548 kiblnd_base_shutdown();
2549 }
2550
2551 static int kiblnd_base_startup(void)
2552 {
2553 struct kib_sched_info *sched;
2554 int rc;
2555 int i;
2556
2557 LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING);
2558
2559 try_module_get(THIS_MODULE);
2560 /* zero pointers, flags etc */
2561 memset(&kiblnd_data, 0, sizeof(kiblnd_data));
2562
2563 rwlock_init(&kiblnd_data.kib_global_lock);
2564
2565 INIT_LIST_HEAD(&kiblnd_data.kib_devs);
2566 INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
2567
2568 kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
2569 LIBCFS_ALLOC(kiblnd_data.kib_peers,
2570 sizeof(struct list_head) * kiblnd_data.kib_peer_hash_size);
2571 if (!kiblnd_data.kib_peers)
2572 goto failed;
2573 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
2574 INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
2575
2576 spin_lock_init(&kiblnd_data.kib_connd_lock);
2577 INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
2578 INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
2579 init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
2580 init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
2581
2582 kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
2583 sizeof(*sched));
2584 if (!kiblnd_data.kib_scheds)
2585 goto failed;
2586
2587 cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
2588 int nthrs;
2589
2590 spin_lock_init(&sched->ibs_lock);
2591 INIT_LIST_HEAD(&sched->ibs_conns);
2592 init_waitqueue_head(&sched->ibs_waitq);
2593
2594 nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
2595 if (*kiblnd_tunables.kib_nscheds > 0) {
2596 nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
2597 } else {
2598 /*
2599 * max to half of CPUs, another half is reserved for
2600 * upper layer modules
2601 */
2602 nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
2603 }
2604
2605 sched->ibs_nthreads_max = nthrs;
2606 sched->ibs_cpt = i;
2607 }
2608
2609 kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
2610
2611 /* lists/ptrs/locks initialised */
2612 kiblnd_data.kib_init = IBLND_INIT_DATA;
2613 /*****************************************************/
2614
2615 rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
2616 if (rc) {
2617 CERROR("Can't spawn o2iblnd connd: %d\n", rc);
2618 goto failed;
2619 }
2620
2621 if (*kiblnd_tunables.kib_dev_failover)
2622 rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
2623 "kiblnd_failover");
2624
2625 if (rc) {
2626 CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
2627 goto failed;
2628 }
2629
2630 /* flag everything initialised */
2631 kiblnd_data.kib_init = IBLND_INIT_ALL;
2632 /*****************************************************/
2633
2634 return 0;
2635
2636 failed:
2637 kiblnd_base_shutdown();
2638 return -ENETDOWN;
2639 }
2640
2641 static int kiblnd_start_schedulers(struct kib_sched_info *sched)
2642 {
2643 int rc = 0;
2644 int nthrs;
2645 int i;
2646
2647 if (!sched->ibs_nthreads) {
2648 if (*kiblnd_tunables.kib_nscheds > 0) {
2649 nthrs = sched->ibs_nthreads_max;
2650 } else {
2651 nthrs = cfs_cpt_weight(lnet_cpt_table(),
2652 sched->ibs_cpt);
2653 nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
2654 nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
2655 }
2656 } else {
2657 LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
2658 /* increase one thread if there is new interface */
2659 nthrs = sched->ibs_nthreads < sched->ibs_nthreads_max;
2660 }
2661
2662 for (i = 0; i < nthrs; i++) {
2663 long id;
2664 char name[20];
2665
2666 id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
2667 snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
2668 KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
2669 rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
2670 if (!rc)
2671 continue;
2672
2673 CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
2674 sched->ibs_cpt, sched->ibs_nthreads + i, rc);
2675 break;
2676 }
2677
2678 sched->ibs_nthreads += i;
2679 return rc;
2680 }
2681
2682 static int kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts,
2683 int ncpts)
2684 {
2685 int cpt;
2686 int rc;
2687 int i;
2688
2689 for (i = 0; i < ncpts; i++) {
2690 struct kib_sched_info *sched;
2691
2692 cpt = !cpts ? i : cpts[i];
2693 sched = kiblnd_data.kib_scheds[cpt];
2694
2695 if (!newdev && sched->ibs_nthreads > 0)
2696 continue;
2697
2698 rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
2699 if (rc) {
2700 CERROR("Failed to start scheduler threads for %s\n",
2701 dev->ibd_ifname);
2702 return rc;
2703 }
2704 }
2705 return 0;
2706 }
2707
2708 static kib_dev_t *kiblnd_dev_search(char *ifname)
2709 {
2710 kib_dev_t *alias = NULL;
2711 kib_dev_t *dev;
2712 char *colon;
2713 char *colon2;
2714
2715 colon = strchr(ifname, ':');
2716 list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
2717 if (!strcmp(&dev->ibd_ifname[0], ifname))
2718 return dev;
2719
2720 if (alias)
2721 continue;
2722
2723 colon2 = strchr(dev->ibd_ifname, ':');
2724 if (colon)
2725 *colon = 0;
2726 if (colon2)
2727 *colon2 = 0;
2728
2729 if (!strcmp(&dev->ibd_ifname[0], ifname))
2730 alias = dev;
2731
2732 if (colon)
2733 *colon = ':';
2734 if (colon2)
2735 *colon2 = ':';
2736 }
2737 return alias;
2738 }
2739
2740 static int kiblnd_startup(lnet_ni_t *ni)
2741 {
2742 char *ifname;
2743 kib_dev_t *ibdev = NULL;
2744 kib_net_t *net;
2745 struct timespec64 tv;
2746 unsigned long flags;
2747 int rc;
2748 int newdev;
2749
2750 LASSERT(ni->ni_lnd == &the_o2iblnd);
2751
2752 if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
2753 rc = kiblnd_base_startup();
2754 if (rc)
2755 return rc;
2756 }
2757
2758 LIBCFS_ALLOC(net, sizeof(*net));
2759 ni->ni_data = net;
2760 if (!net)
2761 goto net_failed;
2762
2763 ktime_get_real_ts64(&tv);
2764 net->ibn_incarnation = tv.tv_sec * USEC_PER_SEC +
2765 tv.tv_nsec / NSEC_PER_USEC;
2766
2767 ni->ni_peertimeout = *kiblnd_tunables.kib_peertimeout;
2768 ni->ni_maxtxcredits = *kiblnd_tunables.kib_credits;
2769 ni->ni_peertxcredits = *kiblnd_tunables.kib_peertxcredits;
2770 ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits;
2771
2772 if (ni->ni_interfaces[0]) {
2773 /* Use the IPoIB interface specified in 'networks=' */
2774
2775 CLASSERT(LNET_MAX_INTERFACES > 1);
2776 if (ni->ni_interfaces[1]) {
2777 CERROR("Multiple interfaces not supported\n");
2778 goto failed;
2779 }
2780
2781 ifname = ni->ni_interfaces[0];
2782 } else {
2783 ifname = *kiblnd_tunables.kib_default_ipif;
2784 }
2785
2786 if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
2787 CERROR("IPoIB interface name too long: %s\n", ifname);
2788 goto failed;
2789 }
2790
2791 ibdev = kiblnd_dev_search(ifname);
2792
2793 newdev = !ibdev;
2794 /* hmm...create kib_dev even for alias */
2795 if (!ibdev || strcmp(&ibdev->ibd_ifname[0], ifname))
2796 ibdev = kiblnd_create_dev(ifname);
2797
2798 if (!ibdev)
2799 goto failed;
2800
2801 net->ibn_dev = ibdev;
2802 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
2803
2804 rc = kiblnd_dev_start_threads(ibdev, newdev,
2805 ni->ni_cpts, ni->ni_ncpts);
2806 if (rc)
2807 goto failed;
2808
2809 rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts);
2810 if (rc) {
2811 CERROR("Failed to initialize NI pools: %d\n", rc);
2812 goto failed;
2813 }
2814
2815 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2816 ibdev->ibd_nnets++;
2817 list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
2818 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2819
2820 net->ibn_init = IBLND_INIT_ALL;
2821
2822 return 0;
2823
2824 failed:
2825 if (!net->ibn_dev && ibdev)
2826 kiblnd_destroy_dev(ibdev);
2827
2828 net_failed:
2829 kiblnd_shutdown(ni);
2830
2831 CDEBUG(D_NET, "kiblnd_startup failed\n");
2832 return -ENETDOWN;
2833 }
2834
2835 static lnd_t the_o2iblnd = {
2836 .lnd_type = O2IBLND,
2837 .lnd_startup = kiblnd_startup,
2838 .lnd_shutdown = kiblnd_shutdown,
2839 .lnd_ctl = kiblnd_ctl,
2840 .lnd_query = kiblnd_query,
2841 .lnd_send = kiblnd_send,
2842 .lnd_recv = kiblnd_recv,
2843 };
2844
2845 static void __exit ko2iblnd_exit(void)
2846 {
2847 lnet_unregister_lnd(&the_o2iblnd);
2848 }
2849
2850 static int __init ko2iblnd_init(void)
2851 {
2852 int rc;
2853
2854 CLASSERT(sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
2855 CLASSERT(offsetof(kib_msg_t,
2856 ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
2857 <= IBLND_MSG_SIZE);
2858 CLASSERT(offsetof(kib_msg_t,
2859 ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
2860 <= IBLND_MSG_SIZE);
2861
2862 rc = kiblnd_tunables_init();
2863 if (rc)
2864 return rc;
2865
2866 lnet_register_lnd(&the_o2iblnd);
2867
2868 return 0;
2869 }
2870
2871 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
2872 MODULE_DESCRIPTION("OpenIB gen2 LNet Network Driver");
2873 MODULE_VERSION("2.7.0");
2874 MODULE_LICENSE("GPL");
2875
2876 module_init(ko2iblnd_init);
2877 module_exit(ko2iblnd_exit);
This page took 0.103089 seconds and 5 git commands to generate.