staging: lustre: lnet: Remove an error code indent should use tabs where possible
[deliverable/linux.git] / drivers / staging / lustre / lnet / klnds / o2iblnd / o2iblnd_cb.c
CommitLineData
d7e09d03
PT
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
1dc563a6 30 * Copyright (c) 2012, 2015, Intel Corporation.
d7e09d03
PT
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lnet/klnds/o2iblnd/o2iblnd_cb.c
37 *
38 * Author: Eric Barton <eric@bartonsoftware.com>
39 */
40
41#include "o2iblnd.h"
42
2f3622b9
IH
43static void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx);
44
a8046a28 45static void
af967b29 46kiblnd_tx_done(lnet_ni_t *ni, kib_tx_t *tx)
d7e09d03
PT
47{
48 lnet_msg_t *lntmsg[2];
ec3d17c0
MS
49 kib_net_t *net = ni->ni_data;
50 int rc;
51 int i;
d7e09d03 52
06ace26e 53 LASSERT(net);
af967b29
GM
54 LASSERT(!in_interrupt());
55 LASSERT(!tx->tx_queued); /* mustn't be queued for sending */
5fd88337 56 LASSERT(!tx->tx_sending); /* mustn't be awaiting sent callback */
af967b29 57 LASSERT(!tx->tx_waiting); /* mustn't be awaiting peer response */
06ace26e 58 LASSERT(tx->tx_pool);
d7e09d03
PT
59
60 kiblnd_unmap_tx(ni, tx);
61
62 /* tx may have up to 2 lnet msgs to finalise */
63 lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
64 lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
65 rc = tx->tx_status;
66
06ace26e 67 if (tx->tx_conn) {
af967b29 68 LASSERT(ni == tx->tx_conn->ibc_peer->ibp_ni);
d7e09d03
PT
69
70 kiblnd_conn_decref(tx->tx_conn);
71 tx->tx_conn = NULL;
72 }
73
74 tx->tx_nwrq = 0;
75 tx->tx_status = 0;
76
77 kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
78
79 /* delay finalize until my descs have been freed */
80 for (i = 0; i < 2; i++) {
06ace26e 81 if (!lntmsg[i])
d7e09d03
PT
82 continue;
83
84 lnet_finalize(ni, lntmsg[i], rc);
85 }
86}
87
88void
af967b29 89kiblnd_txlist_done(lnet_ni_t *ni, struct list_head *txlist, int status)
d7e09d03
PT
90{
91 kib_tx_t *tx;
92
6fe7f962 93 while (!list_empty(txlist)) {
af967b29 94 tx = list_entry(txlist->next, kib_tx_t, tx_list);
d7e09d03
PT
95
96 list_del(&tx->tx_list);
97 /* complete now */
98 tx->tx_waiting = 0;
99 tx->tx_status = status;
100 kiblnd_tx_done(ni, tx);
101 }
102}
103
a8046a28 104static kib_tx_t *
d7e09d03
PT
105kiblnd_get_idle_tx(lnet_ni_t *ni, lnet_nid_t target)
106{
ec3d17c0
MS
107 kib_net_t *net = (kib_net_t *)ni->ni_data;
108 struct list_head *node;
109 kib_tx_t *tx;
110 kib_tx_poolset_t *tps;
d7e09d03
PT
111
112 tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)];
113 node = kiblnd_pool_alloc_node(&tps->tps_poolset);
06ace26e 114 if (!node)
d7e09d03
PT
115 return NULL;
116 tx = container_of(node, kib_tx_t, tx_list);
117
5fd88337 118 LASSERT(!tx->tx_nwrq);
af967b29 119 LASSERT(!tx->tx_queued);
5fd88337 120 LASSERT(!tx->tx_sending);
af967b29 121 LASSERT(!tx->tx_waiting);
5fd88337 122 LASSERT(!tx->tx_status);
06ace26e
JS
123 LASSERT(!tx->tx_conn);
124 LASSERT(!tx->tx_lntmsg[0]);
125 LASSERT(!tx->tx_lntmsg[1]);
5fd88337 126 LASSERT(!tx->tx_nfrags);
d7e09d03
PT
127
128 return tx;
129}
130
a8046a28 131static void
d7e09d03
PT
132kiblnd_drop_rx(kib_rx_t *rx)
133{
ec3d17c0
MS
134 kib_conn_t *conn = rx->rx_conn;
135 struct kib_sched_info *sched = conn->ibc_sched;
136 unsigned long flags;
d7e09d03
PT
137
138 spin_lock_irqsave(&sched->ibs_lock, flags);
139 LASSERT(conn->ibc_nrx > 0);
140 conn->ibc_nrx--;
141 spin_unlock_irqrestore(&sched->ibs_lock, flags);
142
143 kiblnd_conn_decref(conn);
144}
145
146int
af967b29 147kiblnd_post_rx(kib_rx_t *rx, int credit)
d7e09d03 148{
ec3d17c0
MS
149 kib_conn_t *conn = rx->rx_conn;
150 kib_net_t *net = conn->ibc_peer->ibp_ni->ni_data;
151 struct ib_recv_wr *bad_wrq = NULL;
152 struct ib_mr *mr;
153 int rc;
d7e09d03 154
06ace26e 155 LASSERT(net);
af967b29
GM
156 LASSERT(!in_interrupt());
157 LASSERT(credit == IBLND_POSTRX_NO_CREDIT ||
158 credit == IBLND_POSTRX_PEER_CREDIT ||
159 credit == IBLND_POSTRX_RSRVD_CREDIT);
d7e09d03
PT
160
161 mr = kiblnd_find_dma_mr(conn->ibc_hdev, rx->rx_msgaddr, IBLND_MSG_SIZE);
06ace26e 162 LASSERT(mr);
d7e09d03
PT
163
164 rx->rx_sge.lkey = mr->lkey;
165 rx->rx_sge.addr = rx->rx_msgaddr;
166 rx->rx_sge.length = IBLND_MSG_SIZE;
167
ec3d17c0 168 rx->rx_wrq.next = NULL;
d7e09d03
PT
169 rx->rx_wrq.sg_list = &rx->rx_sge;
170 rx->rx_wrq.num_sge = 1;
ec3d17c0 171 rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX);
d7e09d03 172
af967b29
GM
173 LASSERT(conn->ibc_state >= IBLND_CONN_INIT);
174 LASSERT(rx->rx_nob >= 0); /* not posted */
d7e09d03
PT
175
176 if (conn->ibc_state > IBLND_CONN_ESTABLISHED) {
177 kiblnd_drop_rx(rx); /* No more posts for this rx */
178 return 0;
179 }
180
181 rx->rx_nob = -1; /* flag posted */
182
6fa3c577
LZ
183 /* NB: need an extra reference after ib_post_recv because we don't
184 * own this rx (and rx::rx_conn) anymore, LU-5678.
185 */
186 kiblnd_conn_addref(conn);
d7e09d03 187 rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
5fd88337 188 if (unlikely(rc)) {
d7e09d03
PT
189 CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
190 libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
191 rx->rx_nob = 0;
192 }
193
194 if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */
6fa3c577 195 goto out;
d7e09d03 196
5fd88337 197 if (unlikely(rc)) {
d7e09d03
PT
198 kiblnd_close_conn(conn, rc);
199 kiblnd_drop_rx(rx); /* No more posts for this rx */
6fa3c577 200 goto out;
d7e09d03
PT
201 }
202
203 if (credit == IBLND_POSTRX_NO_CREDIT)
6fa3c577 204 goto out;
d7e09d03
PT
205
206 spin_lock(&conn->ibc_lock);
207 if (credit == IBLND_POSTRX_PEER_CREDIT)
208 conn->ibc_outstanding_credits++;
209 else
210 conn->ibc_reserved_credits++;
211 spin_unlock(&conn->ibc_lock);
212
213 kiblnd_check_sends(conn);
6fa3c577
LZ
214out:
215 kiblnd_conn_decref(conn);
216 return rc;
d7e09d03
PT
217}
218
a8046a28 219static kib_tx_t *
d7e09d03
PT
220kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
221{
ec3d17c0 222 struct list_head *tmp;
d7e09d03
PT
223
224 list_for_each(tmp, &conn->ibc_active_txs) {
225 kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
226
af967b29 227 LASSERT(!tx->tx_queued);
5fd88337 228 LASSERT(tx->tx_sending || tx->tx_waiting);
d7e09d03
PT
229
230 if (tx->tx_cookie != cookie)
231 continue;
232
233 if (tx->tx_waiting &&
234 tx->tx_msg->ibm_type == txtype)
235 return tx;
236
237 CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
238 tx->tx_waiting ? "" : "NOT ",
239 tx->tx_msg->ibm_type, txtype);
240 }
241 return NULL;
242}
243
a8046a28 244static void
d7e09d03
PT
245kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
246{
ec3d17c0
MS
247 kib_tx_t *tx;
248 lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
249 int idle;
d7e09d03
PT
250
251 spin_lock(&conn->ibc_lock);
252
253 tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie);
06ace26e 254 if (!tx) {
d7e09d03
PT
255 spin_unlock(&conn->ibc_lock);
256
55f5a824 257 CWARN("Unmatched completion type %x cookie %#llx from %s\n",
d7e09d03
PT
258 txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
259 kiblnd_close_conn(conn, -EPROTO);
260 return;
261 }
262
5fd88337 263 if (!tx->tx_status) { /* success so far */
e03b80cf 264 if (status < 0) /* failed? */
d7e09d03 265 tx->tx_status = status;
e03b80cf 266 else if (txtype == IBLND_MSG_GET_REQ)
d7e09d03 267 lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
d7e09d03
PT
268 }
269
270 tx->tx_waiting = 0;
271
5fd88337 272 idle = !tx->tx_queued && !tx->tx_sending;
d7e09d03
PT
273 if (idle)
274 list_del(&tx->tx_list);
275
276 spin_unlock(&conn->ibc_lock);
277
278 if (idle)
279 kiblnd_tx_done(ni, tx);
280}
281
a8046a28 282static void
d7e09d03
PT
283kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
284{
ec3d17c0
MS
285 lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
286 kib_tx_t *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
d7e09d03 287
06ace26e 288 if (!tx) {
d7e09d03
PT
289 CERROR("Can't get tx for completion %x for %s\n",
290 type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
291 return;
292 }
293
294 tx->tx_msg->ibm_u.completion.ibcm_status = status;
295 tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
296 kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t));
297
298 kiblnd_queue_tx(tx, conn);
299}
300
a8046a28 301static void
af967b29 302kiblnd_handle_rx(kib_rx_t *rx)
d7e09d03 303{
ec3d17c0
MS
304 kib_msg_t *msg = rx->rx_msg;
305 kib_conn_t *conn = rx->rx_conn;
306 lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
307 int credits = msg->ibm_credits;
308 kib_tx_t *tx;
309 int rc = 0;
310 int rc2;
311 int post_credit;
d7e09d03 312
af967b29 313 LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
d7e09d03 314
af967b29
GM
315 CDEBUG(D_NET, "Received %x[%d] from %s\n",
316 msg->ibm_type, credits,
317 libcfs_nid2str(conn->ibc_peer->ibp_nid));
d7e09d03 318
5fd88337 319 if (credits) {
d7e09d03
PT
320 /* Have I received credits that will let me send? */
321 spin_lock(&conn->ibc_lock);
322
323 if (conn->ibc_credits + credits >
324 IBLND_MSG_QUEUE_SIZE(conn->ibc_version)) {
325 rc2 = conn->ibc_credits;
326 spin_unlock(&conn->ibc_lock);
327
328 CERROR("Bad credits from %s: %d + %d > %d\n",
329 libcfs_nid2str(conn->ibc_peer->ibp_nid),
330 rc2, credits,
331 IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
332
333 kiblnd_close_conn(conn, -EPROTO);
334 kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
335 return;
336 }
337
338 conn->ibc_credits += credits;
339
340 /* This ensures the credit taken by NOOP can be returned */
341 if (msg->ibm_type == IBLND_MSG_NOOP &&
342 !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
343 conn->ibc_outstanding_credits++;
344
345 spin_unlock(&conn->ibc_lock);
346 kiblnd_check_sends(conn);
347 }
348
349 switch (msg->ibm_type) {
350 default:
351 CERROR("Bad IBLND message type %x from %s\n",
352 msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
353 post_credit = IBLND_POSTRX_NO_CREDIT;
354 rc = -EPROTO;
355 break;
356
357 case IBLND_MSG_NOOP:
358 if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
359 post_credit = IBLND_POSTRX_NO_CREDIT;
360 break;
361 }
362
5fd88337 363 if (credits) /* credit already posted */
d7e09d03
PT
364 post_credit = IBLND_POSTRX_NO_CREDIT;
365 else /* a keepalive NOOP */
366 post_credit = IBLND_POSTRX_PEER_CREDIT;
367 break;
368
369 case IBLND_MSG_IMMEDIATE:
370 post_credit = IBLND_POSTRX_DONT_POST;
371 rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr,
372 msg->ibm_srcnid, rx, 0);
373 if (rc < 0) /* repost on error */
374 post_credit = IBLND_POSTRX_PEER_CREDIT;
375 break;
376
377 case IBLND_MSG_PUT_REQ:
378 post_credit = IBLND_POSTRX_DONT_POST;
379 rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr,
380 msg->ibm_srcnid, rx, 1);
381 if (rc < 0) /* repost on error */
382 post_credit = IBLND_POSTRX_PEER_CREDIT;
383 break;
384
385 case IBLND_MSG_PUT_NAK:
af967b29
GM
386 CWARN("PUT_NACK from %s\n",
387 libcfs_nid2str(conn->ibc_peer->ibp_nid));
d7e09d03
PT
388 post_credit = IBLND_POSTRX_RSRVD_CREDIT;
389 kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ,
390 msg->ibm_u.completion.ibcm_status,
391 msg->ibm_u.completion.ibcm_cookie);
392 break;
393
394 case IBLND_MSG_PUT_ACK:
395 post_credit = IBLND_POSTRX_RSRVD_CREDIT;
396
397 spin_lock(&conn->ibc_lock);
398 tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ,
c314c319 399 msg->ibm_u.putack.ibpam_src_cookie);
06ace26e 400 if (tx)
d7e09d03
PT
401 list_del(&tx->tx_list);
402 spin_unlock(&conn->ibc_lock);
403
06ace26e 404 if (!tx) {
d7e09d03
PT
405 CERROR("Unmatched PUT_ACK from %s\n",
406 libcfs_nid2str(conn->ibc_peer->ibp_nid));
407 rc = -EPROTO;
408 break;
409 }
410
af967b29 411 LASSERT(tx->tx_waiting);
4420cfd3
JS
412 /*
413 * CAVEAT EMPTOR: I could be racing with tx_complete, but...
d7e09d03 414 * (a) I can overwrite tx_msg since my peer has received it!
4420cfd3
JS
415 * (b) tx_waiting set tells tx_complete() it's not done.
416 */
d7e09d03
PT
417 tx->tx_nwrq = 0; /* overwrite PUT_REQ */
418
419 rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
420 kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
421 &msg->ibm_u.putack.ibpam_rd,
422 msg->ibm_u.putack.ibpam_dst_cookie);
423 if (rc2 < 0)
424 CERROR("Can't setup rdma for PUT to %s: %d\n",
425 libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
426
427 spin_lock(&conn->ibc_lock);
428 tx->tx_waiting = 0; /* clear waiting and queue atomically */
429 kiblnd_queue_tx_locked(tx, conn);
430 spin_unlock(&conn->ibc_lock);
431 break;
432
433 case IBLND_MSG_PUT_DONE:
434 post_credit = IBLND_POSTRX_PEER_CREDIT;
435 kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK,
436 msg->ibm_u.completion.ibcm_status,
437 msg->ibm_u.completion.ibcm_cookie);
438 break;
439
440 case IBLND_MSG_GET_REQ:
441 post_credit = IBLND_POSTRX_DONT_POST;
442 rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr,
443 msg->ibm_srcnid, rx, 1);
444 if (rc < 0) /* repost on error */
445 post_credit = IBLND_POSTRX_PEER_CREDIT;
446 break;
447
448 case IBLND_MSG_GET_DONE:
449 post_credit = IBLND_POSTRX_RSRVD_CREDIT;
450 kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ,
451 msg->ibm_u.completion.ibcm_status,
452 msg->ibm_u.completion.ibcm_cookie);
453 break;
454 }
455
456 if (rc < 0) /* protocol error */
457 kiblnd_close_conn(conn, rc);
458
459 if (post_credit != IBLND_POSTRX_DONT_POST)
460 kiblnd_post_rx(rx, post_credit);
461}
462
a8046a28 463static void
6fe7f962 464kiblnd_rx_complete(kib_rx_t *rx, int status, int nob)
d7e09d03 465{
ec3d17c0
MS
466 kib_msg_t *msg = rx->rx_msg;
467 kib_conn_t *conn = rx->rx_conn;
468 lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
469 kib_net_t *net = ni->ni_data;
470 int rc;
471 int err = -EIO;
d7e09d03 472
06ace26e 473 LASSERT(net);
6fe7f962 474 LASSERT(rx->rx_nob < 0); /* was posted */
d7e09d03
PT
475 rx->rx_nob = 0; /* isn't now */
476
477 if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
478 goto ignore;
479
480 if (status != IB_WC_SUCCESS) {
481 CNETERR("Rx from %s failed: %d\n",
482 libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
483 goto failed;
484 }
485
6fe7f962 486 LASSERT(nob >= 0);
d7e09d03
PT
487 rx->rx_nob = nob;
488
489 rc = kiblnd_unpack_msg(msg, rx->rx_nob);
5fd88337 490 if (rc) {
6fe7f962 491 CERROR("Error %d unpacking rx from %s\n",
c314c319 492 rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
d7e09d03
PT
493 goto failed;
494 }
495
496 if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
497 msg->ibm_dstnid != ni->ni_nid ||
498 msg->ibm_srcstamp != conn->ibc_incarnation ||
499 msg->ibm_dststamp != net->ibn_incarnation) {
6fe7f962 500 CERROR("Stale rx from %s\n",
c314c319 501 libcfs_nid2str(conn->ibc_peer->ibp_nid));
d7e09d03
PT
502 err = -ESTALE;
503 goto failed;
504 }
505
506 /* set time last known alive */
507 kiblnd_peer_alive(conn->ibc_peer);
508
509 /* racing with connection establishment/teardown! */
510
511 if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
ec3d17c0
MS
512 rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
513 unsigned long flags;
d7e09d03
PT
514
515 write_lock_irqsave(g_lock, flags);
516 /* must check holding global lock to eliminate race */
517 if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
518 list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
519 write_unlock_irqrestore(g_lock, flags);
520 return;
521 }
522 write_unlock_irqrestore(g_lock, flags);
523 }
524 kiblnd_handle_rx(rx);
525 return;
526
527 failed:
528 CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
529 kiblnd_close_conn(conn, err);
530 ignore:
531 kiblnd_drop_rx(rx); /* Don't re-post rx. */
532}
533
a8046a28 534static struct page *
6fe7f962 535kiblnd_kvaddr_to_page(unsigned long vaddr)
d7e09d03
PT
536{
537 struct page *page;
538
e92c0809 539 if (is_vmalloc_addr((void *)vaddr)) {
6fe7f962 540 page = vmalloc_to_page((void *)vaddr);
06ace26e 541 LASSERT(page);
d7e09d03
PT
542 return page;
543 }
544#ifdef CONFIG_HIGHMEM
545 if (vaddr >= PKMAP_BASE &&
546 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
547 /* No highmem pages only used for bulk (kiov) I/O */
548 CERROR("find page for address in highmem\n");
549 LBUG();
550 }
551#endif
6fe7f962 552 page = virt_to_page(vaddr);
06ace26e 553 LASSERT(page);
d7e09d03
PT
554 return page;
555}
556
557static int
558kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
559{
ec3d17c0
MS
560 kib_hca_dev_t *hdev;
561 __u64 *pages = tx->tx_pages;
562 kib_fmr_poolset_t *fps;
563 int npages;
564 int size;
565 int cpt;
566 int rc;
567 int i;
d7e09d03 568
06ace26e
JS
569 LASSERT(tx->tx_pool);
570 LASSERT(tx->tx_pool->tpo_pool.po_owner);
d7e09d03 571
ec3d17c0 572 hdev = tx->tx_pool->tpo_hdev;
d7e09d03
PT
573
574 for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
575 for (size = 0; size < rd->rd_frags[i].rf_nob;
576 size += hdev->ibh_page_size) {
6fe7f962 577 pages[npages++] = (rd->rd_frags[i].rf_addr &
d7e09d03
PT
578 hdev->ibh_page_mask) + size;
579 }
580 }
581
582 cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
583
584 fps = net->ibn_fmr_ps[cpt];
415bcb5c 585 rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->fmr);
5fd88337 586 if (rc) {
6fe7f962 587 CERROR("Can't map %d pages: %d\n", npages, rc);
d7e09d03
PT
588 return rc;
589 }
590
4420cfd3
JS
591 /*
592 * If rd is not tx_rd, it's going to get sent to a peer, who will need
593 * the rkey
594 */
415bcb5c
OD
595 rd->rd_key = (rd != tx->tx_rd) ? tx->fmr.fmr_pfmr->fmr->rkey :
596 tx->fmr.fmr_pfmr->fmr->lkey;
d7e09d03 597 rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
ec3d17c0 598 rd->rd_frags[0].rf_nob = nob;
d7e09d03
PT
599 rd->rd_nfrags = 1;
600
601 return 0;
602}
603
2f3622b9 604static void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx)
d7e09d03 605{
ec3d17c0 606 kib_net_t *net = ni->ni_data;
d7e09d03 607
06ace26e 608 LASSERT(net);
d7e09d03 609
415bcb5c
OD
610 if (net->ibn_fmr_ps && tx->fmr.fmr_pfmr) {
611 kiblnd_fmr_pool_unmap(&tx->fmr, tx->tx_status);
612 tx->fmr.fmr_pfmr = NULL;
d7e09d03
PT
613 }
614
5fd88337 615 if (tx->tx_nfrags) {
d7e09d03
PT
616 kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
617 tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
618 tx->tx_nfrags = 0;
619 }
620}
621
2f3622b9
IH
622static int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
623 int nfrags)
d7e09d03 624{
ec3d17c0
MS
625 kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
626 kib_net_t *net = ni->ni_data;
627 struct ib_mr *mr = NULL;
628 __u32 nob;
629 int i;
d7e09d03 630
4420cfd3
JS
631 /*
632 * If rd is not tx_rd, it's going to get sent to a peer and I'm the
633 * RDMA sink
634 */
d7e09d03
PT
635 tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
636 tx->tx_nfrags = nfrags;
637
ec3d17c0
MS
638 rd->rd_nfrags = kiblnd_dma_map_sg(hdev->ibh_ibdev, tx->tx_frags,
639 tx->tx_nfrags, tx->tx_dmadir);
d7e09d03
PT
640
641 for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
642 rd->rd_frags[i].rf_nob = kiblnd_sg_dma_len(
643 hdev->ibh_ibdev, &tx->tx_frags[i]);
644 rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
645 hdev->ibh_ibdev, &tx->tx_frags[i]);
646 nob += rd->rd_frags[i].rf_nob;
647 }
648
649 /* looking for pre-mapping MR */
650 mr = kiblnd_find_rd_dma_mr(hdev, rd);
06ace26e 651 if (mr) {
d7e09d03
PT
652 /* found pre-mapping MR */
653 rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
654 return 0;
655 }
656
06ace26e 657 if (net->ibn_fmr_ps)
d7e09d03 658 return kiblnd_fmr_map_tx(net, tx, rd, nob);
d7e09d03
PT
659
660 return -EINVAL;
661}
662
a8046a28 663static int
d7e09d03 664kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
f351bad2 665 unsigned int niov, struct kvec *iov, int offset, int nob)
d7e09d03 666{
ec3d17c0
MS
667 kib_net_t *net = ni->ni_data;
668 struct page *page;
d7e09d03 669 struct scatterlist *sg;
ec3d17c0
MS
670 unsigned long vaddr;
671 int fragnob;
672 int page_offset;
d7e09d03 673
6fe7f962
GM
674 LASSERT(nob > 0);
675 LASSERT(niov > 0);
06ace26e 676 LASSERT(net);
d7e09d03
PT
677
678 while (offset >= iov->iov_len) {
679 offset -= iov->iov_len;
680 niov--;
681 iov++;
6fe7f962 682 LASSERT(niov > 0);
d7e09d03
PT
683 }
684
685 sg = tx->tx_frags;
686 do {
6fe7f962 687 LASSERT(niov > 0);
d7e09d03
PT
688
689 vaddr = ((unsigned long)iov->iov_base) + offset;
690 page_offset = vaddr & (PAGE_SIZE - 1);
691 page = kiblnd_kvaddr_to_page(vaddr);
06ace26e 692 if (!page) {
6fe7f962 693 CERROR("Can't find page\n");
d7e09d03
PT
694 return -EFAULT;
695 }
696
697 fragnob = min((int)(iov->iov_len - offset), nob);
698 fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
699
700 sg_set_page(sg, page, fragnob, page_offset);
701 sg++;
702
703 if (offset + fragnob < iov->iov_len) {
704 offset += fragnob;
705 } else {
706 offset = 0;
707 iov++;
708 niov--;
709 }
710 nob -= fragnob;
711 } while (nob > 0);
712
713 return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
714}
715
a8046a28 716static int
6fe7f962 717kiblnd_setup_rd_kiov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
c314c319 718 int nkiov, lnet_kiov_t *kiov, int offset, int nob)
d7e09d03 719{
ec3d17c0 720 kib_net_t *net = ni->ni_data;
d7e09d03 721 struct scatterlist *sg;
ec3d17c0 722 int fragnob;
d7e09d03
PT
723
724 CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
725
6fe7f962
GM
726 LASSERT(nob > 0);
727 LASSERT(nkiov > 0);
06ace26e 728 LASSERT(net);
d7e09d03
PT
729
730 while (offset >= kiov->kiov_len) {
731 offset -= kiov->kiov_len;
732 nkiov--;
733 kiov++;
6fe7f962 734 LASSERT(nkiov > 0);
d7e09d03
PT
735 }
736
737 sg = tx->tx_frags;
738 do {
6fe7f962 739 LASSERT(nkiov > 0);
d7e09d03
PT
740
741 fragnob = min((int)(kiov->kiov_len - offset), nob);
742
743 sg_set_page(sg, kiov->kiov_page, fragnob,
744 kiov->kiov_offset + offset);
745 sg++;
746
747 offset = 0;
748 kiov++;
749 nkiov--;
750 nob -= fragnob;
751 } while (nob > 0);
752
753 return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
754}
755
a8046a28 756static int
6fe7f962 757kiblnd_post_tx_locked(kib_conn_t *conn, kib_tx_t *tx, int credit)
a161de86 758 __must_hold(&conn->ibc_lock)
d7e09d03 759{
ec3d17c0
MS
760 kib_msg_t *msg = tx->tx_msg;
761 kib_peer_t *peer = conn->ibc_peer;
762 int ver = conn->ibc_version;
763 int rc;
764 int done;
d7e09d03
PT
765 struct ib_send_wr *bad_wrq;
766
6fe7f962 767 LASSERT(tx->tx_queued);
d7e09d03 768 /* We rely on this for QP sizing */
6fe7f962
GM
769 LASSERT(tx->tx_nwrq > 0);
770 LASSERT(tx->tx_nwrq <= 1 + IBLND_RDMA_FRAGS(ver));
d7e09d03 771
5fd88337 772 LASSERT(!credit || credit == 1);
6fe7f962
GM
773 LASSERT(conn->ibc_outstanding_credits >= 0);
774 LASSERT(conn->ibc_outstanding_credits <= IBLND_MSG_QUEUE_SIZE(ver));
775 LASSERT(conn->ibc_credits >= 0);
776 LASSERT(conn->ibc_credits <= IBLND_MSG_QUEUE_SIZE(ver));
d7e09d03
PT
777
778 if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) {
779 /* tx completions outstanding... */
780 CDEBUG(D_NET, "%s: posted enough\n",
781 libcfs_nid2str(peer->ibp_nid));
782 return -EAGAIN;
783 }
784
5fd88337 785 if (credit && !conn->ibc_credits) { /* no credits */
d7e09d03
PT
786 CDEBUG(D_NET, "%s: no credits\n",
787 libcfs_nid2str(peer->ibp_nid));
788 return -EAGAIN;
789 }
790
5fd88337 791 if (credit && !IBLND_OOB_CAPABLE(ver) &&
d7e09d03
PT
792 conn->ibc_credits == 1 && /* last credit reserved */
793 msg->ibm_type != IBLND_MSG_NOOP) { /* for NOOP */
794 CDEBUG(D_NET, "%s: not using last credit\n",
795 libcfs_nid2str(peer->ibp_nid));
796 return -EAGAIN;
797 }
798
799 /* NB don't drop ibc_lock before bumping tx_sending */
800 list_del(&tx->tx_list);
801 tx->tx_queued = 0;
802
803 if (msg->ibm_type == IBLND_MSG_NOOP &&
804 (!kiblnd_need_noop(conn) || /* redundant NOOP */
805 (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */
806 conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) {
4420cfd3
JS
807 /*
808 * OK to drop when posted enough NOOPs, since
d7e09d03 809 * kiblnd_check_sends will queue NOOP again when
4420cfd3
JS
810 * posted NOOPs complete
811 */
d7e09d03
PT
812 spin_unlock(&conn->ibc_lock);
813 kiblnd_tx_done(peer->ibp_ni, tx);
814 spin_lock(&conn->ibc_lock);
815 CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
816 libcfs_nid2str(peer->ibp_nid),
817 conn->ibc_noops_posted);
818 return 0;
819 }
820
821 kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
822 peer->ibp_nid, conn->ibc_incarnation);
823
824 conn->ibc_credits -= credit;
825 conn->ibc_outstanding_credits = 0;
826 conn->ibc_nsends_posted++;
827 if (msg->ibm_type == IBLND_MSG_NOOP)
828 conn->ibc_noops_posted++;
829
4420cfd3
JS
830 /*
831 * CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA
d7e09d03
PT
832 * PUT. If so, it was first queued here as a PUT_REQ, sent and
833 * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
834 * and then re-queued here. It's (just) possible that
835 * tx_sending is non-zero if we've not done the tx_complete()
4420cfd3
JS
836 * from the first send; hence the ++ rather than = below.
837 */
d7e09d03
PT
838 tx->tx_sending++;
839 list_add(&tx->tx_list, &conn->ibc_active_txs);
840
841 /* I'm still holding ibc_lock! */
842 if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
843 rc = -ECONNABORTED;
844 } else if (tx->tx_pool->tpo_pool.po_failed ||
845 conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
846 /* close_conn will launch failover */
847 rc = -ENETDOWN;
848 } else {
e622f2f4 849 rc = ib_post_send(conn->ibc_cmid->qp, &tx->tx_wrq->wr, &bad_wrq);
d7e09d03
PT
850 }
851
852 conn->ibc_last_send = jiffies;
853
5fd88337 854 if (!rc)
d7e09d03
PT
855 return 0;
856
4420cfd3
JS
857 /*
858 * NB credits are transferred in the actual
859 * message, which can only be the last work item
860 */
d7e09d03
PT
861 conn->ibc_credits += credit;
862 conn->ibc_outstanding_credits += msg->ibm_credits;
863 conn->ibc_nsends_posted--;
864 if (msg->ibm_type == IBLND_MSG_NOOP)
865 conn->ibc_noops_posted--;
866
867 tx->tx_status = rc;
868 tx->tx_waiting = 0;
869 tx->tx_sending--;
870
5fd88337 871 done = !tx->tx_sending;
d7e09d03
PT
872 if (done)
873 list_del(&tx->tx_list);
874
875 spin_unlock(&conn->ibc_lock);
876
877 if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
878 CERROR("Error %d posting transmit to %s\n",
879 rc, libcfs_nid2str(peer->ibp_nid));
880 else
881 CDEBUG(D_NET, "Error %d posting transmit to %s\n",
882 rc, libcfs_nid2str(peer->ibp_nid));
883
884 kiblnd_close_conn(conn, rc);
885
886 if (done)
887 kiblnd_tx_done(peer->ibp_ni, tx);
888
889 spin_lock(&conn->ibc_lock);
890
891 return -EIO;
892}
893
894void
6fe7f962 895kiblnd_check_sends(kib_conn_t *conn)
d7e09d03 896{
ec3d17c0 897 int ver = conn->ibc_version;
d7e09d03 898 lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
ec3d17c0 899 kib_tx_t *tx;
d7e09d03
PT
900
901 /* Don't send anything until after the connection is established */
902 if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
903 CDEBUG(D_NET, "%s too soon\n",
904 libcfs_nid2str(conn->ibc_peer->ibp_nid));
905 return;
906 }
907
908 spin_lock(&conn->ibc_lock);
909
6fe7f962
GM
910 LASSERT(conn->ibc_nsends_posted <= IBLND_CONCURRENT_SENDS(ver));
911 LASSERT(!IBLND_OOB_CAPABLE(ver) ||
c314c319 912 conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
6fe7f962 913 LASSERT(conn->ibc_reserved_credits >= 0);
d7e09d03
PT
914
915 while (conn->ibc_reserved_credits > 0 &&
916 !list_empty(&conn->ibc_tx_queue_rsrvd)) {
917 tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
c314c319 918 kib_tx_t, tx_list);
d7e09d03
PT
919 list_del(&tx->tx_list);
920 list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
921 conn->ibc_reserved_credits--;
922 }
923
924 if (kiblnd_need_noop(conn)) {
925 spin_unlock(&conn->ibc_lock);
926
927 tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
06ace26e 928 if (tx)
d7e09d03
PT
929 kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
930
931 spin_lock(&conn->ibc_lock);
06ace26e 932 if (tx)
d7e09d03
PT
933 kiblnd_queue_tx_locked(tx, conn);
934 }
935
936 kiblnd_conn_addref(conn); /* 1 ref for me.... (see b21911) */
937
938 for (;;) {
939 int credit;
940
941 if (!list_empty(&conn->ibc_tx_queue_nocred)) {
942 credit = 0;
943 tx = list_entry(conn->ibc_tx_queue_nocred.next,
c314c319 944 kib_tx_t, tx_list);
d7e09d03 945 } else if (!list_empty(&conn->ibc_tx_noops)) {
6fe7f962 946 LASSERT(!IBLND_OOB_CAPABLE(ver));
d7e09d03
PT
947 credit = 1;
948 tx = list_entry(conn->ibc_tx_noops.next,
949 kib_tx_t, tx_list);
950 } else if (!list_empty(&conn->ibc_tx_queue)) {
951 credit = 1;
952 tx = list_entry(conn->ibc_tx_queue.next,
c314c319 953 kib_tx_t, tx_list);
06f2f2f2 954 } else {
d7e09d03 955 break;
06f2f2f2 956 }
d7e09d03 957
5fd88337 958 if (kiblnd_post_tx_locked(conn, tx, credit))
d7e09d03
PT
959 break;
960 }
961
962 spin_unlock(&conn->ibc_lock);
963
964 kiblnd_conn_decref(conn); /* ...until here */
965}
966
a8046a28 967static void
6fe7f962 968kiblnd_tx_complete(kib_tx_t *tx, int status)
d7e09d03 969{
ec3d17c0
MS
970 int failed = (status != IB_WC_SUCCESS);
971 kib_conn_t *conn = tx->tx_conn;
972 int idle;
d7e09d03 973
6fe7f962 974 LASSERT(tx->tx_sending > 0);
d7e09d03
PT
975
976 if (failed) {
977 if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
55f5a824 978 CNETERR("Tx -> %s cookie %#llx sending %d waiting %d: failed %d\n",
d7e09d03
PT
979 libcfs_nid2str(conn->ibc_peer->ibp_nid),
980 tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
981 status);
982
983 kiblnd_close_conn(conn, -EIO);
984 } else {
985 kiblnd_peer_alive(conn->ibc_peer);
986 }
987
988 spin_lock(&conn->ibc_lock);
989
4420cfd3
JS
990 /*
991 * I could be racing with rdma completion. Whoever makes 'tx' idle
992 * gets to free it, which also drops its ref on 'conn'.
993 */
d7e09d03
PT
994 tx->tx_sending--;
995 conn->ibc_nsends_posted--;
996 if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP)
997 conn->ibc_noops_posted--;
998
999 if (failed) {
1000 tx->tx_waiting = 0; /* don't wait for peer */
1001 tx->tx_status = -EIO;
1002 }
1003
5fd88337 1004 idle = !tx->tx_sending && /* This is the final callback */
d7e09d03
PT
1005 !tx->tx_waiting && /* Not waiting for peer */
1006 !tx->tx_queued; /* Not re-queued (PUT_DONE) */
1007 if (idle)
1008 list_del(&tx->tx_list);
1009
1010 kiblnd_conn_addref(conn); /* 1 ref for me.... */
1011
1012 spin_unlock(&conn->ibc_lock);
1013
1014 if (idle)
1015 kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx);
1016
1017 kiblnd_check_sends(conn);
1018
1019 kiblnd_conn_decref(conn); /* ...until here */
1020}
1021
1022void
6fe7f962 1023kiblnd_init_tx_msg(lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob)
d7e09d03 1024{
ec3d17c0
MS
1025 kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
1026 struct ib_sge *sge = &tx->tx_sge[tx->tx_nwrq];
e622f2f4 1027 struct ib_rdma_wr *wrq = &tx->tx_wrq[tx->tx_nwrq];
ec3d17c0
MS
1028 int nob = offsetof(kib_msg_t, ibm_u) + body_nob;
1029 struct ib_mr *mr;
d7e09d03 1030
6fe7f962
GM
1031 LASSERT(tx->tx_nwrq >= 0);
1032 LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
1033 LASSERT(nob <= IBLND_MSG_SIZE);
d7e09d03
PT
1034
1035 kiblnd_init_msg(tx->tx_msg, type, body_nob);
1036
1037 mr = kiblnd_find_dma_mr(hdev, tx->tx_msgaddr, nob);
06ace26e 1038 LASSERT(mr);
d7e09d03
PT
1039
1040 sge->lkey = mr->lkey;
1041 sge->addr = tx->tx_msgaddr;
1042 sge->length = nob;
1043
1044 memset(wrq, 0, sizeof(*wrq));
1045
e622f2f4
CH
1046 wrq->wr.next = NULL;
1047 wrq->wr.wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
1048 wrq->wr.sg_list = sge;
1049 wrq->wr.num_sge = 1;
1050 wrq->wr.opcode = IB_WR_SEND;
1051 wrq->wr.send_flags = IB_SEND_SIGNALED;
d7e09d03
PT
1052
1053 tx->tx_nwrq++;
1054}
1055
1056int
6fe7f962 1057kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
c314c319 1058 int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie)
d7e09d03 1059{
ec3d17c0
MS
1060 kib_msg_t *ibmsg = tx->tx_msg;
1061 kib_rdma_desc_t *srcrd = tx->tx_rd;
1062 struct ib_sge *sge = &tx->tx_sge[0];
e622f2f4 1063 struct ib_rdma_wr *wrq = &tx->tx_wrq[0], *next;
ec3d17c0 1064 int rc = resid;
d3d3d37a
JS
1065 int srcidx = 0;
1066 int dstidx = 0;
ec3d17c0 1067 int wrknob;
d7e09d03 1068
6fe7f962 1069 LASSERT(!in_interrupt());
5fd88337 1070 LASSERT(!tx->tx_nwrq);
6fe7f962 1071 LASSERT(type == IBLND_MSG_GET_DONE ||
c314c319 1072 type == IBLND_MSG_PUT_DONE);
d7e09d03 1073
d7e09d03
PT
1074 while (resid > 0) {
1075 if (srcidx >= srcrd->rd_nfrags) {
1076 CERROR("Src buffer exhausted: %d frags\n", srcidx);
1077 rc = -EPROTO;
1078 break;
1079 }
1080
1081 if (dstidx == dstrd->rd_nfrags) {
1082 CERROR("Dst buffer exhausted: %d frags\n", dstidx);
1083 rc = -EPROTO;
1084 break;
1085 }
1086
1087 if (tx->tx_nwrq == IBLND_RDMA_FRAGS(conn->ibc_version)) {
2d00bd17 1088 CERROR("RDMA too fragmented for %s (%d): %d/%d src %d/%d dst frags\n",
d7e09d03
PT
1089 libcfs_nid2str(conn->ibc_peer->ibp_nid),
1090 IBLND_RDMA_FRAGS(conn->ibc_version),
1091 srcidx, srcrd->rd_nfrags,
1092 dstidx, dstrd->rd_nfrags);
1093 rc = -EMSGSIZE;
1094 break;
1095 }
1096
0c575417 1097 wrknob = min(min(kiblnd_rd_frag_size(srcrd, srcidx),
a1ccbf9c
JM
1098 kiblnd_rd_frag_size(dstrd, dstidx)),
1099 (__u32) resid);
d7e09d03
PT
1100
1101 sge = &tx->tx_sge[tx->tx_nwrq];
1102 sge->addr = kiblnd_rd_frag_addr(srcrd, srcidx);
1103 sge->lkey = kiblnd_rd_frag_key(srcrd, srcidx);
1104 sge->length = wrknob;
1105
1106 wrq = &tx->tx_wrq[tx->tx_nwrq];
e622f2f4 1107 next = wrq + 1;
d7e09d03 1108
e622f2f4
CH
1109 wrq->wr.next = &next->wr;
1110 wrq->wr.wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
1111 wrq->wr.sg_list = sge;
1112 wrq->wr.num_sge = 1;
1113 wrq->wr.opcode = IB_WR_RDMA_WRITE;
1114 wrq->wr.send_flags = 0;
d7e09d03 1115
e622f2f4
CH
1116 wrq->remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
1117 wrq->rkey = kiblnd_rd_frag_key(dstrd, dstidx);
d7e09d03
PT
1118
1119 srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob);
1120 dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob);
1121
1122 resid -= wrknob;
1123
1124 tx->tx_nwrq++;
1125 wrq++;
1126 sge++;
1127 }
1128
1129 if (rc < 0) /* no RDMA if completing with failure */
1130 tx->tx_nwrq = 0;
1131
1132 ibmsg->ibm_u.completion.ibcm_status = rc;
1133 ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
1134 kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
6fe7f962 1135 type, sizeof(kib_completion_msg_t));
d7e09d03
PT
1136
1137 return rc;
1138}
1139
1140void
6fe7f962 1141kiblnd_queue_tx_locked(kib_tx_t *tx, kib_conn_t *conn)
d7e09d03 1142{
ec3d17c0 1143 struct list_head *q;
d7e09d03 1144
6fe7f962
GM
1145 LASSERT(tx->tx_nwrq > 0); /* work items set up */
1146 LASSERT(!tx->tx_queued); /* not queued for sending already */
1147 LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
d7e09d03
PT
1148
1149 tx->tx_queued = 1;
1150 tx->tx_deadline = jiffies + (*kiblnd_tunables.kib_timeout * HZ);
1151
06ace26e 1152 if (!tx->tx_conn) {
d7e09d03
PT
1153 kiblnd_conn_addref(conn);
1154 tx->tx_conn = conn;
6fe7f962 1155 LASSERT(tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE);
d7e09d03
PT
1156 } else {
1157 /* PUT_DONE first attached to conn as a PUT_REQ */
6fe7f962
GM
1158 LASSERT(tx->tx_conn == conn);
1159 LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
d7e09d03
PT
1160 }
1161
1162 switch (tx->tx_msg->ibm_type) {
1163 default:
1164 LBUG();
1165
1166 case IBLND_MSG_PUT_REQ:
1167 case IBLND_MSG_GET_REQ:
1168 q = &conn->ibc_tx_queue_rsrvd;
1169 break;
1170
1171 case IBLND_MSG_PUT_NAK:
1172 case IBLND_MSG_PUT_ACK:
1173 case IBLND_MSG_PUT_DONE:
1174 case IBLND_MSG_GET_DONE:
1175 q = &conn->ibc_tx_queue_nocred;
1176 break;
1177
1178 case IBLND_MSG_NOOP:
1179 if (IBLND_OOB_CAPABLE(conn->ibc_version))
1180 q = &conn->ibc_tx_queue_nocred;
1181 else
1182 q = &conn->ibc_tx_noops;
1183 break;
1184
1185 case IBLND_MSG_IMMEDIATE:
1186 q = &conn->ibc_tx_queue;
1187 break;
1188 }
1189
1190 list_add_tail(&tx->tx_list, q);
1191}
1192
1193void
6fe7f962 1194kiblnd_queue_tx(kib_tx_t *tx, kib_conn_t *conn)
d7e09d03
PT
1195{
1196 spin_lock(&conn->ibc_lock);
1197 kiblnd_queue_tx_locked(tx, conn);
1198 spin_unlock(&conn->ibc_lock);
1199
1200 kiblnd_check_sends(conn);
1201}
1202
1203static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
1204 struct sockaddr_in *srcaddr,
1205 struct sockaddr_in *dstaddr,
1206 int timeout_ms)
1207{
1208 unsigned short port;
1209 int rc;
1210
1211 /* allow the port to be reused */
1212 rc = rdma_set_reuseaddr(cmid, 1);
5fd88337 1213 if (rc) {
d7e09d03
PT
1214 CERROR("Unable to set reuse on cmid: %d\n", rc);
1215 return rc;
1216 }
1217
1218 /* look for a free privileged port */
51078e25 1219 for (port = PROT_SOCK - 1; port > 0; port--) {
d7e09d03
PT
1220 srcaddr->sin_port = htons(port);
1221 rc = rdma_resolve_addr(cmid,
1222 (struct sockaddr *)srcaddr,
1223 (struct sockaddr *)dstaddr,
1224 timeout_ms);
5fd88337 1225 if (!rc) {
d7e09d03
PT
1226 CDEBUG(D_NET, "bound to port %hu\n", port);
1227 return 0;
1228 } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
1229 CDEBUG(D_NET, "bind to port %hu failed: %d\n",
1230 port, rc);
1231 } else {
1232 return rc;
1233 }
1234 }
1235
1236 CERROR("Failed to bind to a free privileged port\n");
1237 return rc;
1238}
1239
a8046a28 1240static void
6fe7f962 1241kiblnd_connect_peer(kib_peer_t *peer)
d7e09d03
PT
1242{
1243 struct rdma_cm_id *cmid;
ec3d17c0
MS
1244 kib_dev_t *dev;
1245 kib_net_t *net = peer->ibp_ni->ni_data;
d7e09d03
PT
1246 struct sockaddr_in srcaddr;
1247 struct sockaddr_in dstaddr;
ec3d17c0 1248 int rc;
d7e09d03 1249
06ace26e 1250 LASSERT(net);
6fe7f962 1251 LASSERT(peer->ibp_connecting > 0);
d7e09d03
PT
1252
1253 cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP,
1254 IB_QPT_RC);
1255
1256 if (IS_ERR(cmid)) {
1257 CERROR("Can't create CMID for %s: %ld\n",
1258 libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid));
1259 rc = PTR_ERR(cmid);
1260 goto failed;
1261 }
1262
1263 dev = net->ibn_dev;
1264 memset(&srcaddr, 0, sizeof(srcaddr));
1265 srcaddr.sin_family = AF_INET;
1266 srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
1267
1268 memset(&dstaddr, 0, sizeof(dstaddr));
1269 dstaddr.sin_family = AF_INET;
1270 dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
1271 dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
1272
1273 kiblnd_peer_addref(peer); /* cmid's ref */
1274
1275 if (*kiblnd_tunables.kib_use_priv_port) {
1276 rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
1277 *kiblnd_tunables.kib_timeout * 1000);
1278 } else {
1279 rc = rdma_resolve_addr(cmid,
1280 (struct sockaddr *)&srcaddr,
1281 (struct sockaddr *)&dstaddr,
1282 *kiblnd_tunables.kib_timeout * 1000);
1283 }
5fd88337 1284 if (rc) {
d7e09d03
PT
1285 /* Can't initiate address resolution: */
1286 CERROR("Can't resolve addr for %s: %d\n",
1287 libcfs_nid2str(peer->ibp_nid), rc);
1288 goto failed2;
1289 }
1290
06ace26e 1291 LASSERT(cmid->device);
5e8f6920 1292 CDEBUG(D_NET, "%s: connection bound to %s:%pI4h:%s\n",
d7e09d03 1293 libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname,
5e8f6920 1294 &dev->ibd_ifip, cmid->device->name);
d7e09d03
PT
1295
1296 return;
1297
1298 failed2:
1299 kiblnd_peer_decref(peer); /* cmid's ref */
1300 rdma_destroy_id(cmid);
1301 failed:
1302 kiblnd_peer_connect_failed(peer, 1, rc);
1303}
1304
1305void
6fe7f962 1306kiblnd_launch_tx(lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
d7e09d03 1307{
ec3d17c0
MS
1308 kib_peer_t *peer;
1309 kib_peer_t *peer2;
1310 kib_conn_t *conn;
1311 rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
1312 unsigned long flags;
1313 int rc;
d7e09d03 1314
4420cfd3
JS
1315 /*
1316 * If I get here, I've committed to send, so I complete the tx with
1317 * failure on any problems
1318 */
06ace26e
JS
1319 LASSERT(!tx || !tx->tx_conn); /* only set when assigned a conn */
1320 LASSERT(!tx || tx->tx_nwrq > 0); /* work items have been set up */
d7e09d03 1321
4420cfd3
JS
1322 /*
1323 * First time, just use a read lock since I expect to find my peer
1324 * connected
1325 */
d7e09d03
PT
1326 read_lock_irqsave(g_lock, flags);
1327
1328 peer = kiblnd_find_peer_locked(nid);
06ace26e 1329 if (peer && !list_empty(&peer->ibp_conns)) {
d7e09d03
PT
1330 /* Found a peer with an established connection */
1331 conn = kiblnd_get_conn_locked(peer);
1332 kiblnd_conn_addref(conn); /* 1 ref for me... */
1333
1334 read_unlock_irqrestore(g_lock, flags);
1335
06ace26e 1336 if (tx)
d7e09d03
PT
1337 kiblnd_queue_tx(tx, conn);
1338 kiblnd_conn_decref(conn); /* ...to here */
1339 return;
1340 }
1341
1342 read_unlock(g_lock);
1343 /* Re-try with a write lock */
1344 write_lock(g_lock);
1345
1346 peer = kiblnd_find_peer_locked(nid);
06ace26e 1347 if (peer) {
d7e09d03
PT
1348 if (list_empty(&peer->ibp_conns)) {
1349 /* found a peer, but it's still connecting... */
5fd88337
JS
1350 LASSERT(peer->ibp_connecting ||
1351 peer->ibp_accepting);
06ace26e 1352 if (tx)
d7e09d03 1353 list_add_tail(&tx->tx_list,
c314c319 1354 &peer->ibp_tx_queue);
d7e09d03
PT
1355 write_unlock_irqrestore(g_lock, flags);
1356 } else {
1357 conn = kiblnd_get_conn_locked(peer);
1358 kiblnd_conn_addref(conn); /* 1 ref for me... */
1359
1360 write_unlock_irqrestore(g_lock, flags);
1361
06ace26e 1362 if (tx)
d7e09d03
PT
1363 kiblnd_queue_tx(tx, conn);
1364 kiblnd_conn_decref(conn); /* ...to here */
1365 }
1366 return;
1367 }
1368
1369 write_unlock_irqrestore(g_lock, flags);
1370
1371 /* Allocate a peer ready to add to the peer table and retry */
1372 rc = kiblnd_create_peer(ni, &peer, nid);
5fd88337 1373 if (rc) {
d7e09d03 1374 CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
06ace26e 1375 if (tx) {
d7e09d03
PT
1376 tx->tx_status = -EHOSTUNREACH;
1377 tx->tx_waiting = 0;
1378 kiblnd_tx_done(ni, tx);
1379 }
1380 return;
1381 }
1382
1383 write_lock_irqsave(g_lock, flags);
1384
1385 peer2 = kiblnd_find_peer_locked(nid);
06ace26e 1386 if (peer2) {
d7e09d03
PT
1387 if (list_empty(&peer2->ibp_conns)) {
1388 /* found a peer, but it's still connecting... */
5fd88337
JS
1389 LASSERT(peer2->ibp_connecting ||
1390 peer2->ibp_accepting);
06ace26e 1391 if (tx)
d7e09d03 1392 list_add_tail(&tx->tx_list,
c314c319 1393 &peer2->ibp_tx_queue);
d7e09d03
PT
1394 write_unlock_irqrestore(g_lock, flags);
1395 } else {
1396 conn = kiblnd_get_conn_locked(peer2);
1397 kiblnd_conn_addref(conn); /* 1 ref for me... */
1398
1399 write_unlock_irqrestore(g_lock, flags);
1400
06ace26e 1401 if (tx)
d7e09d03
PT
1402 kiblnd_queue_tx(tx, conn);
1403 kiblnd_conn_decref(conn); /* ...to here */
1404 }
1405
1406 kiblnd_peer_decref(peer);
1407 return;
1408 }
1409
1410 /* Brand new peer */
5fd88337 1411 LASSERT(!peer->ibp_connecting);
d7e09d03
PT
1412 peer->ibp_connecting = 1;
1413
1414 /* always called with a ref on ni, which prevents ni being shutdown */
5fd88337 1415 LASSERT(!((kib_net_t *)ni->ni_data)->ibn_shutdown);
d7e09d03 1416
06ace26e 1417 if (tx)
d7e09d03
PT
1418 list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
1419
1420 kiblnd_peer_addref(peer);
1421 list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
1422
1423 write_unlock_irqrestore(g_lock, flags);
1424
1425 kiblnd_connect_peer(peer);
1426 kiblnd_peer_decref(peer);
1427}
1428
1429int
6fe7f962 1430kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
d7e09d03 1431{
ec3d17c0
MS
1432 lnet_hdr_t *hdr = &lntmsg->msg_hdr;
1433 int type = lntmsg->msg_type;
d7e09d03 1434 lnet_process_id_t target = lntmsg->msg_target;
ec3d17c0
MS
1435 int target_is_router = lntmsg->msg_target_is_router;
1436 int routing = lntmsg->msg_routing;
1437 unsigned int payload_niov = lntmsg->msg_niov;
1438 struct kvec *payload_iov = lntmsg->msg_iov;
1439 lnet_kiov_t *payload_kiov = lntmsg->msg_kiov;
1440 unsigned int payload_offset = lntmsg->msg_offset;
1441 unsigned int payload_nob = lntmsg->msg_len;
1442 kib_msg_t *ibmsg;
2f3622b9 1443 kib_rdma_desc_t *rd;
ec3d17c0
MS
1444 kib_tx_t *tx;
1445 int nob;
1446 int rc;
d7e09d03
PT
1447
1448 /* NB 'private' is different depending on what we're sending.... */
1449
1450 CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
1451 payload_nob, payload_niov, libcfs_id2str(target));
1452
5fd88337 1453 LASSERT(!payload_nob || payload_niov > 0);
6fe7f962 1454 LASSERT(payload_niov <= LNET_MAX_IOV);
d7e09d03
PT
1455
1456 /* Thread context */
6fe7f962 1457 LASSERT(!in_interrupt());
d7e09d03 1458 /* payload is either all vaddrs or all pages */
06ace26e 1459 LASSERT(!(payload_kiov && payload_iov));
d7e09d03
PT
1460
1461 switch (type) {
1462 default:
1463 LBUG();
fbe7c6c7 1464 return -EIO;
d7e09d03
PT
1465
1466 case LNET_MSG_ACK:
5fd88337 1467 LASSERT(!payload_nob);
d7e09d03
PT
1468 break;
1469
1470 case LNET_MSG_GET:
1471 if (routing || target_is_router)
1472 break; /* send IMMEDIATE */
1473
1474 /* is the REPLY message too small for RDMA? */
1475 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
1476 if (nob <= IBLND_MSG_SIZE)
1477 break; /* send IMMEDIATE */
1478
1479 tx = kiblnd_get_idle_tx(ni, target.nid);
06ace26e 1480 if (!tx) {
d7e09d03
PT
1481 CERROR("Can't allocate txd for GET to %s\n",
1482 libcfs_nid2str(target.nid));
1483 return -ENOMEM;
1484 }
1485
1486 ibmsg = tx->tx_msg;
2f3622b9 1487 rd = &ibmsg->ibm_u.get.ibgm_rd;
5fd88337 1488 if (!(lntmsg->msg_md->md_options & LNET_MD_KIOV))
2f3622b9 1489 rc = kiblnd_setup_rd_iov(ni, tx, rd,
d7e09d03
PT
1490 lntmsg->msg_md->md_niov,
1491 lntmsg->msg_md->md_iov.iov,
1492 0, lntmsg->msg_md->md_length);
1493 else
2f3622b9 1494 rc = kiblnd_setup_rd_kiov(ni, tx, rd,
d7e09d03
PT
1495 lntmsg->msg_md->md_niov,
1496 lntmsg->msg_md->md_iov.kiov,
1497 0, lntmsg->msg_md->md_length);
5fd88337 1498 if (rc) {
d7e09d03
PT
1499 CERROR("Can't setup GET sink for %s: %d\n",
1500 libcfs_nid2str(target.nid), rc);
1501 kiblnd_tx_done(ni, tx);
1502 return -EIO;
1503 }
1504
2f3622b9 1505 nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[rd->rd_nfrags]);
d7e09d03
PT
1506 ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
1507 ibmsg->ibm_u.get.ibgm_hdr = *hdr;
1508
1509 kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
1510
1511 tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
06ace26e 1512 if (!tx->tx_lntmsg[1]) {
d7e09d03
PT
1513 CERROR("Can't create reply for GET -> %s\n",
1514 libcfs_nid2str(target.nid));
1515 kiblnd_tx_done(ni, tx);
1516 return -EIO;
1517 }
1518
1519 tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */
1520 tx->tx_waiting = 1; /* waiting for GET_DONE */
1521 kiblnd_launch_tx(ni, tx, target.nid);
1522 return 0;
1523
1524 case LNET_MSG_REPLY:
1525 case LNET_MSG_PUT:
1526 /* Is the payload small enough not to need RDMA? */
1527 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
1528 if (nob <= IBLND_MSG_SIZE)
1529 break; /* send IMMEDIATE */
1530
1531 tx = kiblnd_get_idle_tx(ni, target.nid);
06ace26e 1532 if (!tx) {
d7e09d03
PT
1533 CERROR("Can't allocate %s txd for %s\n",
1534 type == LNET_MSG_PUT ? "PUT" : "REPLY",
1535 libcfs_nid2str(target.nid));
1536 return -ENOMEM;
1537 }
1538
06ace26e 1539 if (!payload_kiov)
d7e09d03
PT
1540 rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
1541 payload_niov, payload_iov,
1542 payload_offset, payload_nob);
1543 else
1544 rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
1545 payload_niov, payload_kiov,
1546 payload_offset, payload_nob);
5fd88337 1547 if (rc) {
d7e09d03
PT
1548 CERROR("Can't setup PUT src for %s: %d\n",
1549 libcfs_nid2str(target.nid), rc);
1550 kiblnd_tx_done(ni, tx);
1551 return -EIO;
1552 }
1553
1554 ibmsg = tx->tx_msg;
1555 ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
1556 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
1557 kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
1558
1559 tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
1560 tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */
1561 kiblnd_launch_tx(ni, tx, target.nid);
1562 return 0;
1563 }
1564
1565 /* send IMMEDIATE */
1566
6fe7f962 1567 LASSERT(offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
d7e09d03
PT
1568 <= IBLND_MSG_SIZE);
1569
1570 tx = kiblnd_get_idle_tx(ni, target.nid);
06ace26e 1571 if (!tx) {
6fe7f962 1572 CERROR("Can't send %d to %s: tx descs exhausted\n",
c314c319 1573 type, libcfs_nid2str(target.nid));
d7e09d03
PT
1574 return -ENOMEM;
1575 }
1576
1577 ibmsg = tx->tx_msg;
1578 ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
1579
06ace26e 1580 if (payload_kiov)
d7e09d03
PT
1581 lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg,
1582 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1583 payload_niov, payload_kiov,
1584 payload_offset, payload_nob);
1585 else
1586 lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg,
1587 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1588 payload_niov, payload_iov,
1589 payload_offset, payload_nob);
1590
1591 nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
1592 kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
1593
1594 tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
1595 kiblnd_launch_tx(ni, tx, target.nid);
1596 return 0;
1597}
1598
a8046a28 1599static void
6fe7f962 1600kiblnd_reply(lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
d7e09d03
PT
1601{
1602 lnet_process_id_t target = lntmsg->msg_target;
ec3d17c0
MS
1603 unsigned int niov = lntmsg->msg_niov;
1604 struct kvec *iov = lntmsg->msg_iov;
1605 lnet_kiov_t *kiov = lntmsg->msg_kiov;
1606 unsigned int offset = lntmsg->msg_offset;
1607 unsigned int nob = lntmsg->msg_len;
1608 kib_tx_t *tx;
1609 int rc;
d7e09d03
PT
1610
1611 tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
06ace26e 1612 if (!tx) {
d7e09d03
PT
1613 CERROR("Can't get tx for REPLY to %s\n",
1614 libcfs_nid2str(target.nid));
1615 goto failed_0;
1616 }
1617
5fd88337 1618 if (!nob)
d7e09d03 1619 rc = 0;
06ace26e 1620 else if (!kiov)
d7e09d03
PT
1621 rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
1622 niov, iov, offset, nob);
1623 else
1624 rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
1625 niov, kiov, offset, nob);
1626
5fd88337 1627 if (rc) {
d7e09d03
PT
1628 CERROR("Can't setup GET src for %s: %d\n",
1629 libcfs_nid2str(target.nid), rc);
1630 goto failed_1;
1631 }
1632
1633 rc = kiblnd_init_rdma(rx->rx_conn, tx,
1634 IBLND_MSG_GET_DONE, nob,
1635 &rx->rx_msg->ibm_u.get.ibgm_rd,
1636 rx->rx_msg->ibm_u.get.ibgm_cookie);
1637 if (rc < 0) {
1638 CERROR("Can't setup rdma for GET from %s: %d\n",
1639 libcfs_nid2str(target.nid), rc);
1640 goto failed_1;
1641 }
1642
5fd88337 1643 if (!nob) {
d7e09d03
PT
1644 /* No RDMA: local completion may happen now! */
1645 lnet_finalize(ni, lntmsg, 0);
1646 } else {
4420cfd3 1647 /* RDMA: lnet_finalize(lntmsg) when it completes */
d7e09d03
PT
1648 tx->tx_lntmsg[0] = lntmsg;
1649 }
1650
1651 kiblnd_queue_tx(tx, rx->rx_conn);
1652 return;
1653
1654 failed_1:
1655 kiblnd_tx_done(ni, tx);
1656 failed_0:
1657 lnet_finalize(ni, lntmsg, -EIO);
1658}
1659
1660int
6fe7f962 1661kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
c314c319
JS
1662 unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov,
1663 unsigned int offset, unsigned int mlen, unsigned int rlen)
d7e09d03 1664{
ec3d17c0
MS
1665 kib_rx_t *rx = private;
1666 kib_msg_t *rxmsg = rx->rx_msg;
1667 kib_conn_t *conn = rx->rx_conn;
1668 kib_tx_t *tx;
ec3d17c0
MS
1669 int nob;
1670 int post_credit = IBLND_POSTRX_PEER_CREDIT;
1671 int rc = 0;
d7e09d03 1672
6fe7f962
GM
1673 LASSERT(mlen <= rlen);
1674 LASSERT(!in_interrupt());
d7e09d03 1675 /* Either all pages or all vaddrs */
06ace26e 1676 LASSERT(!(kiov && iov));
d7e09d03
PT
1677
1678 switch (rxmsg->ibm_type) {
1679 default:
1680 LBUG();
1681
1682 case IBLND_MSG_IMMEDIATE:
1683 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
1684 if (nob > rx->rx_nob) {
6fe7f962 1685 CERROR("Immediate message from %s too big: %d(%d)\n",
c314c319
JS
1686 libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
1687 nob, rx->rx_nob);
d7e09d03
PT
1688 rc = -EPROTO;
1689 break;
1690 }
1691
06ace26e 1692 if (kiov)
d7e09d03
PT
1693 lnet_copy_flat2kiov(niov, kiov, offset,
1694 IBLND_MSG_SIZE, rxmsg,
1695 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1696 mlen);
1697 else
1698 lnet_copy_flat2iov(niov, iov, offset,
1699 IBLND_MSG_SIZE, rxmsg,
1700 offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
1701 mlen);
6fe7f962 1702 lnet_finalize(ni, lntmsg, 0);
d7e09d03
PT
1703 break;
1704
2f3622b9
IH
1705 case IBLND_MSG_PUT_REQ: {
1706 kib_msg_t *txmsg;
1707 kib_rdma_desc_t *rd;
1708
5fd88337 1709 if (!mlen) {
d7e09d03
PT
1710 lnet_finalize(ni, lntmsg, 0);
1711 kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0,
1712 rxmsg->ibm_u.putreq.ibprm_cookie);
1713 break;
1714 }
1715
1716 tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
06ace26e 1717 if (!tx) {
d7e09d03
PT
1718 CERROR("Can't allocate tx for %s\n",
1719 libcfs_nid2str(conn->ibc_peer->ibp_nid));
1720 /* Not replying will break the connection */
1721 rc = -ENOMEM;
1722 break;
1723 }
1724
1725 txmsg = tx->tx_msg;
2f3622b9 1726 rd = &txmsg->ibm_u.putack.ibpam_rd;
06ace26e 1727 if (!kiov)
2f3622b9 1728 rc = kiblnd_setup_rd_iov(ni, tx, rd,
d7e09d03
PT
1729 niov, iov, offset, mlen);
1730 else
2f3622b9 1731 rc = kiblnd_setup_rd_kiov(ni, tx, rd,
d7e09d03 1732 niov, kiov, offset, mlen);
5fd88337 1733 if (rc) {
d7e09d03
PT
1734 CERROR("Can't setup PUT sink for %s: %d\n",
1735 libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
1736 kiblnd_tx_done(ni, tx);
1737 /* tell peer it's over */
1738 kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc,
1739 rxmsg->ibm_u.putreq.ibprm_cookie);
1740 break;
1741 }
1742
2f3622b9 1743 nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[rd->rd_nfrags]);
d7e09d03
PT
1744 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
1745 txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
1746
1747 kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob);
1748
1749 tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
1750 tx->tx_waiting = 1; /* waiting for PUT_DONE */
1751 kiblnd_queue_tx(tx, conn);
1752
1753 /* reposted buffer reserved for PUT_DONE */
1754 post_credit = IBLND_POSTRX_NO_CREDIT;
1755 break;
2f3622b9 1756 }
d7e09d03
PT
1757
1758 case IBLND_MSG_GET_REQ:
06ace26e 1759 if (lntmsg) {
d7e09d03
PT
1760 /* Optimized GET; RDMA lntmsg's payload */
1761 kiblnd_reply(ni, rx, lntmsg);
1762 } else {
1763 /* GET didn't match anything */
1764 kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE,
1765 -ENODATA,
1766 rxmsg->ibm_u.get.ibgm_cookie);
1767 }
1768 break;
1769 }
1770
1771 kiblnd_post_rx(rx, post_credit);
1772 return rc;
1773}
1774
1775int
1776kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name)
1777{
9edf0f67 1778 struct task_struct *task = kthread_run(fn, arg, "%s", name);
d7e09d03
PT
1779
1780 if (IS_ERR(task))
1781 return PTR_ERR(task);
1782
1783 atomic_inc(&kiblnd_data.kib_nthreads);
1784 return 0;
1785}
1786
a8046a28 1787static void
6fe7f962 1788kiblnd_thread_fini(void)
d7e09d03 1789{
6fe7f962 1790 atomic_dec(&kiblnd_data.kib_nthreads);
d7e09d03
PT
1791}
1792
1793void
6fe7f962 1794kiblnd_peer_alive(kib_peer_t *peer)
d7e09d03
PT
1795{
1796 /* This is racy, but everyone's only writing cfs_time_current() */
1797 peer->ibp_last_alive = cfs_time_current();
1798 mb();
1799}
1800
a8046a28 1801static void
6fe7f962 1802kiblnd_peer_notify(kib_peer_t *peer)
d7e09d03 1803{
ec3d17c0
MS
1804 int error = 0;
1805 unsigned long last_alive = 0;
d7e09d03
PT
1806 unsigned long flags;
1807
1808 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1809
1810 if (list_empty(&peer->ibp_conns) &&
5fd88337
JS
1811 !peer->ibp_accepting &&
1812 !peer->ibp_connecting &&
1813 peer->ibp_error) {
d7e09d03
PT
1814 error = peer->ibp_error;
1815 peer->ibp_error = 0;
1816
1817 last_alive = peer->ibp_last_alive;
1818 }
1819
1820 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1821
5fd88337 1822 if (error)
d7e09d03
PT
1823 lnet_notify(peer->ibp_ni,
1824 peer->ibp_nid, 0, last_alive);
1825}
1826
1827void
6fe7f962 1828kiblnd_close_conn_locked(kib_conn_t *conn, int error)
d7e09d03 1829{
4420cfd3
JS
1830 /*
1831 * This just does the immediate housekeeping. 'error' is zero for a
d7e09d03
PT
1832 * normal shutdown which can happen only after the connection has been
1833 * established. If the connection is established, schedule the
4420cfd3 1834 * connection to be finished off by the connd. Otherwise the connd is
d7e09d03 1835 * already dealing with it (either to set it up or tear it down).
4420cfd3
JS
1836 * Caller holds kib_global_lock exclusively in irq context
1837 */
ec3d17c0
MS
1838 kib_peer_t *peer = conn->ibc_peer;
1839 kib_dev_t *dev;
1840 unsigned long flags;
d7e09d03 1841
5fd88337 1842 LASSERT(error || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
d7e09d03 1843
5fd88337 1844 if (error && !conn->ibc_comms_error)
d7e09d03
PT
1845 conn->ibc_comms_error = error;
1846
1847 if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
1848 return; /* already being handled */
1849
5fd88337 1850 if (!error &&
d7e09d03
PT
1851 list_empty(&conn->ibc_tx_noops) &&
1852 list_empty(&conn->ibc_tx_queue) &&
1853 list_empty(&conn->ibc_tx_queue_rsrvd) &&
1854 list_empty(&conn->ibc_tx_queue_nocred) &&
1855 list_empty(&conn->ibc_active_txs)) {
1856 CDEBUG(D_NET, "closing conn to %s\n",
1857 libcfs_nid2str(peer->ibp_nid));
1858 } else {
1859 CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
c314c319
JS
1860 libcfs_nid2str(peer->ibp_nid), error,
1861 list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
1862 list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
1863 list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
1864 list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
1865 list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
d7e09d03
PT
1866 }
1867
1868 dev = ((kib_net_t *)peer->ibp_ni->ni_data)->ibn_dev;
1869 list_del(&conn->ibc_list);
1870 /* connd (see below) takes over ibc_list's ref */
1871
6fe7f962 1872 if (list_empty(&peer->ibp_conns) && /* no more conns */
d7e09d03
PT
1873 kiblnd_peer_active(peer)) { /* still in peer table */
1874 kiblnd_unlink_peer_locked(peer);
1875
1876 /* set/clear error on last conn */
1877 peer->ibp_error = conn->ibc_comms_error;
1878 }
1879
1880 kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
1881
5fd88337 1882 if (error &&
d7e09d03
PT
1883 kiblnd_dev_can_failover(dev)) {
1884 list_add_tail(&dev->ibd_fail_list,
1885 &kiblnd_data.kib_failed_devs);
1886 wake_up(&kiblnd_data.kib_failover_waitq);
1887 }
1888
1889 spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
1890
1891 list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns);
1892 wake_up(&kiblnd_data.kib_connd_waitq);
1893
1894 spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
1895}
1896
1897void
1898kiblnd_close_conn(kib_conn_t *conn, int error)
1899{
1900 unsigned long flags;
1901
1902 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1903
1904 kiblnd_close_conn_locked(conn, error);
1905
1906 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1907}
1908
a8046a28 1909static void
d7e09d03
PT
1910kiblnd_handle_early_rxs(kib_conn_t *conn)
1911{
ec3d17c0
MS
1912 unsigned long flags;
1913 kib_rx_t *rx;
5a2ca43f 1914 kib_rx_t *tmp;
d7e09d03
PT
1915
1916 LASSERT(!in_interrupt());
1917 LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
1918
1919 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
5a2ca43f 1920 list_for_each_entry_safe(rx, tmp, &conn->ibc_early_rxs, rx_list) {
d7e09d03
PT
1921 list_del(&rx->rx_list);
1922 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1923
1924 kiblnd_handle_rx(rx);
1925
1926 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1927 }
1928 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1929}
1930
a8046a28 1931static void
d7e09d03
PT
1932kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
1933{
6fe7f962 1934 LIST_HEAD(zombies);
ec3d17c0
MS
1935 struct list_head *tmp;
1936 struct list_head *nxt;
1937 kib_tx_t *tx;
d7e09d03
PT
1938
1939 spin_lock(&conn->ibc_lock);
1940
6fe7f962
GM
1941 list_for_each_safe(tmp, nxt, txs) {
1942 tx = list_entry(tmp, kib_tx_t, tx_list);
d7e09d03
PT
1943
1944 if (txs == &conn->ibc_active_txs) {
6fe7f962 1945 LASSERT(!tx->tx_queued);
5fd88337 1946 LASSERT(tx->tx_waiting || tx->tx_sending);
d7e09d03 1947 } else {
6fe7f962 1948 LASSERT(tx->tx_queued);
d7e09d03
PT
1949 }
1950
1951 tx->tx_status = -ECONNABORTED;
1952 tx->tx_waiting = 0;
1953
5fd88337 1954 if (!tx->tx_sending) {
d7e09d03 1955 tx->tx_queued = 0;
6fe7f962
GM
1956 list_del(&tx->tx_list);
1957 list_add(&tx->tx_list, &zombies);
d7e09d03
PT
1958 }
1959 }
1960
1961 spin_unlock(&conn->ibc_lock);
1962
1963 kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED);
1964}
1965
a8046a28 1966static void
6fe7f962 1967kiblnd_finalise_conn(kib_conn_t *conn)
d7e09d03 1968{
6fe7f962
GM
1969 LASSERT(!in_interrupt());
1970 LASSERT(conn->ibc_state > IBLND_CONN_INIT);
d7e09d03
PT
1971
1972 kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
1973
4420cfd3
JS
1974 /*
1975 * abort_receives moves QP state to IB_QPS_ERR. This is only required
d7e09d03 1976 * for connections that didn't get as far as being connected, because
4420cfd3
JS
1977 * rdma_disconnect() does this for free.
1978 */
d7e09d03
PT
1979 kiblnd_abort_receives(conn);
1980
4420cfd3
JS
1981 /*
1982 * Complete all tx descs not waiting for sends to complete.
1983 * NB we should be safe from RDMA now that the QP has changed state
1984 */
d7e09d03
PT
1985 kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
1986 kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
1987 kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
1988 kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
1989 kiblnd_abort_txs(conn, &conn->ibc_active_txs);
1990
1991 kiblnd_handle_early_rxs(conn);
1992}
1993
1994void
6fe7f962 1995kiblnd_peer_connect_failed(kib_peer_t *peer, int active, int error)
d7e09d03 1996{
6fe7f962 1997 LIST_HEAD(zombies);
ec3d17c0 1998 unsigned long flags;
d7e09d03 1999
5fd88337 2000 LASSERT(error);
6fe7f962 2001 LASSERT(!in_interrupt());
d7e09d03
PT
2002
2003 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2004
2005 if (active) {
6fe7f962 2006 LASSERT(peer->ibp_connecting > 0);
d7e09d03
PT
2007 peer->ibp_connecting--;
2008 } else {
6fe7f962 2009 LASSERT(peer->ibp_accepting > 0);
d7e09d03
PT
2010 peer->ibp_accepting--;
2011 }
2012
5fd88337
JS
2013 if (peer->ibp_connecting ||
2014 peer->ibp_accepting) {
d7e09d03
PT
2015 /* another connection attempt under way... */
2016 write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
c314c319 2017 flags);
d7e09d03
PT
2018 return;
2019 }
2020
2021 if (list_empty(&peer->ibp_conns)) {
2022 /* Take peer's blocked transmits to complete with error */
2023 list_add(&zombies, &peer->ibp_tx_queue);
2024 list_del_init(&peer->ibp_tx_queue);
2025
2026 if (kiblnd_peer_active(peer))
2027 kiblnd_unlink_peer_locked(peer);
2028
2029 peer->ibp_error = error;
2030 } else {
2031 /* Can't have blocked transmits if there are connections */
6fe7f962 2032 LASSERT(list_empty(&peer->ibp_tx_queue));
d7e09d03
PT
2033 }
2034
2035 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2036
2037 kiblnd_peer_notify(peer);
2038
6fe7f962 2039 if (list_empty(&zombies))
d7e09d03
PT
2040 return;
2041
2042 CNETERR("Deleting messages for %s: connection failed\n",
2043 libcfs_nid2str(peer->ibp_nid));
2044
2045 kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH);
2046}
2047
2048void
2049kiblnd_connreq_done(kib_conn_t *conn, int status)
2050{
ec3d17c0
MS
2051 kib_peer_t *peer = conn->ibc_peer;
2052 kib_tx_t *tx;
5a2ca43f 2053 kib_tx_t *tmp;
ec3d17c0
MS
2054 struct list_head txs;
2055 unsigned long flags;
2056 int active;
d7e09d03
PT
2057
2058 active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
2059
1d8cb70c 2060 CDEBUG(D_NET, "%s: active(%d), version(%x), status(%d)\n",
d7e09d03
PT
2061 libcfs_nid2str(peer->ibp_nid), active,
2062 conn->ibc_version, status);
2063
6fe7f962
GM
2064 LASSERT(!in_interrupt());
2065 LASSERT((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
c314c319 2066 peer->ibp_connecting > 0) ||
d7e09d03 2067 (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
c314c319 2068 peer->ibp_accepting > 0));
d7e09d03
PT
2069
2070 LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
2071 conn->ibc_connvars = NULL;
2072
5fd88337 2073 if (status) {
d7e09d03
PT
2074 /* failed to establish connection */
2075 kiblnd_peer_connect_failed(peer, active, status);
2076 kiblnd_finalise_conn(conn);
2077 return;
2078 }
2079
2080 /* connection established */
2081 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2082
2083 conn->ibc_last_send = jiffies;
2084 kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
2085 kiblnd_peer_alive(peer);
2086
4420cfd3
JS
2087 /*
2088 * Add conn to peer's list and nuke any dangling conns from a different
2089 * peer instance...
2090 */
d7e09d03
PT
2091 kiblnd_conn_addref(conn); /* +1 ref for ibc_list */
2092 list_add(&conn->ibc_list, &peer->ibp_conns);
2093 if (active)
2094 peer->ibp_connecting--;
2095 else
2096 peer->ibp_accepting--;
2097
5fd88337 2098 if (!peer->ibp_version) {
d7e09d03
PT
2099 peer->ibp_version = conn->ibc_version;
2100 peer->ibp_incarnation = conn->ibc_incarnation;
2101 }
2102
2103 if (peer->ibp_version != conn->ibc_version ||
2104 peer->ibp_incarnation != conn->ibc_incarnation) {
2105 kiblnd_close_stale_conns_locked(peer, conn->ibc_version,
2106 conn->ibc_incarnation);
2107 peer->ibp_version = conn->ibc_version;
2108 peer->ibp_incarnation = conn->ibc_incarnation;
2109 }
2110
2111 /* grab pending txs while I have the lock */
2112 list_add(&txs, &peer->ibp_tx_queue);
2113 list_del_init(&peer->ibp_tx_queue);
2114
2115 if (!kiblnd_peer_active(peer) || /* peer has been deleted */
5fd88337 2116 conn->ibc_comms_error) { /* error has happened already */
d7e09d03
PT
2117 lnet_ni_t *ni = peer->ibp_ni;
2118
2119 /* start to shut down connection */
2120 kiblnd_close_conn_locked(conn, -ECONNABORTED);
2121 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2122
2123 kiblnd_txlist_done(ni, &txs, -ECONNABORTED);
2124
2125 return;
2126 }
2127
2128 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2129
2130 /* Schedule blocked txs */
2131 spin_lock(&conn->ibc_lock);
5a2ca43f 2132 list_for_each_entry_safe(tx, tmp, &txs, tx_list) {
d7e09d03
PT
2133 list_del(&tx->tx_list);
2134
2135 kiblnd_queue_tx_locked(tx, conn);
2136 }
2137 spin_unlock(&conn->ibc_lock);
2138
2139 kiblnd_check_sends(conn);
2140
2141 /* schedule blocked rxs */
2142 kiblnd_handle_early_rxs(conn);
2143}
2144
a8046a28 2145static void
d7e09d03
PT
2146kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej)
2147{
ec3d17c0 2148 int rc;
d7e09d03
PT
2149
2150 rc = rdma_reject(cmid, rej, sizeof(*rej));
2151
5fd88337 2152 if (rc)
d7e09d03
PT
2153 CWARN("Error %d sending reject\n", rc);
2154}
2155
a8046a28 2156static int
6fe7f962 2157kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
d7e09d03 2158{
ec3d17c0
MS
2159 rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
2160 kib_msg_t *reqmsg = priv;
2161 kib_msg_t *ackmsg;
2162 kib_dev_t *ibdev;
2163 kib_peer_t *peer;
2164 kib_peer_t *peer2;
2165 kib_conn_t *conn;
2166 lnet_ni_t *ni = NULL;
2167 kib_net_t *net = NULL;
2168 lnet_nid_t nid;
d7e09d03 2169 struct rdma_conn_param cp;
ec3d17c0
MS
2170 kib_rej_t rej;
2171 int version = IBLND_MSG_VERSION;
2172 unsigned long flags;
2173 int rc;
2174 struct sockaddr_in *peer_addr;
50ffcb7e 2175
6fe7f962 2176 LASSERT(!in_interrupt());
d7e09d03
PT
2177
2178 /* cmid inherits 'context' from the corresponding listener id */
2179 ibdev = (kib_dev_t *)cmid->context;
06ace26e 2180 LASSERT(ibdev);
d7e09d03
PT
2181
2182 memset(&rej, 0, sizeof(rej));
ec3d17c0
MS
2183 rej.ibr_magic = IBLND_MSG_MAGIC;
2184 rej.ibr_why = IBLND_REJECT_FATAL;
d7e09d03
PT
2185 rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
2186
0eee6778 2187 peer_addr = (struct sockaddr_in *)&cmid->route.addr.dst_addr;
d7e09d03
PT
2188 if (*kiblnd_tunables.kib_require_priv_port &&
2189 ntohs(peer_addr->sin_port) >= PROT_SOCK) {
2190 __u32 ip = ntohl(peer_addr->sin_addr.s_addr);
50ffcb7e 2191
5e8f6920
PT
2192 CERROR("Peer's port (%pI4h:%hu) is not privileged\n",
2193 &ip, ntohs(peer_addr->sin_port));
d7e09d03
PT
2194 goto failed;
2195 }
2196
2197 if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
2198 CERROR("Short connection request\n");
2199 goto failed;
2200 }
2201
4420cfd3
JS
2202 /*
2203 * Future protocol version compatibility support! If the
d7e09d03
PT
2204 * o2iblnd-specific protocol changes, or when LNET unifies
2205 * protocols over all LNDs, the initial connection will
2206 * negotiate a protocol version. I trap this here to avoid
2207 * console errors; the reject tells the peer which protocol I
4420cfd3
JS
2208 * speak.
2209 */
d7e09d03
PT
2210 if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
2211 reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
2212 goto failed;
2213 if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
2214 reqmsg->ibm_version != IBLND_MSG_VERSION &&
2215 reqmsg->ibm_version != IBLND_MSG_VERSION_1)
2216 goto failed;
2217 if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
2218 reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
2219 reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
2220 goto failed;
2221
2222 rc = kiblnd_unpack_msg(reqmsg, priv_nob);
5fd88337 2223 if (rc) {
d7e09d03
PT
2224 CERROR("Can't parse connection request: %d\n", rc);
2225 goto failed;
2226 }
2227
2228 nid = reqmsg->ibm_srcnid;
ec3d17c0 2229 ni = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid));
d7e09d03 2230
06ace26e 2231 if (ni) {
d7e09d03
PT
2232 net = (kib_net_t *)ni->ni_data;
2233 rej.ibr_incarnation = net->ibn_incarnation;
2234 }
2235
06ace26e 2236 if (!ni || /* no matching net */
d7e09d03
PT
2237 ni->ni_nid != reqmsg->ibm_dstnid || /* right NET, wrong NID! */
2238 net->ibn_dev != ibdev) { /* wrong device */
2d00bd17
JP
2239 CERROR("Can't accept %s on %s (%s:%d:%pI4h): bad dst nid %s\n",
2240 libcfs_nid2str(nid),
06ace26e 2241 !ni ? "NA" : libcfs_nid2str(ni->ni_nid),
d7e09d03 2242 ibdev->ibd_ifname, ibdev->ibd_nnets,
5e8f6920 2243 &ibdev->ibd_ifip,
d7e09d03
PT
2244 libcfs_nid2str(reqmsg->ibm_dstnid));
2245
2246 goto failed;
2247 }
2248
2249 /* check time stamp as soon as possible */
5fd88337 2250 if (reqmsg->ibm_dststamp &&
d7e09d03
PT
2251 reqmsg->ibm_dststamp != net->ibn_incarnation) {
2252 CWARN("Stale connection request\n");
2253 rej.ibr_why = IBLND_REJECT_CONN_STALE;
2254 goto failed;
2255 }
2256
2257 /* I can accept peer's version */
2258 version = reqmsg->ibm_version;
2259
2260 if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
2261 CERROR("Unexpected connreq msg type: %x from %s\n",
2262 reqmsg->ibm_type, libcfs_nid2str(nid));
2263 goto failed;
2264 }
2265
2266 if (reqmsg->ibm_u.connparams.ibcp_queue_depth !=
2267 IBLND_MSG_QUEUE_SIZE(version)) {
2268 CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
2269 libcfs_nid2str(nid), reqmsg->ibm_u.connparams.ibcp_queue_depth,
2270 IBLND_MSG_QUEUE_SIZE(version));
2271
2272 if (version == IBLND_MSG_VERSION)
2273 rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
2274
2275 goto failed;
2276 }
2277
2278 if (reqmsg->ibm_u.connparams.ibcp_max_frags !=
2279 IBLND_RDMA_FRAGS(version)) {
2d00bd17 2280 CERROR("Can't accept %s(version %x): incompatible max_frags %d (%d wanted)\n",
d7e09d03
PT
2281 libcfs_nid2str(nid), version,
2282 reqmsg->ibm_u.connparams.ibcp_max_frags,
2283 IBLND_RDMA_FRAGS(version));
2284
2285 if (version == IBLND_MSG_VERSION)
2286 rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
2287
2288 goto failed;
d7e09d03
PT
2289 }
2290
2291 if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
2292 CERROR("Can't accept %s: message size %d too big (%d max)\n",
2293 libcfs_nid2str(nid),
2294 reqmsg->ibm_u.connparams.ibcp_max_msg_size,
2295 IBLND_MSG_SIZE);
2296 goto failed;
2297 }
2298
2299 /* assume 'nid' is a new peer; create */
2300 rc = kiblnd_create_peer(ni, &peer, nid);
5fd88337 2301 if (rc) {
d7e09d03
PT
2302 CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
2303 rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
2304 goto failed;
2305 }
2306
2307 write_lock_irqsave(g_lock, flags);
2308
2309 peer2 = kiblnd_find_peer_locked(nid);
06ace26e 2310 if (peer2) {
5fd88337 2311 if (!peer2->ibp_version) {
d7e09d03
PT
2312 peer2->ibp_version = version;
2313 peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
2314 }
2315
2316 /* not the guy I've talked with */
2317 if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp ||
2318 peer2->ibp_version != version) {
2319 kiblnd_close_peer_conns_locked(peer2, -ESTALE);
2320 write_unlock_irqrestore(g_lock, flags);
2321
2322 CWARN("Conn stale %s [old ver: %x, new ver: %x]\n",
2323 libcfs_nid2str(nid), peer2->ibp_version, version);
2324
2325 kiblnd_peer_decref(peer);
2326 rej.ibr_why = IBLND_REJECT_CONN_STALE;
2327 goto failed;
2328 }
2329
2330 /* tie-break connection race in favour of the higher NID */
5fd88337 2331 if (peer2->ibp_connecting &&
d7e09d03
PT
2332 nid < ni->ni_nid) {
2333 write_unlock_irqrestore(g_lock, flags);
2334
2335 CWARN("Conn race %s\n", libcfs_nid2str(peer2->ibp_nid));
2336
2337 kiblnd_peer_decref(peer);
2338 rej.ibr_why = IBLND_REJECT_CONN_RACE;
2339 goto failed;
2340 }
2341
2342 peer2->ibp_accepting++;
2343 kiblnd_peer_addref(peer2);
2344
2345 write_unlock_irqrestore(g_lock, flags);
2346 kiblnd_peer_decref(peer);
2347 peer = peer2;
2348 } else {
2349 /* Brand new peer */
5fd88337
JS
2350 LASSERT(!peer->ibp_accepting);
2351 LASSERT(!peer->ibp_version &&
2352 !peer->ibp_incarnation);
d7e09d03
PT
2353
2354 peer->ibp_accepting = 1;
2355 peer->ibp_version = version;
2356 peer->ibp_incarnation = reqmsg->ibm_srcstamp;
2357
2358 /* I have a ref on ni that prevents it being shutdown */
5fd88337 2359 LASSERT(!net->ibn_shutdown);
d7e09d03
PT
2360
2361 kiblnd_peer_addref(peer);
2362 list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
2363
2364 write_unlock_irqrestore(g_lock, flags);
2365 }
2366
2367 conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version);
06ace26e 2368 if (!conn) {
d7e09d03
PT
2369 kiblnd_peer_connect_failed(peer, 0, -ENOMEM);
2370 kiblnd_peer_decref(peer);
2371 rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
2372 goto failed;
2373 }
2374
4420cfd3
JS
2375 /*
2376 * conn now "owns" cmid, so I return success from here on to ensure the
2377 * CM callback doesn't destroy cmid.
2378 */
d7e09d03 2379 conn->ibc_incarnation = reqmsg->ibm_srcstamp;
ec3d17c0 2380 conn->ibc_credits = IBLND_MSG_QUEUE_SIZE(version);
d7e09d03 2381 conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(version);
6fe7f962 2382 LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(version)
d7e09d03
PT
2383 <= IBLND_RX_MSGS(version));
2384
2385 ackmsg = &conn->ibc_connvars->cv_msg;
2386 memset(ackmsg, 0, sizeof(*ackmsg));
2387
2388 kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
2389 sizeof(ackmsg->ibm_u.connparams));
2390 ackmsg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
2391 ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
2392 ackmsg->ibm_u.connparams.ibcp_max_frags = IBLND_RDMA_FRAGS(version);
2393
2394 kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
2395
2396 memset(&cp, 0, sizeof(cp));
2397 cp.private_data = ackmsg;
ec3d17c0 2398 cp.private_data_len = ackmsg->ibm_nob;
d7e09d03 2399 cp.responder_resources = 0; /* No atomic ops or RDMA reads */
ec3d17c0 2400 cp.initiator_depth = 0;
d7e09d03 2401 cp.flow_control = 1;
ec3d17c0
MS
2402 cp.retry_count = *kiblnd_tunables.kib_retry_count;
2403 cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count;
d7e09d03
PT
2404
2405 CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
2406
2407 rc = rdma_accept(cmid, &cp);
5fd88337 2408 if (rc) {
d7e09d03
PT
2409 CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
2410 rej.ibr_version = version;
2411 rej.ibr_why = IBLND_REJECT_FATAL;
2412
2413 kiblnd_reject(cmid, &rej);
2414 kiblnd_connreq_done(conn, rc);
2415 kiblnd_conn_decref(conn);
2416 }
2417
2418 lnet_ni_decref(ni);
2419 return 0;
2420
2421 failed:
06ace26e 2422 if (ni)
d7e09d03
PT
2423 lnet_ni_decref(ni);
2424
ec3d17c0 2425 rej.ibr_version = version;
d7e09d03
PT
2426 rej.ibr_cp.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
2427 rej.ibr_cp.ibcp_max_frags = IBLND_RDMA_FRAGS(version);
2428 kiblnd_reject(cmid, &rej);
2429
2430 return -ECONNREFUSED;
2431}
2432
a8046a28 2433static void
6fe7f962 2434kiblnd_reconnect(kib_conn_t *conn, int version,
c314c319 2435 __u64 incarnation, int why, kib_connparams_t *cp)
d7e09d03 2436{
ec3d17c0
MS
2437 kib_peer_t *peer = conn->ibc_peer;
2438 char *reason;
2439 int retry = 0;
2440 unsigned long flags;
d7e09d03 2441
6fe7f962
GM
2442 LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
2443 LASSERT(peer->ibp_connecting > 0); /* 'conn' at least */
d7e09d03
PT
2444
2445 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2446
4420cfd3
JS
2447 /*
2448 * retry connection if it's still needed and no other connection
d7e09d03
PT
2449 * attempts (active or passive) are in progress
2450 * NB: reconnect is still needed even when ibp_tx_queue is
2451 * empty if ibp_version != version because reconnect may be
4420cfd3
JS
2452 * initiated by kiblnd_query()
2453 */
d7e09d03
PT
2454 if ((!list_empty(&peer->ibp_tx_queue) ||
2455 peer->ibp_version != version) &&
2456 peer->ibp_connecting == 1 &&
5fd88337 2457 !peer->ibp_accepting) {
d7e09d03
PT
2458 retry = 1;
2459 peer->ibp_connecting++;
2460
2461 peer->ibp_version = version;
2462 peer->ibp_incarnation = incarnation;
2463 }
2464
2465 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2466
2467 if (!retry)
2468 return;
2469
2470 switch (why) {
2471 default:
2472 reason = "Unknown";
2473 break;
2474
2475 case IBLND_REJECT_CONN_STALE:
2476 reason = "stale";
2477 break;
2478
2479 case IBLND_REJECT_CONN_RACE:
2480 reason = "conn race";
2481 break;
2482
2483 case IBLND_REJECT_CONN_UNCOMPAT:
2484 reason = "version negotiation";
2485 break;
2486 }
2487
2d00bd17 2488 CNETERR("%s: retrying (%s), %x, %x, queue_dep: %d, max_frag: %d, msg_size: %d\n",
d7e09d03
PT
2489 libcfs_nid2str(peer->ibp_nid),
2490 reason, IBLND_MSG_VERSION, version,
06ace26e
JS
2491 cp ? cp->ibcp_queue_depth : IBLND_MSG_QUEUE_SIZE(version),
2492 cp ? cp->ibcp_max_frags : IBLND_RDMA_FRAGS(version),
2493 cp ? cp->ibcp_max_msg_size : IBLND_MSG_SIZE);
d7e09d03
PT
2494
2495 kiblnd_connect_peer(peer);
2496}
2497
a8046a28 2498static void
6fe7f962 2499kiblnd_rejected(kib_conn_t *conn, int reason, void *priv, int priv_nob)
d7e09d03 2500{
ec3d17c0 2501 kib_peer_t *peer = conn->ibc_peer;
d7e09d03 2502
6fe7f962
GM
2503 LASSERT(!in_interrupt());
2504 LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
d7e09d03
PT
2505
2506 switch (reason) {
2507 case IB_CM_REJ_STALE_CONN:
2508 kiblnd_reconnect(conn, IBLND_MSG_VERSION, 0,
2509 IBLND_REJECT_CONN_STALE, NULL);
2510 break;
2511
2512 case IB_CM_REJ_INVALID_SERVICE_ID:
2513 CNETERR("%s rejected: no listener at %d\n",
2514 libcfs_nid2str(peer->ibp_nid),
2515 *kiblnd_tunables.kib_service);
2516 break;
2517
2518 case IB_CM_REJ_CONSUMER_DEFINED:
2519 if (priv_nob >= offsetof(kib_rej_t, ibr_padding)) {
ec3d17c0
MS
2520 kib_rej_t *rej = priv;
2521 kib_connparams_t *cp = NULL;
2522 int flip = 0;
2523 __u64 incarnation = -1;
d7e09d03
PT
2524
2525 /* NB. default incarnation is -1 because:
2526 * a) V1 will ignore dst incarnation in connreq.
2527 * b) V2 will provide incarnation while rejecting me,
2528 * -1 will be overwrote.
2529 *
2530 * if I try to connect to a V1 peer with V2 protocol,
2531 * it rejected me then upgrade to V2, I have no idea
2532 * about the upgrading and try to reconnect with V1,
2533 * in this case upgraded V2 can find out I'm trying to
2534 * talk to the old guy and reject me(incarnation is -1).
2535 */
2536
2537 if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
2538 rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) {
2539 __swab32s(&rej->ibr_magic);
2540 __swab16s(&rej->ibr_version);
2541 flip = 1;
2542 }
2543
2544 if (priv_nob >= sizeof(kib_rej_t) &&
2545 rej->ibr_version > IBLND_MSG_VERSION_1) {
4420cfd3
JS
2546 /*
2547 * priv_nob is always 148 in current version
d7e09d03 2548 * of OFED, so we still need to check version.
4420cfd3
JS
2549 * (define of IB_CM_REJ_PRIVATE_DATA_SIZE)
2550 */
d7e09d03
PT
2551 cp = &rej->ibr_cp;
2552
2553 if (flip) {
2554 __swab64s(&rej->ibr_incarnation);
2555 __swab16s(&cp->ibcp_queue_depth);
2556 __swab16s(&cp->ibcp_max_frags);
2557 __swab32s(&cp->ibcp_max_msg_size);
2558 }
2559
2560 incarnation = rej->ibr_incarnation;
2561 }
2562
2563 if (rej->ibr_magic != IBLND_MSG_MAGIC &&
2564 rej->ibr_magic != LNET_PROTO_MAGIC) {
2565 CERROR("%s rejected: consumer defined fatal error\n",
2566 libcfs_nid2str(peer->ibp_nid));
2567 break;
2568 }
2569
2570 if (rej->ibr_version != IBLND_MSG_VERSION &&
2571 rej->ibr_version != IBLND_MSG_VERSION_1) {
2572 CERROR("%s rejected: o2iblnd version %x error\n",
2573 libcfs_nid2str(peer->ibp_nid),
2574 rej->ibr_version);
2575 break;
2576 }
2577
2578 if (rej->ibr_why == IBLND_REJECT_FATAL &&
2579 rej->ibr_version == IBLND_MSG_VERSION_1) {
2580 CDEBUG(D_NET, "rejected by old version peer %s: %x\n",
2581 libcfs_nid2str(peer->ibp_nid), rej->ibr_version);
2582
2583 if (conn->ibc_version != IBLND_MSG_VERSION_1)
2584 rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT;
2585 }
2586
2587 switch (rej->ibr_why) {
2588 case IBLND_REJECT_CONN_RACE:
2589 case IBLND_REJECT_CONN_STALE:
2590 case IBLND_REJECT_CONN_UNCOMPAT:
2591 kiblnd_reconnect(conn, rej->ibr_version,
2592 incarnation, rej->ibr_why, cp);
2593 break;
2594
2595 case IBLND_REJECT_MSG_QUEUE_SIZE:
2596 CERROR("%s rejected: incompatible message queue depth %d, %d\n",
109dae8d 2597 libcfs_nid2str(peer->ibp_nid),
06ace26e 2598 cp ? cp->ibcp_queue_depth :
109dae8d 2599 IBLND_MSG_QUEUE_SIZE(rej->ibr_version),
d7e09d03
PT
2600 IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
2601 break;
2602
2603 case IBLND_REJECT_RDMA_FRAGS:
2604 CERROR("%s rejected: incompatible # of RDMA fragments %d, %d\n",
109dae8d 2605 libcfs_nid2str(peer->ibp_nid),
06ace26e 2606 cp ? cp->ibcp_max_frags :
109dae8d 2607 IBLND_RDMA_FRAGS(rej->ibr_version),
d7e09d03
PT
2608 IBLND_RDMA_FRAGS(conn->ibc_version));
2609 break;
2610
2611 case IBLND_REJECT_NO_RESOURCES:
2612 CERROR("%s rejected: o2iblnd no resources\n",
2613 libcfs_nid2str(peer->ibp_nid));
2614 break;
2615
2616 case IBLND_REJECT_FATAL:
2617 CERROR("%s rejected: o2iblnd fatal error\n",
2618 libcfs_nid2str(peer->ibp_nid));
2619 break;
2620
2621 default:
2622 CERROR("%s rejected: o2iblnd reason %d\n",
2623 libcfs_nid2str(peer->ibp_nid),
2624 rej->ibr_why);
2625 break;
2626 }
2627 break;
2628 }
2629 /* fall through */
2630 default:
2631 CNETERR("%s rejected: reason %d, size %d\n",
2632 libcfs_nid2str(peer->ibp_nid), reason, priv_nob);
2633 break;
2634 }
2635
2636 kiblnd_connreq_done(conn, -ECONNREFUSED);
2637}
2638
a8046a28 2639static void
6fe7f962 2640kiblnd_check_connreply(kib_conn_t *conn, void *priv, int priv_nob)
d7e09d03 2641{
ec3d17c0
MS
2642 kib_peer_t *peer = conn->ibc_peer;
2643 lnet_ni_t *ni = peer->ibp_ni;
2644 kib_net_t *net = ni->ni_data;
2645 kib_msg_t *msg = priv;
2646 int ver = conn->ibc_version;
2647 int rc = kiblnd_unpack_msg(msg, priv_nob);
2648 unsigned long flags;
d7e09d03 2649
06ace26e 2650 LASSERT(net);
d7e09d03 2651
5fd88337 2652 if (rc) {
d7e09d03
PT
2653 CERROR("Can't unpack connack from %s: %d\n",
2654 libcfs_nid2str(peer->ibp_nid), rc);
2655 goto failed;
2656 }
2657
2658 if (msg->ibm_type != IBLND_MSG_CONNACK) {
2659 CERROR("Unexpected message %d from %s\n",
2660 msg->ibm_type, libcfs_nid2str(peer->ibp_nid));
2661 rc = -EPROTO;
2662 goto failed;
2663 }
2664
2665 if (ver != msg->ibm_version) {
2d00bd17 2666 CERROR("%s replied version %x is different with requested version %x\n",
d7e09d03
PT
2667 libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver);
2668 rc = -EPROTO;
2669 goto failed;
2670 }
2671
2672 if (msg->ibm_u.connparams.ibcp_queue_depth !=
2673 IBLND_MSG_QUEUE_SIZE(ver)) {
2674 CERROR("%s has incompatible queue depth %d(%d wanted)\n",
2675 libcfs_nid2str(peer->ibp_nid),
2676 msg->ibm_u.connparams.ibcp_queue_depth,
2677 IBLND_MSG_QUEUE_SIZE(ver));
2678 rc = -EPROTO;
2679 goto failed;
2680 }
2681
2682 if (msg->ibm_u.connparams.ibcp_max_frags !=
2683 IBLND_RDMA_FRAGS(ver)) {
2684 CERROR("%s has incompatible max_frags %d (%d wanted)\n",
2685 libcfs_nid2str(peer->ibp_nid),
2686 msg->ibm_u.connparams.ibcp_max_frags,
2687 IBLND_RDMA_FRAGS(ver));
2688 rc = -EPROTO;
2689 goto failed;
2690 }
2691
2692 if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
2693 CERROR("%s max message size %d too big (%d max)\n",
2694 libcfs_nid2str(peer->ibp_nid),
2695 msg->ibm_u.connparams.ibcp_max_msg_size,
2696 IBLND_MSG_SIZE);
2697 rc = -EPROTO;
2698 goto failed;
2699 }
2700
2701 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2702 if (msg->ibm_dstnid == ni->ni_nid &&
2703 msg->ibm_dststamp == net->ibn_incarnation)
2704 rc = 0;
2705 else
2706 rc = -ESTALE;
2707 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2708
5fd88337 2709 if (rc) {
2d00bd17 2710 CERROR("Bad connection reply from %s, rc = %d, version: %x max_frags: %d\n",
d7e09d03
PT
2711 libcfs_nid2str(peer->ibp_nid), rc,
2712 msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags);
2713 goto failed;
2714 }
2715
ec3d17c0
MS
2716 conn->ibc_incarnation = msg->ibm_srcstamp;
2717 conn->ibc_credits =
d7e09d03 2718 conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(ver);
6fe7f962 2719 LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver)
d7e09d03
PT
2720 <= IBLND_RX_MSGS(ver));
2721
2722 kiblnd_connreq_done(conn, 0);
2723 return;
2724
2725 failed:
4420cfd3
JS
2726 /*
2727 * NB My QP has already established itself, so I handle anything going
d7e09d03
PT
2728 * wrong here by setting ibc_comms_error.
2729 * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then
4420cfd3
JS
2730 * immediately tears it down.
2731 */
5fd88337 2732 LASSERT(rc);
d7e09d03
PT
2733 conn->ibc_comms_error = rc;
2734 kiblnd_connreq_done(conn, 0);
2735}
2736
a8046a28 2737static int
6fe7f962 2738kiblnd_active_connect(struct rdma_cm_id *cmid)
d7e09d03 2739{
ec3d17c0
MS
2740 kib_peer_t *peer = (kib_peer_t *)cmid->context;
2741 kib_conn_t *conn;
2742 kib_msg_t *msg;
2743 struct rdma_conn_param cp;
2744 int version;
2745 __u64 incarnation;
2746 unsigned long flags;
2747 int rc;
d7e09d03
PT
2748
2749 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2750
2751 incarnation = peer->ibp_incarnation;
5fd88337
JS
2752 version = !peer->ibp_version ? IBLND_MSG_VERSION :
2753 peer->ibp_version;
d7e09d03
PT
2754
2755 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2756
2757 conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, version);
06ace26e 2758 if (!conn) {
d7e09d03
PT
2759 kiblnd_peer_connect_failed(peer, 1, -ENOMEM);
2760 kiblnd_peer_decref(peer); /* lose cmid's ref */
2761 return -ENOMEM;
2762 }
2763
4420cfd3
JS
2764 /*
2765 * conn "owns" cmid now, so I return success from here on to ensure the
d7e09d03 2766 * CM callback doesn't destroy cmid. conn also takes over cmid's ref
4420cfd3
JS
2767 * on peer
2768 */
d7e09d03
PT
2769 msg = &conn->ibc_connvars->cv_msg;
2770
2771 memset(msg, 0, sizeof(*msg));
2772 kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
2773 msg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
2774 msg->ibm_u.connparams.ibcp_max_frags = IBLND_RDMA_FRAGS(version);
2775 msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
2776
2777 kiblnd_pack_msg(peer->ibp_ni, msg, version,
2778 0, peer->ibp_nid, incarnation);
2779
2780 memset(&cp, 0, sizeof(cp));
2781 cp.private_data = msg;
2782 cp.private_data_len = msg->ibm_nob;
2783 cp.responder_resources = 0; /* No atomic ops or RDMA reads */
2784 cp.initiator_depth = 0;
ec3d17c0
MS
2785 cp.flow_control = 1;
2786 cp.retry_count = *kiblnd_tunables.kib_retry_count;
d7e09d03
PT
2787 cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count;
2788
2789 LASSERT(cmid->context == (void *)conn);
2790 LASSERT(conn->ibc_cmid == cmid);
2791
2792 rc = rdma_connect(cmid, &cp);
5fd88337 2793 if (rc) {
d7e09d03
PT
2794 CERROR("Can't connect to %s: %d\n",
2795 libcfs_nid2str(peer->ibp_nid), rc);
2796 kiblnd_connreq_done(conn, rc);
2797 kiblnd_conn_decref(conn);
2798 }
2799
2800 return 0;
2801}
2802
2803int
2804kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
2805{
ec3d17c0
MS
2806 kib_peer_t *peer;
2807 kib_conn_t *conn;
2808 int rc;
d7e09d03
PT
2809
2810 switch (event->event) {
2811 default:
2812 CERROR("Unexpected event: %d, status: %d\n",
2813 event->event, event->status);
2814 LBUG();
2815
2816 case RDMA_CM_EVENT_CONNECT_REQUEST:
2817 /* destroy cmid on failure */
2818 rc = kiblnd_passive_connect(cmid,
2819 (void *)KIBLND_CONN_PARAM(event),
2820 KIBLND_CONN_PARAM_LEN(event));
2821 CDEBUG(D_NET, "connreq: %d\n", rc);
2822 return rc;
2823
2824 case RDMA_CM_EVENT_ADDR_ERROR:
2825 peer = (kib_peer_t *)cmid->context;
2826 CNETERR("%s: ADDR ERROR %d\n",
c314c319 2827 libcfs_nid2str(peer->ibp_nid), event->status);
d7e09d03
PT
2828 kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
2829 kiblnd_peer_decref(peer);
5fd88337 2830 return -EHOSTUNREACH; /* rc destroys cmid */
d7e09d03
PT
2831
2832 case RDMA_CM_EVENT_ADDR_RESOLVED:
2833 peer = (kib_peer_t *)cmid->context;
2834
1d8cb70c 2835 CDEBUG(D_NET, "%s Addr resolved: %d\n",
d7e09d03
PT
2836 libcfs_nid2str(peer->ibp_nid), event->status);
2837
5fd88337 2838 if (event->status) {
d7e09d03
PT
2839 CNETERR("Can't resolve address for %s: %d\n",
2840 libcfs_nid2str(peer->ibp_nid), event->status);
2841 rc = event->status;
2842 } else {
2843 rc = rdma_resolve_route(
2844 cmid, *kiblnd_tunables.kib_timeout * 1000);
5fd88337 2845 if (!rc)
d7e09d03
PT
2846 return 0;
2847 /* Can't initiate route resolution */
2848 CERROR("Can't resolve route for %s: %d\n",
2849 libcfs_nid2str(peer->ibp_nid), rc);
2850 }
2851 kiblnd_peer_connect_failed(peer, 1, rc);
2852 kiblnd_peer_decref(peer);
5fd88337 2853 return rc; /* rc destroys cmid */
d7e09d03
PT
2854
2855 case RDMA_CM_EVENT_ROUTE_ERROR:
2856 peer = (kib_peer_t *)cmid->context;
2857 CNETERR("%s: ROUTE ERROR %d\n",
2858 libcfs_nid2str(peer->ibp_nid), event->status);
2859 kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
2860 kiblnd_peer_decref(peer);
5fd88337 2861 return -EHOSTUNREACH; /* rc destroys cmid */
d7e09d03
PT
2862
2863 case RDMA_CM_EVENT_ROUTE_RESOLVED:
2864 peer = (kib_peer_t *)cmid->context;
1d8cb70c 2865 CDEBUG(D_NET, "%s Route resolved: %d\n",
d7e09d03
PT
2866 libcfs_nid2str(peer->ibp_nid), event->status);
2867
5fd88337 2868 if (!event->status)
d7e09d03
PT
2869 return kiblnd_active_connect(cmid);
2870
2871 CNETERR("Can't resolve route for %s: %d\n",
c314c319 2872 libcfs_nid2str(peer->ibp_nid), event->status);
d7e09d03
PT
2873 kiblnd_peer_connect_failed(peer, 1, event->status);
2874 kiblnd_peer_decref(peer);
5fd88337 2875 return event->status; /* rc destroys cmid */
d7e09d03
PT
2876
2877 case RDMA_CM_EVENT_UNREACHABLE:
2878 conn = (kib_conn_t *)cmid->context;
2879 LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
2880 conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
2881 CNETERR("%s: UNREACHABLE %d\n",
c314c319 2882 libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
d7e09d03
PT
2883 kiblnd_connreq_done(conn, -ENETDOWN);
2884 kiblnd_conn_decref(conn);
2885 return 0;
2886
2887 case RDMA_CM_EVENT_CONNECT_ERROR:
2888 conn = (kib_conn_t *)cmid->context;
2889 LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
2890 conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
2891 CNETERR("%s: CONNECT ERROR %d\n",
2892 libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
2893 kiblnd_connreq_done(conn, -ENOTCONN);
2894 kiblnd_conn_decref(conn);
2895 return 0;
2896
2897 case RDMA_CM_EVENT_REJECTED:
2898 conn = (kib_conn_t *)cmid->context;
2899 switch (conn->ibc_state) {
2900 default:
2901 LBUG();
2902
2903 case IBLND_CONN_PASSIVE_WAIT:
6fe7f962 2904 CERROR("%s: REJECTED %d\n",
c314c319
JS
2905 libcfs_nid2str(conn->ibc_peer->ibp_nid),
2906 event->status);
d7e09d03
PT
2907 kiblnd_connreq_done(conn, -ECONNRESET);
2908 break;
2909
2910 case IBLND_CONN_ACTIVE_CONNECT:
2911 kiblnd_rejected(conn, event->status,
2912 (void *)KIBLND_CONN_PARAM(event),
2913 KIBLND_CONN_PARAM_LEN(event));
2914 break;
2915 }
2916 kiblnd_conn_decref(conn);
2917 return 0;
2918
2919 case RDMA_CM_EVENT_ESTABLISHED:
2920 conn = (kib_conn_t *)cmid->context;
2921 switch (conn->ibc_state) {
2922 default:
2923 LBUG();
2924
2925 case IBLND_CONN_PASSIVE_WAIT:
2926 CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
2927 libcfs_nid2str(conn->ibc_peer->ibp_nid));
2928 kiblnd_connreq_done(conn, 0);
2929 break;
2930
2931 case IBLND_CONN_ACTIVE_CONNECT:
2932 CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
2933 libcfs_nid2str(conn->ibc_peer->ibp_nid));
2934 kiblnd_check_connreply(conn,
2935 (void *)KIBLND_CONN_PARAM(event),
2936 KIBLND_CONN_PARAM_LEN(event));
2937 break;
2938 }
2939 /* net keeps its ref on conn! */
2940 return 0;
2941
2942 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
2943 CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n");
2944 return 0;
2945 case RDMA_CM_EVENT_DISCONNECTED:
2946 conn = (kib_conn_t *)cmid->context;
2947 if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
2948 CERROR("%s DISCONNECTED\n",
2949 libcfs_nid2str(conn->ibc_peer->ibp_nid));
2950 kiblnd_connreq_done(conn, -ECONNRESET);
2951 } else {
2952 kiblnd_close_conn(conn, 0);
2953 }
2954 kiblnd_conn_decref(conn);
2955 cmid->context = NULL;
2956 return 0;
2957
2958 case RDMA_CM_EVENT_DEVICE_REMOVAL:
2959 LCONSOLE_ERROR_MSG(0x131,
2960 "Received notification of device removal\n"
2961 "Please shutdown LNET to allow this to proceed\n");
4420cfd3
JS
2962 /*
2963 * Can't remove network from underneath LNET for now, so I have
2964 * to ignore this
2965 */
d7e09d03
PT
2966 return 0;
2967
2968 case RDMA_CM_EVENT_ADDR_CHANGE:
2969 LCONSOLE_INFO("Physical link changed (eg hca/port)\n");
2970 return 0;
2971 }
2972}
2973
2974static int
2975kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
2976{
ec3d17c0
MS
2977 kib_tx_t *tx;
2978 struct list_head *ttmp;
d7e09d03 2979
6fe7f962
GM
2980 list_for_each(ttmp, txs) {
2981 tx = list_entry(ttmp, kib_tx_t, tx_list);
d7e09d03
PT
2982
2983 if (txs != &conn->ibc_active_txs) {
6fe7f962 2984 LASSERT(tx->tx_queued);
d7e09d03 2985 } else {
6fe7f962 2986 LASSERT(!tx->tx_queued);
5fd88337 2987 LASSERT(tx->tx_waiting || tx->tx_sending);
d7e09d03
PT
2988 }
2989
6fe7f962 2990 if (cfs_time_aftereq(jiffies, tx->tx_deadline)) {
d7e09d03
PT
2991 CERROR("Timed out tx: %s, %lu seconds\n",
2992 kiblnd_queue2str(conn, txs),
2993 cfs_duration_sec(jiffies - tx->tx_deadline));
2994 return 1;
2995 }
2996 }
2997
2998 return 0;
2999}
3000
3001static int
3002kiblnd_conn_timed_out_locked(kib_conn_t *conn)
3003{
3004 return kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
3005 kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
3006 kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) ||
3007 kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) ||
3008 kiblnd_check_txs_locked(conn, &conn->ibc_active_txs);
3009}
3010
a8046a28 3011static void
6fe7f962 3012kiblnd_check_conns(int idx)
d7e09d03 3013{
6fe7f962
GM
3014 LIST_HEAD(closes);
3015 LIST_HEAD(checksends);
ec3d17c0
MS
3016 struct list_head *peers = &kiblnd_data.kib_peers[idx];
3017 struct list_head *ptmp;
3018 kib_peer_t *peer;
3019 kib_conn_t *conn;
5a2ca43f 3020 kib_conn_t *tmp;
ec3d17c0
MS
3021 struct list_head *ctmp;
3022 unsigned long flags;
d7e09d03 3023
4420cfd3
JS
3024 /*
3025 * NB. We expect to have a look at all the peers and not find any
d7e09d03 3026 * RDMAs to time out, so we just use a shared lock while we
4420cfd3
JS
3027 * take a look...
3028 */
d7e09d03
PT
3029 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
3030
6fe7f962
GM
3031 list_for_each(ptmp, peers) {
3032 peer = list_entry(ptmp, kib_peer_t, ibp_list);
d7e09d03 3033
6fe7f962 3034 list_for_each(ctmp, &peer->ibp_conns) {
d7e09d03
PT
3035 int timedout;
3036 int sendnoop;
3037
3038 conn = list_entry(ctmp, kib_conn_t, ibc_list);
3039
6fe7f962 3040 LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED);
d7e09d03
PT
3041
3042 spin_lock(&conn->ibc_lock);
3043
3044 sendnoop = kiblnd_need_noop(conn);
3045 timedout = kiblnd_conn_timed_out_locked(conn);
3046 if (!sendnoop && !timedout) {
3047 spin_unlock(&conn->ibc_lock);
3048 continue;
3049 }
3050
3051 if (timedout) {
2d00bd17 3052 CERROR("Timed out RDMA with %s (%lu): c: %u, oc: %u, rc: %u\n",
d7e09d03
PT
3053 libcfs_nid2str(peer->ibp_nid),
3054 cfs_duration_sec(cfs_time_current() -
3055 peer->ibp_last_alive),
3056 conn->ibc_credits,
3057 conn->ibc_outstanding_credits,
3058 conn->ibc_reserved_credits);
3059 list_add(&conn->ibc_connd_list, &closes);
3060 } else {
c314c319 3061 list_add(&conn->ibc_connd_list, &checksends);
d7e09d03
PT
3062 }
3063 /* +ref for 'closes' or 'checksends' */
3064 kiblnd_conn_addref(conn);
3065
3066 spin_unlock(&conn->ibc_lock);
3067 }
3068 }
3069
3070 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
3071
4420cfd3
JS
3072 /*
3073 * Handle timeout by closing the whole
d7e09d03 3074 * connection. We can only be sure RDMA activity
4420cfd3
JS
3075 * has ceased once the QP has been modified.
3076 */
5a2ca43f 3077 list_for_each_entry_safe(conn, tmp, &closes, ibc_connd_list) {
d7e09d03
PT
3078 list_del(&conn->ibc_connd_list);
3079 kiblnd_close_conn(conn, -ETIMEDOUT);
3080 kiblnd_conn_decref(conn);
3081 }
3082
4420cfd3
JS
3083 /*
3084 * In case we have enough credits to return via a
d7e09d03 3085 * NOOP, but there were no non-blocking tx descs
4420cfd3
JS
3086 * free to do it last time...
3087 */
d7e09d03 3088 while (!list_empty(&checksends)) {
c314c319 3089 conn = list_entry(checksends.next, kib_conn_t, ibc_connd_list);
d7e09d03
PT
3090 list_del(&conn->ibc_connd_list);
3091 kiblnd_check_sends(conn);
3092 kiblnd_conn_decref(conn);
3093 }
3094}
3095
a8046a28 3096static void
6fe7f962 3097kiblnd_disconnect_conn(kib_conn_t *conn)
d7e09d03 3098{
6fe7f962
GM
3099 LASSERT(!in_interrupt());
3100 LASSERT(current == kiblnd_data.kib_connd);
3101 LASSERT(conn->ibc_state == IBLND_CONN_CLOSING);
d7e09d03
PT
3102
3103 rdma_disconnect(conn->ibc_cmid);
3104 kiblnd_finalise_conn(conn);
3105
3106 kiblnd_peer_notify(conn->ibc_peer);
3107}
3108
3109int
6fe7f962 3110kiblnd_connd(void *arg)
d7e09d03 3111{
ec3d17c0
MS
3112 wait_queue_t wait;
3113 unsigned long flags;
3114 kib_conn_t *conn;
3115 int timeout;
3116 int i;
3117 int dropped_lock;
3118 int peer_index = 0;
3119 unsigned long deadline = jiffies;
d7e09d03 3120
6fe7f962 3121 cfs_block_allsigs();
d7e09d03 3122
9e795d35 3123 init_waitqueue_entry(&wait, current);
d7e09d03
PT
3124 kiblnd_data.kib_connd = current;
3125
3126 spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
3127
3128 while (!kiblnd_data.kib_shutdown) {
d7e09d03
PT
3129 dropped_lock = 0;
3130
6fe7f962 3131 if (!list_empty(&kiblnd_data.kib_connd_zombies)) {
66a84f8a 3132 conn = list_entry(kiblnd_data.kib_connd_zombies.next,
c314c319 3133 kib_conn_t, ibc_list);
d7e09d03
PT
3134 list_del(&conn->ibc_list);
3135
3136 spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
3137 flags);
3138 dropped_lock = 1;
3139
3140 kiblnd_destroy_conn(conn);
3141
3142 spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
3143 }
3144
3145 if (!list_empty(&kiblnd_data.kib_connd_conns)) {
3146 conn = list_entry(kiblnd_data.kib_connd_conns.next,
c314c319 3147 kib_conn_t, ibc_list);
d7e09d03
PT
3148 list_del(&conn->ibc_list);
3149
3150 spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
3151 flags);
3152 dropped_lock = 1;
3153
3154 kiblnd_disconnect_conn(conn);
3155 kiblnd_conn_decref(conn);
3156
3157 spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
3158 }
3159
3160 /* careful with the jiffy wrap... */
3161 timeout = (int)(deadline - jiffies);
3162 if (timeout <= 0) {
3163 const int n = 4;
3164 const int p = 1;
ec3d17c0 3165 int chunk = kiblnd_data.kib_peer_hash_size;
d7e09d03
PT
3166
3167 spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
3168 dropped_lock = 1;
3169
4420cfd3
JS
3170 /*
3171 * Time to check for RDMA timeouts on a few more
d7e09d03
PT
3172 * peers: I do checks every 'p' seconds on a
3173 * proportion of the peer table and I need to check
3174 * every connection 'n' times within a timeout
3175 * interval, to ensure I detect a timeout on any
3176 * connection within (n+1)/n times the timeout
4420cfd3
JS
3177 * interval.
3178 */
d7e09d03
PT
3179 if (*kiblnd_tunables.kib_timeout > n * p)
3180 chunk = (chunk * n * p) /
3181 *kiblnd_tunables.kib_timeout;
5fd88337 3182 if (!chunk)
d7e09d03
PT
3183 chunk = 1;
3184
3185 for (i = 0; i < chunk; i++) {
3186 kiblnd_check_conns(peer_index);
3187 peer_index = (peer_index + 1) %
3188 kiblnd_data.kib_peer_hash_size;
3189 }
3190
3191 deadline += p * HZ;
3192 spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
3193 }
3194
3195 if (dropped_lock)
3196 continue;
3197
3198 /* Nothing to do for 'timeout' */
3199 set_current_state(TASK_INTERRUPTIBLE);
3200 add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
3201 spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
3202
b7efb98d 3203 schedule_timeout(timeout);
d7e09d03 3204
d7e09d03
PT
3205 remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
3206 spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
3207 }
3208
3209 spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
3210
3211 kiblnd_thread_fini();
3212 return 0;
3213}
3214
3215void
3216kiblnd_qp_event(struct ib_event *event, void *arg)
3217{
3218 kib_conn_t *conn = arg;
3219
3220 switch (event->event) {
3221 case IB_EVENT_COMM_EST:
3222 CDEBUG(D_NET, "%s established\n",
3223 libcfs_nid2str(conn->ibc_peer->ibp_nid));
3224 return;
3225
3226 default:
3227 CERROR("%s: Async QP event type %d\n",
3228 libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
3229 return;
3230 }
3231}
3232
a8046a28 3233static void
6fe7f962 3234kiblnd_complete(struct ib_wc *wc)
d7e09d03
PT
3235{
3236 switch (kiblnd_wreqid2type(wc->wr_id)) {
3237 default:
3238 LBUG();
3239
3240 case IBLND_WID_RDMA:
4420cfd3
JS
3241 /*
3242 * We only get RDMA completion notification if it fails. All
d7e09d03
PT
3243 * subsequent work items, including the final SEND will fail
3244 * too. However we can't print out any more info about the
3245 * failing RDMA because 'tx' might be back on the idle list or
3246 * even reused already if we didn't manage to post all our work
4420cfd3
JS
3247 * items
3248 */
d7e09d03
PT
3249 CNETERR("RDMA (tx: %p) failed: %d\n",
3250 kiblnd_wreqid2ptr(wc->wr_id), wc->status);
3251 return;
3252
3253 case IBLND_WID_TX:
3254 kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status);
3255 return;
3256
3257 case IBLND_WID_RX:
3258 kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status,
3259 wc->byte_len);
3260 return;
3261 }
3262}
3263
3264void
3265kiblnd_cq_completion(struct ib_cq *cq, void *arg)
3266{
4420cfd3
JS
3267 /*
3268 * NB I'm not allowed to schedule this conn once its refcount has
d7e09d03
PT
3269 * reached 0. Since fundamentally I'm racing with scheduler threads
3270 * consuming my CQ I could be called after all completions have
5fd88337 3271 * occurred. But in this case, !ibc_nrx && !ibc_nsends_posted
4420cfd3
JS
3272 * and this CQ is about to be destroyed so I NOOP.
3273 */
cca32416 3274 kib_conn_t *conn = arg;
ec3d17c0
MS
3275 struct kib_sched_info *sched = conn->ibc_sched;
3276 unsigned long flags;
d7e09d03
PT
3277
3278 LASSERT(cq == conn->ibc_cq);
3279
3280 spin_lock_irqsave(&sched->ibs_lock, flags);
3281
3282 conn->ibc_ready = 1;
3283
3284 if (!conn->ibc_scheduled &&
3285 (conn->ibc_nrx > 0 ||
3286 conn->ibc_nsends_posted > 0)) {
3287 kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
3288 conn->ibc_scheduled = 1;
3289 list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
3290
3291 if (waitqueue_active(&sched->ibs_waitq))
3292 wake_up(&sched->ibs_waitq);
3293 }
3294
3295 spin_unlock_irqrestore(&sched->ibs_lock, flags);
3296}
3297
3298void
3299kiblnd_cq_event(struct ib_event *event, void *arg)
3300{
3301 kib_conn_t *conn = arg;
3302
3303 CERROR("%s: async CQ event type %d\n",
3304 libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
3305}
3306
3307int
3308kiblnd_scheduler(void *arg)
3309{
ec3d17c0
MS
3310 long id = (long)arg;
3311 struct kib_sched_info *sched;
3312 kib_conn_t *conn;
3313 wait_queue_t wait;
3314 unsigned long flags;
3315 struct ib_wc wc;
3316 int did_something;
3317 int busy_loops = 0;
3318 int rc;
d7e09d03
PT
3319
3320 cfs_block_allsigs();
3321
9e795d35 3322 init_waitqueue_entry(&wait, current);
d7e09d03
PT
3323
3324 sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)];
3325
3326 rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
5fd88337 3327 if (rc) {
2d00bd17
JP
3328 CWARN("Failed to bind on CPT %d, please verify whether all CPUs are healthy and reload modules if necessary, otherwise your system might under risk of low performance\n",
3329 sched->ibs_cpt);
d7e09d03
PT
3330 }
3331
3332 spin_lock_irqsave(&sched->ibs_lock, flags);
3333
3334 while (!kiblnd_data.kib_shutdown) {
3335 if (busy_loops++ >= IBLND_RESCHED) {
3336 spin_unlock_irqrestore(&sched->ibs_lock, flags);
3337
3338 cond_resched();
3339 busy_loops = 0;
3340
3341 spin_lock_irqsave(&sched->ibs_lock, flags);
3342 }
3343
3344 did_something = 0;
3345
3346 if (!list_empty(&sched->ibs_conns)) {
c314c319
JS
3347 conn = list_entry(sched->ibs_conns.next, kib_conn_t,
3348 ibc_sched_list);
d7e09d03
PT
3349 /* take over kib_sched_conns' ref on conn... */
3350 LASSERT(conn->ibc_scheduled);
3351 list_del(&conn->ibc_sched_list);
3352 conn->ibc_ready = 0;
3353
3354 spin_unlock_irqrestore(&sched->ibs_lock, flags);
3355
3356 rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
5fd88337 3357 if (!rc) {
d7e09d03
PT
3358 rc = ib_req_notify_cq(conn->ibc_cq,
3359 IB_CQ_NEXT_COMP);
3360 if (rc < 0) {
2d00bd17 3361 CWARN("%s: ib_req_notify_cq failed: %d, closing connection\n",
d7e09d03
PT
3362 libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
3363 kiblnd_close_conn(conn, -EIO);
3364 kiblnd_conn_decref(conn);
3365 spin_lock_irqsave(&sched->ibs_lock,
c314c319 3366 flags);
d7e09d03
PT
3367 continue;
3368 }
3369
3370 rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
3371 }
3372
3373 if (rc < 0) {
2d00bd17 3374 CWARN("%s: ib_poll_cq failed: %d, closing connection\n",
d7e09d03
PT
3375 libcfs_nid2str(conn->ibc_peer->ibp_nid),
3376 rc);
3377 kiblnd_close_conn(conn, -EIO);
3378 kiblnd_conn_decref(conn);
3379 spin_lock_irqsave(&sched->ibs_lock, flags);
3380 continue;
3381 }
3382
3383 spin_lock_irqsave(&sched->ibs_lock, flags);
3384
5fd88337 3385 if (rc || conn->ibc_ready) {
4420cfd3
JS
3386 /*
3387 * There may be another completion waiting; get
d7e09d03 3388 * another scheduler to check while I handle
4420cfd3
JS
3389 * this one...
3390 */
d7e09d03
PT
3391 /* +1 ref for sched_conns */
3392 kiblnd_conn_addref(conn);
3393 list_add_tail(&conn->ibc_sched_list,
c314c319 3394 &sched->ibs_conns);
d7e09d03
PT
3395 if (waitqueue_active(&sched->ibs_waitq))
3396 wake_up(&sched->ibs_waitq);
3397 } else {
3398 conn->ibc_scheduled = 0;
3399 }
3400
5fd88337 3401 if (rc) {
d7e09d03
PT
3402 spin_unlock_irqrestore(&sched->ibs_lock, flags);
3403 kiblnd_complete(&wc);
3404
3405 spin_lock_irqsave(&sched->ibs_lock, flags);
3406 }
3407
3408 kiblnd_conn_decref(conn); /* ...drop my ref from above */
3409 did_something = 1;
3410 }
3411
3412 if (did_something)
3413 continue;
3414
3415 set_current_state(TASK_INTERRUPTIBLE);
3416 add_wait_queue_exclusive(&sched->ibs_waitq, &wait);
3417 spin_unlock_irqrestore(&sched->ibs_lock, flags);
3418
b3669a7f 3419 schedule();
d7e09d03
PT
3420 busy_loops = 0;
3421
3422 remove_wait_queue(&sched->ibs_waitq, &wait);
d7e09d03
PT
3423 spin_lock_irqsave(&sched->ibs_lock, flags);
3424 }
3425
3426 spin_unlock_irqrestore(&sched->ibs_lock, flags);
3427
3428 kiblnd_thread_fini();
3429 return 0;
3430}
3431
3432int
3433kiblnd_failover_thread(void *arg)
3434{
ec3d17c0
MS
3435 rwlock_t *glock = &kiblnd_data.kib_global_lock;
3436 kib_dev_t *dev;
3437 wait_queue_t wait;
3438 unsigned long flags;
3439 int rc;
d7e09d03 3440
5fd88337 3441 LASSERT(*kiblnd_tunables.kib_dev_failover);
d7e09d03 3442
6fe7f962 3443 cfs_block_allsigs();
d7e09d03 3444
9e795d35 3445 init_waitqueue_entry(&wait, current);
d7e09d03
PT
3446 write_lock_irqsave(glock, flags);
3447
3448 while (!kiblnd_data.kib_shutdown) {
ec3d17c0
MS
3449 int do_failover = 0;
3450 int long_sleep;
d7e09d03
PT
3451
3452 list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
3453 ibd_fail_list) {
699503bc
GKH
3454 if (time_before(cfs_time_current(),
3455 dev->ibd_next_failover))
d7e09d03
PT
3456 continue;
3457 do_failover = 1;
3458 break;
3459 }
3460
3461 if (do_failover) {
3462 list_del_init(&dev->ibd_fail_list);
3463 dev->ibd_failover = 1;
3464 write_unlock_irqrestore(glock, flags);
3465
3466 rc = kiblnd_dev_failover(dev);
3467
3468 write_lock_irqsave(glock, flags);
3469
6fe7f962 3470 LASSERT(dev->ibd_failover);
d7e09d03
PT
3471 dev->ibd_failover = 0;
3472 if (rc >= 0) { /* Device is OK or failover succeed */
3473 dev->ibd_next_failover = cfs_time_shift(3);
3474 continue;
3475 }
3476
3477 /* failed to failover, retry later */
3478 dev->ibd_next_failover =
3479 cfs_time_shift(min(dev->ibd_failed_failover, 10));
3480 if (kiblnd_dev_can_failover(dev)) {
3481 list_add_tail(&dev->ibd_fail_list,
3482 &kiblnd_data.kib_failed_devs);
3483 }
3484
3485 continue;
3486 }
3487
3488 /* long sleep if no more pending failover */
3489 long_sleep = list_empty(&kiblnd_data.kib_failed_devs);
3490
3491 set_current_state(TASK_INTERRUPTIBLE);
3492 add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
3493 write_unlock_irqrestore(glock, flags);
3494
3495 rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) :
3496 cfs_time_seconds(1));
d7e09d03
PT
3497 remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
3498 write_lock_irqsave(glock, flags);
3499
5fd88337 3500 if (!long_sleep || rc)
d7e09d03
PT
3501 continue;
3502
4420cfd3
JS
3503 /*
3504 * have a long sleep, routine check all active devices,
d7e09d03
PT
3505 * we need checking like this because if there is not active
3506 * connection on the dev and no SEND from local, we may listen
4420cfd3
JS
3507 * on wrong HCA for ever while there is a bonding failover
3508 */
d7e09d03
PT
3509 list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
3510 if (kiblnd_dev_can_failover(dev)) {
3511 list_add_tail(&dev->ibd_fail_list,
3512 &kiblnd_data.kib_failed_devs);
3513 }
3514 }
3515 }
3516
3517 write_unlock_irqrestore(glock, flags);
3518
3519 kiblnd_thread_fini();
3520 return 0;
3521}
This page took 0.836837 seconds and 5 git commands to generate.