Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2011, 2012, Intel Corporation. | |
31 | */ | |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | * | |
36 | * lnet/klnds/o2iblnd/o2iblnd.c | |
37 | * | |
38 | * Author: Eric Barton <eric@bartonsoftware.com> | |
39 | */ | |
40 | ||
5f43264c | 41 | #include <asm/div64.h> |
d664d1fd JH |
42 | #include <asm/page.h> |
43 | #include "o2iblnd.h" | |
d7e09d03 | 44 | |
c272937f | 45 | static lnd_t the_o2iblnd = { |
ec3d17c0 MS |
46 | .lnd_type = O2IBLND, |
47 | .lnd_startup = kiblnd_startup, | |
48 | .lnd_shutdown = kiblnd_shutdown, | |
49 | .lnd_ctl = kiblnd_ctl, | |
50 | .lnd_query = kiblnd_query, | |
51 | .lnd_send = kiblnd_send, | |
52 | .lnd_recv = kiblnd_recv, | |
d7e09d03 PT |
53 | }; |
54 | ||
ec3d17c0 | 55 | kib_data_t kiblnd_data; |
d7e09d03 | 56 | |
febe73bd | 57 | static __u32 kiblnd_cksum(void *ptr, int nob) |
d7e09d03 | 58 | { |
ec3d17c0 MS |
59 | char *c = ptr; |
60 | __u32 sum = 0; | |
d7e09d03 PT |
61 | |
62 | while (nob-- > 0) | |
63 | sum = ((sum << 1) | (sum >> 31)) + *c++; | |
64 | ||
65 | /* ensure I don't return 0 (== no checksum) */ | |
66 | return (sum == 0) ? 1 : sum; | |
67 | } | |
68 | ||
febe73bd | 69 | static char *kiblnd_msgtype2str(int type) |
d7e09d03 PT |
70 | { |
71 | switch (type) { | |
72 | case IBLND_MSG_CONNREQ: | |
73 | return "CONNREQ"; | |
74 | ||
75 | case IBLND_MSG_CONNACK: | |
76 | return "CONNACK"; | |
77 | ||
78 | case IBLND_MSG_NOOP: | |
79 | return "NOOP"; | |
80 | ||
81 | case IBLND_MSG_IMMEDIATE: | |
82 | return "IMMEDIATE"; | |
83 | ||
84 | case IBLND_MSG_PUT_REQ: | |
85 | return "PUT_REQ"; | |
86 | ||
87 | case IBLND_MSG_PUT_NAK: | |
88 | return "PUT_NAK"; | |
89 | ||
90 | case IBLND_MSG_PUT_ACK: | |
91 | return "PUT_ACK"; | |
92 | ||
93 | case IBLND_MSG_PUT_DONE: | |
94 | return "PUT_DONE"; | |
95 | ||
96 | case IBLND_MSG_GET_REQ: | |
97 | return "GET_REQ"; | |
98 | ||
99 | case IBLND_MSG_GET_DONE: | |
100 | return "GET_DONE"; | |
101 | ||
102 | default: | |
103 | return "???"; | |
104 | } | |
105 | } | |
106 | ||
febe73bd | 107 | static int kiblnd_msgtype2size(int type) |
d7e09d03 PT |
108 | { |
109 | const int hdr_size = offsetof(kib_msg_t, ibm_u); | |
110 | ||
111 | switch (type) { | |
112 | case IBLND_MSG_CONNREQ: | |
113 | case IBLND_MSG_CONNACK: | |
114 | return hdr_size + sizeof(kib_connparams_t); | |
115 | ||
116 | case IBLND_MSG_NOOP: | |
117 | return hdr_size; | |
118 | ||
119 | case IBLND_MSG_IMMEDIATE: | |
120 | return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]); | |
121 | ||
122 | case IBLND_MSG_PUT_REQ: | |
123 | return hdr_size + sizeof(kib_putreq_msg_t); | |
124 | ||
125 | case IBLND_MSG_PUT_ACK: | |
126 | return hdr_size + sizeof(kib_putack_msg_t); | |
127 | ||
128 | case IBLND_MSG_GET_REQ: | |
129 | return hdr_size + sizeof(kib_get_msg_t); | |
130 | ||
131 | case IBLND_MSG_PUT_NAK: | |
132 | case IBLND_MSG_PUT_DONE: | |
133 | case IBLND_MSG_GET_DONE: | |
134 | return hdr_size + sizeof(kib_completion_msg_t); | |
135 | default: | |
136 | return -1; | |
137 | } | |
138 | } | |
139 | ||
febe73bd | 140 | static int kiblnd_unpack_rd(kib_msg_t *msg, int flip) |
d7e09d03 | 141 | { |
ec3d17c0 MS |
142 | kib_rdma_desc_t *rd; |
143 | int nob; | |
144 | int n; | |
145 | int i; | |
d7e09d03 | 146 | |
febe73bd | 147 | LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ || |
d7e09d03 PT |
148 | msg->ibm_type == IBLND_MSG_PUT_ACK); |
149 | ||
150 | rd = msg->ibm_type == IBLND_MSG_GET_REQ ? | |
151 | &msg->ibm_u.get.ibgm_rd : | |
152 | &msg->ibm_u.putack.ibpam_rd; | |
153 | ||
154 | if (flip) { | |
155 | __swab32s(&rd->rd_key); | |
156 | __swab32s(&rd->rd_nfrags); | |
157 | } | |
158 | ||
159 | n = rd->rd_nfrags; | |
160 | ||
161 | if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) { | |
162 | CERROR("Bad nfrags: %d, should be 0 < n <= %d\n", | |
163 | n, IBLND_MAX_RDMA_FRAGS); | |
164 | return 1; | |
165 | } | |
166 | ||
febe73bd | 167 | nob = offsetof(kib_msg_t, ibm_u) + |
d7e09d03 PT |
168 | kiblnd_rd_msg_size(rd, msg->ibm_type, n); |
169 | ||
170 | if (msg->ibm_nob < nob) { | |
171 | CERROR("Short %s: %d(%d)\n", | |
172 | kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob); | |
173 | return 1; | |
174 | } | |
175 | ||
176 | if (!flip) | |
177 | return 0; | |
178 | ||
179 | for (i = 0; i < n; i++) { | |
180 | __swab32s(&rd->rd_frags[i].rf_nob); | |
181 | __swab64s(&rd->rd_frags[i].rf_addr); | |
182 | } | |
183 | ||
184 | return 0; | |
185 | } | |
186 | ||
febe73bd GM |
187 | void kiblnd_pack_msg(lnet_ni_t *ni, kib_msg_t *msg, int version, |
188 | int credits, lnet_nid_t dstnid, __u64 dststamp) | |
d7e09d03 PT |
189 | { |
190 | kib_net_t *net = ni->ni_data; | |
191 | ||
192 | /* CAVEAT EMPTOR! all message fields not set here should have been | |
193 | * initialised previously. */ | |
194 | msg->ibm_magic = IBLND_MSG_MAGIC; | |
195 | msg->ibm_version = version; | |
196 | /* ibm_type */ | |
197 | msg->ibm_credits = credits; | |
198 | /* ibm_nob */ | |
199 | msg->ibm_cksum = 0; | |
200 | msg->ibm_srcnid = ni->ni_nid; | |
201 | msg->ibm_srcstamp = net->ibn_incarnation; | |
202 | msg->ibm_dstnid = dstnid; | |
203 | msg->ibm_dststamp = dststamp; | |
204 | ||
205 | if (*kiblnd_tunables.kib_cksum) { | |
206 | /* NB ibm_cksum zero while computing cksum */ | |
207 | msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob); | |
208 | } | |
209 | } | |
210 | ||
febe73bd | 211 | int kiblnd_unpack_msg(kib_msg_t *msg, int nob) |
d7e09d03 PT |
212 | { |
213 | const int hdr_size = offsetof(kib_msg_t, ibm_u); | |
ec3d17c0 MS |
214 | __u32 msg_cksum; |
215 | __u16 version; | |
216 | int msg_nob; | |
217 | int flip; | |
d7e09d03 PT |
218 | |
219 | /* 6 bytes are enough to have received magic + version */ | |
220 | if (nob < 6) { | |
221 | CERROR("Short message: %d\n", nob); | |
222 | return -EPROTO; | |
223 | } | |
224 | ||
225 | if (msg->ibm_magic == IBLND_MSG_MAGIC) { | |
226 | flip = 0; | |
227 | } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) { | |
228 | flip = 1; | |
229 | } else { | |
230 | CERROR("Bad magic: %08x\n", msg->ibm_magic); | |
231 | return -EPROTO; | |
232 | } | |
233 | ||
234 | version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; | |
235 | if (version != IBLND_MSG_VERSION && | |
236 | version != IBLND_MSG_VERSION_1) { | |
237 | CERROR("Bad version: %x\n", version); | |
238 | return -EPROTO; | |
239 | } | |
240 | ||
241 | if (nob < hdr_size) { | |
242 | CERROR("Short message: %d\n", nob); | |
243 | return -EPROTO; | |
244 | } | |
245 | ||
246 | msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; | |
247 | if (msg_nob > nob) { | |
248 | CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); | |
249 | return -EPROTO; | |
250 | } | |
251 | ||
252 | /* checksum must be computed with ibm_cksum zero and BEFORE anything | |
253 | * gets flipped */ | |
254 | msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; | |
255 | msg->ibm_cksum = 0; | |
256 | if (msg_cksum != 0 && | |
257 | msg_cksum != kiblnd_cksum(msg, msg_nob)) { | |
258 | CERROR("Bad checksum\n"); | |
259 | return -EPROTO; | |
260 | } | |
261 | ||
262 | msg->ibm_cksum = msg_cksum; | |
263 | ||
264 | if (flip) { | |
265 | /* leave magic unflipped as a clue to peer endianness */ | |
266 | msg->ibm_version = version; | |
febe73bd GM |
267 | CLASSERT(sizeof(msg->ibm_type) == 1); |
268 | CLASSERT(sizeof(msg->ibm_credits) == 1); | |
d7e09d03 PT |
269 | msg->ibm_nob = msg_nob; |
270 | __swab64s(&msg->ibm_srcnid); | |
271 | __swab64s(&msg->ibm_srcstamp); | |
272 | __swab64s(&msg->ibm_dstnid); | |
273 | __swab64s(&msg->ibm_dststamp); | |
274 | } | |
275 | ||
276 | if (msg->ibm_srcnid == LNET_NID_ANY) { | |
277 | CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); | |
278 | return -EPROTO; | |
279 | } | |
280 | ||
281 | if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) { | |
282 | CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type), | |
283 | msg_nob, kiblnd_msgtype2size(msg->ibm_type)); | |
284 | return -EPROTO; | |
285 | } | |
286 | ||
287 | switch (msg->ibm_type) { | |
288 | default: | |
289 | CERROR("Unknown message type %x\n", msg->ibm_type); | |
290 | return -EPROTO; | |
291 | ||
292 | case IBLND_MSG_NOOP: | |
293 | case IBLND_MSG_IMMEDIATE: | |
294 | case IBLND_MSG_PUT_REQ: | |
295 | break; | |
296 | ||
297 | case IBLND_MSG_PUT_ACK: | |
298 | case IBLND_MSG_GET_REQ: | |
299 | if (kiblnd_unpack_rd(msg, flip)) | |
300 | return -EPROTO; | |
301 | break; | |
302 | ||
303 | case IBLND_MSG_PUT_NAK: | |
304 | case IBLND_MSG_PUT_DONE: | |
305 | case IBLND_MSG_GET_DONE: | |
306 | if (flip) | |
307 | __swab32s(&msg->ibm_u.completion.ibcm_status); | |
308 | break; | |
309 | ||
310 | case IBLND_MSG_CONNREQ: | |
311 | case IBLND_MSG_CONNACK: | |
312 | if (flip) { | |
313 | __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth); | |
314 | __swab16s(&msg->ibm_u.connparams.ibcp_max_frags); | |
315 | __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size); | |
316 | } | |
317 | break; | |
318 | } | |
319 | return 0; | |
320 | } | |
321 | ||
febe73bd | 322 | int kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid) |
d7e09d03 | 323 | { |
ec3d17c0 MS |
324 | kib_peer_t *peer; |
325 | kib_net_t *net = ni->ni_data; | |
326 | int cpt = lnet_cpt_of_nid(nid); | |
327 | unsigned long flags; | |
d7e09d03 PT |
328 | |
329 | LASSERT(net != NULL); | |
330 | LASSERT(nid != LNET_NID_ANY); | |
331 | ||
332 | LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer)); | |
333 | if (peer == NULL) { | |
334 | CERROR("Cannot allocate peer\n"); | |
335 | return -ENOMEM; | |
336 | } | |
337 | ||
338 | memset(peer, 0, sizeof(*peer)); /* zero flags etc */ | |
339 | ||
340 | peer->ibp_ni = ni; | |
341 | peer->ibp_nid = nid; | |
342 | peer->ibp_error = 0; | |
343 | peer->ibp_last_alive = 0; | |
344 | atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */ | |
345 | ||
346 | INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */ | |
347 | INIT_LIST_HEAD(&peer->ibp_conns); | |
348 | INIT_LIST_HEAD(&peer->ibp_tx_queue); | |
349 | ||
350 | write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
351 | ||
352 | /* always called with a ref on ni, which prevents ni being shutdown */ | |
febe73bd | 353 | LASSERT(net->ibn_shutdown == 0); |
d7e09d03 PT |
354 | |
355 | /* npeers only grows with the global lock held */ | |
356 | atomic_inc(&net->ibn_npeers); | |
357 | ||
358 | write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
359 | ||
360 | *peerp = peer; | |
361 | return 0; | |
362 | } | |
363 | ||
febe73bd | 364 | void kiblnd_destroy_peer(kib_peer_t *peer) |
d7e09d03 PT |
365 | { |
366 | kib_net_t *net = peer->ibp_ni->ni_data; | |
367 | ||
febe73bd GM |
368 | LASSERT(net != NULL); |
369 | LASSERT(atomic_read(&peer->ibp_refcount) == 0); | |
370 | LASSERT(!kiblnd_peer_active(peer)); | |
371 | LASSERT(peer->ibp_connecting == 0); | |
372 | LASSERT(peer->ibp_accepting == 0); | |
373 | LASSERT(list_empty(&peer->ibp_conns)); | |
374 | LASSERT(list_empty(&peer->ibp_tx_queue)); | |
d7e09d03 PT |
375 | |
376 | LIBCFS_FREE(peer, sizeof(*peer)); | |
377 | ||
378 | /* NB a peer's connections keep a reference on their peer until | |
379 | * they are destroyed, so we can be assured that _all_ state to do | |
380 | * with this peer has been cleaned up when its refcount drops to | |
381 | * zero. */ | |
382 | atomic_dec(&net->ibn_npeers); | |
383 | } | |
384 | ||
febe73bd | 385 | kib_peer_t *kiblnd_find_peer_locked(lnet_nid_t nid) |
d7e09d03 PT |
386 | { |
387 | /* the caller is responsible for accounting the additional reference | |
388 | * that this creates */ | |
ec3d17c0 MS |
389 | struct list_head *peer_list = kiblnd_nid2peerlist(nid); |
390 | struct list_head *tmp; | |
391 | kib_peer_t *peer; | |
d7e09d03 | 392 | |
febe73bd | 393 | list_for_each(tmp, peer_list) { |
d7e09d03 PT |
394 | |
395 | peer = list_entry(tmp, kib_peer_t, ibp_list); | |
396 | ||
febe73bd | 397 | LASSERT(peer->ibp_connecting > 0 || /* creating conns */ |
d7e09d03 PT |
398 | peer->ibp_accepting > 0 || |
399 | !list_empty(&peer->ibp_conns)); /* active conn */ | |
400 | ||
401 | if (peer->ibp_nid != nid) | |
402 | continue; | |
403 | ||
404 | CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n", | |
405 | peer, libcfs_nid2str(nid), | |
406 | atomic_read(&peer->ibp_refcount), | |
407 | peer->ibp_version); | |
408 | return peer; | |
409 | } | |
410 | return NULL; | |
411 | } | |
412 | ||
febe73bd | 413 | void kiblnd_unlink_peer_locked(kib_peer_t *peer) |
d7e09d03 | 414 | { |
febe73bd | 415 | LASSERT(list_empty(&peer->ibp_conns)); |
d7e09d03 | 416 | |
febe73bd | 417 | LASSERT(kiblnd_peer_active(peer)); |
d7e09d03 PT |
418 | list_del_init(&peer->ibp_list); |
419 | /* lose peerlist's ref */ | |
420 | kiblnd_peer_decref(peer); | |
421 | } | |
422 | ||
febe73bd GM |
423 | static int kiblnd_get_peer_info(lnet_ni_t *ni, int index, |
424 | lnet_nid_t *nidp, int *count) | |
d7e09d03 | 425 | { |
ec3d17c0 MS |
426 | kib_peer_t *peer; |
427 | struct list_head *ptmp; | |
428 | int i; | |
429 | unsigned long flags; | |
d7e09d03 PT |
430 | |
431 | read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
432 | ||
433 | for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { | |
434 | ||
febe73bd | 435 | list_for_each(ptmp, &kiblnd_data.kib_peers[i]) { |
d7e09d03 PT |
436 | |
437 | peer = list_entry(ptmp, kib_peer_t, ibp_list); | |
febe73bd | 438 | LASSERT(peer->ibp_connecting > 0 || |
d7e09d03 PT |
439 | peer->ibp_accepting > 0 || |
440 | !list_empty(&peer->ibp_conns)); | |
441 | ||
442 | if (peer->ibp_ni != ni) | |
443 | continue; | |
444 | ||
445 | if (index-- > 0) | |
446 | continue; | |
447 | ||
448 | *nidp = peer->ibp_nid; | |
449 | *count = atomic_read(&peer->ibp_refcount); | |
450 | ||
451 | read_unlock_irqrestore(&kiblnd_data.kib_global_lock, | |
452 | flags); | |
453 | return 0; | |
454 | } | |
455 | } | |
456 | ||
457 | read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
458 | return -ENOENT; | |
459 | } | |
460 | ||
febe73bd | 461 | static void kiblnd_del_peer_locked(kib_peer_t *peer) |
d7e09d03 | 462 | { |
ec3d17c0 MS |
463 | struct list_head *ctmp; |
464 | struct list_head *cnxt; | |
465 | kib_conn_t *conn; | |
d7e09d03 PT |
466 | |
467 | if (list_empty(&peer->ibp_conns)) { | |
468 | kiblnd_unlink_peer_locked(peer); | |
469 | } else { | |
febe73bd | 470 | list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { |
d7e09d03 PT |
471 | conn = list_entry(ctmp, kib_conn_t, ibc_list); |
472 | ||
473 | kiblnd_close_conn_locked(conn, 0); | |
474 | } | |
475 | /* NB closing peer's last conn unlinked it. */ | |
476 | } | |
477 | /* NB peer now unlinked; might even be freed if the peer table had the | |
478 | * last ref on it. */ | |
479 | } | |
480 | ||
febe73bd | 481 | static int kiblnd_del_peer(lnet_ni_t *ni, lnet_nid_t nid) |
d7e09d03 | 482 | { |
febe73bd | 483 | LIST_HEAD(zombies); |
ec3d17c0 MS |
484 | struct list_head *ptmp; |
485 | struct list_head *pnxt; | |
486 | kib_peer_t *peer; | |
487 | int lo; | |
488 | int hi; | |
489 | int i; | |
490 | unsigned long flags; | |
491 | int rc = -ENOENT; | |
d7e09d03 PT |
492 | |
493 | write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
494 | ||
495 | if (nid != LNET_NID_ANY) { | |
496 | lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; | |
497 | } else { | |
498 | lo = 0; | |
499 | hi = kiblnd_data.kib_peer_hash_size - 1; | |
500 | } | |
501 | ||
502 | for (i = lo; i <= hi; i++) { | |
febe73bd | 503 | list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) { |
d7e09d03 | 504 | peer = list_entry(ptmp, kib_peer_t, ibp_list); |
febe73bd | 505 | LASSERT(peer->ibp_connecting > 0 || |
d7e09d03 PT |
506 | peer->ibp_accepting > 0 || |
507 | !list_empty(&peer->ibp_conns)); | |
508 | ||
509 | if (peer->ibp_ni != ni) | |
510 | continue; | |
511 | ||
512 | if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) | |
513 | continue; | |
514 | ||
515 | if (!list_empty(&peer->ibp_tx_queue)) { | |
febe73bd | 516 | LASSERT(list_empty(&peer->ibp_conns)); |
d7e09d03 PT |
517 | |
518 | list_splice_init(&peer->ibp_tx_queue, | |
519 | &zombies); | |
520 | } | |
521 | ||
522 | kiblnd_del_peer_locked(peer); | |
523 | rc = 0; /* matched something */ | |
524 | } | |
525 | } | |
526 | ||
527 | write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
528 | ||
529 | kiblnd_txlist_done(ni, &zombies, -EIO); | |
530 | ||
531 | return rc; | |
532 | } | |
533 | ||
febe73bd | 534 | static kib_conn_t *kiblnd_get_conn_by_idx(lnet_ni_t *ni, int index) |
d7e09d03 | 535 | { |
ec3d17c0 MS |
536 | kib_peer_t *peer; |
537 | struct list_head *ptmp; | |
538 | kib_conn_t *conn; | |
539 | struct list_head *ctmp; | |
540 | int i; | |
541 | unsigned long flags; | |
d7e09d03 PT |
542 | |
543 | read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
544 | ||
545 | for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { | |
febe73bd | 546 | list_for_each(ptmp, &kiblnd_data.kib_peers[i]) { |
d7e09d03 PT |
547 | |
548 | peer = list_entry(ptmp, kib_peer_t, ibp_list); | |
febe73bd | 549 | LASSERT(peer->ibp_connecting > 0 || |
d7e09d03 PT |
550 | peer->ibp_accepting > 0 || |
551 | !list_empty(&peer->ibp_conns)); | |
552 | ||
553 | if (peer->ibp_ni != ni) | |
554 | continue; | |
555 | ||
febe73bd | 556 | list_for_each(ctmp, &peer->ibp_conns) { |
d7e09d03 PT |
557 | if (index-- > 0) |
558 | continue; | |
559 | ||
560 | conn = list_entry(ctmp, kib_conn_t, | |
561 | ibc_list); | |
562 | kiblnd_conn_addref(conn); | |
7a3888a3 GM |
563 | read_unlock_irqrestore( |
564 | &kiblnd_data.kib_global_lock, | |
565 | flags); | |
d7e09d03 PT |
566 | return conn; |
567 | } | |
568 | } | |
569 | } | |
570 | ||
571 | read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
572 | return NULL; | |
573 | } | |
574 | ||
febe73bd | 575 | int kiblnd_translate_mtu(int value) |
d7e09d03 PT |
576 | { |
577 | switch (value) { | |
578 | default: | |
579 | return -1; | |
580 | case 0: | |
581 | return 0; | |
582 | case 256: | |
583 | return IB_MTU_256; | |
584 | case 512: | |
585 | return IB_MTU_512; | |
586 | case 1024: | |
587 | return IB_MTU_1024; | |
588 | case 2048: | |
589 | return IB_MTU_2048; | |
590 | case 4096: | |
591 | return IB_MTU_4096; | |
592 | } | |
593 | } | |
594 | ||
febe73bd | 595 | static void kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid) |
d7e09d03 | 596 | { |
ec3d17c0 | 597 | int mtu; |
d7e09d03 PT |
598 | |
599 | /* XXX There is no path record for iWARP, set by netdev->change_mtu? */ | |
600 | if (cmid->route.path_rec == NULL) | |
601 | return; | |
602 | ||
603 | mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu); | |
febe73bd | 604 | LASSERT(mtu >= 0); |
d7e09d03 PT |
605 | if (mtu != 0) |
606 | cmid->route.path_rec->mtu = mtu; | |
607 | } | |
608 | ||
febe73bd | 609 | static int kiblnd_get_completion_vector(kib_conn_t *conn, int cpt) |
d7e09d03 | 610 | { |
ec3d17c0 MS |
611 | cpumask_t *mask; |
612 | int vectors; | |
613 | int off; | |
614 | int i; | |
615 | lnet_nid_t nid = conn->ibc_peer->ibp_nid; | |
d7e09d03 PT |
616 | |
617 | vectors = conn->ibc_cmid->device->num_comp_vectors; | |
618 | if (vectors <= 1) | |
619 | return 0; | |
620 | ||
621 | mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt); | |
3867ea5a PT |
622 | if (mask == NULL) |
623 | return 0; | |
d7e09d03 PT |
624 | |
625 | /* hash NID to CPU id in this partition... */ | |
4a316f79 OD |
626 | off = do_div(nid, cpumask_weight(mask)); |
627 | for_each_cpu(i, mask) { | |
d7e09d03 PT |
628 | if (off-- == 0) |
629 | return i % vectors; | |
630 | } | |
631 | ||
632 | LBUG(); | |
633 | return 1; | |
634 | } | |
635 | ||
febe73bd GM |
636 | kib_conn_t *kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, |
637 | int state, int version) | |
d7e09d03 PT |
638 | { |
639 | /* CAVEAT EMPTOR: | |
640 | * If the new conn is created successfully it takes over the caller's | |
641 | * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself | |
642 | * is destroyed. On failure, the caller's ref on 'peer' remains and | |
643 | * she must dispose of 'cmid'. (Actually I'd block forever if I tried | |
644 | * to destroy 'cmid' here since I'm called from the CM which still has | |
645 | * its ref on 'cmid'). */ | |
ec3d17c0 MS |
646 | rwlock_t *glock = &kiblnd_data.kib_global_lock; |
647 | kib_net_t *net = peer->ibp_ni->ni_data; | |
648 | kib_dev_t *dev; | |
d7e09d03 | 649 | struct ib_qp_init_attr *init_qp_attr; |
ec3d17c0 | 650 | struct kib_sched_info *sched; |
23908db4 | 651 | struct ib_cq_init_attr cq_attr = {}; |
ec3d17c0 MS |
652 | kib_conn_t *conn; |
653 | struct ib_cq *cq; | |
654 | unsigned long flags; | |
655 | int cpt; | |
656 | int rc; | |
657 | int i; | |
d7e09d03 PT |
658 | |
659 | LASSERT(net != NULL); | |
660 | LASSERT(!in_interrupt()); | |
661 | ||
662 | dev = net->ibn_dev; | |
663 | ||
664 | cpt = lnet_cpt_of_nid(peer->ibp_nid); | |
665 | sched = kiblnd_data.kib_scheds[cpt]; | |
666 | ||
667 | LASSERT(sched->ibs_nthreads > 0); | |
668 | ||
669 | LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt, | |
670 | sizeof(*init_qp_attr)); | |
671 | if (init_qp_attr == NULL) { | |
672 | CERROR("Can't allocate qp_attr for %s\n", | |
673 | libcfs_nid2str(peer->ibp_nid)); | |
674 | goto failed_0; | |
675 | } | |
676 | ||
677 | LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn)); | |
678 | if (conn == NULL) { | |
679 | CERROR("Can't allocate connection for %s\n", | |
680 | libcfs_nid2str(peer->ibp_nid)); | |
681 | goto failed_1; | |
682 | } | |
683 | ||
684 | conn->ibc_state = IBLND_CONN_INIT; | |
685 | conn->ibc_version = version; | |
686 | conn->ibc_peer = peer; /* I take the caller's ref */ | |
687 | cmid->context = conn; /* for future CM callbacks */ | |
688 | conn->ibc_cmid = cmid; | |
689 | ||
690 | INIT_LIST_HEAD(&conn->ibc_early_rxs); | |
691 | INIT_LIST_HEAD(&conn->ibc_tx_noops); | |
692 | INIT_LIST_HEAD(&conn->ibc_tx_queue); | |
693 | INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd); | |
694 | INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred); | |
695 | INIT_LIST_HEAD(&conn->ibc_active_txs); | |
696 | spin_lock_init(&conn->ibc_lock); | |
697 | ||
698 | LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt, | |
699 | sizeof(*conn->ibc_connvars)); | |
700 | if (conn->ibc_connvars == NULL) { | |
701 | CERROR("Can't allocate in-progress connection state\n"); | |
702 | goto failed_2; | |
703 | } | |
704 | ||
705 | write_lock_irqsave(glock, flags); | |
706 | if (dev->ibd_failover) { | |
707 | write_unlock_irqrestore(glock, flags); | |
708 | CERROR("%s: failover in progress\n", dev->ibd_ifname); | |
709 | goto failed_2; | |
710 | } | |
711 | ||
712 | if (dev->ibd_hdev->ibh_ibdev != cmid->device) { | |
713 | /* wakeup failover thread and teardown connection */ | |
714 | if (kiblnd_dev_can_failover(dev)) { | |
715 | list_add_tail(&dev->ibd_fail_list, | |
716 | &kiblnd_data.kib_failed_devs); | |
717 | wake_up(&kiblnd_data.kib_failover_waitq); | |
718 | } | |
719 | ||
720 | write_unlock_irqrestore(glock, flags); | |
721 | CERROR("cmid HCA(%s), kib_dev(%s) need failover\n", | |
722 | cmid->device->name, dev->ibd_ifname); | |
723 | goto failed_2; | |
724 | } | |
725 | ||
726 | kiblnd_hdev_addref_locked(dev->ibd_hdev); | |
727 | conn->ibc_hdev = dev->ibd_hdev; | |
728 | ||
729 | kiblnd_setup_mtu_locked(cmid); | |
730 | ||
731 | write_unlock_irqrestore(glock, flags); | |
732 | ||
733 | LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt, | |
734 | IBLND_RX_MSGS(version) * sizeof(kib_rx_t)); | |
735 | if (conn->ibc_rxs == NULL) { | |
736 | CERROR("Cannot allocate RX buffers\n"); | |
737 | goto failed_2; | |
738 | } | |
739 | ||
740 | rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt, | |
741 | IBLND_RX_MSG_PAGES(version)); | |
742 | if (rc != 0) | |
743 | goto failed_2; | |
744 | ||
745 | kiblnd_map_rx_descs(conn); | |
746 | ||
8e37210b MB |
747 | cq_attr.cqe = IBLND_CQ_ENTRIES(version); |
748 | cq_attr.comp_vector = kiblnd_get_completion_vector(conn, cpt); | |
d7e09d03 PT |
749 | cq = ib_create_cq(cmid->device, |
750 | kiblnd_cq_completion, kiblnd_cq_event, conn, | |
8e37210b | 751 | &cq_attr); |
d7e09d03 PT |
752 | if (IS_ERR(cq)) { |
753 | CERROR("Can't create CQ: %ld, cqe: %d\n", | |
754 | PTR_ERR(cq), IBLND_CQ_ENTRIES(version)); | |
755 | goto failed_2; | |
756 | } | |
757 | ||
758 | conn->ibc_cq = cq; | |
759 | ||
760 | rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); | |
761 | if (rc != 0) { | |
762 | CERROR("Can't request completion notificiation: %d\n", rc); | |
763 | goto failed_2; | |
764 | } | |
765 | ||
766 | init_qp_attr->event_handler = kiblnd_qp_event; | |
767 | init_qp_attr->qp_context = conn; | |
768 | init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version); | |
769 | init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version); | |
770 | init_qp_attr->cap.max_send_sge = 1; | |
771 | init_qp_attr->cap.max_recv_sge = 1; | |
772 | init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR; | |
773 | init_qp_attr->qp_type = IB_QPT_RC; | |
774 | init_qp_attr->send_cq = cq; | |
775 | init_qp_attr->recv_cq = cq; | |
776 | ||
777 | conn->ibc_sched = sched; | |
778 | ||
779 | rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr); | |
780 | if (rc != 0) { | |
781 | CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n", | |
782 | rc, init_qp_attr->cap.max_send_wr, | |
783 | init_qp_attr->cap.max_recv_wr); | |
784 | goto failed_2; | |
785 | } | |
786 | ||
787 | LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); | |
788 | ||
789 | /* 1 ref for caller and each rxmsg */ | |
790 | atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version)); | |
791 | conn->ibc_nrx = IBLND_RX_MSGS(version); | |
792 | ||
793 | /* post receives */ | |
794 | for (i = 0; i < IBLND_RX_MSGS(version); i++) { | |
795 | rc = kiblnd_post_rx(&conn->ibc_rxs[i], | |
796 | IBLND_POSTRX_NO_CREDIT); | |
797 | if (rc != 0) { | |
798 | CERROR("Can't post rxmsg: %d\n", rc); | |
799 | ||
800 | /* Make posted receives complete */ | |
801 | kiblnd_abort_receives(conn); | |
802 | ||
803 | /* correct # of posted buffers | |
804 | * NB locking needed now I'm racing with completion */ | |
805 | spin_lock_irqsave(&sched->ibs_lock, flags); | |
806 | conn->ibc_nrx -= IBLND_RX_MSGS(version) - i; | |
807 | spin_unlock_irqrestore(&sched->ibs_lock, flags); | |
808 | ||
809 | /* cmid will be destroyed by CM(ofed) after cm_callback | |
810 | * returned, so we can't refer it anymore | |
811 | * (by kiblnd_connd()->kiblnd_destroy_conn) */ | |
812 | rdma_destroy_qp(conn->ibc_cmid); | |
813 | conn->ibc_cmid = NULL; | |
814 | ||
815 | /* Drop my own and unused rxbuffer refcounts */ | |
816 | while (i++ <= IBLND_RX_MSGS(version)) | |
817 | kiblnd_conn_decref(conn); | |
818 | ||
819 | return NULL; | |
820 | } | |
821 | } | |
822 | ||
823 | /* Init successful! */ | |
febe73bd | 824 | LASSERT(state == IBLND_CONN_ACTIVE_CONNECT || |
d7e09d03 PT |
825 | state == IBLND_CONN_PASSIVE_WAIT); |
826 | conn->ibc_state = state; | |
827 | ||
828 | /* 1 more conn */ | |
829 | atomic_inc(&net->ibn_nconns); | |
830 | return conn; | |
831 | ||
832 | failed_2: | |
833 | kiblnd_destroy_conn(conn); | |
834 | failed_1: | |
835 | LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); | |
836 | failed_0: | |
837 | return NULL; | |
838 | } | |
839 | ||
febe73bd | 840 | void kiblnd_destroy_conn(kib_conn_t *conn) |
d7e09d03 PT |
841 | { |
842 | struct rdma_cm_id *cmid = conn->ibc_cmid; | |
ec3d17c0 MS |
843 | kib_peer_t *peer = conn->ibc_peer; |
844 | int rc; | |
d7e09d03 | 845 | |
febe73bd GM |
846 | LASSERT(!in_interrupt()); |
847 | LASSERT(atomic_read(&conn->ibc_refcount) == 0); | |
848 | LASSERT(list_empty(&conn->ibc_early_rxs)); | |
849 | LASSERT(list_empty(&conn->ibc_tx_noops)); | |
850 | LASSERT(list_empty(&conn->ibc_tx_queue)); | |
851 | LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd)); | |
852 | LASSERT(list_empty(&conn->ibc_tx_queue_nocred)); | |
853 | LASSERT(list_empty(&conn->ibc_active_txs)); | |
854 | LASSERT(conn->ibc_noops_posted == 0); | |
855 | LASSERT(conn->ibc_nsends_posted == 0); | |
d7e09d03 PT |
856 | |
857 | switch (conn->ibc_state) { | |
858 | default: | |
859 | /* conn must be completely disengaged from the network */ | |
860 | LBUG(); | |
861 | ||
862 | case IBLND_CONN_DISCONNECTED: | |
863 | /* connvars should have been freed already */ | |
febe73bd | 864 | LASSERT(conn->ibc_connvars == NULL); |
d7e09d03 PT |
865 | break; |
866 | ||
867 | case IBLND_CONN_INIT: | |
868 | break; | |
869 | } | |
870 | ||
871 | /* conn->ibc_cmid might be destroyed by CM already */ | |
872 | if (cmid != NULL && cmid->qp != NULL) | |
873 | rdma_destroy_qp(cmid); | |
874 | ||
875 | if (conn->ibc_cq != NULL) { | |
876 | rc = ib_destroy_cq(conn->ibc_cq); | |
877 | if (rc != 0) | |
878 | CWARN("Error destroying CQ: %d\n", rc); | |
879 | } | |
880 | ||
881 | if (conn->ibc_rx_pages != NULL) | |
882 | kiblnd_unmap_rx_descs(conn); | |
883 | ||
884 | if (conn->ibc_rxs != NULL) { | |
885 | LIBCFS_FREE(conn->ibc_rxs, | |
7a3888a3 GM |
886 | IBLND_RX_MSGS(conn->ibc_version) |
887 | * sizeof(kib_rx_t)); | |
d7e09d03 PT |
888 | } |
889 | ||
890 | if (conn->ibc_connvars != NULL) | |
891 | LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); | |
892 | ||
893 | if (conn->ibc_hdev != NULL) | |
894 | kiblnd_hdev_decref(conn->ibc_hdev); | |
895 | ||
896 | /* See CAVEAT EMPTOR above in kiblnd_create_conn */ | |
897 | if (conn->ibc_state != IBLND_CONN_INIT) { | |
898 | kib_net_t *net = peer->ibp_ni->ni_data; | |
899 | ||
900 | kiblnd_peer_decref(peer); | |
901 | rdma_destroy_id(cmid); | |
902 | atomic_dec(&net->ibn_nconns); | |
903 | } | |
904 | ||
905 | LIBCFS_FREE(conn, sizeof(*conn)); | |
906 | } | |
907 | ||
febe73bd | 908 | int kiblnd_close_peer_conns_locked(kib_peer_t *peer, int why) |
d7e09d03 | 909 | { |
ec3d17c0 MS |
910 | kib_conn_t *conn; |
911 | struct list_head *ctmp; | |
912 | struct list_head *cnxt; | |
913 | int count = 0; | |
d7e09d03 | 914 | |
febe73bd | 915 | list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { |
d7e09d03 PT |
916 | conn = list_entry(ctmp, kib_conn_t, ibc_list); |
917 | ||
2d00bd17 | 918 | CDEBUG(D_NET, "Closing conn -> %s, version: %x, reason: %d\n", |
d7e09d03 PT |
919 | libcfs_nid2str(peer->ibp_nid), |
920 | conn->ibc_version, why); | |
921 | ||
922 | kiblnd_close_conn_locked(conn, why); | |
923 | count++; | |
924 | } | |
925 | ||
926 | return count; | |
927 | } | |
928 | ||
febe73bd GM |
929 | int kiblnd_close_stale_conns_locked(kib_peer_t *peer, |
930 | int version, __u64 incarnation) | |
d7e09d03 | 931 | { |
ec3d17c0 MS |
932 | kib_conn_t *conn; |
933 | struct list_head *ctmp; | |
934 | struct list_head *cnxt; | |
935 | int count = 0; | |
d7e09d03 | 936 | |
febe73bd | 937 | list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { |
d7e09d03 PT |
938 | conn = list_entry(ctmp, kib_conn_t, ibc_list); |
939 | ||
940 | if (conn->ibc_version == version && | |
941 | conn->ibc_incarnation == incarnation) | |
942 | continue; | |
943 | ||
7a3888a3 GM |
944 | CDEBUG(D_NET, |
945 | "Closing stale conn -> %s version: %x, incarnation:%#llx(%x, %#llx)\n", | |
d7e09d03 PT |
946 | libcfs_nid2str(peer->ibp_nid), |
947 | conn->ibc_version, conn->ibc_incarnation, | |
948 | version, incarnation); | |
949 | ||
950 | kiblnd_close_conn_locked(conn, -ESTALE); | |
951 | count++; | |
952 | } | |
953 | ||
954 | return count; | |
955 | } | |
956 | ||
febe73bd | 957 | static int kiblnd_close_matching_conns(lnet_ni_t *ni, lnet_nid_t nid) |
d7e09d03 | 958 | { |
ec3d17c0 MS |
959 | kib_peer_t *peer; |
960 | struct list_head *ptmp; | |
961 | struct list_head *pnxt; | |
962 | int lo; | |
963 | int hi; | |
964 | int i; | |
965 | unsigned long flags; | |
966 | int count = 0; | |
d7e09d03 PT |
967 | |
968 | write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
969 | ||
970 | if (nid != LNET_NID_ANY) | |
971 | lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; | |
972 | else { | |
973 | lo = 0; | |
974 | hi = kiblnd_data.kib_peer_hash_size - 1; | |
975 | } | |
976 | ||
977 | for (i = lo; i <= hi; i++) { | |
febe73bd | 978 | list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) { |
d7e09d03 PT |
979 | |
980 | peer = list_entry(ptmp, kib_peer_t, ibp_list); | |
febe73bd | 981 | LASSERT(peer->ibp_connecting > 0 || |
d7e09d03 PT |
982 | peer->ibp_accepting > 0 || |
983 | !list_empty(&peer->ibp_conns)); | |
984 | ||
985 | if (peer->ibp_ni != ni) | |
986 | continue; | |
987 | ||
988 | if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) | |
989 | continue; | |
990 | ||
991 | count += kiblnd_close_peer_conns_locked(peer, 0); | |
992 | } | |
993 | } | |
994 | ||
995 | write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
996 | ||
997 | /* wildcards always succeed */ | |
998 | if (nid == LNET_NID_ANY) | |
999 | return 0; | |
1000 | ||
1001 | return (count == 0) ? -ENOENT : 0; | |
1002 | } | |
1003 | ||
febe73bd | 1004 | int kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) |
d7e09d03 PT |
1005 | { |
1006 | struct libcfs_ioctl_data *data = arg; | |
ec3d17c0 | 1007 | int rc = -EINVAL; |
d7e09d03 | 1008 | |
a58a38ac | 1009 | switch (cmd) { |
d7e09d03 | 1010 | case IOC_LIBCFS_GET_PEER: { |
ec3d17c0 MS |
1011 | lnet_nid_t nid = 0; |
1012 | int count = 0; | |
d7e09d03 PT |
1013 | |
1014 | rc = kiblnd_get_peer_info(ni, data->ioc_count, | |
1015 | &nid, &count); | |
ec3d17c0 MS |
1016 | data->ioc_nid = nid; |
1017 | data->ioc_count = count; | |
d7e09d03 PT |
1018 | break; |
1019 | } | |
1020 | ||
1021 | case IOC_LIBCFS_DEL_PEER: { | |
1022 | rc = kiblnd_del_peer(ni, data->ioc_nid); | |
1023 | break; | |
1024 | } | |
1025 | case IOC_LIBCFS_GET_CONN: { | |
1026 | kib_conn_t *conn; | |
1027 | ||
1028 | rc = 0; | |
1029 | conn = kiblnd_get_conn_by_idx(ni, data->ioc_count); | |
1030 | if (conn == NULL) { | |
1031 | rc = -ENOENT; | |
1032 | break; | |
1033 | } | |
1034 | ||
febe73bd | 1035 | LASSERT(conn->ibc_cmid != NULL); |
d7e09d03 PT |
1036 | data->ioc_nid = conn->ibc_peer->ibp_nid; |
1037 | if (conn->ibc_cmid->route.path_rec == NULL) | |
1038 | data->ioc_u32[0] = 0; /* iWarp has no path MTU */ | |
1039 | else | |
1040 | data->ioc_u32[0] = | |
1041 | ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu); | |
1042 | kiblnd_conn_decref(conn); | |
1043 | break; | |
1044 | } | |
1045 | case IOC_LIBCFS_CLOSE_CONNECTION: { | |
1046 | rc = kiblnd_close_matching_conns(ni, data->ioc_nid); | |
1047 | break; | |
1048 | } | |
1049 | ||
1050 | default: | |
1051 | break; | |
1052 | } | |
1053 | ||
1054 | return rc; | |
1055 | } | |
1056 | ||
febe73bd | 1057 | void kiblnd_query(lnet_ni_t *ni, lnet_nid_t nid, unsigned long *when) |
d7e09d03 | 1058 | { |
ec3d17c0 MS |
1059 | unsigned long last_alive = 0; |
1060 | unsigned long now = cfs_time_current(); | |
1061 | rwlock_t *glock = &kiblnd_data.kib_global_lock; | |
1062 | kib_peer_t *peer; | |
1063 | unsigned long flags; | |
d7e09d03 PT |
1064 | |
1065 | read_lock_irqsave(glock, flags); | |
1066 | ||
1067 | peer = kiblnd_find_peer_locked(nid); | |
1068 | if (peer != NULL) { | |
febe73bd | 1069 | LASSERT(peer->ibp_connecting > 0 || /* creating conns */ |
d7e09d03 PT |
1070 | peer->ibp_accepting > 0 || |
1071 | !list_empty(&peer->ibp_conns)); /* active conn */ | |
1072 | last_alive = peer->ibp_last_alive; | |
1073 | } | |
1074 | ||
1075 | read_unlock_irqrestore(glock, flags); | |
1076 | ||
1077 | if (last_alive != 0) | |
1078 | *when = last_alive; | |
1079 | ||
1080 | /* peer is not persistent in hash, trigger peer creation | |
1081 | * and connection establishment with a NULL tx */ | |
1082 | if (peer == NULL) | |
1083 | kiblnd_launch_tx(ni, NULL, nid); | |
1084 | ||
1085 | CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n", | |
1086 | libcfs_nid2str(nid), peer, | |
1087 | last_alive ? cfs_duration_sec(now - last_alive) : -1); | |
d7e09d03 PT |
1088 | } |
1089 | ||
febe73bd | 1090 | void kiblnd_free_pages(kib_pages_t *p) |
d7e09d03 | 1091 | { |
ec3d17c0 MS |
1092 | int npages = p->ibp_npages; |
1093 | int i; | |
d7e09d03 PT |
1094 | |
1095 | for (i = 0; i < npages; i++) { | |
1096 | if (p->ibp_pages[i] != NULL) | |
1097 | __free_page(p->ibp_pages[i]); | |
1098 | } | |
1099 | ||
1100 | LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages])); | |
1101 | } | |
1102 | ||
febe73bd | 1103 | int kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages) |
d7e09d03 | 1104 | { |
ec3d17c0 MS |
1105 | kib_pages_t *p; |
1106 | int i; | |
d7e09d03 PT |
1107 | |
1108 | LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt, | |
1109 | offsetof(kib_pages_t, ibp_pages[npages])); | |
1110 | if (p == NULL) { | |
1111 | CERROR("Can't allocate descriptor for %d pages\n", npages); | |
1112 | return -ENOMEM; | |
1113 | } | |
1114 | ||
1115 | memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages])); | |
1116 | p->ibp_npages = npages; | |
1117 | ||
1118 | for (i = 0; i < npages; i++) { | |
49c02a75 PT |
1119 | p->ibp_pages[i] = alloc_pages_node( |
1120 | cfs_cpt_spread_node(lnet_cpt_table(), cpt), | |
0be19afa | 1121 | GFP_NOFS, 0); |
d7e09d03 PT |
1122 | if (p->ibp_pages[i] == NULL) { |
1123 | CERROR("Can't allocate page %d of %d\n", i, npages); | |
1124 | kiblnd_free_pages(p); | |
1125 | return -ENOMEM; | |
1126 | } | |
1127 | } | |
1128 | ||
1129 | *pp = p; | |
1130 | return 0; | |
1131 | } | |
1132 | ||
febe73bd | 1133 | void kiblnd_unmap_rx_descs(kib_conn_t *conn) |
d7e09d03 PT |
1134 | { |
1135 | kib_rx_t *rx; | |
ec3d17c0 | 1136 | int i; |
d7e09d03 | 1137 | |
febe73bd GM |
1138 | LASSERT(conn->ibc_rxs != NULL); |
1139 | LASSERT(conn->ibc_hdev != NULL); | |
d7e09d03 PT |
1140 | |
1141 | for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) { | |
1142 | rx = &conn->ibc_rxs[i]; | |
1143 | ||
febe73bd | 1144 | LASSERT(rx->rx_nob >= 0); /* not posted */ |
d7e09d03 PT |
1145 | |
1146 | kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev, | |
1147 | KIBLND_UNMAP_ADDR(rx, rx_msgunmap, | |
1148 | rx->rx_msgaddr), | |
1149 | IBLND_MSG_SIZE, DMA_FROM_DEVICE); | |
1150 | } | |
1151 | ||
1152 | kiblnd_free_pages(conn->ibc_rx_pages); | |
1153 | ||
1154 | conn->ibc_rx_pages = NULL; | |
1155 | } | |
1156 | ||
febe73bd | 1157 | void kiblnd_map_rx_descs(kib_conn_t *conn) |
d7e09d03 | 1158 | { |
ec3d17c0 MS |
1159 | kib_rx_t *rx; |
1160 | struct page *pg; | |
1161 | int pg_off; | |
1162 | int ipg; | |
1163 | int i; | |
d7e09d03 | 1164 | |
ec3d17c0 | 1165 | for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) { |
d7e09d03 PT |
1166 | pg = conn->ibc_rx_pages->ibp_pages[ipg]; |
1167 | rx = &conn->ibc_rxs[i]; | |
1168 | ||
1169 | rx->rx_conn = conn; | |
1170 | rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off); | |
1171 | ||
1172 | rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev, | |
7a3888a3 GM |
1173 | rx->rx_msg, |
1174 | IBLND_MSG_SIZE, | |
d7e09d03 | 1175 | DMA_FROM_DEVICE); |
febe73bd | 1176 | LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev, |
d7e09d03 PT |
1177 | rx->rx_msgaddr)); |
1178 | KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr); | |
1179 | ||
1d8cb70c | 1180 | CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n", |
d7e09d03 | 1181 | i, rx->rx_msg, rx->rx_msgaddr, |
d664d1fd | 1182 | (__u64)(page_to_phys(pg) + pg_off)); |
d7e09d03 PT |
1183 | |
1184 | pg_off += IBLND_MSG_SIZE; | |
febe73bd | 1185 | LASSERT(pg_off <= PAGE_SIZE); |
d7e09d03 PT |
1186 | |
1187 | if (pg_off == PAGE_SIZE) { | |
1188 | pg_off = 0; | |
1189 | ipg++; | |
febe73bd | 1190 | LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version)); |
d7e09d03 PT |
1191 | } |
1192 | } | |
1193 | } | |
1194 | ||
febe73bd | 1195 | static void kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo) |
d7e09d03 | 1196 | { |
ec3d17c0 MS |
1197 | kib_hca_dev_t *hdev = tpo->tpo_hdev; |
1198 | kib_tx_t *tx; | |
1199 | int i; | |
d7e09d03 | 1200 | |
febe73bd | 1201 | LASSERT(tpo->tpo_pool.po_allocated == 0); |
d7e09d03 PT |
1202 | |
1203 | if (hdev == NULL) | |
1204 | return; | |
1205 | ||
1206 | for (i = 0; i < tpo->tpo_pool.po_size; i++) { | |
1207 | tx = &tpo->tpo_tx_descs[i]; | |
1208 | kiblnd_dma_unmap_single(hdev->ibh_ibdev, | |
1209 | KIBLND_UNMAP_ADDR(tx, tx_msgunmap, | |
1210 | tx->tx_msgaddr), | |
1211 | IBLND_MSG_SIZE, DMA_TO_DEVICE); | |
1212 | } | |
1213 | ||
1214 | kiblnd_hdev_decref(hdev); | |
1215 | tpo->tpo_hdev = NULL; | |
1216 | } | |
1217 | ||
febe73bd | 1218 | static kib_hca_dev_t *kiblnd_current_hdev(kib_dev_t *dev) |
d7e09d03 PT |
1219 | { |
1220 | kib_hca_dev_t *hdev; | |
ec3d17c0 MS |
1221 | unsigned long flags; |
1222 | int i = 0; | |
d7e09d03 PT |
1223 | |
1224 | read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
1225 | while (dev->ibd_failover) { | |
1226 | read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
1227 | if (i++ % 50 == 0) | |
1228 | CDEBUG(D_NET, "%s: Wait for failover\n", | |
1229 | dev->ibd_ifname); | |
1230 | schedule_timeout(cfs_time_seconds(1) / 100); | |
1231 | ||
1232 | read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
1233 | } | |
1234 | ||
1235 | kiblnd_hdev_addref_locked(dev->ibd_hdev); | |
1236 | hdev = dev->ibd_hdev; | |
1237 | ||
1238 | read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
1239 | ||
1240 | return hdev; | |
1241 | } | |
1242 | ||
febe73bd | 1243 | static void kiblnd_map_tx_pool(kib_tx_pool_t *tpo) |
d7e09d03 | 1244 | { |
ec3d17c0 MS |
1245 | kib_pages_t *txpgs = tpo->tpo_tx_pages; |
1246 | kib_pool_t *pool = &tpo->tpo_pool; | |
1247 | kib_net_t *net = pool->po_owner->ps_net; | |
1248 | kib_dev_t *dev; | |
1249 | struct page *page; | |
1250 | kib_tx_t *tx; | |
1251 | int page_offset; | |
1252 | int ipage; | |
1253 | int i; | |
d7e09d03 | 1254 | |
febe73bd | 1255 | LASSERT(net != NULL); |
d7e09d03 PT |
1256 | |
1257 | dev = net->ibn_dev; | |
1258 | ||
1259 | /* pre-mapped messages are not bigger than 1 page */ | |
febe73bd | 1260 | CLASSERT(IBLND_MSG_SIZE <= PAGE_SIZE); |
d7e09d03 PT |
1261 | |
1262 | /* No fancy arithmetic when we do the buffer calculations */ | |
febe73bd | 1263 | CLASSERT(PAGE_SIZE % IBLND_MSG_SIZE == 0); |
d7e09d03 PT |
1264 | |
1265 | tpo->tpo_hdev = kiblnd_current_hdev(dev); | |
1266 | ||
1267 | for (ipage = page_offset = i = 0; i < pool->po_size; i++) { | |
1268 | page = txpgs->ibp_pages[ipage]; | |
1269 | tx = &tpo->tpo_tx_descs[i]; | |
1270 | ||
1271 | tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + | |
1272 | page_offset); | |
1273 | ||
1274 | tx->tx_msgaddr = kiblnd_dma_map_single( | |
1275 | tpo->tpo_hdev->ibh_ibdev, tx->tx_msg, | |
1276 | IBLND_MSG_SIZE, DMA_TO_DEVICE); | |
febe73bd | 1277 | LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev, |
d7e09d03 PT |
1278 | tx->tx_msgaddr)); |
1279 | KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr); | |
1280 | ||
1281 | list_add(&tx->tx_list, &pool->po_free_list); | |
1282 | ||
1283 | page_offset += IBLND_MSG_SIZE; | |
febe73bd | 1284 | LASSERT(page_offset <= PAGE_SIZE); |
d7e09d03 PT |
1285 | |
1286 | if (page_offset == PAGE_SIZE) { | |
1287 | page_offset = 0; | |
1288 | ipage++; | |
febe73bd | 1289 | LASSERT(ipage <= txpgs->ibp_npages); |
d7e09d03 PT |
1290 | } |
1291 | } | |
1292 | } | |
1293 | ||
febe73bd | 1294 | struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev, __u64 addr, __u64 size) |
d7e09d03 | 1295 | { |
ec3d17c0 | 1296 | __u64 index; |
d7e09d03 | 1297 | |
febe73bd | 1298 | LASSERT(hdev->ibh_mrs[0] != NULL); |
d7e09d03 PT |
1299 | |
1300 | if (hdev->ibh_nmrs == 1) | |
1301 | return hdev->ibh_mrs[0]; | |
1302 | ||
1303 | index = addr >> hdev->ibh_mr_shift; | |
1304 | ||
1305 | if (index < hdev->ibh_nmrs && | |
1306 | index == ((addr + size - 1) >> hdev->ibh_mr_shift)) | |
1307 | return hdev->ibh_mrs[index]; | |
1308 | ||
1309 | return NULL; | |
1310 | } | |
1311 | ||
febe73bd | 1312 | struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd) |
d7e09d03 PT |
1313 | { |
1314 | struct ib_mr *prev_mr; | |
1315 | struct ib_mr *mr; | |
ec3d17c0 | 1316 | int i; |
d7e09d03 | 1317 | |
febe73bd | 1318 | LASSERT(hdev->ibh_mrs[0] != NULL); |
d7e09d03 PT |
1319 | |
1320 | if (*kiblnd_tunables.kib_map_on_demand > 0 && | |
1321 | *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags) | |
1322 | return NULL; | |
1323 | ||
1324 | if (hdev->ibh_nmrs == 1) | |
1325 | return hdev->ibh_mrs[0]; | |
1326 | ||
1327 | for (i = 0, mr = prev_mr = NULL; | |
1328 | i < rd->rd_nfrags; i++) { | |
1329 | mr = kiblnd_find_dma_mr(hdev, | |
1330 | rd->rd_frags[i].rf_addr, | |
1331 | rd->rd_frags[i].rf_nob); | |
1332 | if (prev_mr == NULL) | |
1333 | prev_mr = mr; | |
1334 | ||
1335 | if (mr == NULL || prev_mr != mr) { | |
1336 | /* Can't covered by one single MR */ | |
1337 | mr = NULL; | |
1338 | break; | |
1339 | } | |
1340 | } | |
1341 | ||
1342 | return mr; | |
1343 | } | |
1344 | ||
febe73bd | 1345 | static void kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool) |
d7e09d03 | 1346 | { |
febe73bd | 1347 | LASSERT(pool->fpo_map_count == 0); |
d7e09d03 PT |
1348 | |
1349 | if (pool->fpo_fmr_pool != NULL) | |
1350 | ib_destroy_fmr_pool(pool->fpo_fmr_pool); | |
1351 | ||
1352 | if (pool->fpo_hdev != NULL) | |
1353 | kiblnd_hdev_decref(pool->fpo_hdev); | |
1354 | ||
1355 | LIBCFS_FREE(pool, sizeof(kib_fmr_pool_t)); | |
1356 | } | |
1357 | ||
febe73bd | 1358 | static void kiblnd_destroy_fmr_pool_list(struct list_head *head) |
d7e09d03 PT |
1359 | { |
1360 | kib_fmr_pool_t *pool; | |
1361 | ||
1362 | while (!list_empty(head)) { | |
1363 | pool = list_entry(head->next, kib_fmr_pool_t, fpo_list); | |
1364 | list_del(&pool->fpo_list); | |
1365 | kiblnd_destroy_fmr_pool(pool); | |
1366 | } | |
1367 | } | |
1368 | ||
1369 | static int kiblnd_fmr_pool_size(int ncpts) | |
1370 | { | |
1371 | int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts; | |
1372 | ||
1373 | return max(IBLND_FMR_POOL, size); | |
1374 | } | |
1375 | ||
1376 | static int kiblnd_fmr_flush_trigger(int ncpts) | |
1377 | { | |
1378 | int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts; | |
1379 | ||
1380 | return max(IBLND_FMR_POOL_FLUSH, size); | |
1381 | } | |
1382 | ||
febe73bd GM |
1383 | static int kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, |
1384 | kib_fmr_pool_t **pp_fpo) | |
d7e09d03 PT |
1385 | { |
1386 | /* FMR pool for RDMA */ | |
ec3d17c0 MS |
1387 | kib_dev_t *dev = fps->fps_net->ibn_dev; |
1388 | kib_fmr_pool_t *fpo; | |
d7e09d03 PT |
1389 | struct ib_fmr_pool_param param = { |
1390 | .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE, | |
ec3d17c0 MS |
1391 | .page_shift = PAGE_SHIFT, |
1392 | .access = (IB_ACCESS_LOCAL_WRITE | | |
1393 | IB_ACCESS_REMOTE_WRITE), | |
1394 | .pool_size = fps->fps_pool_size, | |
d7e09d03 PT |
1395 | .dirty_watermark = fps->fps_flush_trigger, |
1396 | .flush_function = NULL, | |
ec3d17c0 MS |
1397 | .flush_arg = NULL, |
1398 | .cache = !!*kiblnd_tunables.kib_fmr_cache}; | |
d7e09d03 PT |
1399 | int rc; |
1400 | ||
1401 | LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo)); | |
1402 | if (fpo == NULL) | |
1403 | return -ENOMEM; | |
1404 | ||
1405 | fpo->fpo_hdev = kiblnd_current_hdev(dev); | |
1406 | ||
1407 | fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, ¶m); | |
1408 | if (IS_ERR(fpo->fpo_fmr_pool)) { | |
1409 | rc = PTR_ERR(fpo->fpo_fmr_pool); | |
1410 | CERROR("Failed to create FMR pool: %d\n", rc); | |
1411 | ||
1412 | kiblnd_hdev_decref(fpo->fpo_hdev); | |
1413 | LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t)); | |
1414 | return rc; | |
1415 | } | |
1416 | ||
1417 | fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); | |
1418 | fpo->fpo_owner = fps; | |
1419 | *pp_fpo = fpo; | |
1420 | ||
1421 | return 0; | |
1422 | } | |
1423 | ||
febe73bd GM |
1424 | static void kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, |
1425 | struct list_head *zombies) | |
d7e09d03 PT |
1426 | { |
1427 | if (fps->fps_net == NULL) /* intialized? */ | |
1428 | return; | |
1429 | ||
1430 | spin_lock(&fps->fps_lock); | |
1431 | ||
1432 | while (!list_empty(&fps->fps_pool_list)) { | |
1433 | kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next, | |
1434 | kib_fmr_pool_t, fpo_list); | |
1435 | fpo->fpo_failed = 1; | |
1436 | list_del(&fpo->fpo_list); | |
1437 | if (fpo->fpo_map_count == 0) | |
1438 | list_add(&fpo->fpo_list, zombies); | |
1439 | else | |
1440 | list_add(&fpo->fpo_list, &fps->fps_failed_pool_list); | |
1441 | } | |
1442 | ||
1443 | spin_unlock(&fps->fps_lock); | |
1444 | } | |
1445 | ||
febe73bd | 1446 | static void kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps) |
d7e09d03 PT |
1447 | { |
1448 | if (fps->fps_net != NULL) { /* initialized? */ | |
1449 | kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list); | |
1450 | kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list); | |
1451 | } | |
1452 | } | |
1453 | ||
7a3888a3 GM |
1454 | static int kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, |
1455 | kib_net_t *net, int pool_size, | |
1456 | int flush_trigger) | |
d7e09d03 PT |
1457 | { |
1458 | kib_fmr_pool_t *fpo; | |
ec3d17c0 | 1459 | int rc; |
d7e09d03 PT |
1460 | |
1461 | memset(fps, 0, sizeof(kib_fmr_poolset_t)); | |
1462 | ||
1463 | fps->fps_net = net; | |
1464 | fps->fps_cpt = cpt; | |
1465 | fps->fps_pool_size = pool_size; | |
1466 | fps->fps_flush_trigger = flush_trigger; | |
1467 | spin_lock_init(&fps->fps_lock); | |
1468 | INIT_LIST_HEAD(&fps->fps_pool_list); | |
1469 | INIT_LIST_HEAD(&fps->fps_failed_pool_list); | |
1470 | ||
1471 | rc = kiblnd_create_fmr_pool(fps, &fpo); | |
1472 | if (rc == 0) | |
1473 | list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); | |
1474 | ||
1475 | return rc; | |
1476 | } | |
1477 | ||
febe73bd | 1478 | static int kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, unsigned long now) |
d7e09d03 PT |
1479 | { |
1480 | if (fpo->fpo_map_count != 0) /* still in use */ | |
1481 | return 0; | |
1482 | if (fpo->fpo_failed) | |
1483 | return 1; | |
1484 | return cfs_time_aftereq(now, fpo->fpo_deadline); | |
1485 | } | |
1486 | ||
febe73bd | 1487 | void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status) |
d7e09d03 | 1488 | { |
febe73bd | 1489 | LIST_HEAD(zombies); |
ec3d17c0 | 1490 | kib_fmr_pool_t *fpo = fmr->fmr_pool; |
d7e09d03 | 1491 | kib_fmr_poolset_t *fps = fpo->fpo_owner; |
ec3d17c0 MS |
1492 | unsigned long now = cfs_time_current(); |
1493 | kib_fmr_pool_t *tmp; | |
1494 | int rc; | |
d7e09d03 PT |
1495 | |
1496 | rc = ib_fmr_pool_unmap(fmr->fmr_pfmr); | |
febe73bd | 1497 | LASSERT(rc == 0); |
d7e09d03 PT |
1498 | |
1499 | if (status != 0) { | |
1500 | rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool); | |
febe73bd | 1501 | LASSERT(rc == 0); |
d7e09d03 PT |
1502 | } |
1503 | ||
1504 | fmr->fmr_pool = NULL; | |
1505 | fmr->fmr_pfmr = NULL; | |
1506 | ||
1507 | spin_lock(&fps->fps_lock); | |
74732797 | 1508 | fpo->fpo_map_count--; /* decref the pool */ |
d7e09d03 PT |
1509 | |
1510 | list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) { | |
1511 | /* the first pool is persistent */ | |
1512 | if (fps->fps_pool_list.next == &fpo->fpo_list) | |
1513 | continue; | |
1514 | ||
1515 | if (kiblnd_fmr_pool_is_idle(fpo, now)) { | |
1516 | list_move(&fpo->fpo_list, &zombies); | |
74732797 | 1517 | fps->fps_version++; |
d7e09d03 PT |
1518 | } |
1519 | } | |
1520 | spin_unlock(&fps->fps_lock); | |
1521 | ||
1522 | if (!list_empty(&zombies)) | |
1523 | kiblnd_destroy_fmr_pool_list(&zombies); | |
1524 | } | |
1525 | ||
febe73bd GM |
1526 | int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages, |
1527 | __u64 iov, kib_fmr_t *fmr) | |
d7e09d03 PT |
1528 | { |
1529 | struct ib_pool_fmr *pfmr; | |
ec3d17c0 MS |
1530 | kib_fmr_pool_t *fpo; |
1531 | __u64 version; | |
1532 | int rc; | |
d7e09d03 PT |
1533 | |
1534 | again: | |
1535 | spin_lock(&fps->fps_lock); | |
1536 | version = fps->fps_version; | |
1537 | list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) { | |
1538 | fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); | |
1539 | fpo->fpo_map_count++; | |
1540 | spin_unlock(&fps->fps_lock); | |
1541 | ||
1542 | pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool, | |
1543 | pages, npages, iov); | |
1544 | if (likely(!IS_ERR(pfmr))) { | |
1545 | fmr->fmr_pool = fpo; | |
1546 | fmr->fmr_pfmr = pfmr; | |
1547 | return 0; | |
1548 | } | |
1549 | ||
1550 | spin_lock(&fps->fps_lock); | |
1551 | fpo->fpo_map_count--; | |
1552 | if (PTR_ERR(pfmr) != -EAGAIN) { | |
1553 | spin_unlock(&fps->fps_lock); | |
1554 | return PTR_ERR(pfmr); | |
1555 | } | |
1556 | ||
1557 | /* EAGAIN and ... */ | |
1558 | if (version != fps->fps_version) { | |
1559 | spin_unlock(&fps->fps_lock); | |
1560 | goto again; | |
1561 | } | |
1562 | } | |
1563 | ||
1564 | if (fps->fps_increasing) { | |
1565 | spin_unlock(&fps->fps_lock); | |
7a3888a3 GM |
1566 | CDEBUG(D_NET, |
1567 | "Another thread is allocating new FMR pool, waiting for her to complete\n"); | |
d7e09d03 PT |
1568 | schedule(); |
1569 | goto again; | |
1570 | ||
1571 | } | |
1572 | ||
699503bc | 1573 | if (time_before(cfs_time_current(), fps->fps_next_retry)) { |
d7e09d03 PT |
1574 | /* someone failed recently */ |
1575 | spin_unlock(&fps->fps_lock); | |
1576 | return -EAGAIN; | |
1577 | } | |
1578 | ||
1579 | fps->fps_increasing = 1; | |
1580 | spin_unlock(&fps->fps_lock); | |
1581 | ||
1582 | CDEBUG(D_NET, "Allocate new FMR pool\n"); | |
1583 | rc = kiblnd_create_fmr_pool(fps, &fpo); | |
1584 | spin_lock(&fps->fps_lock); | |
1585 | fps->fps_increasing = 0; | |
1586 | if (rc == 0) { | |
1587 | fps->fps_version++; | |
1588 | list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); | |
1589 | } else { | |
1590 | fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY); | |
1591 | } | |
1592 | spin_unlock(&fps->fps_lock); | |
1593 | ||
1594 | goto again; | |
1595 | } | |
1596 | ||
febe73bd | 1597 | static void kiblnd_fini_pool(kib_pool_t *pool) |
d7e09d03 | 1598 | { |
febe73bd GM |
1599 | LASSERT(list_empty(&pool->po_free_list)); |
1600 | LASSERT(pool->po_allocated == 0); | |
d7e09d03 PT |
1601 | |
1602 | CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name); | |
1603 | } | |
1604 | ||
febe73bd | 1605 | static void kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size) |
d7e09d03 PT |
1606 | { |
1607 | CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name); | |
1608 | ||
1609 | memset(pool, 0, sizeof(kib_pool_t)); | |
1610 | INIT_LIST_HEAD(&pool->po_free_list); | |
1611 | pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); | |
1612 | pool->po_owner = ps; | |
1613 | pool->po_size = size; | |
1614 | } | |
1615 | ||
febe73bd | 1616 | static void kiblnd_destroy_pool_list(struct list_head *head) |
d7e09d03 PT |
1617 | { |
1618 | kib_pool_t *pool; | |
1619 | ||
1620 | while (!list_empty(head)) { | |
1621 | pool = list_entry(head->next, kib_pool_t, po_list); | |
1622 | list_del(&pool->po_list); | |
1623 | ||
febe73bd | 1624 | LASSERT(pool->po_owner != NULL); |
d7e09d03 PT |
1625 | pool->po_owner->ps_pool_destroy(pool); |
1626 | } | |
1627 | } | |
1628 | ||
febe73bd | 1629 | static void kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies) |
d7e09d03 PT |
1630 | { |
1631 | if (ps->ps_net == NULL) /* intialized? */ | |
1632 | return; | |
1633 | ||
1634 | spin_lock(&ps->ps_lock); | |
1635 | while (!list_empty(&ps->ps_pool_list)) { | |
1636 | kib_pool_t *po = list_entry(ps->ps_pool_list.next, | |
1637 | kib_pool_t, po_list); | |
1638 | po->po_failed = 1; | |
1639 | list_del(&po->po_list); | |
1640 | if (po->po_allocated == 0) | |
1641 | list_add(&po->po_list, zombies); | |
1642 | else | |
1643 | list_add(&po->po_list, &ps->ps_failed_pool_list); | |
1644 | } | |
1645 | spin_unlock(&ps->ps_lock); | |
1646 | } | |
1647 | ||
febe73bd | 1648 | static void kiblnd_fini_poolset(kib_poolset_t *ps) |
d7e09d03 PT |
1649 | { |
1650 | if (ps->ps_net != NULL) { /* initialized? */ | |
1651 | kiblnd_destroy_pool_list(&ps->ps_failed_pool_list); | |
1652 | kiblnd_destroy_pool_list(&ps->ps_pool_list); | |
1653 | } | |
1654 | } | |
1655 | ||
febe73bd GM |
1656 | static int kiblnd_init_poolset(kib_poolset_t *ps, int cpt, |
1657 | kib_net_t *net, char *name, int size, | |
1658 | kib_ps_pool_create_t po_create, | |
1659 | kib_ps_pool_destroy_t po_destroy, | |
1660 | kib_ps_node_init_t nd_init, | |
1661 | kib_ps_node_fini_t nd_fini) | |
d7e09d03 | 1662 | { |
ec3d17c0 MS |
1663 | kib_pool_t *pool; |
1664 | int rc; | |
d7e09d03 PT |
1665 | |
1666 | memset(ps, 0, sizeof(kib_poolset_t)); | |
1667 | ||
ec3d17c0 MS |
1668 | ps->ps_cpt = cpt; |
1669 | ps->ps_net = net; | |
d7e09d03 PT |
1670 | ps->ps_pool_create = po_create; |
1671 | ps->ps_pool_destroy = po_destroy; | |
1672 | ps->ps_node_init = nd_init; | |
1673 | ps->ps_node_fini = nd_fini; | |
1674 | ps->ps_pool_size = size; | |
1675 | if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name)) | |
1676 | >= sizeof(ps->ps_name)) | |
1677 | return -E2BIG; | |
1678 | spin_lock_init(&ps->ps_lock); | |
1679 | INIT_LIST_HEAD(&ps->ps_pool_list); | |
1680 | INIT_LIST_HEAD(&ps->ps_failed_pool_list); | |
1681 | ||
1682 | rc = ps->ps_pool_create(ps, size, &pool); | |
1683 | if (rc == 0) | |
1684 | list_add(&pool->po_list, &ps->ps_pool_list); | |
1685 | else | |
1686 | CERROR("Failed to create the first pool for %s\n", ps->ps_name); | |
1687 | ||
1688 | return rc; | |
1689 | } | |
1690 | ||
febe73bd | 1691 | static int kiblnd_pool_is_idle(kib_pool_t *pool, unsigned long now) |
d7e09d03 PT |
1692 | { |
1693 | if (pool->po_allocated != 0) /* still in use */ | |
1694 | return 0; | |
1695 | if (pool->po_failed) | |
1696 | return 1; | |
1697 | return cfs_time_aftereq(now, pool->po_deadline); | |
1698 | } | |
1699 | ||
febe73bd | 1700 | void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node) |
d7e09d03 | 1701 | { |
febe73bd | 1702 | LIST_HEAD(zombies); |
ec3d17c0 MS |
1703 | kib_poolset_t *ps = pool->po_owner; |
1704 | kib_pool_t *tmp; | |
1705 | unsigned long now = cfs_time_current(); | |
d7e09d03 PT |
1706 | |
1707 | spin_lock(&ps->ps_lock); | |
1708 | ||
1709 | if (ps->ps_node_fini != NULL) | |
1710 | ps->ps_node_fini(pool, node); | |
1711 | ||
febe73bd | 1712 | LASSERT(pool->po_allocated > 0); |
d7e09d03 | 1713 | list_add(node, &pool->po_free_list); |
74732797 | 1714 | pool->po_allocated--; |
d7e09d03 PT |
1715 | |
1716 | list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) { | |
1717 | /* the first pool is persistent */ | |
1718 | if (ps->ps_pool_list.next == &pool->po_list) | |
1719 | continue; | |
1720 | ||
1721 | if (kiblnd_pool_is_idle(pool, now)) | |
1722 | list_move(&pool->po_list, &zombies); | |
1723 | } | |
1724 | spin_unlock(&ps->ps_lock); | |
1725 | ||
1726 | if (!list_empty(&zombies)) | |
1727 | kiblnd_destroy_pool_list(&zombies); | |
1728 | } | |
1729 | ||
febe73bd | 1730 | struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps) |
d7e09d03 | 1731 | { |
ec3d17c0 MS |
1732 | struct list_head *node; |
1733 | kib_pool_t *pool; | |
1734 | int rc; | |
d7e09d03 PT |
1735 | |
1736 | again: | |
1737 | spin_lock(&ps->ps_lock); | |
1738 | list_for_each_entry(pool, &ps->ps_pool_list, po_list) { | |
1739 | if (list_empty(&pool->po_free_list)) | |
1740 | continue; | |
1741 | ||
74732797 | 1742 | pool->po_allocated++; |
d7e09d03 PT |
1743 | pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); |
1744 | node = pool->po_free_list.next; | |
1745 | list_del(node); | |
1746 | ||
1747 | if (ps->ps_node_init != NULL) { | |
1748 | /* still hold the lock */ | |
1749 | ps->ps_node_init(pool, node); | |
1750 | } | |
1751 | spin_unlock(&ps->ps_lock); | |
1752 | return node; | |
1753 | } | |
1754 | ||
1755 | /* no available tx pool and ... */ | |
1756 | if (ps->ps_increasing) { | |
1757 | /* another thread is allocating a new pool */ | |
1758 | spin_unlock(&ps->ps_lock); | |
2d00bd17 | 1759 | CDEBUG(D_NET, "Another thread is allocating new %s pool, waiting for her to complete\n", |
d7e09d03 PT |
1760 | ps->ps_name); |
1761 | schedule(); | |
1762 | goto again; | |
1763 | } | |
1764 | ||
699503bc | 1765 | if (time_before(cfs_time_current(), ps->ps_next_retry)) { |
d7e09d03 PT |
1766 | /* someone failed recently */ |
1767 | spin_unlock(&ps->ps_lock); | |
1768 | return NULL; | |
1769 | } | |
1770 | ||
1771 | ps->ps_increasing = 1; | |
1772 | spin_unlock(&ps->ps_lock); | |
1773 | ||
1774 | CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name); | |
1775 | ||
1776 | rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool); | |
1777 | ||
1778 | spin_lock(&ps->ps_lock); | |
1779 | ps->ps_increasing = 0; | |
1780 | if (rc == 0) { | |
1781 | list_add_tail(&pool->po_list, &ps->ps_pool_list); | |
1782 | } else { | |
1783 | ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY); | |
1784 | CERROR("Can't allocate new %s pool because out of memory\n", | |
1785 | ps->ps_name); | |
1786 | } | |
1787 | spin_unlock(&ps->ps_lock); | |
1788 | ||
1789 | goto again; | |
1790 | } | |
1791 | ||
febe73bd | 1792 | static void kiblnd_destroy_tx_pool(kib_pool_t *pool) |
d7e09d03 | 1793 | { |
ec3d17c0 MS |
1794 | kib_tx_pool_t *tpo = container_of(pool, kib_tx_pool_t, tpo_pool); |
1795 | int i; | |
d7e09d03 | 1796 | |
febe73bd | 1797 | LASSERT(pool->po_allocated == 0); |
d7e09d03 PT |
1798 | |
1799 | if (tpo->tpo_tx_pages != NULL) { | |
1800 | kiblnd_unmap_tx_pool(tpo); | |
1801 | kiblnd_free_pages(tpo->tpo_tx_pages); | |
1802 | } | |
1803 | ||
1804 | if (tpo->tpo_tx_descs == NULL) | |
1805 | goto out; | |
1806 | ||
1807 | for (i = 0; i < pool->po_size; i++) { | |
1808 | kib_tx_t *tx = &tpo->tpo_tx_descs[i]; | |
1809 | ||
1810 | list_del(&tx->tx_list); | |
1811 | if (tx->tx_pages != NULL) | |
1812 | LIBCFS_FREE(tx->tx_pages, | |
1813 | LNET_MAX_IOV * | |
1814 | sizeof(*tx->tx_pages)); | |
1815 | if (tx->tx_frags != NULL) | |
1816 | LIBCFS_FREE(tx->tx_frags, | |
1817 | IBLND_MAX_RDMA_FRAGS * | |
1818 | sizeof(*tx->tx_frags)); | |
1819 | if (tx->tx_wrq != NULL) | |
1820 | LIBCFS_FREE(tx->tx_wrq, | |
1821 | (1 + IBLND_MAX_RDMA_FRAGS) * | |
1822 | sizeof(*tx->tx_wrq)); | |
1823 | if (tx->tx_sge != NULL) | |
1824 | LIBCFS_FREE(tx->tx_sge, | |
1825 | (1 + IBLND_MAX_RDMA_FRAGS) * | |
1826 | sizeof(*tx->tx_sge)); | |
1827 | if (tx->tx_rd != NULL) | |
1828 | LIBCFS_FREE(tx->tx_rd, | |
1829 | offsetof(kib_rdma_desc_t, | |
1830 | rd_frags[IBLND_MAX_RDMA_FRAGS])); | |
1831 | } | |
1832 | ||
1833 | LIBCFS_FREE(tpo->tpo_tx_descs, | |
1834 | pool->po_size * sizeof(kib_tx_t)); | |
1835 | out: | |
1836 | kiblnd_fini_pool(pool); | |
1837 | LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t)); | |
1838 | } | |
1839 | ||
1840 | static int kiblnd_tx_pool_size(int ncpts) | |
1841 | { | |
1842 | int ntx = *kiblnd_tunables.kib_ntx / ncpts; | |
1843 | ||
1844 | return max(IBLND_TX_POOL, ntx); | |
1845 | } | |
1846 | ||
febe73bd GM |
1847 | static int kiblnd_create_tx_pool(kib_poolset_t *ps, int size, |
1848 | kib_pool_t **pp_po) | |
d7e09d03 | 1849 | { |
ec3d17c0 MS |
1850 | int i; |
1851 | int npg; | |
1852 | kib_pool_t *pool; | |
d7e09d03 PT |
1853 | kib_tx_pool_t *tpo; |
1854 | ||
1855 | LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo)); | |
1856 | if (tpo == NULL) { | |
1857 | CERROR("Failed to allocate TX pool\n"); | |
1858 | return -ENOMEM; | |
1859 | } | |
1860 | ||
1861 | pool = &tpo->tpo_pool; | |
1862 | kiblnd_init_pool(ps, pool, size); | |
1863 | tpo->tpo_tx_descs = NULL; | |
1864 | tpo->tpo_tx_pages = NULL; | |
1865 | ||
1866 | npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE; | |
1867 | if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) { | |
1868 | CERROR("Can't allocate tx pages: %d\n", npg); | |
1869 | LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t)); | |
1870 | return -ENOMEM; | |
1871 | } | |
1872 | ||
1873 | LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt, | |
1874 | size * sizeof(kib_tx_t)); | |
1875 | if (tpo->tpo_tx_descs == NULL) { | |
1876 | CERROR("Can't allocate %d tx descriptors\n", size); | |
1877 | ps->ps_pool_destroy(pool); | |
1878 | return -ENOMEM; | |
1879 | } | |
1880 | ||
1881 | memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t)); | |
1882 | ||
1883 | for (i = 0; i < size; i++) { | |
1884 | kib_tx_t *tx = &tpo->tpo_tx_descs[i]; | |
1885 | ||
1886 | tx->tx_pool = tpo; | |
1887 | if (ps->ps_net->ibn_fmr_ps != NULL) { | |
1888 | LIBCFS_CPT_ALLOC(tx->tx_pages, | |
1889 | lnet_cpt_table(), ps->ps_cpt, | |
1890 | LNET_MAX_IOV * sizeof(*tx->tx_pages)); | |
1891 | if (tx->tx_pages == NULL) | |
1892 | break; | |
1893 | } | |
1894 | ||
1895 | LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt, | |
1896 | IBLND_MAX_RDMA_FRAGS * sizeof(*tx->tx_frags)); | |
1897 | if (tx->tx_frags == NULL) | |
1898 | break; | |
1899 | ||
1900 | sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS); | |
1901 | ||
1902 | LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt, | |
1903 | (1 + IBLND_MAX_RDMA_FRAGS) * | |
1904 | sizeof(*tx->tx_wrq)); | |
1905 | if (tx->tx_wrq == NULL) | |
1906 | break; | |
1907 | ||
1908 | LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt, | |
1909 | (1 + IBLND_MAX_RDMA_FRAGS) * | |
1910 | sizeof(*tx->tx_sge)); | |
1911 | if (tx->tx_sge == NULL) | |
1912 | break; | |
1913 | ||
1914 | LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt, | |
1915 | offsetof(kib_rdma_desc_t, | |
1916 | rd_frags[IBLND_MAX_RDMA_FRAGS])); | |
1917 | if (tx->tx_rd == NULL) | |
1918 | break; | |
1919 | } | |
1920 | ||
1921 | if (i == size) { | |
1922 | kiblnd_map_tx_pool(tpo); | |
1923 | *pp_po = pool; | |
1924 | return 0; | |
1925 | } | |
1926 | ||
1927 | ps->ps_pool_destroy(pool); | |
1928 | return -ENOMEM; | |
1929 | } | |
1930 | ||
febe73bd | 1931 | static void kiblnd_tx_init(kib_pool_t *pool, struct list_head *node) |
d7e09d03 PT |
1932 | { |
1933 | kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t, | |
1934 | tps_poolset); | |
ec3d17c0 | 1935 | kib_tx_t *tx = list_entry(node, kib_tx_t, tx_list); |
d7e09d03 | 1936 | |
74732797 | 1937 | tx->tx_cookie = tps->tps_next_tx_cookie++; |
d7e09d03 PT |
1938 | } |
1939 | ||
febe73bd | 1940 | static void kiblnd_net_fini_pools(kib_net_t *net) |
d7e09d03 | 1941 | { |
ec3d17c0 | 1942 | int i; |
d7e09d03 PT |
1943 | |
1944 | cfs_cpt_for_each(i, lnet_cpt_table()) { | |
ec3d17c0 MS |
1945 | kib_tx_poolset_t *tps; |
1946 | kib_fmr_poolset_t *fps; | |
d7e09d03 PT |
1947 | |
1948 | if (net->ibn_tx_ps != NULL) { | |
1949 | tps = net->ibn_tx_ps[i]; | |
1950 | kiblnd_fini_poolset(&tps->tps_poolset); | |
1951 | } | |
1952 | ||
1953 | if (net->ibn_fmr_ps != NULL) { | |
1954 | fps = net->ibn_fmr_ps[i]; | |
1955 | kiblnd_fini_fmr_poolset(fps); | |
1956 | } | |
d7e09d03 PT |
1957 | } |
1958 | ||
1959 | if (net->ibn_tx_ps != NULL) { | |
1960 | cfs_percpt_free(net->ibn_tx_ps); | |
1961 | net->ibn_tx_ps = NULL; | |
1962 | } | |
1963 | ||
1964 | if (net->ibn_fmr_ps != NULL) { | |
1965 | cfs_percpt_free(net->ibn_fmr_ps); | |
1966 | net->ibn_fmr_ps = NULL; | |
1967 | } | |
d7e09d03 PT |
1968 | } |
1969 | ||
febe73bd | 1970 | static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts) |
d7e09d03 | 1971 | { |
ec3d17c0 MS |
1972 | unsigned long flags; |
1973 | int cpt; | |
a6970317 | 1974 | int rc = 0; |
ec3d17c0 | 1975 | int i; |
d7e09d03 PT |
1976 | |
1977 | read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
1978 | if (*kiblnd_tunables.kib_map_on_demand == 0 && | |
1979 | net->ibn_dev->ibd_hdev->ibh_nmrs == 1) { | |
ec3d17c0 | 1980 | read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); |
d7e09d03 PT |
1981 | goto create_tx_pool; |
1982 | } | |
1983 | ||
1984 | read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
1985 | ||
1986 | if (*kiblnd_tunables.kib_fmr_pool_size < | |
1987 | *kiblnd_tunables.kib_ntx / 4) { | |
1988 | CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n", | |
1989 | *kiblnd_tunables.kib_fmr_pool_size, | |
1990 | *kiblnd_tunables.kib_ntx / 4); | |
1991 | rc = -EINVAL; | |
1992 | goto failed; | |
1993 | } | |
1994 | ||
a6970317 OD |
1995 | /* |
1996 | * TX pool must be created later than FMR, see LU-2268 | |
1997 | * for details | |
1998 | */ | |
d7e09d03 PT |
1999 | LASSERT(net->ibn_tx_ps == NULL); |
2000 | ||
a6970317 OD |
2001 | /* |
2002 | * premapping can fail if ibd_nmr > 1, so we always create | |
2003 | * FMR pool and map-on-demand if premapping failed | |
2004 | */ | |
d7e09d03 PT |
2005 | |
2006 | net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(), | |
2007 | sizeof(kib_fmr_poolset_t)); | |
2008 | if (net->ibn_fmr_ps == NULL) { | |
2009 | CERROR("Failed to allocate FMR pool array\n"); | |
2010 | rc = -ENOMEM; | |
2011 | goto failed; | |
2012 | } | |
2013 | ||
2014 | for (i = 0; i < ncpts; i++) { | |
2015 | cpt = (cpts == NULL) ? i : cpts[i]; | |
2016 | rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net, | |
2017 | kiblnd_fmr_pool_size(ncpts), | |
2018 | kiblnd_fmr_flush_trigger(ncpts)); | |
2019 | if (rc == -ENOSYS && i == 0) /* no FMR */ | |
a6970317 | 2020 | break; |
d7e09d03 PT |
2021 | |
2022 | if (rc != 0) { /* a real error */ | |
2023 | CERROR("Can't initialize FMR pool for CPT %d: %d\n", | |
2024 | cpt, rc); | |
2025 | goto failed; | |
2026 | } | |
2027 | } | |
2028 | ||
2029 | if (i > 0) { | |
2030 | LASSERT(i == ncpts); | |
2031 | goto create_tx_pool; | |
2032 | } | |
2033 | ||
2034 | cfs_percpt_free(net->ibn_fmr_ps); | |
2035 | net->ibn_fmr_ps = NULL; | |
2036 | ||
a6970317 | 2037 | CWARN("Device does not support FMR\n"); |
d7e09d03 | 2038 | goto failed; |
d7e09d03 PT |
2039 | |
2040 | create_tx_pool: | |
2041 | net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(), | |
2042 | sizeof(kib_tx_poolset_t)); | |
2043 | if (net->ibn_tx_ps == NULL) { | |
2044 | CERROR("Failed to allocate tx pool array\n"); | |
2045 | rc = -ENOMEM; | |
2046 | goto failed; | |
2047 | } | |
2048 | ||
2049 | for (i = 0; i < ncpts; i++) { | |
2050 | cpt = (cpts == NULL) ? i : cpts[i]; | |
2051 | rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset, | |
2052 | cpt, net, "TX", | |
2053 | kiblnd_tx_pool_size(ncpts), | |
2054 | kiblnd_create_tx_pool, | |
2055 | kiblnd_destroy_tx_pool, | |
2056 | kiblnd_tx_init, NULL); | |
2057 | if (rc != 0) { | |
2058 | CERROR("Can't initialize TX pool for CPT %d: %d\n", | |
2059 | cpt, rc); | |
2060 | goto failed; | |
2061 | } | |
2062 | } | |
2063 | ||
2064 | return 0; | |
2065 | failed: | |
2066 | kiblnd_net_fini_pools(net); | |
2067 | LASSERT(rc != 0); | |
2068 | return rc; | |
2069 | } | |
2070 | ||
febe73bd | 2071 | static int kiblnd_hdev_get_attr(kib_hca_dev_t *hdev) |
d7e09d03 PT |
2072 | { |
2073 | struct ib_device_attr *attr; | |
ec3d17c0 | 2074 | int rc; |
d7e09d03 PT |
2075 | |
2076 | /* It's safe to assume a HCA can handle a page size | |
2077 | * matching that of the native system */ | |
2078 | hdev->ibh_page_shift = PAGE_SHIFT; | |
2079 | hdev->ibh_page_size = 1 << PAGE_SHIFT; | |
2080 | hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1); | |
2081 | ||
2082 | LIBCFS_ALLOC(attr, sizeof(*attr)); | |
2083 | if (attr == NULL) { | |
2084 | CERROR("Out of memory\n"); | |
2085 | return -ENOMEM; | |
2086 | } | |
2087 | ||
2088 | rc = ib_query_device(hdev->ibh_ibdev, attr); | |
2089 | if (rc == 0) | |
2090 | hdev->ibh_mr_size = attr->max_mr_size; | |
2091 | ||
2092 | LIBCFS_FREE(attr, sizeof(*attr)); | |
2093 | ||
2094 | if (rc != 0) { | |
2095 | CERROR("Failed to query IB device: %d\n", rc); | |
2096 | return rc; | |
2097 | } | |
2098 | ||
2099 | if (hdev->ibh_mr_size == ~0ULL) { | |
2100 | hdev->ibh_mr_shift = 64; | |
2101 | return 0; | |
2102 | } | |
2103 | ||
2104 | for (hdev->ibh_mr_shift = 0; | |
74732797 | 2105 | hdev->ibh_mr_shift < 64; hdev->ibh_mr_shift++) { |
d7e09d03 PT |
2106 | if (hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) || |
2107 | hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) - 1) | |
2108 | return 0; | |
2109 | } | |
2110 | ||
55f5a824 | 2111 | CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size); |
d7e09d03 PT |
2112 | return -EINVAL; |
2113 | } | |
2114 | ||
febe73bd | 2115 | static void kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev) |
d7e09d03 | 2116 | { |
ec3d17c0 | 2117 | int i; |
d7e09d03 PT |
2118 | |
2119 | if (hdev->ibh_nmrs == 0 || hdev->ibh_mrs == NULL) | |
2120 | return; | |
2121 | ||
2122 | for (i = 0; i < hdev->ibh_nmrs; i++) { | |
2123 | if (hdev->ibh_mrs[i] == NULL) | |
2124 | break; | |
2125 | ||
2126 | ib_dereg_mr(hdev->ibh_mrs[i]); | |
2127 | } | |
2128 | ||
2129 | LIBCFS_FREE(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs); | |
2130 | hdev->ibh_mrs = NULL; | |
2131 | hdev->ibh_nmrs = 0; | |
2132 | } | |
2133 | ||
febe73bd | 2134 | void kiblnd_hdev_destroy(kib_hca_dev_t *hdev) |
d7e09d03 PT |
2135 | { |
2136 | kiblnd_hdev_cleanup_mrs(hdev); | |
2137 | ||
2138 | if (hdev->ibh_pd != NULL) | |
2139 | ib_dealloc_pd(hdev->ibh_pd); | |
2140 | ||
2141 | if (hdev->ibh_cmid != NULL) | |
2142 | rdma_destroy_id(hdev->ibh_cmid); | |
2143 | ||
2144 | LIBCFS_FREE(hdev, sizeof(*hdev)); | |
2145 | } | |
2146 | ||
febe73bd | 2147 | static int kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev) |
d7e09d03 PT |
2148 | { |
2149 | struct ib_mr *mr; | |
ec3d17c0 | 2150 | int rc; |
ec3d17c0 | 2151 | int acflags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE; |
d7e09d03 PT |
2152 | |
2153 | rc = kiblnd_hdev_get_attr(hdev); | |
2154 | if (rc != 0) | |
2155 | return rc; | |
2156 | ||
d7e09d03 PT |
2157 | LIBCFS_ALLOC(hdev->ibh_mrs, 1 * sizeof(*hdev->ibh_mrs)); |
2158 | if (hdev->ibh_mrs == NULL) { | |
2159 | CERROR("Failed to allocate MRs table\n"); | |
2160 | return -ENOMEM; | |
2161 | } | |
2162 | ||
2163 | hdev->ibh_mrs[0] = NULL; | |
2164 | hdev->ibh_nmrs = 1; | |
2165 | ||
2166 | mr = ib_get_dma_mr(hdev->ibh_pd, acflags); | |
2167 | if (IS_ERR(mr)) { | |
2168 | CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr)); | |
2169 | kiblnd_hdev_cleanup_mrs(hdev); | |
2170 | return PTR_ERR(mr); | |
2171 | } | |
2172 | ||
2173 | hdev->ibh_mrs[0] = mr; | |
2174 | ||
d7e09d03 PT |
2175 | return 0; |
2176 | } | |
2177 | ||
febe73bd | 2178 | /* DUMMY */ |
7a3888a3 GM |
2179 | static int kiblnd_dummy_callback(struct rdma_cm_id *cmid, |
2180 | struct rdma_cm_event *event) | |
febe73bd | 2181 | { |
d7e09d03 PT |
2182 | return 0; |
2183 | } | |
2184 | ||
febe73bd | 2185 | static int kiblnd_dev_need_failover(kib_dev_t *dev) |
d7e09d03 | 2186 | { |
ec3d17c0 MS |
2187 | struct rdma_cm_id *cmid; |
2188 | struct sockaddr_in srcaddr; | |
2189 | struct sockaddr_in dstaddr; | |
2190 | int rc; | |
d7e09d03 PT |
2191 | |
2192 | if (dev->ibd_hdev == NULL || /* initializing */ | |
2193 | dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */ | |
2194 | *kiblnd_tunables.kib_dev_failover > 1) /* debugging */ | |
2195 | return 1; | |
2196 | ||
2197 | /* XXX: it's UGLY, but I don't have better way to find | |
2198 | * ib-bonding HCA failover because: | |
2199 | * | |
2200 | * a. no reliable CM event for HCA failover... | |
2201 | * b. no OFED API to get ib_device for current net_device... | |
2202 | * | |
2203 | * We have only two choices at this point: | |
2204 | * | |
2205 | * a. rdma_bind_addr(), it will conflict with listener cmid | |
2206 | * b. rdma_resolve_addr() to zero addr */ | |
2207 | cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP, | |
2208 | IB_QPT_RC); | |
2209 | if (IS_ERR(cmid)) { | |
2210 | rc = PTR_ERR(cmid); | |
2211 | CERROR("Failed to create cmid for failover: %d\n", rc); | |
2212 | return rc; | |
2213 | } | |
2214 | ||
2215 | memset(&srcaddr, 0, sizeof(srcaddr)); | |
ec3d17c0 | 2216 | srcaddr.sin_family = AF_INET; |
d7e09d03 PT |
2217 | srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip); |
2218 | ||
2219 | memset(&dstaddr, 0, sizeof(dstaddr)); | |
2220 | dstaddr.sin_family = AF_INET; | |
2221 | rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr, | |
2222 | (struct sockaddr *)&dstaddr, 1); | |
2223 | if (rc != 0 || cmid->device == NULL) { | |
5e8f6920 PT |
2224 | CERROR("Failed to bind %s:%pI4h to device(%p): %d\n", |
2225 | dev->ibd_ifname, &dev->ibd_ifip, | |
d7e09d03 PT |
2226 | cmid->device, rc); |
2227 | rdma_destroy_id(cmid); | |
2228 | return rc; | |
2229 | } | |
2230 | ||
2231 | if (dev->ibd_hdev->ibh_ibdev == cmid->device) { | |
2232 | /* don't need device failover */ | |
2233 | rdma_destroy_id(cmid); | |
2234 | return 0; | |
2235 | } | |
2236 | ||
2237 | return 1; | |
2238 | } | |
2239 | ||
febe73bd | 2240 | int kiblnd_dev_failover(kib_dev_t *dev) |
d7e09d03 | 2241 | { |
febe73bd GM |
2242 | LIST_HEAD(zombie_tpo); |
2243 | LIST_HEAD(zombie_ppo); | |
2244 | LIST_HEAD(zombie_fpo); | |
ec3d17c0 MS |
2245 | struct rdma_cm_id *cmid = NULL; |
2246 | kib_hca_dev_t *hdev = NULL; | |
ec3d17c0 MS |
2247 | struct ib_pd *pd; |
2248 | kib_net_t *net; | |
2249 | struct sockaddr_in addr; | |
2250 | unsigned long flags; | |
2251 | int rc = 0; | |
2252 | int i; | |
d7e09d03 | 2253 | |
febe73bd | 2254 | LASSERT(*kiblnd_tunables.kib_dev_failover > 1 || |
d7e09d03 PT |
2255 | dev->ibd_can_failover || |
2256 | dev->ibd_hdev == NULL); | |
2257 | ||
2258 | rc = kiblnd_dev_need_failover(dev); | |
2259 | if (rc <= 0) | |
2260 | goto out; | |
2261 | ||
2262 | if (dev->ibd_hdev != NULL && | |
2263 | dev->ibd_hdev->ibh_cmid != NULL) { | |
2264 | /* XXX it's not good to close old listener at here, | |
2265 | * because we can fail to create new listener. | |
2266 | * But we have to close it now, otherwise rdma_bind_addr | |
2267 | * will return EADDRINUSE... How crap! */ | |
2268 | write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
2269 | ||
2270 | cmid = dev->ibd_hdev->ibh_cmid; | |
2271 | /* make next schedule of kiblnd_dev_need_failover() | |
2272 | * return 1 for me */ | |
2273 | dev->ibd_hdev->ibh_cmid = NULL; | |
2274 | write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
2275 | ||
2276 | rdma_destroy_id(cmid); | |
2277 | } | |
2278 | ||
2279 | cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP, | |
2280 | IB_QPT_RC); | |
2281 | if (IS_ERR(cmid)) { | |
2282 | rc = PTR_ERR(cmid); | |
2283 | CERROR("Failed to create cmid for failover: %d\n", rc); | |
2284 | goto out; | |
2285 | } | |
2286 | ||
2287 | memset(&addr, 0, sizeof(addr)); | |
2288 | addr.sin_family = AF_INET; | |
2289 | addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip); | |
2290 | addr.sin_port = htons(*kiblnd_tunables.kib_service); | |
2291 | ||
2292 | /* Bind to failover device or port */ | |
2293 | rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr); | |
2294 | if (rc != 0 || cmid->device == NULL) { | |
5e8f6920 PT |
2295 | CERROR("Failed to bind %s:%pI4h to device(%p): %d\n", |
2296 | dev->ibd_ifname, &dev->ibd_ifip, | |
d7e09d03 PT |
2297 | cmid->device, rc); |
2298 | rdma_destroy_id(cmid); | |
2299 | goto out; | |
2300 | } | |
2301 | ||
2302 | LIBCFS_ALLOC(hdev, sizeof(*hdev)); | |
2303 | if (hdev == NULL) { | |
2304 | CERROR("Failed to allocate kib_hca_dev\n"); | |
2305 | rdma_destroy_id(cmid); | |
2306 | rc = -ENOMEM; | |
2307 | goto out; | |
2308 | } | |
2309 | ||
2310 | atomic_set(&hdev->ibh_ref, 1); | |
2311 | hdev->ibh_dev = dev; | |
2312 | hdev->ibh_cmid = cmid; | |
2313 | hdev->ibh_ibdev = cmid->device; | |
2314 | ||
2315 | pd = ib_alloc_pd(cmid->device); | |
2316 | if (IS_ERR(pd)) { | |
2317 | rc = PTR_ERR(pd); | |
2318 | CERROR("Can't allocate PD: %d\n", rc); | |
2319 | goto out; | |
2320 | } | |
2321 | ||
2322 | hdev->ibh_pd = pd; | |
2323 | ||
2324 | rc = rdma_listen(cmid, 0); | |
2325 | if (rc != 0) { | |
2326 | CERROR("Can't start new listener: %d\n", rc); | |
2327 | goto out; | |
2328 | } | |
2329 | ||
2330 | rc = kiblnd_hdev_setup_mrs(hdev); | |
2331 | if (rc != 0) { | |
2332 | CERROR("Can't setup device: %d\n", rc); | |
2333 | goto out; | |
2334 | } | |
2335 | ||
2336 | write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
2337 | ||
6d37b171 | 2338 | swap(dev->ibd_hdev, hdev); /* take over the refcount */ |
d7e09d03 PT |
2339 | |
2340 | list_for_each_entry(net, &dev->ibd_nets, ibn_list) { | |
2341 | cfs_cpt_for_each(i, lnet_cpt_table()) { | |
2342 | kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset, | |
2343 | &zombie_tpo); | |
2344 | ||
a6970317 | 2345 | if (net->ibn_fmr_ps) |
d7e09d03 PT |
2346 | kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i], |
2347 | &zombie_fpo); | |
d7e09d03 PT |
2348 | } |
2349 | } | |
2350 | ||
2351 | write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
2352 | out: | |
2353 | if (!list_empty(&zombie_tpo)) | |
2354 | kiblnd_destroy_pool_list(&zombie_tpo); | |
2355 | if (!list_empty(&zombie_ppo)) | |
2356 | kiblnd_destroy_pool_list(&zombie_ppo); | |
2357 | if (!list_empty(&zombie_fpo)) | |
2358 | kiblnd_destroy_fmr_pool_list(&zombie_fpo); | |
2359 | if (hdev != NULL) | |
2360 | kiblnd_hdev_decref(hdev); | |
2361 | ||
2362 | if (rc != 0) | |
2363 | dev->ibd_failed_failover++; | |
2364 | else | |
2365 | dev->ibd_failed_failover = 0; | |
2366 | ||
2367 | return rc; | |
2368 | } | |
2369 | ||
febe73bd | 2370 | void kiblnd_destroy_dev(kib_dev_t *dev) |
d7e09d03 | 2371 | { |
febe73bd GM |
2372 | LASSERT(dev->ibd_nnets == 0); |
2373 | LASSERT(list_empty(&dev->ibd_nets)); | |
d7e09d03 PT |
2374 | |
2375 | list_del(&dev->ibd_fail_list); | |
2376 | list_del(&dev->ibd_list); | |
2377 | ||
2378 | if (dev->ibd_hdev != NULL) | |
2379 | kiblnd_hdev_decref(dev->ibd_hdev); | |
2380 | ||
2381 | LIBCFS_FREE(dev, sizeof(*dev)); | |
2382 | } | |
2383 | ||
febe73bd | 2384 | static kib_dev_t *kiblnd_create_dev(char *ifname) |
d7e09d03 PT |
2385 | { |
2386 | struct net_device *netdev; | |
ec3d17c0 MS |
2387 | kib_dev_t *dev; |
2388 | __u32 netmask; | |
2389 | __u32 ip; | |
2390 | int up; | |
2391 | int rc; | |
d7e09d03 | 2392 | |
1ad6a73e | 2393 | rc = lnet_ipif_query(ifname, &up, &ip, &netmask); |
d7e09d03 PT |
2394 | if (rc != 0) { |
2395 | CERROR("Can't query IPoIB interface %s: %d\n", | |
2396 | ifname, rc); | |
2397 | return NULL; | |
2398 | } | |
2399 | ||
2400 | if (!up) { | |
2401 | CERROR("Can't query IPoIB interface %s: it's down\n", ifname); | |
2402 | return NULL; | |
2403 | } | |
2404 | ||
2405 | LIBCFS_ALLOC(dev, sizeof(*dev)); | |
2406 | if (dev == NULL) | |
2407 | return NULL; | |
2408 | ||
d7e09d03 PT |
2409 | netdev = dev_get_by_name(&init_net, ifname); |
2410 | if (netdev == NULL) { | |
2411 | dev->ibd_can_failover = 0; | |
2412 | } else { | |
2413 | dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER); | |
2414 | dev_put(netdev); | |
2415 | } | |
2416 | ||
2417 | INIT_LIST_HEAD(&dev->ibd_nets); | |
2418 | INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */ | |
2419 | INIT_LIST_HEAD(&dev->ibd_fail_list); | |
2420 | dev->ibd_ifip = ip; | |
2421 | strcpy(&dev->ibd_ifname[0], ifname); | |
2422 | ||
2423 | /* initialize the device */ | |
2424 | rc = kiblnd_dev_failover(dev); | |
2425 | if (rc != 0) { | |
2426 | CERROR("Can't initialize device: %d\n", rc); | |
2427 | LIBCFS_FREE(dev, sizeof(*dev)); | |
2428 | return NULL; | |
2429 | } | |
2430 | ||
2431 | list_add_tail(&dev->ibd_list, | |
2432 | &kiblnd_data.kib_devs); | |
2433 | return dev; | |
2434 | } | |
2435 | ||
febe73bd | 2436 | static void kiblnd_base_shutdown(void) |
d7e09d03 | 2437 | { |
ec3d17c0 MS |
2438 | struct kib_sched_info *sched; |
2439 | int i; | |
d7e09d03 | 2440 | |
febe73bd | 2441 | LASSERT(list_empty(&kiblnd_data.kib_devs)); |
d7e09d03 PT |
2442 | |
2443 | CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n", | |
2444 | atomic_read(&libcfs_kmemory)); | |
2445 | ||
2446 | switch (kiblnd_data.kib_init) { | |
2447 | default: | |
2448 | LBUG(); | |
2449 | ||
2450 | case IBLND_INIT_ALL: | |
2451 | case IBLND_INIT_DATA: | |
febe73bd | 2452 | LASSERT(kiblnd_data.kib_peers != NULL); |
7a3888a3 | 2453 | for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) |
febe73bd | 2454 | LASSERT(list_empty(&kiblnd_data.kib_peers[i])); |
febe73bd GM |
2455 | LASSERT(list_empty(&kiblnd_data.kib_connd_zombies)); |
2456 | LASSERT(list_empty(&kiblnd_data.kib_connd_conns)); | |
d7e09d03 PT |
2457 | |
2458 | /* flag threads to terminate; wake and wait for them to die */ | |
2459 | kiblnd_data.kib_shutdown = 1; | |
2460 | ||
2461 | /* NB: we really want to stop scheduler threads net by net | |
2462 | * instead of the whole module, this should be improved | |
2463 | * with dynamic configuration LNet */ | |
2464 | cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) | |
2465 | wake_up_all(&sched->ibs_waitq); | |
2466 | ||
2467 | wake_up_all(&kiblnd_data.kib_connd_waitq); | |
2468 | wake_up_all(&kiblnd_data.kib_failover_waitq); | |
2469 | ||
2470 | i = 2; | |
2471 | while (atomic_read(&kiblnd_data.kib_nthreads) != 0) { | |
2472 | i++; | |
7a3888a3 GM |
2473 | /* power of 2 ? */ |
2474 | CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, | |
d7e09d03 PT |
2475 | "Waiting for %d threads to terminate\n", |
2476 | atomic_read(&kiblnd_data.kib_nthreads)); | |
d3caf4d5 PT |
2477 | set_current_state(TASK_UNINTERRUPTIBLE); |
2478 | schedule_timeout(cfs_time_seconds(1)); | |
d7e09d03 PT |
2479 | } |
2480 | ||
2481 | /* fall through */ | |
2482 | ||
2483 | case IBLND_INIT_NOTHING: | |
2484 | break; | |
2485 | } | |
2486 | ||
2487 | if (kiblnd_data.kib_peers != NULL) { | |
2488 | LIBCFS_FREE(kiblnd_data.kib_peers, | |
2489 | sizeof(struct list_head) * | |
2490 | kiblnd_data.kib_peer_hash_size); | |
2491 | } | |
2492 | ||
2493 | if (kiblnd_data.kib_scheds != NULL) | |
2494 | cfs_percpt_free(kiblnd_data.kib_scheds); | |
2495 | ||
2496 | CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n", | |
2497 | atomic_read(&libcfs_kmemory)); | |
2498 | ||
2499 | kiblnd_data.kib_init = IBLND_INIT_NOTHING; | |
2500 | module_put(THIS_MODULE); | |
2501 | } | |
2502 | ||
febe73bd | 2503 | void kiblnd_shutdown(lnet_ni_t *ni) |
d7e09d03 | 2504 | { |
ec3d17c0 MS |
2505 | kib_net_t *net = ni->ni_data; |
2506 | rwlock_t *g_lock = &kiblnd_data.kib_global_lock; | |
2507 | int i; | |
2508 | unsigned long flags; | |
d7e09d03 PT |
2509 | |
2510 | LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL); | |
2511 | ||
2512 | if (net == NULL) | |
2513 | goto out; | |
2514 | ||
2515 | CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n", | |
2516 | atomic_read(&libcfs_kmemory)); | |
2517 | ||
2518 | write_lock_irqsave(g_lock, flags); | |
2519 | net->ibn_shutdown = 1; | |
2520 | write_unlock_irqrestore(g_lock, flags); | |
2521 | ||
2522 | switch (net->ibn_init) { | |
2523 | default: | |
2524 | LBUG(); | |
2525 | ||
2526 | case IBLND_INIT_ALL: | |
2527 | /* nuke all existing peers within this net */ | |
2528 | kiblnd_del_peer(ni, LNET_NID_ANY); | |
2529 | ||
2530 | /* Wait for all peer state to clean up */ | |
2531 | i = 2; | |
2532 | while (atomic_read(&net->ibn_npeers) != 0) { | |
2533 | i++; | |
2534 | CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */ | |
2535 | "%s: waiting for %d peers to disconnect\n", | |
2536 | libcfs_nid2str(ni->ni_nid), | |
2537 | atomic_read(&net->ibn_npeers)); | |
d3caf4d5 PT |
2538 | set_current_state(TASK_UNINTERRUPTIBLE); |
2539 | schedule_timeout(cfs_time_seconds(1)); | |
d7e09d03 PT |
2540 | } |
2541 | ||
2542 | kiblnd_net_fini_pools(net); | |
2543 | ||
2544 | write_lock_irqsave(g_lock, flags); | |
2545 | LASSERT(net->ibn_dev->ibd_nnets > 0); | |
2546 | net->ibn_dev->ibd_nnets--; | |
2547 | list_del(&net->ibn_list); | |
2548 | write_unlock_irqrestore(g_lock, flags); | |
2549 | ||
2550 | /* fall through */ | |
2551 | ||
2552 | case IBLND_INIT_NOTHING: | |
febe73bd | 2553 | LASSERT(atomic_read(&net->ibn_nconns) == 0); |
d7e09d03 PT |
2554 | |
2555 | if (net->ibn_dev != NULL && | |
2556 | net->ibn_dev->ibd_nnets == 0) | |
2557 | kiblnd_destroy_dev(net->ibn_dev); | |
2558 | ||
2559 | break; | |
2560 | } | |
2561 | ||
2562 | CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n", | |
2563 | atomic_read(&libcfs_kmemory)); | |
2564 | ||
2565 | net->ibn_init = IBLND_INIT_NOTHING; | |
2566 | ni->ni_data = NULL; | |
2567 | ||
2568 | LIBCFS_FREE(net, sizeof(*net)); | |
2569 | ||
2570 | out: | |
2571 | if (list_empty(&kiblnd_data.kib_devs)) | |
2572 | kiblnd_base_shutdown(); | |
d7e09d03 PT |
2573 | } |
2574 | ||
febe73bd | 2575 | static int kiblnd_base_startup(void) |
d7e09d03 | 2576 | { |
ec3d17c0 MS |
2577 | struct kib_sched_info *sched; |
2578 | int rc; | |
2579 | int i; | |
d7e09d03 | 2580 | |
febe73bd | 2581 | LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING); |
d7e09d03 PT |
2582 | |
2583 | try_module_get(THIS_MODULE); | |
7a3888a3 GM |
2584 | /* zero pointers, flags etc */ |
2585 | memset(&kiblnd_data, 0, sizeof(kiblnd_data)); | |
d7e09d03 PT |
2586 | |
2587 | rwlock_init(&kiblnd_data.kib_global_lock); | |
2588 | ||
2589 | INIT_LIST_HEAD(&kiblnd_data.kib_devs); | |
2590 | INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs); | |
2591 | ||
2592 | kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE; | |
2593 | LIBCFS_ALLOC(kiblnd_data.kib_peers, | |
ec3d17c0 | 2594 | sizeof(struct list_head) * kiblnd_data.kib_peer_hash_size); |
7a3888a3 | 2595 | if (kiblnd_data.kib_peers == NULL) |
d7e09d03 | 2596 | goto failed; |
d7e09d03 PT |
2597 | for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) |
2598 | INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]); | |
2599 | ||
2600 | spin_lock_init(&kiblnd_data.kib_connd_lock); | |
2601 | INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns); | |
2602 | INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies); | |
2603 | init_waitqueue_head(&kiblnd_data.kib_connd_waitq); | |
2604 | init_waitqueue_head(&kiblnd_data.kib_failover_waitq); | |
2605 | ||
2606 | kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(), | |
2607 | sizeof(*sched)); | |
2608 | if (kiblnd_data.kib_scheds == NULL) | |
2609 | goto failed; | |
2610 | ||
2611 | cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) { | |
ec3d17c0 | 2612 | int nthrs; |
d7e09d03 PT |
2613 | |
2614 | spin_lock_init(&sched->ibs_lock); | |
2615 | INIT_LIST_HEAD(&sched->ibs_conns); | |
2616 | init_waitqueue_head(&sched->ibs_waitq); | |
2617 | ||
2618 | nthrs = cfs_cpt_weight(lnet_cpt_table(), i); | |
2619 | if (*kiblnd_tunables.kib_nscheds > 0) { | |
2620 | nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds); | |
2621 | } else { | |
2622 | /* max to half of CPUs, another half is reserved for | |
2623 | * upper layer modules */ | |
2624 | nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); | |
2625 | } | |
2626 | ||
2627 | sched->ibs_nthreads_max = nthrs; | |
2628 | sched->ibs_cpt = i; | |
2629 | } | |
2630 | ||
2631 | kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR; | |
2632 | ||
2633 | /* lists/ptrs/locks initialised */ | |
2634 | kiblnd_data.kib_init = IBLND_INIT_DATA; | |
2635 | /*****************************************************/ | |
2636 | ||
2637 | rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd"); | |
2638 | if (rc != 0) { | |
2639 | CERROR("Can't spawn o2iblnd connd: %d\n", rc); | |
2640 | goto failed; | |
2641 | } | |
2642 | ||
2643 | if (*kiblnd_tunables.kib_dev_failover != 0) | |
2644 | rc = kiblnd_thread_start(kiblnd_failover_thread, NULL, | |
2645 | "kiblnd_failover"); | |
2646 | ||
2647 | if (rc != 0) { | |
2648 | CERROR("Can't spawn o2iblnd failover thread: %d\n", rc); | |
2649 | goto failed; | |
2650 | } | |
2651 | ||
2652 | /* flag everything initialised */ | |
2653 | kiblnd_data.kib_init = IBLND_INIT_ALL; | |
2654 | /*****************************************************/ | |
2655 | ||
2656 | return 0; | |
2657 | ||
2658 | failed: | |
2659 | kiblnd_base_shutdown(); | |
2660 | return -ENETDOWN; | |
2661 | } | |
2662 | ||
febe73bd | 2663 | static int kiblnd_start_schedulers(struct kib_sched_info *sched) |
d7e09d03 | 2664 | { |
ec3d17c0 MS |
2665 | int rc = 0; |
2666 | int nthrs; | |
2667 | int i; | |
d7e09d03 PT |
2668 | |
2669 | if (sched->ibs_nthreads == 0) { | |
2670 | if (*kiblnd_tunables.kib_nscheds > 0) { | |
2671 | nthrs = sched->ibs_nthreads_max; | |
2672 | } else { | |
2673 | nthrs = cfs_cpt_weight(lnet_cpt_table(), | |
2674 | sched->ibs_cpt); | |
2675 | nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); | |
2676 | nthrs = min(IBLND_N_SCHED_HIGH, nthrs); | |
2677 | } | |
2678 | } else { | |
2679 | LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max); | |
2680 | /* increase one thread if there is new interface */ | |
b6ee3824 | 2681 | nthrs = sched->ibs_nthreads < sched->ibs_nthreads_max; |
d7e09d03 PT |
2682 | } |
2683 | ||
2684 | for (i = 0; i < nthrs; i++) { | |
ec3d17c0 MS |
2685 | long id; |
2686 | char name[20]; | |
7a3888a3 | 2687 | |
d7e09d03 PT |
2688 | id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i); |
2689 | snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld", | |
2690 | KIB_THREAD_CPT(id), KIB_THREAD_TID(id)); | |
2691 | rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name); | |
2692 | if (rc == 0) | |
2693 | continue; | |
2694 | ||
2695 | CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", | |
2696 | sched->ibs_cpt, sched->ibs_nthreads + i, rc); | |
2697 | break; | |
2698 | } | |
2699 | ||
2700 | sched->ibs_nthreads += i; | |
2701 | return rc; | |
2702 | } | |
2703 | ||
7a3888a3 GM |
2704 | static int kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts, |
2705 | int ncpts) | |
d7e09d03 | 2706 | { |
ec3d17c0 MS |
2707 | int cpt; |
2708 | int rc; | |
2709 | int i; | |
d7e09d03 PT |
2710 | |
2711 | for (i = 0; i < ncpts; i++) { | |
2712 | struct kib_sched_info *sched; | |
2713 | ||
2714 | cpt = (cpts == NULL) ? i : cpts[i]; | |
2715 | sched = kiblnd_data.kib_scheds[cpt]; | |
2716 | ||
2717 | if (!newdev && sched->ibs_nthreads > 0) | |
2718 | continue; | |
2719 | ||
2720 | rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]); | |
2721 | if (rc != 0) { | |
2722 | CERROR("Failed to start scheduler threads for %s\n", | |
2723 | dev->ibd_ifname); | |
2724 | return rc; | |
2725 | } | |
2726 | } | |
2727 | return 0; | |
2728 | } | |
2729 | ||
febe73bd | 2730 | static kib_dev_t *kiblnd_dev_search(char *ifname) |
d7e09d03 | 2731 | { |
ec3d17c0 MS |
2732 | kib_dev_t *alias = NULL; |
2733 | kib_dev_t *dev; | |
2734 | char *colon; | |
2735 | char *colon2; | |
d7e09d03 PT |
2736 | |
2737 | colon = strchr(ifname, ':'); | |
2738 | list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { | |
2739 | if (strcmp(&dev->ibd_ifname[0], ifname) == 0) | |
2740 | return dev; | |
2741 | ||
2742 | if (alias != NULL) | |
2743 | continue; | |
2744 | ||
2745 | colon2 = strchr(dev->ibd_ifname, ':'); | |
2746 | if (colon != NULL) | |
2747 | *colon = 0; | |
2748 | if (colon2 != NULL) | |
2749 | *colon2 = 0; | |
2750 | ||
2751 | if (strcmp(&dev->ibd_ifname[0], ifname) == 0) | |
2752 | alias = dev; | |
2753 | ||
2754 | if (colon != NULL) | |
2755 | *colon = ':'; | |
2756 | if (colon2 != NULL) | |
2757 | *colon2 = ':'; | |
2758 | } | |
2759 | return alias; | |
2760 | } | |
2761 | ||
febe73bd | 2762 | int kiblnd_startup(lnet_ni_t *ni) |
d7e09d03 | 2763 | { |
ec3d17c0 MS |
2764 | char *ifname; |
2765 | kib_dev_t *ibdev = NULL; | |
2766 | kib_net_t *net; | |
2767 | struct timeval tv; | |
2768 | unsigned long flags; | |
2769 | int rc; | |
2770 | int newdev; | |
d7e09d03 | 2771 | |
febe73bd | 2772 | LASSERT(ni->ni_lnd == &the_o2iblnd); |
d7e09d03 PT |
2773 | |
2774 | if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) { | |
2775 | rc = kiblnd_base_startup(); | |
2776 | if (rc != 0) | |
2777 | return rc; | |
2778 | } | |
2779 | ||
2780 | LIBCFS_ALLOC(net, sizeof(*net)); | |
2781 | ni->ni_data = net; | |
2782 | if (net == NULL) | |
3247c4e5 | 2783 | goto net_failed; |
d7e09d03 | 2784 | |
d7e09d03 PT |
2785 | do_gettimeofday(&tv); |
2786 | net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; | |
2787 | ||
2788 | ni->ni_peertimeout = *kiblnd_tunables.kib_peertimeout; | |
2789 | ni->ni_maxtxcredits = *kiblnd_tunables.kib_credits; | |
2790 | ni->ni_peertxcredits = *kiblnd_tunables.kib_peertxcredits; | |
2791 | ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits; | |
2792 | ||
2793 | if (ni->ni_interfaces[0] != NULL) { | |
2794 | /* Use the IPoIB interface specified in 'networks=' */ | |
2795 | ||
febe73bd | 2796 | CLASSERT(LNET_MAX_INTERFACES > 1); |
d7e09d03 PT |
2797 | if (ni->ni_interfaces[1] != NULL) { |
2798 | CERROR("Multiple interfaces not supported\n"); | |
2799 | goto failed; | |
2800 | } | |
2801 | ||
2802 | ifname = ni->ni_interfaces[0]; | |
2803 | } else { | |
2804 | ifname = *kiblnd_tunables.kib_default_ipif; | |
2805 | } | |
2806 | ||
2807 | if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) { | |
2808 | CERROR("IPoIB interface name too long: %s\n", ifname); | |
2809 | goto failed; | |
2810 | } | |
2811 | ||
2812 | ibdev = kiblnd_dev_search(ifname); | |
2813 | ||
2814 | newdev = ibdev == NULL; | |
2815 | /* hmm...create kib_dev even for alias */ | |
2816 | if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0) | |
2817 | ibdev = kiblnd_create_dev(ifname); | |
2818 | ||
2819 | if (ibdev == NULL) | |
2820 | goto failed; | |
2821 | ||
2822 | net->ibn_dev = ibdev; | |
2823 | ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip); | |
2824 | ||
2825 | rc = kiblnd_dev_start_threads(ibdev, newdev, | |
2826 | ni->ni_cpts, ni->ni_ncpts); | |
2827 | if (rc != 0) | |
2828 | goto failed; | |
2829 | ||
2830 | rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts); | |
2831 | if (rc != 0) { | |
2832 | CERROR("Failed to initialize NI pools: %d\n", rc); | |
2833 | goto failed; | |
2834 | } | |
2835 | ||
2836 | write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); | |
2837 | ibdev->ibd_nnets++; | |
2838 | list_add_tail(&net->ibn_list, &ibdev->ibd_nets); | |
2839 | write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); | |
2840 | ||
2841 | net->ibn_init = IBLND_INIT_ALL; | |
2842 | ||
2843 | return 0; | |
2844 | ||
2845 | failed: | |
2846 | if (net->ibn_dev == NULL && ibdev != NULL) | |
2847 | kiblnd_destroy_dev(ibdev); | |
2848 | ||
3247c4e5 | 2849 | net_failed: |
d7e09d03 PT |
2850 | kiblnd_shutdown(ni); |
2851 | ||
2852 | CDEBUG(D_NET, "kiblnd_startup failed\n"); | |
2853 | return -ENETDOWN; | |
2854 | } | |
2855 | ||
febe73bd | 2856 | static void __exit kiblnd_module_fini(void) |
d7e09d03 PT |
2857 | { |
2858 | lnet_unregister_lnd(&the_o2iblnd); | |
d7e09d03 PT |
2859 | } |
2860 | ||
febe73bd | 2861 | static int __init kiblnd_module_init(void) |
d7e09d03 | 2862 | { |
ec3d17c0 | 2863 | int rc; |
d7e09d03 | 2864 | |
febe73bd | 2865 | CLASSERT(sizeof(kib_msg_t) <= IBLND_MSG_SIZE); |
7a3888a3 GM |
2866 | CLASSERT(offsetof(kib_msg_t, |
2867 | ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) | |
2868 | <= IBLND_MSG_SIZE); | |
2869 | CLASSERT(offsetof(kib_msg_t, | |
2870 | ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) | |
2871 | <= IBLND_MSG_SIZE); | |
d7e09d03 PT |
2872 | |
2873 | rc = kiblnd_tunables_init(); | |
2874 | if (rc != 0) | |
2875 | return rc; | |
2876 | ||
2877 | lnet_register_lnd(&the_o2iblnd); | |
2878 | ||
2879 | return 0; | |
2880 | } | |
2881 | ||
2882 | MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>"); | |
2883 | MODULE_DESCRIPTION("Kernel OpenIB gen2 LND v2.00"); | |
2884 | MODULE_LICENSE("GPL"); | |
2885 | ||
2886 | module_init(kiblnd_module_init); | |
2887 | module_exit(kiblnd_module_fini); |