Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2011, 2012, Intel Corporation. | |
31 | */ | |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | * | |
36 | * lnet/lnet/lib-move.c | |
37 | * | |
38 | * Data movement routines | |
39 | */ | |
40 | ||
41 | #define DEBUG_SUBSYSTEM S_LNET | |
42 | ||
43 | #include <linux/lnet/lib-lnet.h> | |
44 | ||
45 | static int local_nid_dist_zero = 1; | |
46 | CFS_MODULE_PARM(local_nid_dist_zero, "i", int, 0444, | |
47 | "Reserved"); | |
48 | ||
49 | int | |
af66a6e2 | 50 | lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) |
d7e09d03 PT |
51 | { |
52 | lnet_test_peer_t *tp; | |
53 | struct list_head *el; | |
54 | struct list_head *next; | |
55 | struct list_head cull; | |
56 | ||
af66a6e2 | 57 | LASSERT(the_lnet.ln_init); |
d7e09d03 PT |
58 | |
59 | /* NB: use lnet_net_lock(0) to serialize operations on test peers */ | |
60 | if (threshold != 0) { | |
61 | /* Adding a new entry */ | |
62 | LIBCFS_ALLOC(tp, sizeof(*tp)); | |
63 | if (tp == NULL) | |
64 | return -ENOMEM; | |
65 | ||
66 | tp->tp_nid = nid; | |
67 | tp->tp_threshold = threshold; | |
68 | ||
69 | lnet_net_lock(0); | |
70 | list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers); | |
71 | lnet_net_unlock(0); | |
72 | return 0; | |
73 | } | |
74 | ||
75 | /* removing entries */ | |
76 | INIT_LIST_HEAD(&cull); | |
77 | ||
78 | lnet_net_lock(0); | |
79 | ||
af66a6e2 LN |
80 | list_for_each_safe(el, next, &the_lnet.ln_test_peers) { |
81 | tp = list_entry(el, lnet_test_peer_t, tp_list); | |
d7e09d03 PT |
82 | |
83 | if (tp->tp_threshold == 0 || /* needs culling anyway */ | |
84 | nid == LNET_NID_ANY || /* removing all entries */ | |
9b79ca85 | 85 | tp->tp_nid == nid) { /* matched this one */ |
af66a6e2 LN |
86 | list_del(&tp->tp_list); |
87 | list_add(&tp->tp_list, &cull); | |
d7e09d03 PT |
88 | } |
89 | } | |
90 | ||
91 | lnet_net_unlock(0); | |
92 | ||
af66a6e2 LN |
93 | while (!list_empty(&cull)) { |
94 | tp = list_entry(cull.next, lnet_test_peer_t, tp_list); | |
d7e09d03 | 95 | |
af66a6e2 LN |
96 | list_del(&tp->tp_list); |
97 | LIBCFS_FREE(tp, sizeof(*tp)); | |
d7e09d03 PT |
98 | } |
99 | return 0; | |
100 | } | |
101 | ||
102 | static int | |
af66a6e2 | 103 | fail_peer(lnet_nid_t nid, int outgoing) |
d7e09d03 PT |
104 | { |
105 | lnet_test_peer_t *tp; | |
106 | struct list_head *el; | |
107 | struct list_head *next; | |
108 | struct list_head cull; | |
109 | int fail = 0; | |
110 | ||
af66a6e2 | 111 | INIT_LIST_HEAD(&cull); |
d7e09d03 PT |
112 | |
113 | /* NB: use lnet_net_lock(0) to serialize operations on test peers */ | |
114 | lnet_net_lock(0); | |
115 | ||
af66a6e2 LN |
116 | list_for_each_safe(el, next, &the_lnet.ln_test_peers) { |
117 | tp = list_entry(el, lnet_test_peer_t, tp_list); | |
d7e09d03 PT |
118 | |
119 | if (tp->tp_threshold == 0) { | |
120 | /* zombie entry */ | |
121 | if (outgoing) { | |
122 | /* only cull zombies on outgoing tests, | |
123 | * since we may be at interrupt priority on | |
124 | * incoming messages. */ | |
af66a6e2 LN |
125 | list_del(&tp->tp_list); |
126 | list_add(&tp->tp_list, &cull); | |
d7e09d03 PT |
127 | } |
128 | continue; | |
129 | } | |
130 | ||
131 | if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */ | |
132 | nid == tp->tp_nid) { /* fail this peer */ | |
133 | fail = 1; | |
134 | ||
135 | if (tp->tp_threshold != LNET_MD_THRESH_INF) { | |
136 | tp->tp_threshold--; | |
137 | if (outgoing && | |
138 | tp->tp_threshold == 0) { | |
139 | /* see above */ | |
af66a6e2 LN |
140 | list_del(&tp->tp_list); |
141 | list_add(&tp->tp_list, &cull); | |
d7e09d03 PT |
142 | } |
143 | } | |
144 | break; | |
145 | } | |
146 | } | |
147 | ||
148 | lnet_net_unlock(0); | |
149 | ||
af66a6e2 LN |
150 | while (!list_empty(&cull)) { |
151 | tp = list_entry(cull.next, lnet_test_peer_t, tp_list); | |
152 | list_del(&tp->tp_list); | |
d7e09d03 | 153 | |
af66a6e2 | 154 | LIBCFS_FREE(tp, sizeof(*tp)); |
d7e09d03 PT |
155 | } |
156 | ||
157 | return (fail); | |
158 | } | |
159 | ||
160 | unsigned int | |
af66a6e2 | 161 | lnet_iov_nob(unsigned int niov, struct iovec *iov) |
d7e09d03 PT |
162 | { |
163 | unsigned int nob = 0; | |
164 | ||
165 | while (niov-- > 0) | |
166 | nob += (iov++)->iov_len; | |
167 | ||
168 | return (nob); | |
169 | } | |
170 | EXPORT_SYMBOL(lnet_iov_nob); | |
171 | ||
172 | void | |
af66a6e2 | 173 | lnet_copy_iov2iov(unsigned int ndiov, struct iovec *diov, unsigned int doffset, |
d7e09d03 PT |
174 | unsigned int nsiov, struct iovec *siov, unsigned int soffset, |
175 | unsigned int nob) | |
176 | { | |
177 | /* NB diov, siov are READ-ONLY */ | |
178 | unsigned int this_nob; | |
179 | ||
180 | if (nob == 0) | |
181 | return; | |
182 | ||
183 | /* skip complete frags before 'doffset' */ | |
af66a6e2 | 184 | LASSERT(ndiov > 0); |
d7e09d03 PT |
185 | while (doffset >= diov->iov_len) { |
186 | doffset -= diov->iov_len; | |
187 | diov++; | |
188 | ndiov--; | |
af66a6e2 | 189 | LASSERT(ndiov > 0); |
d7e09d03 PT |
190 | } |
191 | ||
192 | /* skip complete frags before 'soffset' */ | |
af66a6e2 | 193 | LASSERT(nsiov > 0); |
d7e09d03 PT |
194 | while (soffset >= siov->iov_len) { |
195 | soffset -= siov->iov_len; | |
196 | siov++; | |
197 | nsiov--; | |
af66a6e2 | 198 | LASSERT(nsiov > 0); |
d7e09d03 PT |
199 | } |
200 | ||
201 | do { | |
af66a6e2 LN |
202 | LASSERT(ndiov > 0); |
203 | LASSERT(nsiov > 0); | |
d7e09d03 PT |
204 | this_nob = MIN(diov->iov_len - doffset, |
205 | siov->iov_len - soffset); | |
206 | this_nob = MIN(this_nob, nob); | |
207 | ||
af66a6e2 | 208 | memcpy((char *)diov->iov_base + doffset, |
d7e09d03 PT |
209 | (char *)siov->iov_base + soffset, this_nob); |
210 | nob -= this_nob; | |
211 | ||
212 | if (diov->iov_len > doffset + this_nob) { | |
213 | doffset += this_nob; | |
214 | } else { | |
215 | diov++; | |
216 | ndiov--; | |
217 | doffset = 0; | |
218 | } | |
219 | ||
220 | if (siov->iov_len > soffset + this_nob) { | |
221 | soffset += this_nob; | |
222 | } else { | |
223 | siov++; | |
224 | nsiov--; | |
225 | soffset = 0; | |
226 | } | |
227 | } while (nob > 0); | |
228 | } | |
229 | EXPORT_SYMBOL(lnet_copy_iov2iov); | |
230 | ||
231 | int | |
af66a6e2 | 232 | lnet_extract_iov(int dst_niov, struct iovec *dst, |
d7e09d03 PT |
233 | int src_niov, struct iovec *src, |
234 | unsigned int offset, unsigned int len) | |
235 | { | |
236 | /* Initialise 'dst' to the subset of 'src' starting at 'offset', | |
237 | * for exactly 'len' bytes, and return the number of entries. | |
238 | * NB not destructive to 'src' */ | |
239 | unsigned int frag_len; | |
240 | unsigned int niov; | |
241 | ||
242 | if (len == 0) /* no data => */ | |
243 | return (0); /* no frags */ | |
244 | ||
af66a6e2 | 245 | LASSERT(src_niov > 0); |
d7e09d03 PT |
246 | while (offset >= src->iov_len) { /* skip initial frags */ |
247 | offset -= src->iov_len; | |
248 | src_niov--; | |
249 | src++; | |
af66a6e2 | 250 | LASSERT(src_niov > 0); |
d7e09d03 PT |
251 | } |
252 | ||
253 | niov = 1; | |
254 | for (;;) { | |
af66a6e2 LN |
255 | LASSERT(src_niov > 0); |
256 | LASSERT((int)niov <= dst_niov); | |
d7e09d03 PT |
257 | |
258 | frag_len = src->iov_len - offset; | |
259 | dst->iov_base = ((char *)src->iov_base) + offset; | |
260 | ||
261 | if (len <= frag_len) { | |
262 | dst->iov_len = len; | |
263 | return (niov); | |
264 | } | |
265 | ||
266 | dst->iov_len = frag_len; | |
267 | ||
268 | len -= frag_len; | |
269 | dst++; | |
270 | src++; | |
271 | niov++; | |
272 | src_niov--; | |
273 | offset = 0; | |
274 | } | |
275 | } | |
276 | EXPORT_SYMBOL(lnet_extract_iov); | |
277 | ||
278 | ||
279 | unsigned int | |
af66a6e2 | 280 | lnet_kiov_nob(unsigned int niov, lnet_kiov_t *kiov) |
d7e09d03 PT |
281 | { |
282 | unsigned int nob = 0; | |
283 | ||
284 | while (niov-- > 0) | |
285 | nob += (kiov++)->kiov_len; | |
286 | ||
287 | return (nob); | |
288 | } | |
289 | EXPORT_SYMBOL(lnet_kiov_nob); | |
290 | ||
291 | void | |
af66a6e2 | 292 | lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, |
d7e09d03 PT |
293 | unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset, |
294 | unsigned int nob) | |
295 | { | |
296 | /* NB diov, siov are READ-ONLY */ | |
297 | unsigned int this_nob; | |
298 | char *daddr = NULL; | |
299 | char *saddr = NULL; | |
300 | ||
301 | if (nob == 0) | |
302 | return; | |
303 | ||
af66a6e2 | 304 | LASSERT(!in_interrupt()); |
d7e09d03 | 305 | |
af66a6e2 | 306 | LASSERT(ndiov > 0); |
d7e09d03 PT |
307 | while (doffset >= diov->kiov_len) { |
308 | doffset -= diov->kiov_len; | |
309 | diov++; | |
310 | ndiov--; | |
af66a6e2 | 311 | LASSERT(ndiov > 0); |
d7e09d03 PT |
312 | } |
313 | ||
af66a6e2 | 314 | LASSERT(nsiov > 0); |
d7e09d03 PT |
315 | while (soffset >= siov->kiov_len) { |
316 | soffset -= siov->kiov_len; | |
317 | siov++; | |
318 | nsiov--; | |
af66a6e2 | 319 | LASSERT(nsiov > 0); |
d7e09d03 PT |
320 | } |
321 | ||
322 | do { | |
af66a6e2 LN |
323 | LASSERT(ndiov > 0); |
324 | LASSERT(nsiov > 0); | |
d7e09d03 PT |
325 | this_nob = MIN(diov->kiov_len - doffset, |
326 | siov->kiov_len - soffset); | |
327 | this_nob = MIN(this_nob, nob); | |
328 | ||
329 | if (daddr == NULL) | |
330 | daddr = ((char *)kmap(diov->kiov_page)) + | |
331 | diov->kiov_offset + doffset; | |
332 | if (saddr == NULL) | |
333 | saddr = ((char *)kmap(siov->kiov_page)) + | |
334 | siov->kiov_offset + soffset; | |
335 | ||
336 | /* Vanishing risk of kmap deadlock when mapping 2 pages. | |
337 | * However in practice at least one of the kiovs will be mapped | |
338 | * kernel pages and the map/unmap will be NOOPs */ | |
339 | ||
af66a6e2 | 340 | memcpy(daddr, saddr, this_nob); |
d7e09d03 PT |
341 | nob -= this_nob; |
342 | ||
343 | if (diov->kiov_len > doffset + this_nob) { | |
344 | daddr += this_nob; | |
345 | doffset += this_nob; | |
346 | } else { | |
347 | kunmap(diov->kiov_page); | |
348 | daddr = NULL; | |
349 | diov++; | |
350 | ndiov--; | |
351 | doffset = 0; | |
352 | } | |
353 | ||
354 | if (siov->kiov_len > soffset + this_nob) { | |
355 | saddr += this_nob; | |
356 | soffset += this_nob; | |
357 | } else { | |
358 | kunmap(siov->kiov_page); | |
359 | saddr = NULL; | |
360 | siov++; | |
361 | nsiov--; | |
362 | soffset = 0; | |
363 | } | |
364 | } while (nob > 0); | |
365 | ||
366 | if (daddr != NULL) | |
367 | kunmap(diov->kiov_page); | |
368 | if (saddr != NULL) | |
369 | kunmap(siov->kiov_page); | |
370 | } | |
371 | EXPORT_SYMBOL(lnet_copy_kiov2kiov); | |
372 | ||
373 | void | |
af66a6e2 | 374 | lnet_copy_kiov2iov(unsigned int niov, struct iovec *iov, unsigned int iovoffset, |
d7e09d03 PT |
375 | unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, |
376 | unsigned int nob) | |
377 | { | |
378 | /* NB iov, kiov are READ-ONLY */ | |
379 | unsigned int this_nob; | |
380 | char *addr = NULL; | |
381 | ||
382 | if (nob == 0) | |
383 | return; | |
384 | ||
af66a6e2 | 385 | LASSERT(!in_interrupt()); |
d7e09d03 | 386 | |
af66a6e2 | 387 | LASSERT(niov > 0); |
d7e09d03 PT |
388 | while (iovoffset >= iov->iov_len) { |
389 | iovoffset -= iov->iov_len; | |
390 | iov++; | |
391 | niov--; | |
af66a6e2 | 392 | LASSERT(niov > 0); |
d7e09d03 PT |
393 | } |
394 | ||
af66a6e2 | 395 | LASSERT(nkiov > 0); |
d7e09d03 PT |
396 | while (kiovoffset >= kiov->kiov_len) { |
397 | kiovoffset -= kiov->kiov_len; | |
398 | kiov++; | |
399 | nkiov--; | |
af66a6e2 | 400 | LASSERT(nkiov > 0); |
d7e09d03 PT |
401 | } |
402 | ||
403 | do { | |
af66a6e2 LN |
404 | LASSERT(niov > 0); |
405 | LASSERT(nkiov > 0); | |
d7e09d03 PT |
406 | this_nob = MIN(iov->iov_len - iovoffset, |
407 | kiov->kiov_len - kiovoffset); | |
408 | this_nob = MIN(this_nob, nob); | |
409 | ||
410 | if (addr == NULL) | |
411 | addr = ((char *)kmap(kiov->kiov_page)) + | |
412 | kiov->kiov_offset + kiovoffset; | |
413 | ||
af66a6e2 | 414 | memcpy((char *)iov->iov_base + iovoffset, addr, this_nob); |
d7e09d03 PT |
415 | nob -= this_nob; |
416 | ||
417 | if (iov->iov_len > iovoffset + this_nob) { | |
418 | iovoffset += this_nob; | |
419 | } else { | |
420 | iov++; | |
421 | niov--; | |
422 | iovoffset = 0; | |
423 | } | |
424 | ||
425 | if (kiov->kiov_len > kiovoffset + this_nob) { | |
426 | addr += this_nob; | |
427 | kiovoffset += this_nob; | |
428 | } else { | |
429 | kunmap(kiov->kiov_page); | |
430 | addr = NULL; | |
431 | kiov++; | |
432 | nkiov--; | |
433 | kiovoffset = 0; | |
434 | } | |
435 | ||
436 | } while (nob > 0); | |
437 | ||
438 | if (addr != NULL) | |
439 | kunmap(kiov->kiov_page); | |
440 | } | |
441 | EXPORT_SYMBOL(lnet_copy_kiov2iov); | |
442 | ||
443 | void | |
af66a6e2 | 444 | lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, |
d7e09d03 PT |
445 | unsigned int niov, struct iovec *iov, unsigned int iovoffset, |
446 | unsigned int nob) | |
447 | { | |
448 | /* NB kiov, iov are READ-ONLY */ | |
449 | unsigned int this_nob; | |
450 | char *addr = NULL; | |
451 | ||
452 | if (nob == 0) | |
453 | return; | |
454 | ||
af66a6e2 | 455 | LASSERT(!in_interrupt()); |
d7e09d03 | 456 | |
af66a6e2 | 457 | LASSERT(nkiov > 0); |
d7e09d03 PT |
458 | while (kiovoffset >= kiov->kiov_len) { |
459 | kiovoffset -= kiov->kiov_len; | |
460 | kiov++; | |
461 | nkiov--; | |
af66a6e2 | 462 | LASSERT(nkiov > 0); |
d7e09d03 PT |
463 | } |
464 | ||
af66a6e2 | 465 | LASSERT(niov > 0); |
d7e09d03 PT |
466 | while (iovoffset >= iov->iov_len) { |
467 | iovoffset -= iov->iov_len; | |
468 | iov++; | |
469 | niov--; | |
af66a6e2 | 470 | LASSERT(niov > 0); |
d7e09d03 PT |
471 | } |
472 | ||
473 | do { | |
af66a6e2 LN |
474 | LASSERT(nkiov > 0); |
475 | LASSERT(niov > 0); | |
d7e09d03 PT |
476 | this_nob = MIN(kiov->kiov_len - kiovoffset, |
477 | iov->iov_len - iovoffset); | |
478 | this_nob = MIN(this_nob, nob); | |
479 | ||
480 | if (addr == NULL) | |
481 | addr = ((char *)kmap(kiov->kiov_page)) + | |
482 | kiov->kiov_offset + kiovoffset; | |
483 | ||
af66a6e2 | 484 | memcpy(addr, (char *)iov->iov_base + iovoffset, this_nob); |
d7e09d03 PT |
485 | nob -= this_nob; |
486 | ||
487 | if (kiov->kiov_len > kiovoffset + this_nob) { | |
488 | addr += this_nob; | |
489 | kiovoffset += this_nob; | |
490 | } else { | |
491 | kunmap(kiov->kiov_page); | |
492 | addr = NULL; | |
493 | kiov++; | |
494 | nkiov--; | |
495 | kiovoffset = 0; | |
496 | } | |
497 | ||
498 | if (iov->iov_len > iovoffset + this_nob) { | |
499 | iovoffset += this_nob; | |
500 | } else { | |
501 | iov++; | |
502 | niov--; | |
503 | iovoffset = 0; | |
504 | } | |
505 | } while (nob > 0); | |
506 | ||
507 | if (addr != NULL) | |
508 | kunmap(kiov->kiov_page); | |
509 | } | |
510 | EXPORT_SYMBOL(lnet_copy_iov2kiov); | |
511 | ||
512 | int | |
af66a6e2 | 513 | lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, |
d7e09d03 PT |
514 | int src_niov, lnet_kiov_t *src, |
515 | unsigned int offset, unsigned int len) | |
516 | { | |
517 | /* Initialise 'dst' to the subset of 'src' starting at 'offset', | |
518 | * for exactly 'len' bytes, and return the number of entries. | |
519 | * NB not destructive to 'src' */ | |
520 | unsigned int frag_len; | |
521 | unsigned int niov; | |
522 | ||
523 | if (len == 0) /* no data => */ | |
524 | return (0); /* no frags */ | |
525 | ||
af66a6e2 | 526 | LASSERT(src_niov > 0); |
d7e09d03 PT |
527 | while (offset >= src->kiov_len) { /* skip initial frags */ |
528 | offset -= src->kiov_len; | |
529 | src_niov--; | |
530 | src++; | |
af66a6e2 | 531 | LASSERT(src_niov > 0); |
d7e09d03 PT |
532 | } |
533 | ||
534 | niov = 1; | |
535 | for (;;) { | |
af66a6e2 LN |
536 | LASSERT(src_niov > 0); |
537 | LASSERT((int)niov <= dst_niov); | |
d7e09d03 PT |
538 | |
539 | frag_len = src->kiov_len - offset; | |
540 | dst->kiov_page = src->kiov_page; | |
541 | dst->kiov_offset = src->kiov_offset + offset; | |
542 | ||
543 | if (len <= frag_len) { | |
544 | dst->kiov_len = len; | |
af66a6e2 | 545 | LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE); |
d7e09d03 PT |
546 | return (niov); |
547 | } | |
548 | ||
549 | dst->kiov_len = frag_len; | |
af66a6e2 | 550 | LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE); |
d7e09d03 PT |
551 | |
552 | len -= frag_len; | |
553 | dst++; | |
554 | src++; | |
555 | niov++; | |
556 | src_niov--; | |
557 | offset = 0; | |
558 | } | |
559 | } | |
560 | EXPORT_SYMBOL(lnet_extract_kiov); | |
561 | ||
562 | void | |
563 | lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, | |
564 | unsigned int offset, unsigned int mlen, unsigned int rlen) | |
565 | { | |
566 | unsigned int niov = 0; | |
567 | struct iovec *iov = NULL; | |
568 | lnet_kiov_t *kiov = NULL; | |
569 | int rc; | |
570 | ||
af66a6e2 LN |
571 | LASSERT(!in_interrupt()); |
572 | LASSERT(mlen == 0 || msg != NULL); | |
d7e09d03 PT |
573 | |
574 | if (msg != NULL) { | |
575 | LASSERT(msg->msg_receiving); | |
576 | LASSERT(!msg->msg_sending); | |
577 | LASSERT(rlen == msg->msg_len); | |
578 | LASSERT(mlen <= msg->msg_len); | |
579 | LASSERT(msg->msg_offset == offset); | |
580 | LASSERT(msg->msg_wanted == mlen); | |
581 | ||
582 | msg->msg_receiving = 0; | |
583 | ||
584 | if (mlen != 0) { | |
585 | niov = msg->msg_niov; | |
586 | iov = msg->msg_iov; | |
587 | kiov = msg->msg_kiov; | |
588 | ||
af66a6e2 LN |
589 | LASSERT(niov > 0); |
590 | LASSERT((iov == NULL) != (kiov == NULL)); | |
d7e09d03 PT |
591 | } |
592 | } | |
593 | ||
594 | rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed, | |
595 | niov, iov, kiov, offset, mlen, rlen); | |
596 | if (rc < 0) | |
597 | lnet_finalize(ni, msg, rc); | |
598 | } | |
599 | ||
600 | void | |
601 | lnet_setpayloadbuffer(lnet_msg_t *msg) | |
602 | { | |
603 | lnet_libmd_t *md = msg->msg_md; | |
604 | ||
af66a6e2 LN |
605 | LASSERT(msg->msg_len > 0); |
606 | LASSERT(!msg->msg_routing); | |
607 | LASSERT(md != NULL); | |
608 | LASSERT(msg->msg_niov == 0); | |
609 | LASSERT(msg->msg_iov == NULL); | |
610 | LASSERT(msg->msg_kiov == NULL); | |
d7e09d03 PT |
611 | |
612 | msg->msg_niov = md->md_niov; | |
613 | if ((md->md_options & LNET_MD_KIOV) != 0) | |
614 | msg->msg_kiov = md->md_iov.kiov; | |
615 | else | |
616 | msg->msg_iov = md->md_iov.iov; | |
617 | } | |
618 | ||
619 | void | |
620 | lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, | |
621 | unsigned int offset, unsigned int len) | |
622 | { | |
623 | msg->msg_type = type; | |
624 | msg->msg_target = target; | |
625 | msg->msg_len = len; | |
626 | msg->msg_offset = offset; | |
627 | ||
628 | if (len != 0) | |
629 | lnet_setpayloadbuffer(msg); | |
630 | ||
af66a6e2 | 631 | memset(&msg->msg_hdr, 0, sizeof(msg->msg_hdr)); |
d7e09d03 PT |
632 | msg->msg_hdr.type = cpu_to_le32(type); |
633 | msg->msg_hdr.dest_nid = cpu_to_le64(target.nid); | |
634 | msg->msg_hdr.dest_pid = cpu_to_le32(target.pid); | |
635 | /* src_nid will be set later */ | |
636 | msg->msg_hdr.src_pid = cpu_to_le32(the_lnet.ln_pid); | |
637 | msg->msg_hdr.payload_length = cpu_to_le32(len); | |
638 | } | |
639 | ||
640 | void | |
641 | lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg) | |
642 | { | |
643 | void *priv = msg->msg_private; | |
644 | int rc; | |
645 | ||
af66a6e2 LN |
646 | LASSERT(!in_interrupt()); |
647 | LASSERT(LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND || | |
d7e09d03 PT |
648 | (msg->msg_txcredit && msg->msg_peertxcredit)); |
649 | ||
650 | rc = (ni->ni_lnd->lnd_send)(ni, priv, msg); | |
651 | if (rc < 0) | |
652 | lnet_finalize(ni, msg, rc); | |
653 | } | |
654 | ||
655 | int | |
656 | lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg) | |
657 | { | |
658 | int rc; | |
659 | ||
660 | LASSERT(!msg->msg_sending); | |
661 | LASSERT(msg->msg_receiving); | |
662 | LASSERT(!msg->msg_rx_ready_delay); | |
663 | LASSERT(ni->ni_lnd->lnd_eager_recv != NULL); | |
664 | ||
665 | msg->msg_rx_ready_delay = 1; | |
666 | rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg, | |
667 | &msg->msg_private); | |
668 | if (rc != 0) { | |
669 | CERROR("recv from %s / send to %s aborted: " | |
670 | "eager_recv failed %d\n", | |
671 | libcfs_nid2str(msg->msg_rxpeer->lp_nid), | |
672 | libcfs_id2str(msg->msg_target), rc); | |
673 | LASSERT(rc < 0); /* required by my callers */ | |
674 | } | |
675 | ||
676 | return rc; | |
677 | } | |
678 | ||
679 | /* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */ | |
680 | void | |
681 | lnet_ni_query_locked(lnet_ni_t *ni, lnet_peer_t *lp) | |
682 | { | |
683 | cfs_time_t last_alive = 0; | |
684 | ||
685 | LASSERT(lnet_peer_aliveness_enabled(lp)); | |
686 | LASSERT(ni->ni_lnd->lnd_query != NULL); | |
687 | ||
688 | lnet_net_unlock(lp->lp_cpt); | |
689 | (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive); | |
690 | lnet_net_lock(lp->lp_cpt); | |
691 | ||
692 | lp->lp_last_query = cfs_time_current(); | |
693 | ||
694 | if (last_alive != 0) /* NI has updated timestamp */ | |
695 | lp->lp_last_alive = last_alive; | |
696 | } | |
697 | ||
698 | /* NB: always called with lnet_net_lock held */ | |
699 | static inline int | |
af66a6e2 | 700 | lnet_peer_is_alive(lnet_peer_t *lp, cfs_time_t now) |
d7e09d03 PT |
701 | { |
702 | int alive; | |
703 | cfs_time_t deadline; | |
704 | ||
af66a6e2 | 705 | LASSERT(lnet_peer_aliveness_enabled(lp)); |
d7e09d03 PT |
706 | |
707 | /* Trust lnet_notify() if it has more recent aliveness news, but | |
708 | * ignore the initial assumed death (see lnet_peers_start_down()). | |
709 | */ | |
710 | if (!lp->lp_alive && lp->lp_alive_count > 0 && | |
711 | cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive)) | |
712 | return 0; | |
713 | ||
714 | deadline = cfs_time_add(lp->lp_last_alive, | |
715 | cfs_time_seconds(lp->lp_ni->ni_peertimeout)); | |
716 | alive = cfs_time_after(deadline, now); | |
717 | ||
718 | /* Update obsolete lp_alive except for routers assumed to be dead | |
719 | * initially, because router checker would update aliveness in this | |
720 | * case, and moreover lp_last_alive at peer creation is assumed. | |
721 | */ | |
722 | if (alive && !lp->lp_alive && | |
723 | !(lnet_isrouter(lp) && lp->lp_alive_count == 0)) | |
724 | lnet_notify_locked(lp, 0, 1, lp->lp_last_alive); | |
725 | ||
726 | return alive; | |
727 | } | |
728 | ||
729 | ||
730 | /* NB: returns 1 when alive, 0 when dead, negative when error; | |
731 | * may drop the lnet_net_lock */ | |
732 | int | |
af66a6e2 | 733 | lnet_peer_alive_locked(lnet_peer_t *lp) |
d7e09d03 PT |
734 | { |
735 | cfs_time_t now = cfs_time_current(); | |
736 | ||
737 | if (!lnet_peer_aliveness_enabled(lp)) | |
738 | return -ENODEV; | |
739 | ||
740 | if (lnet_peer_is_alive(lp, now)) | |
741 | return 1; | |
742 | ||
743 | /* Peer appears dead, but we should avoid frequent NI queries (at | |
744 | * most once per lnet_queryinterval seconds). */ | |
745 | if (lp->lp_last_query != 0) { | |
746 | static const int lnet_queryinterval = 1; | |
747 | ||
748 | cfs_time_t next_query = | |
749 | cfs_time_add(lp->lp_last_query, | |
750 | cfs_time_seconds(lnet_queryinterval)); | |
751 | ||
752 | if (cfs_time_before(now, next_query)) { | |
753 | if (lp->lp_alive) | |
754 | CWARN("Unexpected aliveness of peer %s: " | |
755 | "%d < %d (%d/%d)\n", | |
756 | libcfs_nid2str(lp->lp_nid), | |
757 | (int)now, (int)next_query, | |
758 | lnet_queryinterval, | |
759 | lp->lp_ni->ni_peertimeout); | |
760 | return 0; | |
761 | } | |
762 | } | |
763 | ||
764 | /* query NI for latest aliveness news */ | |
765 | lnet_ni_query_locked(lp->lp_ni, lp); | |
766 | ||
767 | if (lnet_peer_is_alive(lp, now)) | |
768 | return 1; | |
769 | ||
770 | lnet_notify_locked(lp, 0, 0, lp->lp_last_alive); | |
771 | return 0; | |
772 | } | |
773 | ||
774 | int | |
775 | lnet_post_send_locked(lnet_msg_t *msg, int do_send) | |
776 | { | |
777 | /* lnet_send is going to lnet_net_unlock immediately after this, | |
778 | * so it sets do_send FALSE and I don't do the unlock/send/lock bit. | |
779 | * I return EAGAIN if msg blocked, EHOSTUNREACH if msg_txpeer | |
780 | * appears dead, and 0 if sent or OK to send */ | |
781 | struct lnet_peer *lp = msg->msg_txpeer; | |
782 | struct lnet_ni *ni = lp->lp_ni; | |
783 | struct lnet_tx_queue *tq; | |
784 | int cpt; | |
785 | ||
786 | /* non-lnet_send() callers have checked before */ | |
787 | LASSERT(!do_send || msg->msg_tx_delayed); | |
788 | LASSERT(!msg->msg_receiving); | |
789 | LASSERT(msg->msg_tx_committed); | |
790 | ||
791 | cpt = msg->msg_tx_cpt; | |
792 | tq = ni->ni_tx_queues[cpt]; | |
793 | ||
794 | /* NB 'lp' is always the next hop */ | |
795 | if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && | |
796 | lnet_peer_alive_locked(lp) == 0) { | |
797 | the_lnet.ln_counters[cpt]->drop_count++; | |
798 | the_lnet.ln_counters[cpt]->drop_length += msg->msg_len; | |
799 | lnet_net_unlock(cpt); | |
800 | ||
801 | CNETERR("Dropping message for %s: peer not alive\n", | |
802 | libcfs_id2str(msg->msg_target)); | |
803 | if (do_send) | |
804 | lnet_finalize(ni, msg, -EHOSTUNREACH); | |
805 | ||
806 | lnet_net_lock(cpt); | |
807 | return EHOSTUNREACH; | |
808 | } | |
809 | ||
810 | if (!msg->msg_peertxcredit) { | |
af66a6e2 | 811 | LASSERT((lp->lp_txcredits < 0) == |
d7e09d03 PT |
812 | !list_empty(&lp->lp_txq)); |
813 | ||
814 | msg->msg_peertxcredit = 1; | |
815 | lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t); | |
816 | lp->lp_txcredits--; | |
817 | ||
818 | if (lp->lp_txcredits < lp->lp_mintxcredits) | |
819 | lp->lp_mintxcredits = lp->lp_txcredits; | |
820 | ||
821 | if (lp->lp_txcredits < 0) { | |
822 | msg->msg_tx_delayed = 1; | |
823 | list_add_tail(&msg->msg_list, &lp->lp_txq); | |
824 | return EAGAIN; | |
825 | } | |
826 | } | |
827 | ||
828 | if (!msg->msg_txcredit) { | |
829 | LASSERT((tq->tq_credits < 0) == | |
830 | !list_empty(&tq->tq_delayed)); | |
831 | ||
832 | msg->msg_txcredit = 1; | |
833 | tq->tq_credits--; | |
834 | ||
835 | if (tq->tq_credits < tq->tq_credits_min) | |
836 | tq->tq_credits_min = tq->tq_credits; | |
837 | ||
838 | if (tq->tq_credits < 0) { | |
839 | msg->msg_tx_delayed = 1; | |
840 | list_add_tail(&msg->msg_list, &tq->tq_delayed); | |
841 | return EAGAIN; | |
842 | } | |
843 | } | |
844 | ||
845 | if (do_send) { | |
846 | lnet_net_unlock(cpt); | |
847 | lnet_ni_send(ni, msg); | |
848 | lnet_net_lock(cpt); | |
849 | } | |
850 | return 0; | |
851 | } | |
852 | ||
853 | ||
854 | lnet_rtrbufpool_t * | |
855 | lnet_msg2bufpool(lnet_msg_t *msg) | |
856 | { | |
857 | lnet_rtrbufpool_t *rbp; | |
858 | int cpt; | |
859 | ||
860 | LASSERT(msg->msg_rx_committed); | |
861 | ||
862 | cpt = msg->msg_rx_cpt; | |
863 | rbp = &the_lnet.ln_rtrpools[cpt][0]; | |
864 | ||
865 | LASSERT(msg->msg_len <= LNET_MTU); | |
866 | while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_CACHE_SIZE) { | |
867 | rbp++; | |
868 | LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]); | |
869 | } | |
870 | ||
871 | return rbp; | |
872 | } | |
873 | ||
874 | int | |
af66a6e2 | 875 | lnet_post_routed_recv_locked(lnet_msg_t *msg, int do_recv) |
d7e09d03 PT |
876 | { |
877 | /* lnet_parse is going to lnet_net_unlock immediately after this, so it | |
878 | * sets do_recv FALSE and I don't do the unlock/send/lock bit. I | |
879 | * return EAGAIN if msg blocked and 0 if received or OK to receive */ | |
880 | lnet_peer_t *lp = msg->msg_rxpeer; | |
881 | lnet_rtrbufpool_t *rbp; | |
882 | lnet_rtrbuf_t *rb; | |
883 | ||
af66a6e2 LN |
884 | LASSERT(msg->msg_iov == NULL); |
885 | LASSERT(msg->msg_kiov == NULL); | |
886 | LASSERT(msg->msg_niov == 0); | |
887 | LASSERT(msg->msg_routing); | |
888 | LASSERT(msg->msg_receiving); | |
889 | LASSERT(!msg->msg_sending); | |
d7e09d03 PT |
890 | |
891 | /* non-lnet_parse callers only receive delayed messages */ | |
892 | LASSERT(!do_recv || msg->msg_rx_delayed); | |
893 | ||
894 | if (!msg->msg_peerrtrcredit) { | |
af66a6e2 | 895 | LASSERT((lp->lp_rtrcredits < 0) == |
d7e09d03 PT |
896 | !list_empty(&lp->lp_rtrq)); |
897 | ||
898 | msg->msg_peerrtrcredit = 1; | |
899 | lp->lp_rtrcredits--; | |
900 | if (lp->lp_rtrcredits < lp->lp_minrtrcredits) | |
901 | lp->lp_minrtrcredits = lp->lp_rtrcredits; | |
902 | ||
903 | if (lp->lp_rtrcredits < 0) { | |
904 | /* must have checked eager_recv before here */ | |
905 | LASSERT(msg->msg_rx_ready_delay); | |
906 | msg->msg_rx_delayed = 1; | |
907 | list_add_tail(&msg->msg_list, &lp->lp_rtrq); | |
908 | return EAGAIN; | |
909 | } | |
910 | } | |
911 | ||
912 | rbp = lnet_msg2bufpool(msg); | |
913 | ||
914 | if (!msg->msg_rtrcredit) { | |
af66a6e2 | 915 | LASSERT((rbp->rbp_credits < 0) == |
d7e09d03 PT |
916 | !list_empty(&rbp->rbp_msgs)); |
917 | ||
918 | msg->msg_rtrcredit = 1; | |
919 | rbp->rbp_credits--; | |
920 | if (rbp->rbp_credits < rbp->rbp_mincredits) | |
921 | rbp->rbp_mincredits = rbp->rbp_credits; | |
922 | ||
923 | if (rbp->rbp_credits < 0) { | |
924 | /* must have checked eager_recv before here */ | |
925 | LASSERT(msg->msg_rx_ready_delay); | |
926 | msg->msg_rx_delayed = 1; | |
927 | list_add_tail(&msg->msg_list, &rbp->rbp_msgs); | |
928 | return EAGAIN; | |
929 | } | |
930 | } | |
931 | ||
af66a6e2 | 932 | LASSERT(!list_empty(&rbp->rbp_bufs)); |
d7e09d03 PT |
933 | rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list); |
934 | list_del(&rb->rb_list); | |
935 | ||
936 | msg->msg_niov = rbp->rbp_npages; | |
937 | msg->msg_kiov = &rb->rb_kiov[0]; | |
938 | ||
939 | if (do_recv) { | |
940 | int cpt = msg->msg_rx_cpt; | |
941 | ||
942 | lnet_net_unlock(cpt); | |
943 | lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1, | |
944 | 0, msg->msg_len, msg->msg_len); | |
945 | lnet_net_lock(cpt); | |
946 | } | |
947 | return 0; | |
948 | } | |
949 | ||
950 | void | |
951 | lnet_return_tx_credits_locked(lnet_msg_t *msg) | |
952 | { | |
953 | lnet_peer_t *txpeer = msg->msg_txpeer; | |
954 | lnet_msg_t *msg2; | |
955 | ||
956 | if (msg->msg_txcredit) { | |
957 | struct lnet_ni *ni = txpeer->lp_ni; | |
958 | struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt]; | |
959 | ||
960 | /* give back NI txcredits */ | |
961 | msg->msg_txcredit = 0; | |
962 | ||
963 | LASSERT((tq->tq_credits < 0) == | |
964 | !list_empty(&tq->tq_delayed)); | |
965 | ||
966 | tq->tq_credits++; | |
967 | if (tq->tq_credits <= 0) { | |
968 | msg2 = list_entry(tq->tq_delayed.next, | |
969 | lnet_msg_t, msg_list); | |
970 | list_del(&msg2->msg_list); | |
971 | ||
972 | LASSERT(msg2->msg_txpeer->lp_ni == ni); | |
973 | LASSERT(msg2->msg_tx_delayed); | |
974 | ||
975 | (void) lnet_post_send_locked(msg2, 1); | |
976 | } | |
977 | } | |
978 | ||
979 | if (msg->msg_peertxcredit) { | |
980 | /* give back peer txcredits */ | |
981 | msg->msg_peertxcredit = 0; | |
982 | ||
983 | LASSERT((txpeer->lp_txcredits < 0) == | |
984 | !list_empty(&txpeer->lp_txq)); | |
985 | ||
986 | txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t); | |
af66a6e2 | 987 | LASSERT(txpeer->lp_txqnob >= 0); |
d7e09d03 PT |
988 | |
989 | txpeer->lp_txcredits++; | |
990 | if (txpeer->lp_txcredits <= 0) { | |
991 | msg2 = list_entry(txpeer->lp_txq.next, | |
992 | lnet_msg_t, msg_list); | |
993 | list_del(&msg2->msg_list); | |
994 | ||
995 | LASSERT(msg2->msg_txpeer == txpeer); | |
996 | LASSERT(msg2->msg_tx_delayed); | |
997 | ||
998 | (void) lnet_post_send_locked(msg2, 1); | |
999 | } | |
1000 | } | |
1001 | ||
1002 | if (txpeer != NULL) { | |
1003 | msg->msg_txpeer = NULL; | |
1004 | lnet_peer_decref_locked(txpeer); | |
1005 | } | |
1006 | } | |
1007 | ||
1008 | void | |
1009 | lnet_return_rx_credits_locked(lnet_msg_t *msg) | |
1010 | { | |
1011 | lnet_peer_t *rxpeer = msg->msg_rxpeer; | |
1012 | lnet_msg_t *msg2; | |
1013 | ||
1014 | if (msg->msg_rtrcredit) { | |
1015 | /* give back global router credits */ | |
1016 | lnet_rtrbuf_t *rb; | |
1017 | lnet_rtrbufpool_t *rbp; | |
1018 | ||
1019 | /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays | |
1020 | * there until it gets one allocated, or aborts the wait | |
1021 | * itself */ | |
af66a6e2 | 1022 | LASSERT(msg->msg_kiov != NULL); |
d7e09d03 PT |
1023 | |
1024 | rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]); | |
1025 | rbp = rb->rb_pool; | |
af66a6e2 | 1026 | LASSERT(rbp == lnet_msg2bufpool(msg)); |
d7e09d03 PT |
1027 | |
1028 | msg->msg_kiov = NULL; | |
1029 | msg->msg_rtrcredit = 0; | |
1030 | ||
1031 | LASSERT((rbp->rbp_credits < 0) == | |
1032 | !list_empty(&rbp->rbp_msgs)); | |
1033 | LASSERT((rbp->rbp_credits > 0) == | |
1034 | !list_empty(&rbp->rbp_bufs)); | |
1035 | ||
1036 | list_add(&rb->rb_list, &rbp->rbp_bufs); | |
1037 | rbp->rbp_credits++; | |
1038 | if (rbp->rbp_credits <= 0) { | |
1039 | msg2 = list_entry(rbp->rbp_msgs.next, | |
1040 | lnet_msg_t, msg_list); | |
1041 | list_del(&msg2->msg_list); | |
1042 | ||
1043 | (void) lnet_post_routed_recv_locked(msg2, 1); | |
1044 | } | |
1045 | } | |
1046 | ||
1047 | if (msg->msg_peerrtrcredit) { | |
1048 | /* give back peer router credits */ | |
1049 | msg->msg_peerrtrcredit = 0; | |
1050 | ||
1051 | LASSERT((rxpeer->lp_rtrcredits < 0) == | |
1052 | !list_empty(&rxpeer->lp_rtrq)); | |
1053 | ||
1054 | rxpeer->lp_rtrcredits++; | |
1055 | if (rxpeer->lp_rtrcredits <= 0) { | |
1056 | msg2 = list_entry(rxpeer->lp_rtrq.next, | |
1057 | lnet_msg_t, msg_list); | |
1058 | list_del(&msg2->msg_list); | |
1059 | ||
1060 | (void) lnet_post_routed_recv_locked(msg2, 1); | |
1061 | } | |
1062 | } | |
1063 | if (rxpeer != NULL) { | |
1064 | msg->msg_rxpeer = NULL; | |
1065 | lnet_peer_decref_locked(rxpeer); | |
1066 | } | |
1067 | } | |
1068 | ||
1069 | static int | |
1070 | lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2) | |
1071 | { | |
1072 | lnet_peer_t *p1 = r1->lr_gateway; | |
1073 | lnet_peer_t *p2 = r2->lr_gateway; | |
1074 | ||
1075 | if (r1->lr_hops < r2->lr_hops) | |
1076 | return 1; | |
1077 | ||
1078 | if (r1->lr_hops > r2->lr_hops) | |
1079 | return -1; | |
1080 | ||
1081 | if (p1->lp_txqnob < p2->lp_txqnob) | |
1082 | return 1; | |
1083 | ||
1084 | if (p1->lp_txqnob > p2->lp_txqnob) | |
1085 | return -1; | |
1086 | ||
1087 | if (p1->lp_txcredits > p2->lp_txcredits) | |
1088 | return 1; | |
1089 | ||
1090 | if (p1->lp_txcredits < p2->lp_txcredits) | |
1091 | return -1; | |
1092 | ||
1093 | if (r1->lr_seq - r2->lr_seq <= 0) | |
1094 | return 1; | |
1095 | ||
1096 | return -1; | |
1097 | } | |
1098 | ||
1099 | static lnet_peer_t * | |
1100 | lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid) | |
1101 | { | |
1102 | lnet_remotenet_t *rnet; | |
1103 | lnet_route_t *rtr; | |
1104 | lnet_route_t *rtr_best; | |
1105 | lnet_route_t *rtr_last; | |
1106 | struct lnet_peer *lp_best; | |
1107 | struct lnet_peer *lp; | |
1108 | int rc; | |
1109 | ||
1110 | /* If @rtr_nid is not LNET_NID_ANY, return the gateway with | |
1111 | * rtr_nid nid, otherwise find the best gateway I can use */ | |
1112 | ||
1113 | rnet = lnet_find_net_locked(LNET_NIDNET(target)); | |
1114 | if (rnet == NULL) | |
1115 | return NULL; | |
1116 | ||
1117 | lp_best = NULL; | |
1118 | rtr_best = rtr_last = NULL; | |
1119 | list_for_each_entry(rtr, &rnet->lrn_routes, lr_list) { | |
1120 | lp = rtr->lr_gateway; | |
1121 | ||
1122 | if (!lp->lp_alive || /* gateway is down */ | |
1123 | ((lp->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0 && | |
1124 | rtr->lr_downis != 0)) /* NI to target is down */ | |
1125 | continue; | |
1126 | ||
1127 | if (ni != NULL && lp->lp_ni != ni) | |
1128 | continue; | |
1129 | ||
1130 | if (lp->lp_nid == rtr_nid) /* it's pre-determined router */ | |
1131 | return lp; | |
1132 | ||
1133 | if (lp_best == NULL) { | |
1134 | rtr_best = rtr_last = rtr; | |
1135 | lp_best = lp; | |
1136 | continue; | |
1137 | } | |
1138 | ||
1139 | /* no protection on below fields, but it's harmless */ | |
1140 | if (rtr_last->lr_seq - rtr->lr_seq < 0) | |
1141 | rtr_last = rtr; | |
1142 | ||
1143 | rc = lnet_compare_routes(rtr, rtr_best); | |
1144 | if (rc < 0) | |
1145 | continue; | |
1146 | ||
1147 | rtr_best = rtr; | |
1148 | lp_best = lp; | |
1149 | } | |
1150 | ||
1151 | /* set sequence number on the best router to the latest sequence + 1 | |
1152 | * so we can round-robin all routers, it's race and inaccurate but | |
1153 | * harmless and functional */ | |
1154 | if (rtr_best != NULL) | |
1155 | rtr_best->lr_seq = rtr_last->lr_seq + 1; | |
1156 | return lp_best; | |
1157 | } | |
1158 | ||
1159 | int | |
1160 | lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid) | |
1161 | { | |
1162 | lnet_nid_t dst_nid = msg->msg_target.nid; | |
1163 | struct lnet_ni *src_ni; | |
1164 | struct lnet_ni *local_ni; | |
1165 | struct lnet_peer *lp; | |
1166 | int cpt; | |
1167 | int cpt2; | |
1168 | int rc; | |
1169 | ||
1170 | /* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases, | |
1171 | * but we might want to use pre-determined router for ACK/REPLY | |
1172 | * in the future */ | |
1173 | /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */ | |
af66a6e2 LN |
1174 | LASSERT(msg->msg_txpeer == NULL); |
1175 | LASSERT(!msg->msg_sending); | |
1176 | LASSERT(!msg->msg_target_is_router); | |
1177 | LASSERT(!msg->msg_receiving); | |
d7e09d03 PT |
1178 | |
1179 | msg->msg_sending = 1; | |
1180 | ||
1181 | LASSERT(!msg->msg_tx_committed); | |
1182 | cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid); | |
1183 | again: | |
1184 | lnet_net_lock(cpt); | |
1185 | ||
1186 | if (the_lnet.ln_shutdown) { | |
1187 | lnet_net_unlock(cpt); | |
1188 | return -ESHUTDOWN; | |
1189 | } | |
1190 | ||
1191 | if (src_nid == LNET_NID_ANY) { | |
1192 | src_ni = NULL; | |
1193 | } else { | |
1194 | src_ni = lnet_nid2ni_locked(src_nid, cpt); | |
1195 | if (src_ni == NULL) { | |
1196 | lnet_net_unlock(cpt); | |
1197 | LCONSOLE_WARN("Can't send to %s: src %s is not a " | |
1198 | "local nid\n", libcfs_nid2str(dst_nid), | |
1199 | libcfs_nid2str(src_nid)); | |
1200 | return -EINVAL; | |
1201 | } | |
af66a6e2 | 1202 | LASSERT(!msg->msg_routing); |
d7e09d03 PT |
1203 | } |
1204 | ||
1205 | /* Is this for someone on a local network? */ | |
1206 | local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt); | |
1207 | ||
1208 | if (local_ni != NULL) { | |
1209 | if (src_ni == NULL) { | |
1210 | src_ni = local_ni; | |
1211 | src_nid = src_ni->ni_nid; | |
1212 | } else if (src_ni == local_ni) { | |
1213 | lnet_ni_decref_locked(local_ni, cpt); | |
1214 | } else { | |
1215 | lnet_ni_decref_locked(local_ni, cpt); | |
1216 | lnet_ni_decref_locked(src_ni, cpt); | |
1217 | lnet_net_unlock(cpt); | |
1218 | LCONSOLE_WARN("No route to %s via from %s\n", | |
1219 | libcfs_nid2str(dst_nid), | |
1220 | libcfs_nid2str(src_nid)); | |
1221 | return -EINVAL; | |
1222 | } | |
1223 | ||
1224 | LASSERT(src_nid != LNET_NID_ANY); | |
1225 | lnet_msg_commit(msg, cpt); | |
1226 | ||
1227 | if (!msg->msg_routing) | |
1228 | msg->msg_hdr.src_nid = cpu_to_le64(src_nid); | |
1229 | ||
1230 | if (src_ni == the_lnet.ln_loni) { | |
1231 | /* No send credit hassles with LOLND */ | |
1232 | lnet_net_unlock(cpt); | |
1233 | lnet_ni_send(src_ni, msg); | |
1234 | ||
1235 | lnet_net_lock(cpt); | |
1236 | lnet_ni_decref_locked(src_ni, cpt); | |
1237 | lnet_net_unlock(cpt); | |
1238 | return 0; | |
1239 | } | |
1240 | ||
1241 | rc = lnet_nid2peer_locked(&lp, dst_nid, cpt); | |
1242 | /* lp has ref on src_ni; lose mine */ | |
1243 | lnet_ni_decref_locked(src_ni, cpt); | |
1244 | if (rc != 0) { | |
1245 | lnet_net_unlock(cpt); | |
1246 | LCONSOLE_WARN("Error %d finding peer %s\n", rc, | |
1247 | libcfs_nid2str(dst_nid)); | |
1248 | /* ENOMEM or shutting down */ | |
1249 | return rc; | |
1250 | } | |
af66a6e2 | 1251 | LASSERT(lp->lp_ni == src_ni); |
d7e09d03 PT |
1252 | } else { |
1253 | /* sending to a remote network */ | |
1254 | lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid); | |
1255 | if (lp == NULL) { | |
1256 | if (src_ni != NULL) | |
1257 | lnet_ni_decref_locked(src_ni, cpt); | |
1258 | lnet_net_unlock(cpt); | |
1259 | ||
1260 | LCONSOLE_WARN("No route to %s via %s " | |
1261 | "(all routers down)\n", | |
1262 | libcfs_id2str(msg->msg_target), | |
1263 | libcfs_nid2str(src_nid)); | |
1264 | return -EHOSTUNREACH; | |
1265 | } | |
1266 | ||
1267 | /* rtr_nid is LNET_NID_ANY or NID of pre-determined router, | |
1268 | * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't | |
1269 | * pre-determined router, this can happen if router table | |
1270 | * was changed when we release the lock */ | |
1271 | if (rtr_nid != lp->lp_nid) { | |
1272 | cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid); | |
1273 | if (cpt2 != cpt) { | |
1274 | if (src_ni != NULL) | |
1275 | lnet_ni_decref_locked(src_ni, cpt); | |
1276 | lnet_net_unlock(cpt); | |
1277 | ||
1278 | rtr_nid = lp->lp_nid; | |
1279 | cpt = cpt2; | |
1280 | goto again; | |
1281 | } | |
1282 | } | |
1283 | ||
1284 | CDEBUG(D_NET, "Best route to %s via %s for %s %d\n", | |
1285 | libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid), | |
1286 | lnet_msgtyp2str(msg->msg_type), msg->msg_len); | |
1287 | ||
1288 | if (src_ni == NULL) { | |
1289 | src_ni = lp->lp_ni; | |
1290 | src_nid = src_ni->ni_nid; | |
1291 | } else { | |
af66a6e2 | 1292 | LASSERT(src_ni == lp->lp_ni); |
d7e09d03 PT |
1293 | lnet_ni_decref_locked(src_ni, cpt); |
1294 | } | |
1295 | ||
1296 | lnet_peer_addref_locked(lp); | |
1297 | ||
1298 | LASSERT(src_nid != LNET_NID_ANY); | |
1299 | lnet_msg_commit(msg, cpt); | |
1300 | ||
1301 | if (!msg->msg_routing) { | |
1302 | /* I'm the source and now I know which NI to send on */ | |
1303 | msg->msg_hdr.src_nid = cpu_to_le64(src_nid); | |
1304 | } | |
1305 | ||
1306 | msg->msg_target_is_router = 1; | |
1307 | msg->msg_target.nid = lp->lp_nid; | |
1308 | msg->msg_target.pid = LUSTRE_SRV_LNET_PID; | |
1309 | } | |
1310 | ||
1311 | /* 'lp' is our best choice of peer */ | |
1312 | ||
af66a6e2 LN |
1313 | LASSERT(!msg->msg_peertxcredit); |
1314 | LASSERT(!msg->msg_txcredit); | |
1315 | LASSERT(msg->msg_txpeer == NULL); | |
d7e09d03 PT |
1316 | |
1317 | msg->msg_txpeer = lp; /* msg takes my ref on lp */ | |
1318 | ||
1319 | rc = lnet_post_send_locked(msg, 0); | |
1320 | lnet_net_unlock(cpt); | |
1321 | ||
1322 | if (rc == EHOSTUNREACH) | |
1323 | return -EHOSTUNREACH; | |
1324 | ||
1325 | if (rc == 0) | |
1326 | lnet_ni_send(src_ni, msg); | |
1327 | ||
1328 | return 0; | |
1329 | } | |
1330 | ||
1331 | static void | |
1332 | lnet_drop_message(lnet_ni_t *ni, int cpt, void *private, unsigned int nob) | |
1333 | { | |
1334 | lnet_net_lock(cpt); | |
1335 | the_lnet.ln_counters[cpt]->drop_count++; | |
1336 | the_lnet.ln_counters[cpt]->drop_length += nob; | |
1337 | lnet_net_unlock(cpt); | |
1338 | ||
1339 | lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob); | |
1340 | } | |
1341 | ||
1342 | static void | |
1343 | lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg) | |
1344 | { | |
1345 | lnet_hdr_t *hdr = &msg->msg_hdr; | |
1346 | ||
1347 | if (msg->msg_wanted != 0) | |
1348 | lnet_setpayloadbuffer(msg); | |
1349 | ||
1350 | lnet_build_msg_event(msg, LNET_EVENT_PUT); | |
1351 | ||
1352 | /* Must I ACK? If so I'll grab the ack_wmd out of the header and put | |
1353 | * it back into the ACK during lnet_finalize() */ | |
1354 | msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) && | |
1355 | (msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0); | |
1356 | ||
1357 | lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed, | |
1358 | msg->msg_offset, msg->msg_wanted, hdr->payload_length); | |
1359 | } | |
1360 | ||
1361 | static int | |
1362 | lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) | |
1363 | { | |
1364 | lnet_hdr_t *hdr = &msg->msg_hdr; | |
1365 | struct lnet_match_info info; | |
1366 | int rc; | |
1367 | ||
1368 | /* Convert put fields to host byte order */ | |
1369 | hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits); | |
1370 | hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index); | |
1371 | hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset); | |
1372 | ||
1373 | info.mi_id.nid = hdr->src_nid; | |
1374 | info.mi_id.pid = hdr->src_pid; | |
1375 | info.mi_opc = LNET_MD_OP_PUT; | |
1376 | info.mi_portal = hdr->msg.put.ptl_index; | |
1377 | info.mi_rlength = hdr->payload_length; | |
1378 | info.mi_roffset = hdr->msg.put.offset; | |
1379 | info.mi_mbits = hdr->msg.put.match_bits; | |
1380 | ||
1381 | msg->msg_rx_ready_delay = ni->ni_lnd->lnd_eager_recv == NULL; | |
1382 | ||
1383 | again: | |
1384 | rc = lnet_ptl_match_md(&info, msg); | |
1385 | switch (rc) { | |
1386 | default: | |
1387 | LBUG(); | |
1388 | ||
1389 | case LNET_MATCHMD_OK: | |
1390 | lnet_recv_put(ni, msg); | |
1391 | return 0; | |
1392 | ||
1393 | case LNET_MATCHMD_NONE: | |
1394 | if (msg->msg_rx_delayed) /* attached on delayed list */ | |
1395 | return 0; | |
1396 | ||
1397 | rc = lnet_ni_eager_recv(ni, msg); | |
1398 | if (rc == 0) | |
1399 | goto again; | |
1400 | /* fall through */ | |
1401 | ||
1402 | case LNET_MATCHMD_DROP: | |
1403 | CNETERR("Dropping PUT from %s portal %d match "LPU64 | |
1404 | " offset %d length %d: %d\n", | |
1405 | libcfs_id2str(info.mi_id), info.mi_portal, | |
1406 | info.mi_mbits, info.mi_roffset, info.mi_rlength, rc); | |
1407 | ||
1408 | return ENOENT; /* +ve: OK but no match */ | |
1409 | } | |
1410 | } | |
1411 | ||
1412 | static int | |
1413 | lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) | |
1414 | { | |
1415 | struct lnet_match_info info; | |
1416 | lnet_hdr_t *hdr = &msg->msg_hdr; | |
1417 | lnet_handle_wire_t reply_wmd; | |
1418 | int rc; | |
1419 | ||
1420 | /* Convert get fields to host byte order */ | |
1421 | hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits); | |
1422 | hdr->msg.get.ptl_index = le32_to_cpu(hdr->msg.get.ptl_index); | |
1423 | hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length); | |
1424 | hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset); | |
1425 | ||
1426 | info.mi_id.nid = hdr->src_nid; | |
1427 | info.mi_id.pid = hdr->src_pid; | |
1428 | info.mi_opc = LNET_MD_OP_GET; | |
1429 | info.mi_portal = hdr->msg.get.ptl_index; | |
1430 | info.mi_rlength = hdr->msg.get.sink_length; | |
1431 | info.mi_roffset = hdr->msg.get.src_offset; | |
1432 | info.mi_mbits = hdr->msg.get.match_bits; | |
1433 | ||
1434 | rc = lnet_ptl_match_md(&info, msg); | |
1435 | if (rc == LNET_MATCHMD_DROP) { | |
1436 | CNETERR("Dropping GET from %s portal %d match "LPU64 | |
1437 | " offset %d length %d\n", | |
1438 | libcfs_id2str(info.mi_id), info.mi_portal, | |
1439 | info.mi_mbits, info.mi_roffset, info.mi_rlength); | |
1440 | return ENOENT; /* +ve: OK but no match */ | |
1441 | } | |
1442 | ||
1443 | LASSERT(rc == LNET_MATCHMD_OK); | |
1444 | ||
1445 | lnet_build_msg_event(msg, LNET_EVENT_GET); | |
1446 | ||
1447 | reply_wmd = hdr->msg.get.return_wmd; | |
1448 | ||
1449 | lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id, | |
1450 | msg->msg_offset, msg->msg_wanted); | |
1451 | ||
1452 | msg->msg_hdr.msg.reply.dst_wmd = reply_wmd; | |
1453 | ||
1454 | if (rdma_get) { | |
1455 | /* The LND completes the REPLY from her recv procedure */ | |
1456 | lnet_ni_recv(ni, msg->msg_private, msg, 0, | |
1457 | msg->msg_offset, msg->msg_len, msg->msg_len); | |
1458 | return 0; | |
1459 | } | |
1460 | ||
1461 | lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0); | |
1462 | msg->msg_receiving = 0; | |
1463 | ||
1464 | rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY); | |
1465 | if (rc < 0) { | |
1466 | /* didn't get as far as lnet_ni_send() */ | |
1467 | CERROR("%s: Unable to send REPLY for GET from %s: %d\n", | |
1468 | libcfs_nid2str(ni->ni_nid), | |
1469 | libcfs_id2str(info.mi_id), rc); | |
1470 | ||
1471 | lnet_finalize(ni, msg, rc); | |
1472 | } | |
1473 | ||
1474 | return 0; | |
1475 | } | |
1476 | ||
1477 | static int | |
1478 | lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg) | |
1479 | { | |
1480 | void *private = msg->msg_private; | |
1481 | lnet_hdr_t *hdr = &msg->msg_hdr; | |
1482 | lnet_process_id_t src = {0}; | |
1483 | lnet_libmd_t *md; | |
1484 | int rlength; | |
1485 | int mlength; | |
1486 | int cpt; | |
1487 | ||
1488 | cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie); | |
1489 | lnet_res_lock(cpt); | |
1490 | ||
1491 | src.nid = hdr->src_nid; | |
1492 | src.pid = hdr->src_pid; | |
1493 | ||
1494 | /* NB handles only looked up by creator (no flips) */ | |
1495 | md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd); | |
1496 | if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { | |
1497 | CNETERR("%s: Dropping REPLY from %s for %s " | |
1498 | "MD "LPX64"."LPX64"\n", | |
1499 | libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), | |
1500 | (md == NULL) ? "invalid" : "inactive", | |
1501 | hdr->msg.reply.dst_wmd.wh_interface_cookie, | |
1502 | hdr->msg.reply.dst_wmd.wh_object_cookie); | |
1503 | if (md != NULL && md->md_me != NULL) | |
1504 | CERROR("REPLY MD also attached to portal %d\n", | |
1505 | md->md_me->me_portal); | |
1506 | ||
1507 | lnet_res_unlock(cpt); | |
1508 | return ENOENT; /* +ve: OK but no match */ | |
1509 | } | |
1510 | ||
af66a6e2 | 1511 | LASSERT(md->md_offset == 0); |
d7e09d03 PT |
1512 | |
1513 | rlength = hdr->payload_length; | |
1514 | mlength = MIN(rlength, (int)md->md_length); | |
1515 | ||
1516 | if (mlength < rlength && | |
1517 | (md->md_options & LNET_MD_TRUNCATE) == 0) { | |
1518 | CNETERR("%s: Dropping REPLY from %s length %d " | |
1519 | "for MD "LPX64" would overflow (%d)\n", | |
1520 | libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), | |
1521 | rlength, hdr->msg.reply.dst_wmd.wh_object_cookie, | |
1522 | mlength); | |
1523 | lnet_res_unlock(cpt); | |
1524 | return ENOENT; /* +ve: OK but no match */ | |
1525 | } | |
1526 | ||
1527 | CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md "LPX64"\n", | |
1528 | libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), | |
1529 | mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie); | |
1530 | ||
1531 | lnet_msg_attach_md(msg, md, 0, mlength); | |
1532 | ||
1533 | if (mlength != 0) | |
1534 | lnet_setpayloadbuffer(msg); | |
1535 | ||
1536 | lnet_res_unlock(cpt); | |
1537 | ||
1538 | lnet_build_msg_event(msg, LNET_EVENT_REPLY); | |
1539 | ||
1540 | lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength); | |
1541 | return 0; | |
1542 | } | |
1543 | ||
1544 | static int | |
1545 | lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg) | |
1546 | { | |
1547 | lnet_hdr_t *hdr = &msg->msg_hdr; | |
1548 | lnet_process_id_t src = {0}; | |
1549 | lnet_libmd_t *md; | |
1550 | int cpt; | |
1551 | ||
1552 | src.nid = hdr->src_nid; | |
1553 | src.pid = hdr->src_pid; | |
1554 | ||
1555 | /* Convert ack fields to host byte order */ | |
1556 | hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits); | |
1557 | hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength); | |
1558 | ||
1559 | cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie); | |
1560 | lnet_res_lock(cpt); | |
1561 | ||
1562 | /* NB handles only looked up by creator (no flips) */ | |
1563 | md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd); | |
1564 | if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { | |
1565 | /* Don't moan; this is expected */ | |
1566 | CDEBUG(D_NET, | |
1567 | "%s: Dropping ACK from %s to %s MD "LPX64"."LPX64"\n", | |
1568 | libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), | |
1569 | (md == NULL) ? "invalid" : "inactive", | |
1570 | hdr->msg.ack.dst_wmd.wh_interface_cookie, | |
1571 | hdr->msg.ack.dst_wmd.wh_object_cookie); | |
1572 | if (md != NULL && md->md_me != NULL) | |
1573 | CERROR("Source MD also attached to portal %d\n", | |
1574 | md->md_me->me_portal); | |
1575 | ||
1576 | lnet_res_unlock(cpt); | |
1577 | return ENOENT; /* +ve! */ | |
1578 | } | |
1579 | ||
1580 | CDEBUG(D_NET, "%s: ACK from %s into md "LPX64"\n", | |
1581 | libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), | |
1582 | hdr->msg.ack.dst_wmd.wh_object_cookie); | |
1583 | ||
1584 | lnet_msg_attach_md(msg, md, 0, 0); | |
1585 | ||
1586 | lnet_res_unlock(cpt); | |
1587 | ||
1588 | lnet_build_msg_event(msg, LNET_EVENT_ACK); | |
1589 | ||
1590 | lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len); | |
1591 | return 0; | |
1592 | } | |
1593 | ||
1594 | static int | |
1595 | lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg) | |
1596 | { | |
1597 | int rc = 0; | |
1598 | ||
1599 | if (msg->msg_rxpeer->lp_rtrcredits <= 0 || | |
1600 | lnet_msg2bufpool(msg)->rbp_credits <= 0) { | |
1601 | if (ni->ni_lnd->lnd_eager_recv == NULL) { | |
1602 | msg->msg_rx_ready_delay = 1; | |
1603 | } else { | |
1604 | lnet_net_unlock(msg->msg_rx_cpt); | |
1605 | rc = lnet_ni_eager_recv(ni, msg); | |
1606 | lnet_net_lock(msg->msg_rx_cpt); | |
1607 | } | |
1608 | } | |
1609 | ||
1610 | if (rc == 0) | |
1611 | rc = lnet_post_routed_recv_locked(msg, 0); | |
1612 | return rc; | |
1613 | } | |
1614 | ||
1615 | char * | |
af66a6e2 | 1616 | lnet_msgtyp2str(int type) |
d7e09d03 PT |
1617 | { |
1618 | switch (type) { | |
1619 | case LNET_MSG_ACK: | |
1620 | return ("ACK"); | |
1621 | case LNET_MSG_PUT: | |
1622 | return ("PUT"); | |
1623 | case LNET_MSG_GET: | |
1624 | return ("GET"); | |
1625 | case LNET_MSG_REPLY: | |
1626 | return ("REPLY"); | |
1627 | case LNET_MSG_HELLO: | |
1628 | return ("HELLO"); | |
1629 | default: | |
1630 | return ("<UNKNOWN>"); | |
1631 | } | |
1632 | } | |
1633 | EXPORT_SYMBOL(lnet_msgtyp2str); | |
1634 | ||
1635 | void | |
1636 | lnet_print_hdr(lnet_hdr_t * hdr) | |
1637 | { | |
1638 | lnet_process_id_t src = {0}; | |
1639 | lnet_process_id_t dst = {0}; | |
af66a6e2 | 1640 | char *type_str = lnet_msgtyp2str(hdr->type); |
d7e09d03 PT |
1641 | |
1642 | src.nid = hdr->src_nid; | |
1643 | src.pid = hdr->src_pid; | |
1644 | ||
1645 | dst.nid = hdr->dest_nid; | |
1646 | dst.pid = hdr->dest_pid; | |
1647 | ||
1648 | CWARN("P3 Header at %p of type %s\n", hdr, type_str); | |
1649 | CWARN(" From %s\n", libcfs_id2str(src)); | |
1650 | CWARN(" To %s\n", libcfs_id2str(dst)); | |
1651 | ||
1652 | switch (hdr->type) { | |
1653 | default: | |
1654 | break; | |
1655 | ||
1656 | case LNET_MSG_PUT: | |
1657 | CWARN(" Ptl index %d, ack md "LPX64"."LPX64", " | |
1658 | "match bits "LPU64"\n", | |
1659 | hdr->msg.put.ptl_index, | |
1660 | hdr->msg.put.ack_wmd.wh_interface_cookie, | |
1661 | hdr->msg.put.ack_wmd.wh_object_cookie, | |
1662 | hdr->msg.put.match_bits); | |
1663 | CWARN(" Length %d, offset %d, hdr data "LPX64"\n", | |
1664 | hdr->payload_length, hdr->msg.put.offset, | |
1665 | hdr->msg.put.hdr_data); | |
1666 | break; | |
1667 | ||
1668 | case LNET_MSG_GET: | |
1669 | CWARN(" Ptl index %d, return md "LPX64"."LPX64", " | |
1670 | "match bits "LPU64"\n", hdr->msg.get.ptl_index, | |
1671 | hdr->msg.get.return_wmd.wh_interface_cookie, | |
1672 | hdr->msg.get.return_wmd.wh_object_cookie, | |
1673 | hdr->msg.get.match_bits); | |
1674 | CWARN(" Length %d, src offset %d\n", | |
1675 | hdr->msg.get.sink_length, | |
1676 | hdr->msg.get.src_offset); | |
1677 | break; | |
1678 | ||
1679 | case LNET_MSG_ACK: | |
1680 | CWARN(" dst md "LPX64"."LPX64", " | |
1681 | "manipulated length %d\n", | |
1682 | hdr->msg.ack.dst_wmd.wh_interface_cookie, | |
1683 | hdr->msg.ack.dst_wmd.wh_object_cookie, | |
1684 | hdr->msg.ack.mlength); | |
1685 | break; | |
1686 | ||
1687 | case LNET_MSG_REPLY: | |
1688 | CWARN(" dst md "LPX64"."LPX64", " | |
1689 | "length %d\n", | |
1690 | hdr->msg.reply.dst_wmd.wh_interface_cookie, | |
1691 | hdr->msg.reply.dst_wmd.wh_object_cookie, | |
1692 | hdr->payload_length); | |
1693 | } | |
1694 | ||
1695 | } | |
1696 | ||
1697 | int | |
1698 | lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, | |
1699 | void *private, int rdma_req) | |
1700 | { | |
1701 | int rc = 0; | |
1702 | int cpt; | |
1703 | int for_me; | |
1704 | struct lnet_msg *msg; | |
1705 | lnet_pid_t dest_pid; | |
1706 | lnet_nid_t dest_nid; | |
1707 | lnet_nid_t src_nid; | |
1708 | __u32 payload_length; | |
1709 | __u32 type; | |
1710 | ||
af66a6e2 | 1711 | LASSERT(!in_interrupt()); |
d7e09d03 PT |
1712 | |
1713 | type = le32_to_cpu(hdr->type); | |
1714 | src_nid = le64_to_cpu(hdr->src_nid); | |
1715 | dest_nid = le64_to_cpu(hdr->dest_nid); | |
1716 | dest_pid = le32_to_cpu(hdr->dest_pid); | |
1717 | payload_length = le32_to_cpu(hdr->payload_length); | |
1718 | ||
1719 | for_me = (ni->ni_nid == dest_nid); | |
1720 | cpt = lnet_cpt_of_nid(from_nid); | |
1721 | ||
1722 | switch (type) { | |
1723 | case LNET_MSG_ACK: | |
1724 | case LNET_MSG_GET: | |
1725 | if (payload_length > 0) { | |
1726 | CERROR("%s, src %s: bad %s payload %d (0 expected)\n", | |
1727 | libcfs_nid2str(from_nid), | |
1728 | libcfs_nid2str(src_nid), | |
1729 | lnet_msgtyp2str(type), payload_length); | |
1730 | return -EPROTO; | |
1731 | } | |
1732 | break; | |
1733 | ||
1734 | case LNET_MSG_PUT: | |
1735 | case LNET_MSG_REPLY: | |
1736 | if (payload_length > (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) { | |
1737 | CERROR("%s, src %s: bad %s payload %d " | |
1738 | "(%d max expected)\n", | |
1739 | libcfs_nid2str(from_nid), | |
1740 | libcfs_nid2str(src_nid), | |
1741 | lnet_msgtyp2str(type), | |
1742 | payload_length, | |
1743 | for_me ? LNET_MAX_PAYLOAD : LNET_MTU); | |
1744 | return -EPROTO; | |
1745 | } | |
1746 | break; | |
1747 | ||
1748 | default: | |
1749 | CERROR("%s, src %s: Bad message type 0x%x\n", | |
1750 | libcfs_nid2str(from_nid), | |
1751 | libcfs_nid2str(src_nid), type); | |
1752 | return -EPROTO; | |
1753 | } | |
1754 | ||
1755 | if (the_lnet.ln_routing && | |
1756 | ni->ni_last_alive != cfs_time_current_sec()) { | |
1757 | lnet_ni_lock(ni); | |
1758 | ||
1759 | /* NB: so far here is the only place to set NI status to "up */ | |
1760 | ni->ni_last_alive = cfs_time_current_sec(); | |
1761 | if (ni->ni_status != NULL && | |
1762 | ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) | |
1763 | ni->ni_status->ns_status = LNET_NI_STATUS_UP; | |
1764 | lnet_ni_unlock(ni); | |
1765 | } | |
1766 | ||
1767 | /* Regard a bad destination NID as a protocol error. Senders should | |
1768 | * know what they're doing; if they don't they're misconfigured, buggy | |
1769 | * or malicious so we chop them off at the knees :) */ | |
1770 | ||
1771 | if (!for_me) { | |
1772 | if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) { | |
1773 | /* should have gone direct */ | |
af66a6e2 | 1774 | CERROR("%s, src %s: Bad dest nid %s " |
d7e09d03 PT |
1775 | "(should have been sent direct)\n", |
1776 | libcfs_nid2str(from_nid), | |
1777 | libcfs_nid2str(src_nid), | |
1778 | libcfs_nid2str(dest_nid)); | |
1779 | return -EPROTO; | |
1780 | } | |
1781 | ||
1782 | if (lnet_islocalnid(dest_nid)) { | |
1783 | /* dest is another local NI; sender should have used | |
1784 | * this node's NID on its own network */ | |
af66a6e2 | 1785 | CERROR("%s, src %s: Bad dest nid %s " |
d7e09d03 PT |
1786 | "(it's my nid but on a different network)\n", |
1787 | libcfs_nid2str(from_nid), | |
1788 | libcfs_nid2str(src_nid), | |
1789 | libcfs_nid2str(dest_nid)); | |
1790 | return -EPROTO; | |
1791 | } | |
1792 | ||
1793 | if (rdma_req && type == LNET_MSG_GET) { | |
af66a6e2 | 1794 | CERROR("%s, src %s: Bad optimized GET for %s " |
d7e09d03 PT |
1795 | "(final destination must be me)\n", |
1796 | libcfs_nid2str(from_nid), | |
1797 | libcfs_nid2str(src_nid), | |
1798 | libcfs_nid2str(dest_nid)); | |
1799 | return -EPROTO; | |
1800 | } | |
1801 | ||
1802 | if (!the_lnet.ln_routing) { | |
af66a6e2 | 1803 | CERROR("%s, src %s: Dropping message for %s " |
d7e09d03 PT |
1804 | "(routing not enabled)\n", |
1805 | libcfs_nid2str(from_nid), | |
1806 | libcfs_nid2str(src_nid), | |
1807 | libcfs_nid2str(dest_nid)); | |
1808 | goto drop; | |
1809 | } | |
1810 | } | |
1811 | ||
1812 | /* Message looks OK; we're not going to return an error, so we MUST | |
1813 | * call back lnd_recv() come what may... */ | |
1814 | ||
af66a6e2 | 1815 | if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ |
9b79ca85 | 1816 | fail_peer(src_nid, 0)) { /* shall we now? */ |
d7e09d03 PT |
1817 | CERROR("%s, src %s: Dropping %s to simulate failure\n", |
1818 | libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), | |
1819 | lnet_msgtyp2str(type)); | |
1820 | goto drop; | |
1821 | } | |
1822 | ||
1823 | msg = lnet_msg_alloc(); | |
1824 | if (msg == NULL) { | |
1825 | CERROR("%s, src %s: Dropping %s (out of memory)\n", | |
1826 | libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), | |
1827 | lnet_msgtyp2str(type)); | |
1828 | goto drop; | |
1829 | } | |
1830 | ||
1831 | /* msg zeroed in lnet_msg_alloc; i.e. flags all clear, pointers NULL etc */ | |
1832 | ||
1833 | msg->msg_type = type; | |
1834 | msg->msg_private = private; | |
1835 | msg->msg_receiving = 1; | |
1836 | msg->msg_len = msg->msg_wanted = payload_length; | |
1837 | msg->msg_offset = 0; | |
1838 | msg->msg_hdr = *hdr; | |
1839 | /* for building message event */ | |
1840 | msg->msg_from = from_nid; | |
1841 | if (!for_me) { | |
1842 | msg->msg_target.pid = dest_pid; | |
1843 | msg->msg_target.nid = dest_nid; | |
1844 | msg->msg_routing = 1; | |
1845 | ||
1846 | } else { | |
1847 | /* convert common msg->hdr fields to host byteorder */ | |
1848 | msg->msg_hdr.type = type; | |
1849 | msg->msg_hdr.src_nid = src_nid; | |
1850 | msg->msg_hdr.src_pid = le32_to_cpu(msg->msg_hdr.src_pid); | |
1851 | msg->msg_hdr.dest_nid = dest_nid; | |
1852 | msg->msg_hdr.dest_pid = dest_pid; | |
1853 | msg->msg_hdr.payload_length = payload_length; | |
1854 | } | |
1855 | ||
1856 | lnet_net_lock(cpt); | |
1857 | rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt); | |
1858 | if (rc != 0) { | |
1859 | lnet_net_unlock(cpt); | |
1860 | CERROR("%s, src %s: Dropping %s " | |
1861 | "(error %d looking up sender)\n", | |
1862 | libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), | |
1863 | lnet_msgtyp2str(type), rc); | |
1864 | lnet_msg_free(msg); | |
1865 | goto drop; | |
1866 | } | |
1867 | ||
1868 | lnet_msg_commit(msg, cpt); | |
1869 | ||
1870 | if (!for_me) { | |
1871 | rc = lnet_parse_forward_locked(ni, msg); | |
1872 | lnet_net_unlock(cpt); | |
1873 | ||
1874 | if (rc < 0) | |
1875 | goto free_drop; | |
1876 | if (rc == 0) { | |
1877 | lnet_ni_recv(ni, msg->msg_private, msg, 0, | |
1878 | 0, payload_length, payload_length); | |
1879 | } | |
1880 | return 0; | |
1881 | } | |
1882 | ||
1883 | lnet_net_unlock(cpt); | |
1884 | ||
1885 | switch (type) { | |
1886 | case LNET_MSG_ACK: | |
1887 | rc = lnet_parse_ack(ni, msg); | |
1888 | break; | |
1889 | case LNET_MSG_PUT: | |
1890 | rc = lnet_parse_put(ni, msg); | |
1891 | break; | |
1892 | case LNET_MSG_GET: | |
1893 | rc = lnet_parse_get(ni, msg, rdma_req); | |
1894 | break; | |
1895 | case LNET_MSG_REPLY: | |
1896 | rc = lnet_parse_reply(ni, msg); | |
1897 | break; | |
1898 | default: | |
1899 | LASSERT(0); | |
1900 | rc = -EPROTO; | |
1901 | goto free_drop; /* prevent an unused label if !kernel */ | |
1902 | } | |
1903 | ||
1904 | if (rc == 0) | |
1905 | return 0; | |
1906 | ||
af66a6e2 | 1907 | LASSERT(rc == ENOENT); |
d7e09d03 PT |
1908 | |
1909 | free_drop: | |
1910 | LASSERT(msg->msg_md == NULL); | |
1911 | lnet_finalize(ni, msg, rc); | |
1912 | ||
1913 | drop: | |
1914 | lnet_drop_message(ni, cpt, private, payload_length); | |
1915 | return 0; | |
1916 | } | |
1917 | EXPORT_SYMBOL(lnet_parse); | |
1918 | ||
1919 | void | |
1920 | lnet_drop_delayed_msg_list(struct list_head *head, char *reason) | |
1921 | { | |
1922 | while (!list_empty(head)) { | |
1923 | lnet_process_id_t id = {0}; | |
1924 | lnet_msg_t *msg; | |
1925 | ||
1926 | msg = list_entry(head->next, lnet_msg_t, msg_list); | |
1927 | list_del(&msg->msg_list); | |
1928 | ||
1929 | id.nid = msg->msg_hdr.src_nid; | |
1930 | id.pid = msg->msg_hdr.src_pid; | |
1931 | ||
1932 | LASSERT(msg->msg_md == NULL); | |
1933 | LASSERT(msg->msg_rx_delayed); | |
1934 | LASSERT(msg->msg_rxpeer != NULL); | |
1935 | LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); | |
1936 | ||
1937 | CWARN("Dropping delayed PUT from %s portal %d match "LPU64 | |
1938 | " offset %d length %d: %s\n", | |
1939 | libcfs_id2str(id), | |
1940 | msg->msg_hdr.msg.put.ptl_index, | |
1941 | msg->msg_hdr.msg.put.match_bits, | |
1942 | msg->msg_hdr.msg.put.offset, | |
1943 | msg->msg_hdr.payload_length, reason); | |
1944 | ||
1945 | /* NB I can't drop msg's ref on msg_rxpeer until after I've | |
1946 | * called lnet_drop_message(), so I just hang onto msg as well | |
1947 | * until that's done */ | |
1948 | ||
1949 | lnet_drop_message(msg->msg_rxpeer->lp_ni, | |
1950 | msg->msg_rxpeer->lp_cpt, | |
1951 | msg->msg_private, msg->msg_len); | |
1952 | /* | |
1953 | * NB: message will not generate event because w/o attached MD, | |
1954 | * but we still should give error code so lnet_msg_decommit() | |
1955 | * can skip counters operations and other checks. | |
1956 | */ | |
1957 | lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT); | |
1958 | } | |
1959 | } | |
1960 | ||
1961 | void | |
1962 | lnet_recv_delayed_msg_list(struct list_head *head) | |
1963 | { | |
1964 | while (!list_empty(head)) { | |
1965 | lnet_msg_t *msg; | |
1966 | lnet_process_id_t id; | |
1967 | ||
1968 | msg = list_entry(head->next, lnet_msg_t, msg_list); | |
1969 | list_del(&msg->msg_list); | |
1970 | ||
1971 | /* md won't disappear under me, since each msg | |
1972 | * holds a ref on it */ | |
1973 | ||
1974 | id.nid = msg->msg_hdr.src_nid; | |
1975 | id.pid = msg->msg_hdr.src_pid; | |
1976 | ||
1977 | LASSERT(msg->msg_rx_delayed); | |
1978 | LASSERT(msg->msg_md != NULL); | |
1979 | LASSERT(msg->msg_rxpeer != NULL); | |
1980 | LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); | |
1981 | ||
1982 | CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d " | |
1983 | "match "LPU64" offset %d length %d.\n", | |
1984 | libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index, | |
1985 | msg->msg_hdr.msg.put.match_bits, | |
1986 | msg->msg_hdr.msg.put.offset, | |
1987 | msg->msg_hdr.payload_length); | |
1988 | ||
1989 | lnet_recv_put(msg->msg_rxpeer->lp_ni, msg); | |
1990 | } | |
1991 | } | |
1992 | ||
1993 | /** | |
1994 | * Initiate an asynchronous PUT operation. | |
1995 | * | |
1996 | * There are several events associated with a PUT: completion of the send on | |
1997 | * the initiator node (LNET_EVENT_SEND), and when the send completes | |
1998 | * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating | |
1999 | * that the operation was accepted by the target. The event LNET_EVENT_PUT is | |
2000 | * used at the target node to indicate the completion of incoming data | |
2001 | * delivery. | |
2002 | * | |
2003 | * The local events will be logged in the EQ associated with the MD pointed to | |
2004 | * by \a mdh handle. Using a MD without an associated EQ results in these | |
2005 | * events being discarded. In this case, the caller must have another | |
2006 | * mechanism (e.g., a higher level protocol) for determining when it is safe | |
2007 | * to modify the memory region associated with the MD. | |
2008 | * | |
2009 | * Note that LNet does not guarantee the order of LNET_EVENT_SEND and | |
2010 | * LNET_EVENT_ACK, though intuitively ACK should happen after SEND. | |
2011 | * | |
2012 | * \param self Indicates the NID of a local interface through which to send | |
2013 | * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself. | |
2014 | * \param mdh A handle for the MD that describes the memory to be sent. The MD | |
2015 | * must be "free floating" (See LNetMDBind()). | |
2016 | * \param ack Controls whether an acknowledgment is requested. | |
2017 | * Acknowledgments are only sent when they are requested by the initiating | |
2018 | * process and the target MD enables them. | |
2019 | * \param target A process identifier for the target process. | |
2020 | * \param portal The index in the \a target's portal table. | |
2021 | * \param match_bits The match bits to use for MD selection at the target | |
2022 | * process. | |
2023 | * \param offset The offset into the target MD (only used when the target | |
2024 | * MD has the LNET_MD_MANAGE_REMOTE option set). | |
2025 | * \param hdr_data 64 bits of user data that can be included in the message | |
2026 | * header. This data is written to an event queue entry at the target if an | |
2027 | * EQ is present on the matching MD. | |
2028 | * | |
2029 | * \retval 0 Success, and only in this case events will be generated | |
2030 | * and logged to EQ (if it exists). | |
2031 | * \retval -EIO Simulated failure. | |
2032 | * \retval -ENOMEM Memory allocation failure. | |
2033 | * \retval -ENOENT Invalid MD object. | |
2034 | * | |
2035 | * \see lnet_event_t::hdr_data and lnet_event_kind_t. | |
2036 | */ | |
2037 | int | |
2038 | LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, | |
2039 | lnet_process_id_t target, unsigned int portal, | |
2040 | __u64 match_bits, unsigned int offset, | |
2041 | __u64 hdr_data) | |
2042 | { | |
2043 | struct lnet_msg *msg; | |
2044 | struct lnet_libmd *md; | |
2045 | int cpt; | |
2046 | int rc; | |
2047 | ||
af66a6e2 LN |
2048 | LASSERT(the_lnet.ln_init); |
2049 | LASSERT(the_lnet.ln_refcount > 0); | |
d7e09d03 | 2050 | |
af66a6e2 | 2051 | if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ |
9b79ca85 | 2052 | fail_peer(target.nid, 1)) { /* shall we now? */ |
d7e09d03 PT |
2053 | CERROR("Dropping PUT to %s: simulated failure\n", |
2054 | libcfs_id2str(target)); | |
2055 | return -EIO; | |
2056 | } | |
2057 | ||
2058 | msg = lnet_msg_alloc(); | |
2059 | if (msg == NULL) { | |
2060 | CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n", | |
2061 | libcfs_id2str(target)); | |
2062 | return -ENOMEM; | |
2063 | } | |
2064 | msg->msg_vmflush = !!memory_pressure_get(); | |
2065 | ||
2066 | cpt = lnet_cpt_of_cookie(mdh.cookie); | |
2067 | lnet_res_lock(cpt); | |
2068 | ||
2069 | md = lnet_handle2md(&mdh); | |
2070 | if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { | |
2071 | CERROR("Dropping PUT ("LPU64":%d:%s): MD (%d) invalid\n", | |
2072 | match_bits, portal, libcfs_id2str(target), | |
2073 | md == NULL ? -1 : md->md_threshold); | |
2074 | if (md != NULL && md->md_me != NULL) | |
2075 | CERROR("Source MD also attached to portal %d\n", | |
2076 | md->md_me->me_portal); | |
2077 | lnet_res_unlock(cpt); | |
2078 | ||
2079 | lnet_msg_free(msg); | |
2080 | return -ENOENT; | |
2081 | } | |
2082 | ||
2083 | CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target)); | |
2084 | ||
2085 | lnet_msg_attach_md(msg, md, 0, 0); | |
2086 | ||
2087 | lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length); | |
2088 | ||
2089 | msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits); | |
2090 | msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal); | |
2091 | msg->msg_hdr.msg.put.offset = cpu_to_le32(offset); | |
2092 | msg->msg_hdr.msg.put.hdr_data = hdr_data; | |
2093 | ||
2094 | /* NB handles only looked up by creator (no flips) */ | |
2095 | if (ack == LNET_ACK_REQ) { | |
2096 | msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = | |
2097 | the_lnet.ln_interface_cookie; | |
2098 | msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = | |
2099 | md->md_lh.lh_cookie; | |
2100 | } else { | |
2101 | msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = | |
2102 | LNET_WIRE_HANDLE_COOKIE_NONE; | |
2103 | msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = | |
2104 | LNET_WIRE_HANDLE_COOKIE_NONE; | |
2105 | } | |
2106 | ||
2107 | lnet_res_unlock(cpt); | |
2108 | ||
2109 | lnet_build_msg_event(msg, LNET_EVENT_SEND); | |
2110 | ||
2111 | rc = lnet_send(self, msg, LNET_NID_ANY); | |
2112 | if (rc != 0) { | |
af66a6e2 | 2113 | CNETERR("Error sending PUT to %s: %d\n", |
d7e09d03 | 2114 | libcfs_id2str(target), rc); |
af66a6e2 | 2115 | lnet_finalize(NULL, msg, rc); |
d7e09d03 PT |
2116 | } |
2117 | ||
2118 | /* completion will be signalled by an event */ | |
2119 | return 0; | |
2120 | } | |
2121 | EXPORT_SYMBOL(LNetPut); | |
2122 | ||
2123 | lnet_msg_t * | |
af66a6e2 | 2124 | lnet_create_reply_msg(lnet_ni_t *ni, lnet_msg_t *getmsg) |
d7e09d03 PT |
2125 | { |
2126 | /* The LND can DMA direct to the GET md (i.e. no REPLY msg). This | |
2127 | * returns a msg for the LND to pass to lnet_finalize() when the sink | |
2128 | * data has been received. | |
2129 | * | |
2130 | * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when | |
2131 | * lnet_finalize() is called on it, so the LND must call this first */ | |
2132 | ||
2133 | struct lnet_msg *msg = lnet_msg_alloc(); | |
2134 | struct lnet_libmd *getmd = getmsg->msg_md; | |
2135 | lnet_process_id_t peer_id = getmsg->msg_target; | |
2136 | int cpt; | |
2137 | ||
2138 | LASSERT(!getmsg->msg_target_is_router); | |
2139 | LASSERT(!getmsg->msg_routing); | |
2140 | ||
2141 | cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie); | |
2142 | lnet_res_lock(cpt); | |
2143 | ||
af66a6e2 | 2144 | LASSERT(getmd->md_refcount > 0); |
d7e09d03 PT |
2145 | |
2146 | if (msg == NULL) { | |
af66a6e2 | 2147 | CERROR("%s: Dropping REPLY from %s: can't allocate msg\n", |
d7e09d03 PT |
2148 | libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id)); |
2149 | goto drop; | |
2150 | } | |
2151 | ||
2152 | if (getmd->md_threshold == 0) { | |
af66a6e2 | 2153 | CERROR("%s: Dropping REPLY from %s for inactive MD %p\n", |
d7e09d03 PT |
2154 | libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), |
2155 | getmd); | |
2156 | lnet_res_unlock(cpt); | |
2157 | goto drop; | |
2158 | } | |
2159 | ||
2160 | LASSERT(getmd->md_offset == 0); | |
2161 | ||
2162 | CDEBUG(D_NET, "%s: Reply from %s md %p\n", | |
2163 | libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd); | |
2164 | ||
2165 | /* setup information for lnet_build_msg_event */ | |
2166 | msg->msg_from = peer_id.nid; | |
2167 | msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */ | |
2168 | msg->msg_hdr.src_nid = peer_id.nid; | |
2169 | msg->msg_hdr.payload_length = getmd->md_length; | |
2170 | msg->msg_receiving = 1; /* required by lnet_msg_attach_md */ | |
2171 | ||
2172 | lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length); | |
2173 | lnet_res_unlock(cpt); | |
2174 | ||
2175 | cpt = lnet_cpt_of_nid(peer_id.nid); | |
2176 | ||
2177 | lnet_net_lock(cpt); | |
2178 | lnet_msg_commit(msg, cpt); | |
2179 | lnet_net_unlock(cpt); | |
2180 | ||
2181 | lnet_build_msg_event(msg, LNET_EVENT_REPLY); | |
2182 | ||
2183 | return msg; | |
2184 | ||
2185 | drop: | |
2186 | cpt = lnet_cpt_of_nid(peer_id.nid); | |
2187 | ||
2188 | lnet_net_lock(cpt); | |
2189 | the_lnet.ln_counters[cpt]->drop_count++; | |
2190 | the_lnet.ln_counters[cpt]->drop_length += getmd->md_length; | |
2191 | lnet_net_unlock(cpt); | |
2192 | ||
2193 | if (msg != NULL) | |
2194 | lnet_msg_free(msg); | |
2195 | ||
2196 | return NULL; | |
2197 | } | |
2198 | EXPORT_SYMBOL(lnet_create_reply_msg); | |
2199 | ||
2200 | void | |
2201 | lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len) | |
2202 | { | |
2203 | /* Set the REPLY length, now the RDMA that elides the REPLY message has | |
2204 | * completed and I know it. */ | |
af66a6e2 LN |
2205 | LASSERT(reply != NULL); |
2206 | LASSERT(reply->msg_type == LNET_MSG_GET); | |
2207 | LASSERT(reply->msg_ev.type == LNET_EVENT_REPLY); | |
d7e09d03 PT |
2208 | |
2209 | /* NB I trusted my peer to RDMA. If she tells me she's written beyond | |
2210 | * the end of my buffer, I might as well be dead. */ | |
af66a6e2 | 2211 | LASSERT(len <= reply->msg_ev.mlength); |
d7e09d03 PT |
2212 | |
2213 | reply->msg_ev.mlength = len; | |
2214 | } | |
2215 | EXPORT_SYMBOL(lnet_set_reply_msg_len); | |
2216 | ||
2217 | /** | |
2218 | * Initiate an asynchronous GET operation. | |
2219 | * | |
2220 | * On the initiator node, an LNET_EVENT_SEND is logged when the GET request | |
2221 | * is sent, and an LNET_EVENT_REPLY is logged when the data returned from | |
2222 | * the target node in the REPLY has been written to local MD. | |
2223 | * | |
2224 | * On the target node, an LNET_EVENT_GET is logged when the GET request | |
2225 | * arrives and is accepted into a MD. | |
2226 | * | |
2227 | * \param self,target,portal,match_bits,offset See the discussion in LNetPut(). | |
2228 | * \param mdh A handle for the MD that describes the memory into which the | |
2229 | * requested data will be received. The MD must be "free floating" (See LNetMDBind()). | |
2230 | * | |
2231 | * \retval 0 Success, and only in this case events will be generated | |
2232 | * and logged to EQ (if it exists) of the MD. | |
2233 | * \retval -EIO Simulated failure. | |
2234 | * \retval -ENOMEM Memory allocation failure. | |
2235 | * \retval -ENOENT Invalid MD object. | |
2236 | */ | |
2237 | int | |
2238 | LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, | |
2239 | lnet_process_id_t target, unsigned int portal, | |
2240 | __u64 match_bits, unsigned int offset) | |
2241 | { | |
2242 | struct lnet_msg *msg; | |
2243 | struct lnet_libmd *md; | |
2244 | int cpt; | |
2245 | int rc; | |
2246 | ||
af66a6e2 LN |
2247 | LASSERT(the_lnet.ln_init); |
2248 | LASSERT(the_lnet.ln_refcount > 0); | |
d7e09d03 | 2249 | |
af66a6e2 | 2250 | if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ |
9b79ca85 | 2251 | fail_peer(target.nid, 1)) { /* shall we now? */ |
d7e09d03 PT |
2252 | CERROR("Dropping GET to %s: simulated failure\n", |
2253 | libcfs_id2str(target)); | |
2254 | return -EIO; | |
2255 | } | |
2256 | ||
2257 | msg = lnet_msg_alloc(); | |
2258 | if (msg == NULL) { | |
2259 | CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n", | |
2260 | libcfs_id2str(target)); | |
2261 | return -ENOMEM; | |
2262 | } | |
2263 | ||
2264 | cpt = lnet_cpt_of_cookie(mdh.cookie); | |
2265 | lnet_res_lock(cpt); | |
2266 | ||
2267 | md = lnet_handle2md(&mdh); | |
2268 | if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { | |
2269 | CERROR("Dropping GET ("LPU64":%d:%s): MD (%d) invalid\n", | |
2270 | match_bits, portal, libcfs_id2str(target), | |
2271 | md == NULL ? -1 : md->md_threshold); | |
2272 | if (md != NULL && md->md_me != NULL) | |
2273 | CERROR("REPLY MD also attached to portal %d\n", | |
2274 | md->md_me->me_portal); | |
2275 | ||
2276 | lnet_res_unlock(cpt); | |
2277 | ||
2278 | lnet_msg_free(msg); | |
2279 | ||
2280 | return -ENOENT; | |
2281 | } | |
2282 | ||
2283 | CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target)); | |
2284 | ||
2285 | lnet_msg_attach_md(msg, md, 0, 0); | |
2286 | ||
2287 | lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0); | |
2288 | ||
2289 | msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits); | |
2290 | msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal); | |
2291 | msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset); | |
2292 | msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length); | |
2293 | ||
2294 | /* NB handles only looked up by creator (no flips) */ | |
2295 | msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie = | |
2296 | the_lnet.ln_interface_cookie; | |
2297 | msg->msg_hdr.msg.get.return_wmd.wh_object_cookie = | |
2298 | md->md_lh.lh_cookie; | |
2299 | ||
2300 | lnet_res_unlock(cpt); | |
2301 | ||
2302 | lnet_build_msg_event(msg, LNET_EVENT_SEND); | |
2303 | ||
2304 | rc = lnet_send(self, msg, LNET_NID_ANY); | |
2305 | if (rc < 0) { | |
af66a6e2 | 2306 | CNETERR("Error sending GET to %s: %d\n", |
d7e09d03 | 2307 | libcfs_id2str(target), rc); |
af66a6e2 | 2308 | lnet_finalize(NULL, msg, rc); |
d7e09d03 PT |
2309 | } |
2310 | ||
2311 | /* completion will be signalled by an event */ | |
2312 | return 0; | |
2313 | } | |
2314 | EXPORT_SYMBOL(LNetGet); | |
2315 | ||
2316 | /** | |
2317 | * Calculate distance to node at \a dstnid. | |
2318 | * | |
2319 | * \param dstnid Target NID. | |
2320 | * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid | |
2321 | * is saved here. | |
2322 | * \param orderp If not NULL, order of the route to reach \a dstnid is saved | |
2323 | * here. | |
2324 | * | |
2325 | * \retval 0 If \a dstnid belongs to a local interface, and reserved option | |
2326 | * local_nid_dist_zero is set, which is the default. | |
2327 | * \retval positives Distance to target NID, i.e. number of hops plus one. | |
2328 | * \retval -EHOSTUNREACH If \a dstnid is not reachable. | |
2329 | */ | |
2330 | int | |
2331 | LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) | |
2332 | { | |
2333 | struct list_head *e; | |
2334 | struct lnet_ni *ni; | |
2335 | lnet_remotenet_t *rnet; | |
2336 | __u32 dstnet = LNET_NIDNET(dstnid); | |
2337 | int hops; | |
2338 | int cpt; | |
2339 | __u32 order = 2; | |
2340 | struct list_head *rn_list; | |
2341 | ||
2342 | /* if !local_nid_dist_zero, I don't return a distance of 0 ever | |
2343 | * (when lustre sees a distance of 0, it substitutes 0@lo), so I | |
2344 | * keep order 0 free for 0@lo and order 1 free for a local NID | |
2345 | * match */ | |
2346 | ||
af66a6e2 LN |
2347 | LASSERT(the_lnet.ln_init); |
2348 | LASSERT(the_lnet.ln_refcount > 0); | |
d7e09d03 PT |
2349 | |
2350 | cpt = lnet_net_lock_current(); | |
2351 | ||
af66a6e2 | 2352 | list_for_each(e, &the_lnet.ln_nis) { |
d7e09d03 PT |
2353 | ni = list_entry(e, lnet_ni_t, ni_list); |
2354 | ||
2355 | if (ni->ni_nid == dstnid) { | |
2356 | if (srcnidp != NULL) | |
2357 | *srcnidp = dstnid; | |
2358 | if (orderp != NULL) { | |
2359 | if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND) | |
2360 | *orderp = 0; | |
2361 | else | |
2362 | *orderp = 1; | |
2363 | } | |
2364 | lnet_net_unlock(cpt); | |
2365 | ||
2366 | return local_nid_dist_zero ? 0 : 1; | |
2367 | } | |
2368 | ||
2369 | if (LNET_NIDNET(ni->ni_nid) == dstnet) { | |
2370 | if (srcnidp != NULL) | |
2371 | *srcnidp = ni->ni_nid; | |
2372 | if (orderp != NULL) | |
2373 | *orderp = order; | |
2374 | lnet_net_unlock(cpt); | |
2375 | return 1; | |
2376 | } | |
2377 | ||
2378 | order++; | |
2379 | } | |
2380 | ||
2381 | rn_list = lnet_net2rnethash(dstnet); | |
2382 | list_for_each(e, rn_list) { | |
2383 | rnet = list_entry(e, lnet_remotenet_t, lrn_list); | |
2384 | ||
2385 | if (rnet->lrn_net == dstnet) { | |
2386 | lnet_route_t *route; | |
2387 | lnet_route_t *shortest = NULL; | |
2388 | ||
af66a6e2 | 2389 | LASSERT(!list_empty(&rnet->lrn_routes)); |
d7e09d03 PT |
2390 | |
2391 | list_for_each_entry(route, &rnet->lrn_routes, | |
2392 | lr_list) { | |
2393 | if (shortest == NULL || | |
2394 | route->lr_hops < shortest->lr_hops) | |
2395 | shortest = route; | |
2396 | } | |
2397 | ||
af66a6e2 | 2398 | LASSERT(shortest != NULL); |
d7e09d03 PT |
2399 | hops = shortest->lr_hops; |
2400 | if (srcnidp != NULL) | |
2401 | *srcnidp = shortest->lr_gateway->lp_ni->ni_nid; | |
2402 | if (orderp != NULL) | |
2403 | *orderp = order; | |
2404 | lnet_net_unlock(cpt); | |
2405 | return hops + 1; | |
2406 | } | |
2407 | order++; | |
2408 | } | |
2409 | ||
2410 | lnet_net_unlock(cpt); | |
2411 | return -EHOSTUNREACH; | |
2412 | } | |
2413 | EXPORT_SYMBOL(LNetDist); | |
2414 | ||
2415 | /** | |
2416 | * Set the number of asynchronous messages expected from a target process. | |
2417 | * | |
2418 | * This function is only meaningful for userspace callers. It's a no-op when | |
2419 | * called from kernel. | |
2420 | * | |
2421 | * Asynchronous messages are those that can come from a target when the | |
2422 | * userspace process is not waiting for IO to complete; e.g., AST callbacks | |
2423 | * from Lustre servers. Specifying the expected number of such messages | |
2424 | * allows them to be eagerly received when user process is not running in | |
2425 | * LNet; otherwise network errors may occur. | |
2426 | * | |
2427 | * \param id Process ID of the target process. | |
2428 | * \param nasync Number of asynchronous messages expected from the target. | |
2429 | * | |
2430 | * \return 0 on success, and an error code otherwise. | |
2431 | */ | |
2432 | int | |
2433 | LNetSetAsync(lnet_process_id_t id, int nasync) | |
2434 | { | |
2435 | return 0; | |
2436 | } | |
2437 | EXPORT_SYMBOL(LNetSetAsync); |