Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2011, 2012, Intel Corporation. | |
31 | */ | |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | */ | |
36 | ||
37 | #include "socklnd.h" | |
38 | ||
d7e09d03 | 39 | int |
978b9b35 | 40 | ksocknal_lib_get_conn_addrs(ksock_conn_t *conn) |
d7e09d03 | 41 | { |
1ad6a73e JS |
42 | int rc = lnet_sock_getaddr(conn->ksnc_sock, 1, &conn->ksnc_ipaddr, |
43 | &conn->ksnc_port); | |
d7e09d03 PT |
44 | |
45 | /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ | |
978b9b35 | 46 | LASSERT(!conn->ksnc_closing); |
d7e09d03 PT |
47 | |
48 | if (rc != 0) { | |
978b9b35 | 49 | CERROR("Error %d getting sock peer IP\n", rc); |
d7e09d03 PT |
50 | return rc; |
51 | } | |
52 | ||
1ad6a73e | 53 | rc = lnet_sock_getaddr(conn->ksnc_sock, 0, &conn->ksnc_myipaddr, NULL); |
d7e09d03 | 54 | if (rc != 0) { |
978b9b35 | 55 | CERROR("Error %d getting sock local IP\n", rc); |
d7e09d03 PT |
56 | return rc; |
57 | } | |
58 | ||
59 | return 0; | |
60 | } | |
61 | ||
62 | int | |
63 | ksocknal_lib_zc_capable(ksock_conn_t *conn) | |
64 | { | |
97d10d0a | 65 | int caps = conn->ksnc_sock->sk->sk_route_caps; |
d7e09d03 PT |
66 | |
67 | if (conn->ksnc_proto == &ksocknal_protocol_v1x) | |
68 | return 0; | |
69 | ||
4420cfd3 JS |
70 | /* |
71 | * ZC if the socket supports scatter/gather and doesn't need software | |
72 | * checksums | |
73 | */ | |
a188222b | 74 | return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_CSUM_MASK) != 0); |
d7e09d03 PT |
75 | } |
76 | ||
77 | int | |
978b9b35 | 78 | ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx) |
d7e09d03 PT |
79 | { |
80 | struct socket *sock = conn->ksnc_sock; | |
97d10d0a MS |
81 | int nob; |
82 | int rc; | |
d7e09d03 PT |
83 | |
84 | if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */ | |
85 | conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */ | |
86 | tx->tx_nob == tx->tx_resid && /* frist sending */ | |
87 | tx->tx_msg.ksm_csum == 0) /* not checksummed */ | |
88 | ksocknal_lib_csum_tx(tx); | |
89 | ||
4420cfd3 JS |
90 | /* |
91 | * NB we can't trust socket ops to either consume our iovs | |
92 | * or leave them alone. | |
93 | */ | |
d7e09d03 PT |
94 | { |
95 | #if SOCKNAL_SINGLE_FRAG_TX | |
97d10d0a MS |
96 | struct kvec scratch; |
97 | struct kvec *scratchiov = &scratch; | |
98 | unsigned int niov = 1; | |
d7e09d03 | 99 | #else |
97d10d0a MS |
100 | struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; |
101 | unsigned int niov = tx->tx_niov; | |
d7e09d03 | 102 | #endif |
480f40de | 103 | struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; |
97d10d0a | 104 | int i; |
d7e09d03 PT |
105 | |
106 | for (nob = i = 0; i < niov; i++) { | |
107 | scratchiov[i] = tx->tx_iov[i]; | |
108 | nob += scratchiov[i].iov_len; | |
109 | } | |
110 | ||
111 | if (!list_empty(&conn->ksnc_tx_queue) || | |
112 | nob < tx->tx_resid) | |
113 | msg.msg_flags |= MSG_MORE; | |
114 | ||
f351bad2 | 115 | rc = kernel_sendmsg(sock, &msg, scratchiov, niov, nob); |
d7e09d03 PT |
116 | } |
117 | return rc; | |
118 | } | |
119 | ||
120 | int | |
978b9b35 | 121 | ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx) |
d7e09d03 PT |
122 | { |
123 | struct socket *sock = conn->ksnc_sock; | |
97d10d0a MS |
124 | lnet_kiov_t *kiov = tx->tx_kiov; |
125 | int rc; | |
126 | int nob; | |
d7e09d03 PT |
127 | |
128 | /* Not NOOP message */ | |
978b9b35 | 129 | LASSERT(tx->tx_lnetmsg != NULL); |
d7e09d03 | 130 | |
4420cfd3 JS |
131 | /* |
132 | * NB we can't trust socket ops to either consume our iovs | |
133 | * or leave them alone. | |
134 | */ | |
d7e09d03 PT |
135 | if (tx->tx_msg.ksm_zc_cookies[0] != 0) { |
136 | /* Zero copy is enabled */ | |
97d10d0a MS |
137 | struct sock *sk = sock->sk; |
138 | struct page *page = kiov->kiov_page; | |
139 | int offset = kiov->kiov_offset; | |
140 | int fragsize = kiov->kiov_len; | |
141 | int msgflg = MSG_DONTWAIT; | |
d7e09d03 PT |
142 | |
143 | CDEBUG(D_NET, "page %p + offset %x for %d\n", | |
c314c319 | 144 | page, offset, kiov->kiov_len); |
d7e09d03 PT |
145 | |
146 | if (!list_empty(&conn->ksnc_tx_queue) || | |
147 | fragsize < tx->tx_resid) | |
148 | msgflg |= MSG_MORE; | |
149 | ||
150 | if (sk->sk_prot->sendpage != NULL) { | |
151 | rc = sk->sk_prot->sendpage(sk, page, | |
152 | offset, fragsize, msgflg); | |
153 | } else { | |
d664d1fd | 154 | rc = tcp_sendpage(sk, page, offset, fragsize, msgflg); |
d7e09d03 PT |
155 | } |
156 | } else { | |
157 | #if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK | |
97d10d0a | 158 | struct kvec scratch; |
f351bad2 | 159 | struct kvec *scratchiov = &scratch; |
97d10d0a | 160 | unsigned int niov = 1; |
d7e09d03 PT |
161 | #else |
162 | #ifdef CONFIG_HIGHMEM | |
163 | #warning "XXX risk of kmap deadlock on multiple frags..." | |
164 | #endif | |
f351bad2 | 165 | struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; |
97d10d0a | 166 | unsigned int niov = tx->tx_nkiov; |
d7e09d03 | 167 | #endif |
480f40de | 168 | struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; |
97d10d0a | 169 | int i; |
d7e09d03 PT |
170 | |
171 | for (nob = i = 0; i < niov; i++) { | |
172 | scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + | |
173 | kiov[i].kiov_offset; | |
174 | nob += scratchiov[i].iov_len = kiov[i].kiov_len; | |
175 | } | |
176 | ||
177 | if (!list_empty(&conn->ksnc_tx_queue) || | |
178 | nob < tx->tx_resid) | |
179 | msg.msg_flags |= MSG_MORE; | |
180 | ||
480f40de | 181 | rc = kernel_sendmsg(sock, &msg, (struct kvec *)scratchiov, niov, nob); |
d7e09d03 PT |
182 | |
183 | for (i = 0; i < niov; i++) | |
184 | kunmap(kiov[i].kiov_page); | |
185 | } | |
186 | return rc; | |
187 | } | |
188 | ||
189 | void | |
978b9b35 | 190 | ksocknal_lib_eager_ack(ksock_conn_t *conn) |
d7e09d03 | 191 | { |
97d10d0a | 192 | int opt = 1; |
d7e09d03 PT |
193 | struct socket *sock = conn->ksnc_sock; |
194 | ||
4420cfd3 JS |
195 | /* |
196 | * Remind the socket to ACK eagerly. If I don't, the socket might | |
d7e09d03 PT |
197 | * think I'm about to send something it could piggy-back the ACK |
198 | * on, introducing delay in completing zero-copy sends in my | |
4420cfd3 JS |
199 | * peer. |
200 | */ | |
c314c319 JS |
201 | kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, (char *)&opt, |
202 | sizeof(opt)); | |
d7e09d03 PT |
203 | } |
204 | ||
205 | int | |
978b9b35 | 206 | ksocknal_lib_recv_iov(ksock_conn_t *conn) |
d7e09d03 PT |
207 | { |
208 | #if SOCKNAL_SINGLE_FRAG_RX | |
97d10d0a | 209 | struct kvec scratch; |
f351bad2 | 210 | struct kvec *scratchiov = &scratch; |
97d10d0a | 211 | unsigned int niov = 1; |
d7e09d03 | 212 | #else |
f351bad2 | 213 | struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; |
97d10d0a | 214 | unsigned int niov = conn->ksnc_rx_niov; |
d7e09d03 | 215 | #endif |
f351bad2 | 216 | struct kvec *iov = conn->ksnc_rx_iov; |
d7e09d03 | 217 | struct msghdr msg = { |
97d10d0a | 218 | .msg_flags = 0 |
d7e09d03 | 219 | }; |
97d10d0a MS |
220 | int nob; |
221 | int i; | |
222 | int rc; | |
223 | int fragnob; | |
224 | int sum; | |
225 | __u32 saved_csum; | |
d7e09d03 | 226 | |
4420cfd3 JS |
227 | /* |
228 | * NB we can't trust socket ops to either consume our iovs | |
229 | * or leave them alone. | |
230 | */ | |
978b9b35 | 231 | LASSERT(niov > 0); |
d7e09d03 PT |
232 | |
233 | for (nob = i = 0; i < niov; i++) { | |
234 | scratchiov[i] = iov[i]; | |
235 | nob += scratchiov[i].iov_len; | |
236 | } | |
978b9b35 | 237 | LASSERT(nob <= conn->ksnc_rx_nob_wanted); |
d7e09d03 | 238 | |
c314c319 JS |
239 | rc = kernel_recvmsg(conn->ksnc_sock, &msg, scratchiov, niov, nob, |
240 | MSG_DONTWAIT); | |
d7e09d03 PT |
241 | |
242 | saved_csum = 0; | |
243 | if (conn->ksnc_proto == &ksocknal_protocol_v2x) { | |
244 | saved_csum = conn->ksnc_msg.ksm_csum; | |
245 | conn->ksnc_msg.ksm_csum = 0; | |
246 | } | |
247 | ||
248 | if (saved_csum != 0) { | |
249 | /* accumulate checksum */ | |
250 | for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { | |
978b9b35 | 251 | LASSERT(i < niov); |
d7e09d03 PT |
252 | |
253 | fragnob = iov[i].iov_len; | |
254 | if (fragnob > sum) | |
255 | fragnob = sum; | |
256 | ||
257 | conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, | |
258 | iov[i].iov_base, fragnob); | |
259 | } | |
260 | conn->ksnc_msg.ksm_csum = saved_csum; | |
261 | } | |
262 | ||
263 | return rc; | |
264 | } | |
265 | ||
266 | static void | |
267 | ksocknal_lib_kiov_vunmap(void *addr) | |
268 | { | |
269 | if (addr == NULL) | |
270 | return; | |
271 | ||
272 | vunmap(addr); | |
273 | } | |
274 | ||
275 | static void * | |
276 | ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov, | |
f351bad2 | 277 | struct kvec *iov, struct page **pages) |
d7e09d03 | 278 | { |
97d10d0a MS |
279 | void *addr; |
280 | int nob; | |
281 | int i; | |
d7e09d03 PT |
282 | |
283 | if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL) | |
284 | return NULL; | |
285 | ||
978b9b35 | 286 | LASSERT(niov <= LNET_MAX_IOV); |
d7e09d03 PT |
287 | |
288 | if (niov < 2 || | |
289 | niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags) | |
290 | return NULL; | |
291 | ||
292 | for (nob = i = 0; i < niov; i++) { | |
293 | if ((kiov[i].kiov_offset != 0 && i > 0) || | |
294 | (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_CACHE_SIZE && i < niov - 1)) | |
295 | return NULL; | |
296 | ||
297 | pages[i] = kiov[i].kiov_page; | |
298 | nob += kiov[i].kiov_len; | |
299 | } | |
300 | ||
301 | addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL); | |
302 | if (addr == NULL) | |
303 | return NULL; | |
304 | ||
305 | iov->iov_base = addr + kiov[0].kiov_offset; | |
306 | iov->iov_len = nob; | |
307 | ||
308 | return addr; | |
309 | } | |
310 | ||
311 | int | |
978b9b35 | 312 | ksocknal_lib_recv_kiov(ksock_conn_t *conn) |
d7e09d03 PT |
313 | { |
314 | #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK | |
97d10d0a MS |
315 | struct kvec scratch; |
316 | struct kvec *scratchiov = &scratch; | |
317 | struct page **pages = NULL; | |
318 | unsigned int niov = 1; | |
d7e09d03 PT |
319 | #else |
320 | #ifdef CONFIG_HIGHMEM | |
321 | #warning "XXX risk of kmap deadlock on multiple frags..." | |
322 | #endif | |
97d10d0a MS |
323 | struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; |
324 | struct page **pages = conn->ksnc_scheduler->kss_rx_scratch_pgs; | |
325 | unsigned int niov = conn->ksnc_rx_nkiov; | |
d7e09d03 PT |
326 | #endif |
327 | lnet_kiov_t *kiov = conn->ksnc_rx_kiov; | |
328 | struct msghdr msg = { | |
97d10d0a | 329 | .msg_flags = 0 |
d7e09d03 | 330 | }; |
97d10d0a MS |
331 | int nob; |
332 | int i; | |
333 | int rc; | |
334 | void *base; | |
335 | void *addr; | |
336 | int sum; | |
337 | int fragnob; | |
b2f42cfe | 338 | int n; |
d7e09d03 | 339 | |
4420cfd3 JS |
340 | /* |
341 | * NB we can't trust socket ops to either consume our iovs | |
342 | * or leave them alone. | |
343 | */ | |
4a87df3e CP |
344 | addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages); |
345 | if (addr != NULL) { | |
d7e09d03 | 346 | nob = scratchiov[0].iov_len; |
b2f42cfe | 347 | n = 1; |
d7e09d03 PT |
348 | |
349 | } else { | |
350 | for (nob = i = 0; i < niov; i++) { | |
351 | nob += scratchiov[i].iov_len = kiov[i].kiov_len; | |
352 | scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + | |
353 | kiov[i].kiov_offset; | |
354 | } | |
b2f42cfe | 355 | n = niov; |
d7e09d03 PT |
356 | } |
357 | ||
978b9b35 | 358 | LASSERT(nob <= conn->ksnc_rx_nob_wanted); |
d7e09d03 | 359 | |
c314c319 JS |
360 | rc = kernel_recvmsg(conn->ksnc_sock, &msg, (struct kvec *)scratchiov, |
361 | n, nob, MSG_DONTWAIT); | |
d7e09d03 PT |
362 | |
363 | if (conn->ksnc_msg.ksm_csum != 0) { | |
364 | for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { | |
978b9b35 | 365 | LASSERT(i < niov); |
d7e09d03 | 366 | |
4420cfd3 JS |
367 | /* |
368 | * Dang! have to kmap again because I have nowhere to | |
b1ff8901 MR |
369 | * stash the mapped address. But by doing it while the |
370 | * page is still mapped, the kernel just bumps the map | |
4420cfd3 JS |
371 | * count and returns me the address it stashed. |
372 | */ | |
d7e09d03 PT |
373 | base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; |
374 | fragnob = kiov[i].kiov_len; | |
375 | if (fragnob > sum) | |
376 | fragnob = sum; | |
377 | ||
378 | conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, | |
379 | base, fragnob); | |
380 | ||
381 | kunmap(kiov[i].kiov_page); | |
382 | } | |
383 | } | |
384 | ||
385 | if (addr != NULL) { | |
386 | ksocknal_lib_kiov_vunmap(addr); | |
387 | } else { | |
388 | for (i = 0; i < niov; i++) | |
389 | kunmap(kiov[i].kiov_page); | |
390 | } | |
391 | ||
71397095 | 392 | return rc; |
d7e09d03 PT |
393 | } |
394 | ||
395 | void | |
396 | ksocknal_lib_csum_tx(ksock_tx_t *tx) | |
397 | { | |
97d10d0a MS |
398 | int i; |
399 | __u32 csum; | |
400 | void *base; | |
d7e09d03 | 401 | |
f351bad2 | 402 | LASSERT(tx->tx_iov[0].iov_base == &tx->tx_msg); |
d7e09d03 PT |
403 | LASSERT(tx->tx_conn != NULL); |
404 | LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x); | |
405 | ||
406 | tx->tx_msg.ksm_csum = 0; | |
407 | ||
f351bad2 | 408 | csum = ksocknal_csum(~0, tx->tx_iov[0].iov_base, |
d7e09d03 PT |
409 | tx->tx_iov[0].iov_len); |
410 | ||
411 | if (tx->tx_kiov != NULL) { | |
412 | for (i = 0; i < tx->tx_nkiov; i++) { | |
413 | base = kmap(tx->tx_kiov[i].kiov_page) + | |
414 | tx->tx_kiov[i].kiov_offset; | |
415 | ||
416 | csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len); | |
417 | ||
418 | kunmap(tx->tx_kiov[i].kiov_page); | |
419 | } | |
420 | } else { | |
421 | for (i = 1; i < tx->tx_niov; i++) | |
422 | csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base, | |
423 | tx->tx_iov[i].iov_len); | |
424 | } | |
425 | ||
426 | if (*ksocknal_tunables.ksnd_inject_csum_error) { | |
427 | csum++; | |
428 | *ksocknal_tunables.ksnd_inject_csum_error = 0; | |
429 | } | |
430 | ||
431 | tx->tx_msg.ksm_csum = csum; | |
432 | } | |
433 | ||
434 | int | |
978b9b35 | 435 | ksocknal_lib_get_conn_tunables(ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle) |
d7e09d03 | 436 | { |
d7e09d03 | 437 | struct socket *sock = conn->ksnc_sock; |
97d10d0a MS |
438 | int len; |
439 | int rc; | |
d7e09d03 PT |
440 | |
441 | rc = ksocknal_connsock_addref(conn); | |
442 | if (rc != 0) { | |
978b9b35 | 443 | LASSERT(conn->ksnc_closing); |
d7e09d03 | 444 | *txmem = *rxmem = *nagle = 0; |
71397095 | 445 | return -ESHUTDOWN; |
d7e09d03 PT |
446 | } |
447 | ||
1ad6a73e | 448 | rc = lnet_sock_getbuf(sock, txmem, rxmem); |
d7e09d03 PT |
449 | if (rc == 0) { |
450 | len = sizeof(*nagle); | |
80db2734 | 451 | rc = kernel_getsockopt(sock, SOL_TCP, TCP_NODELAY, |
c314c319 | 452 | (char *)nagle, &len); |
d7e09d03 PT |
453 | } |
454 | ||
455 | ksocknal_connsock_decref(conn); | |
456 | ||
457 | if (rc == 0) | |
458 | *nagle = !*nagle; | |
459 | else | |
460 | *txmem = *rxmem = *nagle = 0; | |
461 | ||
71397095 | 462 | return rc; |
d7e09d03 PT |
463 | } |
464 | ||
465 | int | |
978b9b35 | 466 | ksocknal_lib_setup_sock(struct socket *sock) |
d7e09d03 | 467 | { |
97d10d0a MS |
468 | int rc; |
469 | int option; | |
470 | int keep_idle; | |
471 | int keep_intvl; | |
472 | int keep_count; | |
473 | int do_keepalive; | |
474 | struct linger linger; | |
d7e09d03 PT |
475 | |
476 | sock->sk->sk_allocation = GFP_NOFS; | |
477 | ||
4420cfd3 JS |
478 | /* |
479 | * Ensure this socket aborts active sends immediately when we close | |
480 | * it. | |
481 | */ | |
d7e09d03 PT |
482 | linger.l_onoff = 0; |
483 | linger.l_linger = 0; | |
484 | ||
c314c319 JS |
485 | rc = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, (char *)&linger, |
486 | sizeof(linger)); | |
d7e09d03 | 487 | if (rc != 0) { |
978b9b35 | 488 | CERROR("Can't set SO_LINGER: %d\n", rc); |
71397095 | 489 | return rc; |
d7e09d03 PT |
490 | } |
491 | ||
492 | option = -1; | |
c314c319 JS |
493 | rc = kernel_setsockopt(sock, SOL_TCP, TCP_LINGER2, (char *)&option, |
494 | sizeof(option)); | |
d7e09d03 | 495 | if (rc != 0) { |
978b9b35 | 496 | CERROR("Can't set SO_LINGER2: %d\n", rc); |
71397095 | 497 | return rc; |
d7e09d03 PT |
498 | } |
499 | ||
500 | if (!*ksocknal_tunables.ksnd_nagle) { | |
501 | option = 1; | |
502 | ||
80db2734 | 503 | rc = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, |
c314c319 | 504 | (char *)&option, sizeof(option)); |
d7e09d03 | 505 | if (rc != 0) { |
978b9b35 | 506 | CERROR("Can't disable nagle: %d\n", rc); |
71397095 | 507 | return rc; |
d7e09d03 PT |
508 | } |
509 | } | |
510 | ||
1ad6a73e JS |
511 | rc = lnet_sock_setbuf(sock, *ksocknal_tunables.ksnd_tx_buffer_size, |
512 | *ksocknal_tunables.ksnd_rx_buffer_size); | |
d7e09d03 | 513 | if (rc != 0) { |
978b9b35 | 514 | CERROR("Can't set buffer tx %d, rx %d buffers: %d\n", |
c314c319 JS |
515 | *ksocknal_tunables.ksnd_tx_buffer_size, |
516 | *ksocknal_tunables.ksnd_rx_buffer_size, rc); | |
71397095 | 517 | return rc; |
d7e09d03 PT |
518 | } |
519 | ||
520 | /* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */ | |
521 | ||
522 | /* snapshot tunables */ | |
523 | keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; | |
524 | keep_count = *ksocknal_tunables.ksnd_keepalive_count; | |
525 | keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; | |
526 | ||
527 | do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); | |
528 | ||
529 | option = (do_keepalive ? 1 : 0); | |
c314c319 JS |
530 | rc = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&option, |
531 | sizeof(option)); | |
d7e09d03 | 532 | if (rc != 0) { |
978b9b35 | 533 | CERROR("Can't set SO_KEEPALIVE: %d\n", rc); |
71397095 | 534 | return rc; |
d7e09d03 PT |
535 | } |
536 | ||
537 | if (!do_keepalive) | |
71397095 | 538 | return 0; |
d7e09d03 | 539 | |
c314c319 JS |
540 | rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, (char *)&keep_idle, |
541 | sizeof(keep_idle)); | |
d7e09d03 | 542 | if (rc != 0) { |
978b9b35 | 543 | CERROR("Can't set TCP_KEEPIDLE: %d\n", rc); |
71397095 | 544 | return rc; |
d7e09d03 PT |
545 | } |
546 | ||
80db2734 | 547 | rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, |
c314c319 | 548 | (char *)&keep_intvl, sizeof(keep_intvl)); |
d7e09d03 | 549 | if (rc != 0) { |
978b9b35 | 550 | CERROR("Can't set TCP_KEEPINTVL: %d\n", rc); |
71397095 | 551 | return rc; |
d7e09d03 PT |
552 | } |
553 | ||
c314c319 JS |
554 | rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, (char *)&keep_count, |
555 | sizeof(keep_count)); | |
d7e09d03 | 556 | if (rc != 0) { |
978b9b35 | 557 | CERROR("Can't set TCP_KEEPCNT: %d\n", rc); |
71397095 | 558 | return rc; |
d7e09d03 PT |
559 | } |
560 | ||
71397095 | 561 | return 0; |
d7e09d03 PT |
562 | } |
563 | ||
564 | void | |
978b9b35 | 565 | ksocknal_lib_push_conn(ksock_conn_t *conn) |
d7e09d03 | 566 | { |
97d10d0a | 567 | struct sock *sk; |
d7e09d03 | 568 | struct tcp_sock *tp; |
97d10d0a MS |
569 | int nonagle; |
570 | int val = 1; | |
571 | int rc; | |
d7e09d03 PT |
572 | |
573 | rc = ksocknal_connsock_addref(conn); | |
574 | if (rc != 0) /* being shut down */ | |
575 | return; | |
576 | ||
577 | sk = conn->ksnc_sock->sk; | |
578 | tp = tcp_sk(sk); | |
579 | ||
978b9b35 | 580 | lock_sock(sk); |
d7e09d03 PT |
581 | nonagle = tp->nonagle; |
582 | tp->nonagle = 1; | |
978b9b35 | 583 | release_sock(sk); |
d7e09d03 | 584 | |
80db2734 | 585 | rc = kernel_setsockopt(conn->ksnc_sock, SOL_TCP, TCP_NODELAY, |
c314c319 | 586 | (char *)&val, sizeof(val)); |
978b9b35 | 587 | LASSERT(rc == 0); |
d7e09d03 | 588 | |
978b9b35 | 589 | lock_sock(sk); |
d7e09d03 | 590 | tp->nonagle = nonagle; |
978b9b35 | 591 | release_sock(sk); |
d7e09d03 PT |
592 | |
593 | ksocknal_connsock_decref(conn); | |
594 | } | |
595 | ||
d7e09d03 PT |
596 | /* |
597 | * socket call back in Linux | |
598 | */ | |
599 | static void | |
978b9b35 | 600 | ksocknal_data_ready(struct sock *sk) |
d7e09d03 | 601 | { |
97d10d0a | 602 | ksock_conn_t *conn; |
d7e09d03 PT |
603 | |
604 | /* interleave correctly with closing sockets... */ | |
605 | LASSERT(!in_irq()); | |
606 | read_lock(&ksocknal_data.ksnd_global_lock); | |
607 | ||
608 | conn = sk->sk_user_data; | |
609 | if (conn == NULL) { /* raced with ksocknal_terminate_conn */ | |
978b9b35 HE |
610 | LASSERT(sk->sk_data_ready != &ksocknal_data_ready); |
611 | sk->sk_data_ready(sk); | |
d7e09d03 PT |
612 | } else |
613 | ksocknal_read_callback(conn); | |
614 | ||
615 | read_unlock(&ksocknal_data.ksnd_global_lock); | |
d7e09d03 PT |
616 | } |
617 | ||
618 | static void | |
978b9b35 | 619 | ksocknal_write_space(struct sock *sk) |
d7e09d03 | 620 | { |
97d10d0a MS |
621 | ksock_conn_t *conn; |
622 | int wspace; | |
623 | int min_wpace; | |
d7e09d03 PT |
624 | |
625 | /* interleave correctly with closing sockets... */ | |
626 | LASSERT(!in_irq()); | |
627 | read_lock(&ksocknal_data.ksnd_global_lock); | |
628 | ||
629 | conn = sk->sk_user_data; | |
12c41f00 JH |
630 | wspace = sk_stream_wspace(sk); |
631 | min_wpace = sk_stream_min_wspace(sk); | |
d7e09d03 PT |
632 | |
633 | CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", | |
634 | sk, wspace, min_wpace, conn, | |
635 | (conn == NULL) ? "" : (conn->ksnc_tx_ready ? | |
636 | " ready" : " blocked"), | |
637 | (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? | |
638 | " scheduled" : " idle"), | |
978b9b35 | 639 | (conn == NULL) ? "" : (list_empty(&conn->ksnc_tx_queue) ? |
d7e09d03 PT |
640 | " empty" : " queued")); |
641 | ||
642 | if (conn == NULL) { /* raced with ksocknal_terminate_conn */ | |
978b9b35 HE |
643 | LASSERT(sk->sk_write_space != &ksocknal_write_space); |
644 | sk->sk_write_space(sk); | |
d7e09d03 PT |
645 | |
646 | read_unlock(&ksocknal_data.ksnd_global_lock); | |
647 | return; | |
648 | } | |
649 | ||
650 | if (wspace >= min_wpace) { /* got enough space */ | |
651 | ksocknal_write_callback(conn); | |
652 | ||
4420cfd3 JS |
653 | /* |
654 | * Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the | |
d7e09d03 | 655 | * ENOMEM check in ksocknal_transmit is race-free (think about |
4420cfd3 JS |
656 | * it). |
657 | */ | |
978b9b35 | 658 | clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); |
d7e09d03 PT |
659 | } |
660 | ||
661 | read_unlock(&ksocknal_data.ksnd_global_lock); | |
662 | } | |
663 | ||
664 | void | |
665 | ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn) | |
666 | { | |
667 | conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; | |
668 | conn->ksnc_saved_write_space = sock->sk->sk_write_space; | |
669 | } | |
670 | ||
671 | void | |
672 | ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn) | |
673 | { | |
674 | sock->sk->sk_user_data = conn; | |
675 | sock->sk->sk_data_ready = ksocknal_data_ready; | |
676 | sock->sk->sk_write_space = ksocknal_write_space; | |
677 | return; | |
678 | } | |
679 | ||
680 | void | |
681 | ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn) | |
682 | { | |
4420cfd3 JS |
683 | /* |
684 | * Remove conn's network callbacks. | |
d7e09d03 | 685 | * NB I _have_ to restore the callback, rather than storing a noop, |
4420cfd3 JS |
686 | * since the socket could survive past this module being unloaded!! |
687 | */ | |
d7e09d03 PT |
688 | sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; |
689 | sock->sk->sk_write_space = conn->ksnc_saved_write_space; | |
690 | ||
4420cfd3 JS |
691 | /* |
692 | * A callback could be in progress already; they hold a read lock | |
d7e09d03 | 693 | * on ksnd_global_lock (to serialise with me) and NOOP if |
4420cfd3 JS |
694 | * sk_user_data is NULL. |
695 | */ | |
d7e09d03 PT |
696 | sock->sk->sk_user_data = NULL; |
697 | ||
698 | return ; | |
699 | } | |
700 | ||
701 | int | |
702 | ksocknal_lib_memory_pressure(ksock_conn_t *conn) | |
703 | { | |
97d10d0a | 704 | int rc = 0; |
d7e09d03 PT |
705 | ksock_sched_t *sched; |
706 | ||
707 | sched = conn->ksnc_scheduler; | |
708 | spin_lock_bh(&sched->kss_lock); | |
709 | ||
fb4a1539 | 710 | if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) && |
d7e09d03 | 711 | !conn->ksnc_tx_ready) { |
4420cfd3 JS |
712 | /* |
713 | * SOCK_NOSPACE is set when the socket fills | |
d7e09d03 PT |
714 | * and cleared in the write_space callback |
715 | * (which also sets ksnc_tx_ready). If | |
716 | * SOCK_NOSPACE and ksnc_tx_ready are BOTH | |
717 | * zero, I didn't fill the socket and | |
718 | * write_space won't reschedule me, so I | |
719 | * return -ENOMEM to get my caller to retry | |
4420cfd3 JS |
720 | * after a timeout |
721 | */ | |
d7e09d03 PT |
722 | rc = -ENOMEM; |
723 | } | |
724 | ||
725 | spin_unlock_bh(&sched->kss_lock); | |
726 | ||
727 | return rc; | |
728 | } |