db9303a6a145a479820c50e78e48b3fd01ced08c
[deliverable/linux.git] / net / sunrpc / xprtrdma / verbs.c
1 /*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50 #include <linux/interrupt.h>
51 #include <linux/slab.h>
52 #include <linux/prefetch.h>
53 #include <linux/sunrpc/addr.h>
54 #include <asm/bitops.h>
55
56 #include "xprt_rdma.h"
57
58 /*
59 * Globals/Macros
60 */
61
62 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
63 # define RPCDBG_FACILITY RPCDBG_TRANS
64 #endif
65
66 /*
67 * internal functions
68 */
69
70 /*
71 * handle replies in tasklet context, using a single, global list
72 * rdma tasklet function -- just turn around and call the func
73 * for all replies on the list
74 */
75
76 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
77 static LIST_HEAD(rpcrdma_tasklets_g);
78
79 static void
80 rpcrdma_run_tasklet(unsigned long data)
81 {
82 struct rpcrdma_rep *rep;
83 unsigned long flags;
84
85 data = data;
86 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
87 while (!list_empty(&rpcrdma_tasklets_g)) {
88 rep = list_entry(rpcrdma_tasklets_g.next,
89 struct rpcrdma_rep, rr_list);
90 list_del(&rep->rr_list);
91 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
92
93 rpcrdma_reply_handler(rep);
94
95 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
96 }
97 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
98 }
99
100 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
101
102 static const char * const async_event[] = {
103 "CQ error",
104 "QP fatal error",
105 "QP request error",
106 "QP access error",
107 "communication established",
108 "send queue drained",
109 "path migration successful",
110 "path mig error",
111 "device fatal error",
112 "port active",
113 "port error",
114 "LID change",
115 "P_key change",
116 "SM change",
117 "SRQ error",
118 "SRQ limit reached",
119 "last WQE reached",
120 "client reregister",
121 "GID change",
122 };
123
124 #define ASYNC_MSG(status) \
125 ((status) < ARRAY_SIZE(async_event) ? \
126 async_event[(status)] : "unknown async error")
127
128 static void
129 rpcrdma_schedule_tasklet(struct list_head *sched_list)
130 {
131 unsigned long flags;
132
133 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
134 list_splice_tail(sched_list, &rpcrdma_tasklets_g);
135 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
136 tasklet_schedule(&rpcrdma_tasklet_g);
137 }
138
139 static void
140 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
141 {
142 struct rpcrdma_ep *ep = context;
143
144 pr_err("RPC: %s: %s on device %s ep %p\n",
145 __func__, ASYNC_MSG(event->event),
146 event->device->name, context);
147 if (ep->rep_connected == 1) {
148 ep->rep_connected = -EIO;
149 rpcrdma_conn_func(ep);
150 wake_up_all(&ep->rep_connect_wait);
151 }
152 }
153
154 static void
155 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
156 {
157 struct rpcrdma_ep *ep = context;
158
159 pr_err("RPC: %s: %s on device %s ep %p\n",
160 __func__, ASYNC_MSG(event->event),
161 event->device->name, context);
162 if (ep->rep_connected == 1) {
163 ep->rep_connected = -EIO;
164 rpcrdma_conn_func(ep);
165 wake_up_all(&ep->rep_connect_wait);
166 }
167 }
168
169 static const char * const wc_status[] = {
170 "success",
171 "local length error",
172 "local QP operation error",
173 "local EE context operation error",
174 "local protection error",
175 "WR flushed",
176 "memory management operation error",
177 "bad response error",
178 "local access error",
179 "remote invalid request error",
180 "remote access error",
181 "remote operation error",
182 "transport retry counter exceeded",
183 "RNR retry counter exceeded",
184 "local RDD violation error",
185 "remove invalid RD request",
186 "operation aborted",
187 "invalid EE context number",
188 "invalid EE context state",
189 "fatal error",
190 "response timeout error",
191 "general error",
192 };
193
194 #define COMPLETION_MSG(status) \
195 ((status) < ARRAY_SIZE(wc_status) ? \
196 wc_status[(status)] : "unexpected completion error")
197
198 static void
199 rpcrdma_sendcq_process_wc(struct ib_wc *wc)
200 {
201 /* WARNING: Only wr_id and status are reliable at this point */
202 if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) {
203 if (wc->status != IB_WC_SUCCESS &&
204 wc->status != IB_WC_WR_FLUSH_ERR)
205 pr_err("RPC: %s: SEND: %s\n",
206 __func__, COMPLETION_MSG(wc->status));
207 } else {
208 struct rpcrdma_mw *r;
209
210 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
211 r->mw_sendcompletion(wc);
212 }
213 }
214
215 static int
216 rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
217 {
218 struct ib_wc *wcs;
219 int budget, count, rc;
220
221 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
222 do {
223 wcs = ep->rep_send_wcs;
224
225 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
226 if (rc <= 0)
227 return rc;
228
229 count = rc;
230 while (count-- > 0)
231 rpcrdma_sendcq_process_wc(wcs++);
232 } while (rc == RPCRDMA_POLLSIZE && --budget);
233 return 0;
234 }
235
236 /*
237 * Handle send, fast_reg_mr, and local_inv completions.
238 *
239 * Send events are typically suppressed and thus do not result
240 * in an upcall. Occasionally one is signaled, however. This
241 * prevents the provider's completion queue from wrapping and
242 * losing a completion.
243 */
244 static void
245 rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
246 {
247 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
248 int rc;
249
250 rc = rpcrdma_sendcq_poll(cq, ep);
251 if (rc) {
252 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
253 __func__, rc);
254 return;
255 }
256
257 rc = ib_req_notify_cq(cq,
258 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
259 if (rc == 0)
260 return;
261 if (rc < 0) {
262 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
263 __func__, rc);
264 return;
265 }
266
267 rpcrdma_sendcq_poll(cq, ep);
268 }
269
270 static void
271 rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
272 {
273 struct rpcrdma_rep *rep =
274 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
275
276 /* WARNING: Only wr_id and status are reliable at this point */
277 if (wc->status != IB_WC_SUCCESS)
278 goto out_fail;
279
280 /* status == SUCCESS means all fields in wc are trustworthy */
281 if (wc->opcode != IB_WC_RECV)
282 return;
283
284 dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
285 __func__, rep, wc->byte_len);
286
287 rep->rr_len = wc->byte_len;
288 ib_dma_sync_single_for_cpu(rep->rr_device,
289 rdmab_addr(rep->rr_rdmabuf),
290 rep->rr_len, DMA_FROM_DEVICE);
291 prefetch(rdmab_to_msg(rep->rr_rdmabuf));
292
293 out_schedule:
294 list_add_tail(&rep->rr_list, sched_list);
295 return;
296 out_fail:
297 if (wc->status != IB_WC_WR_FLUSH_ERR)
298 pr_err("RPC: %s: rep %p: %s\n",
299 __func__, rep, COMPLETION_MSG(wc->status));
300 rep->rr_len = ~0U;
301 goto out_schedule;
302 }
303
304 static int
305 rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
306 {
307 struct list_head sched_list;
308 struct ib_wc *wcs;
309 int budget, count, rc;
310
311 INIT_LIST_HEAD(&sched_list);
312 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
313 do {
314 wcs = ep->rep_recv_wcs;
315
316 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
317 if (rc <= 0)
318 goto out_schedule;
319
320 count = rc;
321 while (count-- > 0)
322 rpcrdma_recvcq_process_wc(wcs++, &sched_list);
323 } while (rc == RPCRDMA_POLLSIZE && --budget);
324 rc = 0;
325
326 out_schedule:
327 rpcrdma_schedule_tasklet(&sched_list);
328 return rc;
329 }
330
331 /*
332 * Handle receive completions.
333 *
334 * It is reentrant but processes single events in order to maintain
335 * ordering of receives to keep server credits.
336 *
337 * It is the responsibility of the scheduled tasklet to return
338 * recv buffers to the pool. NOTE: this affects synchronization of
339 * connection shutdown. That is, the structures required for
340 * the completion of the reply handler must remain intact until
341 * all memory has been reclaimed.
342 */
343 static void
344 rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
345 {
346 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
347 int rc;
348
349 rc = rpcrdma_recvcq_poll(cq, ep);
350 if (rc) {
351 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
352 __func__, rc);
353 return;
354 }
355
356 rc = ib_req_notify_cq(cq,
357 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
358 if (rc == 0)
359 return;
360 if (rc < 0) {
361 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
362 __func__, rc);
363 return;
364 }
365
366 rpcrdma_recvcq_poll(cq, ep);
367 }
368
369 static void
370 rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
371 {
372 struct ib_wc wc;
373 LIST_HEAD(sched_list);
374
375 while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
376 rpcrdma_recvcq_process_wc(&wc, &sched_list);
377 if (!list_empty(&sched_list))
378 rpcrdma_schedule_tasklet(&sched_list);
379 while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
380 rpcrdma_sendcq_process_wc(&wc);
381 }
382
383 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
384 static const char * const conn[] = {
385 "address resolved",
386 "address error",
387 "route resolved",
388 "route error",
389 "connect request",
390 "connect response",
391 "connect error",
392 "unreachable",
393 "rejected",
394 "established",
395 "disconnected",
396 "device removal",
397 "multicast join",
398 "multicast error",
399 "address change",
400 "timewait exit",
401 };
402
403 #define CONNECTION_MSG(status) \
404 ((status) < ARRAY_SIZE(conn) ? \
405 conn[(status)] : "unrecognized connection error")
406 #endif
407
408 static int
409 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
410 {
411 struct rpcrdma_xprt *xprt = id->context;
412 struct rpcrdma_ia *ia = &xprt->rx_ia;
413 struct rpcrdma_ep *ep = &xprt->rx_ep;
414 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
415 struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
416 #endif
417 struct ib_qp_attr *attr = &ia->ri_qp_attr;
418 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
419 int connstate = 0;
420
421 switch (event->event) {
422 case RDMA_CM_EVENT_ADDR_RESOLVED:
423 case RDMA_CM_EVENT_ROUTE_RESOLVED:
424 ia->ri_async_rc = 0;
425 complete(&ia->ri_done);
426 break;
427 case RDMA_CM_EVENT_ADDR_ERROR:
428 ia->ri_async_rc = -EHOSTUNREACH;
429 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
430 __func__, ep);
431 complete(&ia->ri_done);
432 break;
433 case RDMA_CM_EVENT_ROUTE_ERROR:
434 ia->ri_async_rc = -ENETUNREACH;
435 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
436 __func__, ep);
437 complete(&ia->ri_done);
438 break;
439 case RDMA_CM_EVENT_ESTABLISHED:
440 connstate = 1;
441 ib_query_qp(ia->ri_id->qp, attr,
442 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
443 iattr);
444 dprintk("RPC: %s: %d responder resources"
445 " (%d initiator)\n",
446 __func__, attr->max_dest_rd_atomic,
447 attr->max_rd_atomic);
448 goto connected;
449 case RDMA_CM_EVENT_CONNECT_ERROR:
450 connstate = -ENOTCONN;
451 goto connected;
452 case RDMA_CM_EVENT_UNREACHABLE:
453 connstate = -ENETDOWN;
454 goto connected;
455 case RDMA_CM_EVENT_REJECTED:
456 connstate = -ECONNREFUSED;
457 goto connected;
458 case RDMA_CM_EVENT_DISCONNECTED:
459 connstate = -ECONNABORTED;
460 goto connected;
461 case RDMA_CM_EVENT_DEVICE_REMOVAL:
462 connstate = -ENODEV;
463 connected:
464 dprintk("RPC: %s: %sconnected\n",
465 __func__, connstate > 0 ? "" : "dis");
466 ep->rep_connected = connstate;
467 rpcrdma_conn_func(ep);
468 wake_up_all(&ep->rep_connect_wait);
469 /*FALLTHROUGH*/
470 default:
471 dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
472 __func__, sap, rpc_get_port(sap), ep,
473 CONNECTION_MSG(event->event));
474 break;
475 }
476
477 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
478 if (connstate == 1) {
479 int ird = attr->max_dest_rd_atomic;
480 int tird = ep->rep_remote_cma.responder_resources;
481
482 pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
483 sap, rpc_get_port(sap),
484 ia->ri_device->name,
485 ia->ri_ops->ro_displayname,
486 xprt->rx_buf.rb_max_requests,
487 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
488 } else if (connstate < 0) {
489 pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
490 sap, rpc_get_port(sap), connstate);
491 }
492 #endif
493
494 return 0;
495 }
496
497 static struct rdma_cm_id *
498 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
499 struct rpcrdma_ia *ia, struct sockaddr *addr)
500 {
501 struct rdma_cm_id *id;
502 int rc;
503
504 init_completion(&ia->ri_done);
505
506 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
507 if (IS_ERR(id)) {
508 rc = PTR_ERR(id);
509 dprintk("RPC: %s: rdma_create_id() failed %i\n",
510 __func__, rc);
511 return id;
512 }
513
514 ia->ri_async_rc = -ETIMEDOUT;
515 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
516 if (rc) {
517 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
518 __func__, rc);
519 goto out;
520 }
521 wait_for_completion_interruptible_timeout(&ia->ri_done,
522 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
523 rc = ia->ri_async_rc;
524 if (rc)
525 goto out;
526
527 ia->ri_async_rc = -ETIMEDOUT;
528 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
529 if (rc) {
530 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
531 __func__, rc);
532 goto out;
533 }
534 wait_for_completion_interruptible_timeout(&ia->ri_done,
535 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
536 rc = ia->ri_async_rc;
537 if (rc)
538 goto out;
539
540 return id;
541
542 out:
543 rdma_destroy_id(id);
544 return ERR_PTR(rc);
545 }
546
547 /*
548 * Drain any cq, prior to teardown.
549 */
550 static void
551 rpcrdma_clean_cq(struct ib_cq *cq)
552 {
553 struct ib_wc wc;
554 int count = 0;
555
556 while (1 == ib_poll_cq(cq, 1, &wc))
557 ++count;
558
559 if (count)
560 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
561 __func__, count, wc.opcode);
562 }
563
564 /*
565 * Exported functions.
566 */
567
568 /*
569 * Open and initialize an Interface Adapter.
570 * o initializes fields of struct rpcrdma_ia, including
571 * interface and provider attributes and protection zone.
572 */
573 int
574 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
575 {
576 int rc, mem_priv;
577 struct rpcrdma_ia *ia = &xprt->rx_ia;
578 struct ib_device_attr *devattr = &ia->ri_devattr;
579
580 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
581 if (IS_ERR(ia->ri_id)) {
582 rc = PTR_ERR(ia->ri_id);
583 goto out1;
584 }
585 ia->ri_device = ia->ri_id->device;
586
587 ia->ri_pd = ib_alloc_pd(ia->ri_device);
588 if (IS_ERR(ia->ri_pd)) {
589 rc = PTR_ERR(ia->ri_pd);
590 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
591 __func__, rc);
592 goto out2;
593 }
594
595 rc = ib_query_device(ia->ri_device, devattr);
596 if (rc) {
597 dprintk("RPC: %s: ib_query_device failed %d\n",
598 __func__, rc);
599 goto out3;
600 }
601
602 if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
603 ia->ri_have_dma_lkey = 1;
604 ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
605 }
606
607 if (memreg == RPCRDMA_FRMR) {
608 /* Requires both frmr reg and local dma lkey */
609 if (((devattr->device_cap_flags &
610 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
611 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
612 (devattr->max_fast_reg_page_list_len == 0)) {
613 dprintk("RPC: %s: FRMR registration "
614 "not supported by HCA\n", __func__);
615 memreg = RPCRDMA_MTHCAFMR;
616 }
617 }
618 if (memreg == RPCRDMA_MTHCAFMR) {
619 if (!ia->ri_device->alloc_fmr) {
620 dprintk("RPC: %s: MTHCAFMR registration "
621 "not supported by HCA\n", __func__);
622 memreg = RPCRDMA_ALLPHYSICAL;
623 }
624 }
625
626 /*
627 * Optionally obtain an underlying physical identity mapping in
628 * order to do a memory window-based bind. This base registration
629 * is protected from remote access - that is enabled only by binding
630 * for the specific bytes targeted during each RPC operation, and
631 * revoked after the corresponding completion similar to a storage
632 * adapter.
633 */
634 switch (memreg) {
635 case RPCRDMA_FRMR:
636 ia->ri_ops = &rpcrdma_frwr_memreg_ops;
637 break;
638 case RPCRDMA_ALLPHYSICAL:
639 ia->ri_ops = &rpcrdma_physical_memreg_ops;
640 mem_priv = IB_ACCESS_LOCAL_WRITE |
641 IB_ACCESS_REMOTE_WRITE |
642 IB_ACCESS_REMOTE_READ;
643 goto register_setup;
644 case RPCRDMA_MTHCAFMR:
645 ia->ri_ops = &rpcrdma_fmr_memreg_ops;
646 if (ia->ri_have_dma_lkey)
647 break;
648 mem_priv = IB_ACCESS_LOCAL_WRITE;
649 register_setup:
650 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
651 if (IS_ERR(ia->ri_bind_mem)) {
652 printk(KERN_ALERT "%s: ib_get_dma_mr for "
653 "phys register failed with %lX\n",
654 __func__, PTR_ERR(ia->ri_bind_mem));
655 rc = -ENOMEM;
656 goto out3;
657 }
658 break;
659 default:
660 printk(KERN_ERR "RPC: Unsupported memory "
661 "registration mode: %d\n", memreg);
662 rc = -ENOMEM;
663 goto out3;
664 }
665 dprintk("RPC: %s: memory registration strategy is '%s'\n",
666 __func__, ia->ri_ops->ro_displayname);
667
668 /* Else will do memory reg/dereg for each chunk */
669 ia->ri_memreg_strategy = memreg;
670
671 rwlock_init(&ia->ri_qplock);
672 return 0;
673
674 out3:
675 ib_dealloc_pd(ia->ri_pd);
676 ia->ri_pd = NULL;
677 out2:
678 rdma_destroy_id(ia->ri_id);
679 ia->ri_id = NULL;
680 out1:
681 return rc;
682 }
683
684 /*
685 * Clean up/close an IA.
686 * o if event handles and PD have been initialized, free them.
687 * o close the IA
688 */
689 void
690 rpcrdma_ia_close(struct rpcrdma_ia *ia)
691 {
692 int rc;
693
694 dprintk("RPC: %s: entering\n", __func__);
695 if (ia->ri_bind_mem != NULL) {
696 rc = ib_dereg_mr(ia->ri_bind_mem);
697 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
698 __func__, rc);
699 }
700
701 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
702 if (ia->ri_id->qp)
703 rdma_destroy_qp(ia->ri_id);
704 rdma_destroy_id(ia->ri_id);
705 ia->ri_id = NULL;
706 }
707
708 /* If the pd is still busy, xprtrdma missed freeing a resource */
709 if (ia->ri_pd && !IS_ERR(ia->ri_pd))
710 WARN_ON(ib_dealloc_pd(ia->ri_pd));
711 }
712
713 /*
714 * Create unconnected endpoint.
715 */
716 int
717 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
718 struct rpcrdma_create_data_internal *cdata)
719 {
720 struct ib_device_attr *devattr = &ia->ri_devattr;
721 struct ib_cq *sendcq, *recvcq;
722 int rc, err;
723
724 /* check provider's send/recv wr limits */
725 if (cdata->max_requests > devattr->max_qp_wr)
726 cdata->max_requests = devattr->max_qp_wr;
727
728 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
729 ep->rep_attr.qp_context = ep;
730 ep->rep_attr.srq = NULL;
731 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
732 rc = ia->ri_ops->ro_open(ia, ep, cdata);
733 if (rc)
734 return rc;
735 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
736 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
737 ep->rep_attr.cap.max_recv_sge = 1;
738 ep->rep_attr.cap.max_inline_data = 0;
739 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
740 ep->rep_attr.qp_type = IB_QPT_RC;
741 ep->rep_attr.port_num = ~0;
742
743 if (cdata->padding) {
744 ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
745 GFP_KERNEL);
746 if (IS_ERR(ep->rep_padbuf))
747 return PTR_ERR(ep->rep_padbuf);
748 } else
749 ep->rep_padbuf = NULL;
750
751 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
752 "iovs: send %d recv %d\n",
753 __func__,
754 ep->rep_attr.cap.max_send_wr,
755 ep->rep_attr.cap.max_recv_wr,
756 ep->rep_attr.cap.max_send_sge,
757 ep->rep_attr.cap.max_recv_sge);
758
759 /* set trigger for requesting send completion */
760 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
761 if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
762 ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
763 else if (ep->rep_cqinit <= 2)
764 ep->rep_cqinit = 0;
765 INIT_CQCOUNT(ep);
766 init_waitqueue_head(&ep->rep_connect_wait);
767 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
768
769 sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall,
770 rpcrdma_cq_async_error_upcall, ep,
771 ep->rep_attr.cap.max_send_wr + 1, 0);
772 if (IS_ERR(sendcq)) {
773 rc = PTR_ERR(sendcq);
774 dprintk("RPC: %s: failed to create send CQ: %i\n",
775 __func__, rc);
776 goto out1;
777 }
778
779 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
780 if (rc) {
781 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
782 __func__, rc);
783 goto out2;
784 }
785
786 recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall,
787 rpcrdma_cq_async_error_upcall, ep,
788 ep->rep_attr.cap.max_recv_wr + 1, 0);
789 if (IS_ERR(recvcq)) {
790 rc = PTR_ERR(recvcq);
791 dprintk("RPC: %s: failed to create recv CQ: %i\n",
792 __func__, rc);
793 goto out2;
794 }
795
796 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
797 if (rc) {
798 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
799 __func__, rc);
800 ib_destroy_cq(recvcq);
801 goto out2;
802 }
803
804 ep->rep_attr.send_cq = sendcq;
805 ep->rep_attr.recv_cq = recvcq;
806
807 /* Initialize cma parameters */
808
809 /* RPC/RDMA does not use private data */
810 ep->rep_remote_cma.private_data = NULL;
811 ep->rep_remote_cma.private_data_len = 0;
812
813 /* Client offers RDMA Read but does not initiate */
814 ep->rep_remote_cma.initiator_depth = 0;
815 if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
816 ep->rep_remote_cma.responder_resources = 32;
817 else
818 ep->rep_remote_cma.responder_resources =
819 devattr->max_qp_rd_atom;
820
821 ep->rep_remote_cma.retry_count = 7;
822 ep->rep_remote_cma.flow_control = 0;
823 ep->rep_remote_cma.rnr_retry_count = 0;
824
825 return 0;
826
827 out2:
828 err = ib_destroy_cq(sendcq);
829 if (err)
830 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
831 __func__, err);
832 out1:
833 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
834 return rc;
835 }
836
837 /*
838 * rpcrdma_ep_destroy
839 *
840 * Disconnect and destroy endpoint. After this, the only
841 * valid operations on the ep are to free it (if dynamically
842 * allocated) or re-create it.
843 */
844 void
845 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
846 {
847 int rc;
848
849 dprintk("RPC: %s: entering, connected is %d\n",
850 __func__, ep->rep_connected);
851
852 cancel_delayed_work_sync(&ep->rep_connect_worker);
853
854 if (ia->ri_id->qp) {
855 rpcrdma_ep_disconnect(ep, ia);
856 rdma_destroy_qp(ia->ri_id);
857 ia->ri_id->qp = NULL;
858 }
859
860 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
861
862 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
863 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
864 if (rc)
865 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
866 __func__, rc);
867
868 rpcrdma_clean_cq(ep->rep_attr.send_cq);
869 rc = ib_destroy_cq(ep->rep_attr.send_cq);
870 if (rc)
871 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
872 __func__, rc);
873 }
874
875 /*
876 * Connect unconnected endpoint.
877 */
878 int
879 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
880 {
881 struct rdma_cm_id *id, *old;
882 int rc = 0;
883 int retry_count = 0;
884
885 if (ep->rep_connected != 0) {
886 struct rpcrdma_xprt *xprt;
887 retry:
888 dprintk("RPC: %s: reconnecting...\n", __func__);
889
890 rpcrdma_ep_disconnect(ep, ia);
891 rpcrdma_flush_cqs(ep);
892
893 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
894 id = rpcrdma_create_id(xprt, ia,
895 (struct sockaddr *)&xprt->rx_data.addr);
896 if (IS_ERR(id)) {
897 rc = -EHOSTUNREACH;
898 goto out;
899 }
900 /* TEMP TEMP TEMP - fail if new device:
901 * Deregister/remarshal *all* requests!
902 * Close and recreate adapter, pd, etc!
903 * Re-determine all attributes still sane!
904 * More stuff I haven't thought of!
905 * Rrrgh!
906 */
907 if (ia->ri_device != id->device) {
908 printk("RPC: %s: can't reconnect on "
909 "different device!\n", __func__);
910 rdma_destroy_id(id);
911 rc = -ENETUNREACH;
912 goto out;
913 }
914 /* END TEMP */
915 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
916 if (rc) {
917 dprintk("RPC: %s: rdma_create_qp failed %i\n",
918 __func__, rc);
919 rdma_destroy_id(id);
920 rc = -ENETUNREACH;
921 goto out;
922 }
923
924 write_lock(&ia->ri_qplock);
925 old = ia->ri_id;
926 ia->ri_id = id;
927 write_unlock(&ia->ri_qplock);
928
929 rdma_destroy_qp(old);
930 rdma_destroy_id(old);
931 } else {
932 dprintk("RPC: %s: connecting...\n", __func__);
933 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
934 if (rc) {
935 dprintk("RPC: %s: rdma_create_qp failed %i\n",
936 __func__, rc);
937 /* do not update ep->rep_connected */
938 return -ENETUNREACH;
939 }
940 }
941
942 ep->rep_connected = 0;
943
944 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
945 if (rc) {
946 dprintk("RPC: %s: rdma_connect() failed with %i\n",
947 __func__, rc);
948 goto out;
949 }
950
951 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
952
953 /*
954 * Check state. A non-peer reject indicates no listener
955 * (ECONNREFUSED), which may be a transient state. All
956 * others indicate a transport condition which has already
957 * undergone a best-effort.
958 */
959 if (ep->rep_connected == -ECONNREFUSED &&
960 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
961 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
962 goto retry;
963 }
964 if (ep->rep_connected <= 0) {
965 /* Sometimes, the only way to reliably connect to remote
966 * CMs is to use same nonzero values for ORD and IRD. */
967 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
968 (ep->rep_remote_cma.responder_resources == 0 ||
969 ep->rep_remote_cma.initiator_depth !=
970 ep->rep_remote_cma.responder_resources)) {
971 if (ep->rep_remote_cma.responder_resources == 0)
972 ep->rep_remote_cma.responder_resources = 1;
973 ep->rep_remote_cma.initiator_depth =
974 ep->rep_remote_cma.responder_resources;
975 goto retry;
976 }
977 rc = ep->rep_connected;
978 } else {
979 dprintk("RPC: %s: connected\n", __func__);
980 }
981
982 out:
983 if (rc)
984 ep->rep_connected = rc;
985 return rc;
986 }
987
988 /*
989 * rpcrdma_ep_disconnect
990 *
991 * This is separate from destroy to facilitate the ability
992 * to reconnect without recreating the endpoint.
993 *
994 * This call is not reentrant, and must not be made in parallel
995 * on the same endpoint.
996 */
997 void
998 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
999 {
1000 int rc;
1001
1002 rpcrdma_flush_cqs(ep);
1003 rc = rdma_disconnect(ia->ri_id);
1004 if (!rc) {
1005 /* returns without wait if not connected */
1006 wait_event_interruptible(ep->rep_connect_wait,
1007 ep->rep_connected != 1);
1008 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
1009 (ep->rep_connected == 1) ? "still " : "dis");
1010 } else {
1011 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
1012 ep->rep_connected = rc;
1013 }
1014 }
1015
1016 static struct rpcrdma_req *
1017 rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
1018 {
1019 struct rpcrdma_req *req;
1020
1021 req = kzalloc(sizeof(*req), GFP_KERNEL);
1022 if (req == NULL)
1023 return ERR_PTR(-ENOMEM);
1024
1025 req->rl_buffer = &r_xprt->rx_buf;
1026 return req;
1027 }
1028
1029 static struct rpcrdma_rep *
1030 rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
1031 {
1032 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1033 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1034 struct rpcrdma_rep *rep;
1035 int rc;
1036
1037 rc = -ENOMEM;
1038 rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1039 if (rep == NULL)
1040 goto out;
1041
1042 rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
1043 GFP_KERNEL);
1044 if (IS_ERR(rep->rr_rdmabuf)) {
1045 rc = PTR_ERR(rep->rr_rdmabuf);
1046 goto out_free;
1047 }
1048
1049 rep->rr_device = ia->ri_device;
1050 rep->rr_rxprt = r_xprt;
1051 return rep;
1052
1053 out_free:
1054 kfree(rep);
1055 out:
1056 return ERR_PTR(rc);
1057 }
1058
1059 int
1060 rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1061 {
1062 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1063 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1064 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1065 char *p;
1066 size_t len;
1067 int i, rc;
1068
1069 buf->rb_max_requests = cdata->max_requests;
1070 spin_lock_init(&buf->rb_lock);
1071
1072 /* Need to allocate:
1073 * 1. arrays for send and recv pointers
1074 * 2. arrays of struct rpcrdma_req to fill in pointers
1075 * 3. array of struct rpcrdma_rep for replies
1076 * Send/recv buffers in req/rep need to be registered
1077 */
1078 len = buf->rb_max_requests *
1079 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1080
1081 p = kzalloc(len, GFP_KERNEL);
1082 if (p == NULL) {
1083 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1084 __func__, len);
1085 rc = -ENOMEM;
1086 goto out;
1087 }
1088 buf->rb_pool = p; /* for freeing it later */
1089
1090 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1091 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1092 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1093 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1094
1095 rc = ia->ri_ops->ro_init(r_xprt);
1096 if (rc)
1097 goto out;
1098
1099 for (i = 0; i < buf->rb_max_requests; i++) {
1100 struct rpcrdma_req *req;
1101 struct rpcrdma_rep *rep;
1102
1103 req = rpcrdma_create_req(r_xprt);
1104 if (IS_ERR(req)) {
1105 dprintk("RPC: %s: request buffer %d alloc"
1106 " failed\n", __func__, i);
1107 rc = PTR_ERR(req);
1108 goto out;
1109 }
1110 buf->rb_send_bufs[i] = req;
1111
1112 rep = rpcrdma_create_rep(r_xprt);
1113 if (IS_ERR(rep)) {
1114 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1115 __func__, i);
1116 rc = PTR_ERR(rep);
1117 goto out;
1118 }
1119 buf->rb_recv_bufs[i] = rep;
1120 }
1121
1122 return 0;
1123 out:
1124 rpcrdma_buffer_destroy(buf);
1125 return rc;
1126 }
1127
1128 static void
1129 rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
1130 {
1131 if (!rep)
1132 return;
1133
1134 rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
1135 kfree(rep);
1136 }
1137
1138 static void
1139 rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
1140 {
1141 if (!req)
1142 return;
1143
1144 rpcrdma_free_regbuf(ia, req->rl_sendbuf);
1145 rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
1146 kfree(req);
1147 }
1148
1149 void
1150 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1151 {
1152 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1153 int i;
1154
1155 /* clean up in reverse order from create
1156 * 1. recv mr memory (mr free, then kfree)
1157 * 2. send mr memory (mr free, then kfree)
1158 * 3. MWs
1159 */
1160 dprintk("RPC: %s: entering\n", __func__);
1161
1162 for (i = 0; i < buf->rb_max_requests; i++) {
1163 if (buf->rb_recv_bufs)
1164 rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
1165 if (buf->rb_send_bufs)
1166 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
1167 }
1168
1169 ia->ri_ops->ro_destroy(buf);
1170
1171 kfree(buf->rb_pool);
1172 }
1173
1174 struct rpcrdma_mw *
1175 rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
1176 {
1177 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1178 struct rpcrdma_mw *mw = NULL;
1179 unsigned long flags;
1180
1181 spin_lock_irqsave(&buf->rb_lock, flags);
1182 if (!list_empty(&buf->rb_mws)) {
1183 mw = list_first_entry(&buf->rb_mws,
1184 struct rpcrdma_mw, mw_list);
1185 list_del_init(&mw->mw_list);
1186 }
1187 spin_unlock_irqrestore(&buf->rb_lock, flags);
1188
1189 if (!mw)
1190 pr_err("RPC: %s: no MWs available\n", __func__);
1191 return mw;
1192 }
1193
1194 void
1195 rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
1196 {
1197 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1198 unsigned long flags;
1199
1200 spin_lock_irqsave(&buf->rb_lock, flags);
1201 list_add_tail(&mw->mw_list, &buf->rb_mws);
1202 spin_unlock_irqrestore(&buf->rb_lock, flags);
1203 }
1204
1205 static void
1206 rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1207 {
1208 buf->rb_send_bufs[--buf->rb_send_index] = req;
1209 req->rl_niovs = 0;
1210 if (req->rl_reply) {
1211 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1212 req->rl_reply = NULL;
1213 }
1214 }
1215
1216 /*
1217 * Get a set of request/reply buffers.
1218 *
1219 * Reply buffer (if needed) is attached to send buffer upon return.
1220 * Rule:
1221 * rb_send_index and rb_recv_index MUST always be pointing to the
1222 * *next* available buffer (non-NULL). They are incremented after
1223 * removing buffers, and decremented *before* returning them.
1224 */
1225 struct rpcrdma_req *
1226 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1227 {
1228 struct rpcrdma_req *req;
1229 unsigned long flags;
1230
1231 spin_lock_irqsave(&buffers->rb_lock, flags);
1232
1233 if (buffers->rb_send_index == buffers->rb_max_requests) {
1234 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1235 dprintk("RPC: %s: out of request buffers\n", __func__);
1236 return ((struct rpcrdma_req *)NULL);
1237 }
1238
1239 req = buffers->rb_send_bufs[buffers->rb_send_index];
1240 if (buffers->rb_send_index < buffers->rb_recv_index) {
1241 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1242 __func__,
1243 buffers->rb_recv_index - buffers->rb_send_index);
1244 req->rl_reply = NULL;
1245 } else {
1246 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1247 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1248 }
1249 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1250
1251 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1252 return req;
1253 }
1254
1255 /*
1256 * Put request/reply buffers back into pool.
1257 * Pre-decrement counter/array index.
1258 */
1259 void
1260 rpcrdma_buffer_put(struct rpcrdma_req *req)
1261 {
1262 struct rpcrdma_buffer *buffers = req->rl_buffer;
1263 unsigned long flags;
1264
1265 spin_lock_irqsave(&buffers->rb_lock, flags);
1266 rpcrdma_buffer_put_sendbuf(req, buffers);
1267 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1268 }
1269
1270 /*
1271 * Recover reply buffers from pool.
1272 * This happens when recovering from error conditions.
1273 * Post-increment counter/array index.
1274 */
1275 void
1276 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1277 {
1278 struct rpcrdma_buffer *buffers = req->rl_buffer;
1279 unsigned long flags;
1280
1281 spin_lock_irqsave(&buffers->rb_lock, flags);
1282 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1283 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1284 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1285 }
1286 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1287 }
1288
1289 /*
1290 * Put reply buffers back into pool when not attached to
1291 * request. This happens in error conditions.
1292 */
1293 void
1294 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1295 {
1296 struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
1297 unsigned long flags;
1298
1299 spin_lock_irqsave(&buffers->rb_lock, flags);
1300 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1301 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1302 }
1303
1304 /*
1305 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1306 */
1307
1308 void
1309 rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
1310 {
1311 dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
1312 seg->mr_offset,
1313 (unsigned long long)seg->mr_dma, seg->mr_dmalen);
1314 }
1315
1316 static int
1317 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1318 struct ib_mr **mrp, struct ib_sge *iov)
1319 {
1320 struct ib_phys_buf ipb;
1321 struct ib_mr *mr;
1322 int rc;
1323
1324 /*
1325 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1326 */
1327 iov->addr = ib_dma_map_single(ia->ri_device,
1328 va, len, DMA_BIDIRECTIONAL);
1329 if (ib_dma_mapping_error(ia->ri_device, iov->addr))
1330 return -ENOMEM;
1331
1332 iov->length = len;
1333
1334 if (ia->ri_have_dma_lkey) {
1335 *mrp = NULL;
1336 iov->lkey = ia->ri_dma_lkey;
1337 return 0;
1338 } else if (ia->ri_bind_mem != NULL) {
1339 *mrp = NULL;
1340 iov->lkey = ia->ri_bind_mem->lkey;
1341 return 0;
1342 }
1343
1344 ipb.addr = iov->addr;
1345 ipb.size = iov->length;
1346 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1347 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1348
1349 dprintk("RPC: %s: phys convert: 0x%llx "
1350 "registered 0x%llx length %d\n",
1351 __func__, (unsigned long long)ipb.addr,
1352 (unsigned long long)iov->addr, len);
1353
1354 if (IS_ERR(mr)) {
1355 *mrp = NULL;
1356 rc = PTR_ERR(mr);
1357 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1358 } else {
1359 *mrp = mr;
1360 iov->lkey = mr->lkey;
1361 rc = 0;
1362 }
1363
1364 return rc;
1365 }
1366
1367 static int
1368 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1369 struct ib_mr *mr, struct ib_sge *iov)
1370 {
1371 int rc;
1372
1373 ib_dma_unmap_single(ia->ri_device,
1374 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1375
1376 if (NULL == mr)
1377 return 0;
1378
1379 rc = ib_dereg_mr(mr);
1380 if (rc)
1381 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1382 return rc;
1383 }
1384
1385 /**
1386 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
1387 * @ia: controlling rpcrdma_ia
1388 * @size: size of buffer to be allocated, in bytes
1389 * @flags: GFP flags
1390 *
1391 * Returns pointer to private header of an area of internally
1392 * registered memory, or an ERR_PTR. The registered buffer follows
1393 * the end of the private header.
1394 *
1395 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
1396 * receiving the payload of RDMA RECV operations. regbufs are not
1397 * used for RDMA READ/WRITE operations, thus are registered only for
1398 * LOCAL access.
1399 */
1400 struct rpcrdma_regbuf *
1401 rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
1402 {
1403 struct rpcrdma_regbuf *rb;
1404 int rc;
1405
1406 rc = -ENOMEM;
1407 rb = kmalloc(sizeof(*rb) + size, flags);
1408 if (rb == NULL)
1409 goto out;
1410
1411 rb->rg_size = size;
1412 rb->rg_owner = NULL;
1413 rc = rpcrdma_register_internal(ia, rb->rg_base, size,
1414 &rb->rg_mr, &rb->rg_iov);
1415 if (rc)
1416 goto out_free;
1417
1418 return rb;
1419
1420 out_free:
1421 kfree(rb);
1422 out:
1423 return ERR_PTR(rc);
1424 }
1425
1426 /**
1427 * rpcrdma_free_regbuf - deregister and free registered buffer
1428 * @ia: controlling rpcrdma_ia
1429 * @rb: regbuf to be deregistered and freed
1430 */
1431 void
1432 rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
1433 {
1434 if (rb) {
1435 rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
1436 kfree(rb);
1437 }
1438 }
1439
1440 /*
1441 * Prepost any receive buffer, then post send.
1442 *
1443 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1444 */
1445 int
1446 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1447 struct rpcrdma_ep *ep,
1448 struct rpcrdma_req *req)
1449 {
1450 struct ib_send_wr send_wr, *send_wr_fail;
1451 struct rpcrdma_rep *rep = req->rl_reply;
1452 int rc;
1453
1454 if (rep) {
1455 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1456 if (rc)
1457 goto out;
1458 req->rl_reply = NULL;
1459 }
1460
1461 send_wr.next = NULL;
1462 send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
1463 send_wr.sg_list = req->rl_send_iov;
1464 send_wr.num_sge = req->rl_niovs;
1465 send_wr.opcode = IB_WR_SEND;
1466 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1467 ib_dma_sync_single_for_device(ia->ri_device,
1468 req->rl_send_iov[3].addr,
1469 req->rl_send_iov[3].length,
1470 DMA_TO_DEVICE);
1471 ib_dma_sync_single_for_device(ia->ri_device,
1472 req->rl_send_iov[1].addr,
1473 req->rl_send_iov[1].length,
1474 DMA_TO_DEVICE);
1475 ib_dma_sync_single_for_device(ia->ri_device,
1476 req->rl_send_iov[0].addr,
1477 req->rl_send_iov[0].length,
1478 DMA_TO_DEVICE);
1479
1480 if (DECR_CQCOUNT(ep) > 0)
1481 send_wr.send_flags = 0;
1482 else { /* Provider must take a send completion every now and then */
1483 INIT_CQCOUNT(ep);
1484 send_wr.send_flags = IB_SEND_SIGNALED;
1485 }
1486
1487 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1488 if (rc)
1489 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1490 rc);
1491 out:
1492 return rc;
1493 }
1494
1495 /*
1496 * (Re)post a receive buffer.
1497 */
1498 int
1499 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1500 struct rpcrdma_ep *ep,
1501 struct rpcrdma_rep *rep)
1502 {
1503 struct ib_recv_wr recv_wr, *recv_wr_fail;
1504 int rc;
1505
1506 recv_wr.next = NULL;
1507 recv_wr.wr_id = (u64) (unsigned long) rep;
1508 recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
1509 recv_wr.num_sge = 1;
1510
1511 ib_dma_sync_single_for_cpu(ia->ri_device,
1512 rdmab_addr(rep->rr_rdmabuf),
1513 rdmab_length(rep->rr_rdmabuf),
1514 DMA_BIDIRECTIONAL);
1515
1516 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1517
1518 if (rc)
1519 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1520 rc);
1521 return rc;
1522 }
1523
1524 /* How many chunk list items fit within our inline buffers?
1525 */
1526 unsigned int
1527 rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
1528 {
1529 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1530 int bytes, segments;
1531
1532 bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
1533 bytes -= RPCRDMA_HDRLEN_MIN;
1534 if (bytes < sizeof(struct rpcrdma_segment) * 2) {
1535 pr_warn("RPC: %s: inline threshold too small\n",
1536 __func__);
1537 return 0;
1538 }
1539
1540 segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
1541 dprintk("RPC: %s: max chunk list size = %d segments\n",
1542 __func__, segments);
1543 return segments;
1544 }
This page took 0.078769 seconds and 4 git commands to generate.