60f3317c90ee6bc4334695b6a1e16d6612b97c8b
[deliverable/linux.git] / net / sunrpc / xprtrdma / verbs.c
1 /*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50 #include <linux/interrupt.h>
51 #include <linux/slab.h>
52 #include <linux/prefetch.h>
53 #include <linux/sunrpc/addr.h>
54 #include <asm/bitops.h>
55
56 #include "xprt_rdma.h"
57
58 /*
59 * Globals/Macros
60 */
61
62 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
63 # define RPCDBG_FACILITY RPCDBG_TRANS
64 #endif
65
66 static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
67 static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
68
69 /*
70 * internal functions
71 */
72
73 /*
74 * handle replies in tasklet context, using a single, global list
75 * rdma tasklet function -- just turn around and call the func
76 * for all replies on the list
77 */
78
79 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
80 static LIST_HEAD(rpcrdma_tasklets_g);
81
82 static void
83 rpcrdma_run_tasklet(unsigned long data)
84 {
85 struct rpcrdma_rep *rep;
86 void (*func)(struct rpcrdma_rep *);
87 unsigned long flags;
88
89 data = data;
90 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
91 while (!list_empty(&rpcrdma_tasklets_g)) {
92 rep = list_entry(rpcrdma_tasklets_g.next,
93 struct rpcrdma_rep, rr_list);
94 list_del(&rep->rr_list);
95 func = rep->rr_func;
96 rep->rr_func = NULL;
97 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
98
99 if (func)
100 func(rep);
101 else
102 rpcrdma_recv_buffer_put(rep);
103
104 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
105 }
106 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
107 }
108
109 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
110
111 static const char * const async_event[] = {
112 "CQ error",
113 "QP fatal error",
114 "QP request error",
115 "QP access error",
116 "communication established",
117 "send queue drained",
118 "path migration successful",
119 "path mig error",
120 "device fatal error",
121 "port active",
122 "port error",
123 "LID change",
124 "P_key change",
125 "SM change",
126 "SRQ error",
127 "SRQ limit reached",
128 "last WQE reached",
129 "client reregister",
130 "GID change",
131 };
132
133 #define ASYNC_MSG(status) \
134 ((status) < ARRAY_SIZE(async_event) ? \
135 async_event[(status)] : "unknown async error")
136
137 static void
138 rpcrdma_schedule_tasklet(struct list_head *sched_list)
139 {
140 unsigned long flags;
141
142 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
143 list_splice_tail(sched_list, &rpcrdma_tasklets_g);
144 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
145 tasklet_schedule(&rpcrdma_tasklet_g);
146 }
147
148 static void
149 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
150 {
151 struct rpcrdma_ep *ep = context;
152
153 pr_err("RPC: %s: %s on device %s ep %p\n",
154 __func__, ASYNC_MSG(event->event),
155 event->device->name, context);
156 if (ep->rep_connected == 1) {
157 ep->rep_connected = -EIO;
158 rpcrdma_conn_func(ep);
159 wake_up_all(&ep->rep_connect_wait);
160 }
161 }
162
163 static void
164 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
165 {
166 struct rpcrdma_ep *ep = context;
167
168 pr_err("RPC: %s: %s on device %s ep %p\n",
169 __func__, ASYNC_MSG(event->event),
170 event->device->name, context);
171 if (ep->rep_connected == 1) {
172 ep->rep_connected = -EIO;
173 rpcrdma_conn_func(ep);
174 wake_up_all(&ep->rep_connect_wait);
175 }
176 }
177
178 static const char * const wc_status[] = {
179 "success",
180 "local length error",
181 "local QP operation error",
182 "local EE context operation error",
183 "local protection error",
184 "WR flushed",
185 "memory management operation error",
186 "bad response error",
187 "local access error",
188 "remote invalid request error",
189 "remote access error",
190 "remote operation error",
191 "transport retry counter exceeded",
192 "RNR retrycounter exceeded",
193 "local RDD violation error",
194 "remove invalid RD request",
195 "operation aborted",
196 "invalid EE context number",
197 "invalid EE context state",
198 "fatal error",
199 "response timeout error",
200 "general error",
201 };
202
203 #define COMPLETION_MSG(status) \
204 ((status) < ARRAY_SIZE(wc_status) ? \
205 wc_status[(status)] : "unexpected completion error")
206
207 static void
208 rpcrdma_sendcq_process_wc(struct ib_wc *wc)
209 {
210 if (likely(wc->status == IB_WC_SUCCESS))
211 return;
212
213 /* WARNING: Only wr_id and status are reliable at this point */
214 if (wc->wr_id == 0ULL) {
215 if (wc->status != IB_WC_WR_FLUSH_ERR)
216 pr_err("RPC: %s: SEND: %s\n",
217 __func__, COMPLETION_MSG(wc->status));
218 } else {
219 struct rpcrdma_mw *r;
220
221 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
222 r->r.frmr.fr_state = FRMR_IS_STALE;
223 pr_err("RPC: %s: frmr %p (stale): %s\n",
224 __func__, r, COMPLETION_MSG(wc->status));
225 }
226 }
227
228 static int
229 rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
230 {
231 struct ib_wc *wcs;
232 int budget, count, rc;
233
234 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
235 do {
236 wcs = ep->rep_send_wcs;
237
238 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
239 if (rc <= 0)
240 return rc;
241
242 count = rc;
243 while (count-- > 0)
244 rpcrdma_sendcq_process_wc(wcs++);
245 } while (rc == RPCRDMA_POLLSIZE && --budget);
246 return 0;
247 }
248
249 /*
250 * Handle send, fast_reg_mr, and local_inv completions.
251 *
252 * Send events are typically suppressed and thus do not result
253 * in an upcall. Occasionally one is signaled, however. This
254 * prevents the provider's completion queue from wrapping and
255 * losing a completion.
256 */
257 static void
258 rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
259 {
260 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
261 int rc;
262
263 rc = rpcrdma_sendcq_poll(cq, ep);
264 if (rc) {
265 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
266 __func__, rc);
267 return;
268 }
269
270 rc = ib_req_notify_cq(cq,
271 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
272 if (rc == 0)
273 return;
274 if (rc < 0) {
275 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
276 __func__, rc);
277 return;
278 }
279
280 rpcrdma_sendcq_poll(cq, ep);
281 }
282
283 static void
284 rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
285 {
286 struct rpcrdma_rep *rep =
287 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
288
289 /* WARNING: Only wr_id and status are reliable at this point */
290 if (wc->status != IB_WC_SUCCESS)
291 goto out_fail;
292
293 /* status == SUCCESS means all fields in wc are trustworthy */
294 if (wc->opcode != IB_WC_RECV)
295 return;
296
297 dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
298 __func__, rep, wc->byte_len);
299
300 rep->rr_len = wc->byte_len;
301 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
302 rdmab_addr(rep->rr_rdmabuf),
303 rep->rr_len, DMA_FROM_DEVICE);
304 prefetch(rdmab_to_msg(rep->rr_rdmabuf));
305
306 out_schedule:
307 list_add_tail(&rep->rr_list, sched_list);
308 return;
309 out_fail:
310 if (wc->status != IB_WC_WR_FLUSH_ERR)
311 pr_err("RPC: %s: rep %p: %s\n",
312 __func__, rep, COMPLETION_MSG(wc->status));
313 rep->rr_len = ~0U;
314 goto out_schedule;
315 }
316
317 static int
318 rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
319 {
320 struct list_head sched_list;
321 struct ib_wc *wcs;
322 int budget, count, rc;
323
324 INIT_LIST_HEAD(&sched_list);
325 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
326 do {
327 wcs = ep->rep_recv_wcs;
328
329 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
330 if (rc <= 0)
331 goto out_schedule;
332
333 count = rc;
334 while (count-- > 0)
335 rpcrdma_recvcq_process_wc(wcs++, &sched_list);
336 } while (rc == RPCRDMA_POLLSIZE && --budget);
337 rc = 0;
338
339 out_schedule:
340 rpcrdma_schedule_tasklet(&sched_list);
341 return rc;
342 }
343
344 /*
345 * Handle receive completions.
346 *
347 * It is reentrant but processes single events in order to maintain
348 * ordering of receives to keep server credits.
349 *
350 * It is the responsibility of the scheduled tasklet to return
351 * recv buffers to the pool. NOTE: this affects synchronization of
352 * connection shutdown. That is, the structures required for
353 * the completion of the reply handler must remain intact until
354 * all memory has been reclaimed.
355 */
356 static void
357 rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
358 {
359 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
360 int rc;
361
362 rc = rpcrdma_recvcq_poll(cq, ep);
363 if (rc) {
364 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
365 __func__, rc);
366 return;
367 }
368
369 rc = ib_req_notify_cq(cq,
370 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
371 if (rc == 0)
372 return;
373 if (rc < 0) {
374 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
375 __func__, rc);
376 return;
377 }
378
379 rpcrdma_recvcq_poll(cq, ep);
380 }
381
382 static void
383 rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
384 {
385 struct ib_wc wc;
386 LIST_HEAD(sched_list);
387
388 while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
389 rpcrdma_recvcq_process_wc(&wc, &sched_list);
390 if (!list_empty(&sched_list))
391 rpcrdma_schedule_tasklet(&sched_list);
392 while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
393 rpcrdma_sendcq_process_wc(&wc);
394 }
395
396 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
397 static const char * const conn[] = {
398 "address resolved",
399 "address error",
400 "route resolved",
401 "route error",
402 "connect request",
403 "connect response",
404 "connect error",
405 "unreachable",
406 "rejected",
407 "established",
408 "disconnected",
409 "device removal",
410 "multicast join",
411 "multicast error",
412 "address change",
413 "timewait exit",
414 };
415
416 #define CONNECTION_MSG(status) \
417 ((status) < ARRAY_SIZE(conn) ? \
418 conn[(status)] : "unrecognized connection error")
419 #endif
420
421 static int
422 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
423 {
424 struct rpcrdma_xprt *xprt = id->context;
425 struct rpcrdma_ia *ia = &xprt->rx_ia;
426 struct rpcrdma_ep *ep = &xprt->rx_ep;
427 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
428 struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
429 #endif
430 struct ib_qp_attr *attr = &ia->ri_qp_attr;
431 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
432 int connstate = 0;
433
434 switch (event->event) {
435 case RDMA_CM_EVENT_ADDR_RESOLVED:
436 case RDMA_CM_EVENT_ROUTE_RESOLVED:
437 ia->ri_async_rc = 0;
438 complete(&ia->ri_done);
439 break;
440 case RDMA_CM_EVENT_ADDR_ERROR:
441 ia->ri_async_rc = -EHOSTUNREACH;
442 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
443 __func__, ep);
444 complete(&ia->ri_done);
445 break;
446 case RDMA_CM_EVENT_ROUTE_ERROR:
447 ia->ri_async_rc = -ENETUNREACH;
448 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
449 __func__, ep);
450 complete(&ia->ri_done);
451 break;
452 case RDMA_CM_EVENT_ESTABLISHED:
453 connstate = 1;
454 ib_query_qp(ia->ri_id->qp, attr,
455 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
456 iattr);
457 dprintk("RPC: %s: %d responder resources"
458 " (%d initiator)\n",
459 __func__, attr->max_dest_rd_atomic,
460 attr->max_rd_atomic);
461 goto connected;
462 case RDMA_CM_EVENT_CONNECT_ERROR:
463 connstate = -ENOTCONN;
464 goto connected;
465 case RDMA_CM_EVENT_UNREACHABLE:
466 connstate = -ENETDOWN;
467 goto connected;
468 case RDMA_CM_EVENT_REJECTED:
469 connstate = -ECONNREFUSED;
470 goto connected;
471 case RDMA_CM_EVENT_DISCONNECTED:
472 connstate = -ECONNABORTED;
473 goto connected;
474 case RDMA_CM_EVENT_DEVICE_REMOVAL:
475 connstate = -ENODEV;
476 connected:
477 dprintk("RPC: %s: %sconnected\n",
478 __func__, connstate > 0 ? "" : "dis");
479 ep->rep_connected = connstate;
480 rpcrdma_conn_func(ep);
481 wake_up_all(&ep->rep_connect_wait);
482 /*FALLTHROUGH*/
483 default:
484 dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
485 __func__, sap, rpc_get_port(sap), ep,
486 CONNECTION_MSG(event->event));
487 break;
488 }
489
490 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
491 if (connstate == 1) {
492 int ird = attr->max_dest_rd_atomic;
493 int tird = ep->rep_remote_cma.responder_resources;
494
495 pr_info("rpcrdma: connection to %pIS:%u on %s, memreg %d slots %d ird %d%s\n",
496 sap, rpc_get_port(sap),
497 ia->ri_id->device->name,
498 ia->ri_memreg_strategy,
499 xprt->rx_buf.rb_max_requests,
500 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
501 } else if (connstate < 0) {
502 pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
503 sap, rpc_get_port(sap), connstate);
504 }
505 #endif
506
507 return 0;
508 }
509
510 static struct rdma_cm_id *
511 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
512 struct rpcrdma_ia *ia, struct sockaddr *addr)
513 {
514 struct rdma_cm_id *id;
515 int rc;
516
517 init_completion(&ia->ri_done);
518
519 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
520 if (IS_ERR(id)) {
521 rc = PTR_ERR(id);
522 dprintk("RPC: %s: rdma_create_id() failed %i\n",
523 __func__, rc);
524 return id;
525 }
526
527 ia->ri_async_rc = -ETIMEDOUT;
528 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
529 if (rc) {
530 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
531 __func__, rc);
532 goto out;
533 }
534 wait_for_completion_interruptible_timeout(&ia->ri_done,
535 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
536 rc = ia->ri_async_rc;
537 if (rc)
538 goto out;
539
540 ia->ri_async_rc = -ETIMEDOUT;
541 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
542 if (rc) {
543 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
544 __func__, rc);
545 goto out;
546 }
547 wait_for_completion_interruptible_timeout(&ia->ri_done,
548 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
549 rc = ia->ri_async_rc;
550 if (rc)
551 goto out;
552
553 return id;
554
555 out:
556 rdma_destroy_id(id);
557 return ERR_PTR(rc);
558 }
559
560 /*
561 * Drain any cq, prior to teardown.
562 */
563 static void
564 rpcrdma_clean_cq(struct ib_cq *cq)
565 {
566 struct ib_wc wc;
567 int count = 0;
568
569 while (1 == ib_poll_cq(cq, 1, &wc))
570 ++count;
571
572 if (count)
573 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
574 __func__, count, wc.opcode);
575 }
576
577 /*
578 * Exported functions.
579 */
580
581 /*
582 * Open and initialize an Interface Adapter.
583 * o initializes fields of struct rpcrdma_ia, including
584 * interface and provider attributes and protection zone.
585 */
586 int
587 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
588 {
589 int rc, mem_priv;
590 struct rpcrdma_ia *ia = &xprt->rx_ia;
591 struct ib_device_attr *devattr = &ia->ri_devattr;
592
593 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
594 if (IS_ERR(ia->ri_id)) {
595 rc = PTR_ERR(ia->ri_id);
596 goto out1;
597 }
598
599 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
600 if (IS_ERR(ia->ri_pd)) {
601 rc = PTR_ERR(ia->ri_pd);
602 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
603 __func__, rc);
604 goto out2;
605 }
606
607 rc = ib_query_device(ia->ri_id->device, devattr);
608 if (rc) {
609 dprintk("RPC: %s: ib_query_device failed %d\n",
610 __func__, rc);
611 goto out3;
612 }
613
614 if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
615 ia->ri_have_dma_lkey = 1;
616 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
617 }
618
619 if (memreg == RPCRDMA_FRMR) {
620 /* Requires both frmr reg and local dma lkey */
621 if ((devattr->device_cap_flags &
622 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
623 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
624 dprintk("RPC: %s: FRMR registration "
625 "not supported by HCA\n", __func__);
626 memreg = RPCRDMA_MTHCAFMR;
627 } else {
628 /* Mind the ia limit on FRMR page list depth */
629 ia->ri_max_frmr_depth = min_t(unsigned int,
630 RPCRDMA_MAX_DATA_SEGS,
631 devattr->max_fast_reg_page_list_len);
632 }
633 }
634 if (memreg == RPCRDMA_MTHCAFMR) {
635 if (!ia->ri_id->device->alloc_fmr) {
636 dprintk("RPC: %s: MTHCAFMR registration "
637 "not supported by HCA\n", __func__);
638 memreg = RPCRDMA_ALLPHYSICAL;
639 }
640 }
641
642 /*
643 * Optionally obtain an underlying physical identity mapping in
644 * order to do a memory window-based bind. This base registration
645 * is protected from remote access - that is enabled only by binding
646 * for the specific bytes targeted during each RPC operation, and
647 * revoked after the corresponding completion similar to a storage
648 * adapter.
649 */
650 switch (memreg) {
651 case RPCRDMA_FRMR:
652 break;
653 case RPCRDMA_ALLPHYSICAL:
654 mem_priv = IB_ACCESS_LOCAL_WRITE |
655 IB_ACCESS_REMOTE_WRITE |
656 IB_ACCESS_REMOTE_READ;
657 goto register_setup;
658 case RPCRDMA_MTHCAFMR:
659 if (ia->ri_have_dma_lkey)
660 break;
661 mem_priv = IB_ACCESS_LOCAL_WRITE;
662 register_setup:
663 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
664 if (IS_ERR(ia->ri_bind_mem)) {
665 printk(KERN_ALERT "%s: ib_get_dma_mr for "
666 "phys register failed with %lX\n",
667 __func__, PTR_ERR(ia->ri_bind_mem));
668 rc = -ENOMEM;
669 goto out3;
670 }
671 break;
672 default:
673 printk(KERN_ERR "RPC: Unsupported memory "
674 "registration mode: %d\n", memreg);
675 rc = -ENOMEM;
676 goto out3;
677 }
678 dprintk("RPC: %s: memory registration strategy is %d\n",
679 __func__, memreg);
680
681 /* Else will do memory reg/dereg for each chunk */
682 ia->ri_memreg_strategy = memreg;
683
684 rwlock_init(&ia->ri_qplock);
685 return 0;
686
687 out3:
688 ib_dealloc_pd(ia->ri_pd);
689 ia->ri_pd = NULL;
690 out2:
691 rdma_destroy_id(ia->ri_id);
692 ia->ri_id = NULL;
693 out1:
694 return rc;
695 }
696
697 /*
698 * Clean up/close an IA.
699 * o if event handles and PD have been initialized, free them.
700 * o close the IA
701 */
702 void
703 rpcrdma_ia_close(struct rpcrdma_ia *ia)
704 {
705 int rc;
706
707 dprintk("RPC: %s: entering\n", __func__);
708 if (ia->ri_bind_mem != NULL) {
709 rc = ib_dereg_mr(ia->ri_bind_mem);
710 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
711 __func__, rc);
712 }
713 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
714 if (ia->ri_id->qp)
715 rdma_destroy_qp(ia->ri_id);
716 rdma_destroy_id(ia->ri_id);
717 ia->ri_id = NULL;
718 }
719 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
720 rc = ib_dealloc_pd(ia->ri_pd);
721 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
722 __func__, rc);
723 }
724 }
725
726 /*
727 * Create unconnected endpoint.
728 */
729 int
730 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
731 struct rpcrdma_create_data_internal *cdata)
732 {
733 struct ib_device_attr *devattr = &ia->ri_devattr;
734 struct ib_cq *sendcq, *recvcq;
735 int rc, err;
736
737 /* check provider's send/recv wr limits */
738 if (cdata->max_requests > devattr->max_qp_wr)
739 cdata->max_requests = devattr->max_qp_wr;
740
741 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
742 ep->rep_attr.qp_context = ep;
743 /* send_cq and recv_cq initialized below */
744 ep->rep_attr.srq = NULL;
745 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
746 switch (ia->ri_memreg_strategy) {
747 case RPCRDMA_FRMR: {
748 int depth = 7;
749
750 /* Add room for frmr register and invalidate WRs.
751 * 1. FRMR reg WR for head
752 * 2. FRMR invalidate WR for head
753 * 3. N FRMR reg WRs for pagelist
754 * 4. N FRMR invalidate WRs for pagelist
755 * 5. FRMR reg WR for tail
756 * 6. FRMR invalidate WR for tail
757 * 7. The RDMA_SEND WR
758 */
759
760 /* Calculate N if the device max FRMR depth is smaller than
761 * RPCRDMA_MAX_DATA_SEGS.
762 */
763 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
764 int delta = RPCRDMA_MAX_DATA_SEGS -
765 ia->ri_max_frmr_depth;
766
767 do {
768 depth += 2; /* FRMR reg + invalidate */
769 delta -= ia->ri_max_frmr_depth;
770 } while (delta > 0);
771
772 }
773 ep->rep_attr.cap.max_send_wr *= depth;
774 if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
775 cdata->max_requests = devattr->max_qp_wr / depth;
776 if (!cdata->max_requests)
777 return -EINVAL;
778 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
779 depth;
780 }
781 break;
782 }
783 default:
784 break;
785 }
786 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
787 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
788 ep->rep_attr.cap.max_recv_sge = 1;
789 ep->rep_attr.cap.max_inline_data = 0;
790 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
791 ep->rep_attr.qp_type = IB_QPT_RC;
792 ep->rep_attr.port_num = ~0;
793
794 if (cdata->padding) {
795 ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
796 GFP_KERNEL);
797 if (IS_ERR(ep->rep_padbuf))
798 return PTR_ERR(ep->rep_padbuf);
799 } else
800 ep->rep_padbuf = NULL;
801
802 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
803 "iovs: send %d recv %d\n",
804 __func__,
805 ep->rep_attr.cap.max_send_wr,
806 ep->rep_attr.cap.max_recv_wr,
807 ep->rep_attr.cap.max_send_sge,
808 ep->rep_attr.cap.max_recv_sge);
809
810 /* set trigger for requesting send completion */
811 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
812 if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
813 ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
814 else if (ep->rep_cqinit <= 2)
815 ep->rep_cqinit = 0;
816 INIT_CQCOUNT(ep);
817 init_waitqueue_head(&ep->rep_connect_wait);
818 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
819
820 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
821 rpcrdma_cq_async_error_upcall, ep,
822 ep->rep_attr.cap.max_send_wr + 1, 0);
823 if (IS_ERR(sendcq)) {
824 rc = PTR_ERR(sendcq);
825 dprintk("RPC: %s: failed to create send CQ: %i\n",
826 __func__, rc);
827 goto out1;
828 }
829
830 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
831 if (rc) {
832 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
833 __func__, rc);
834 goto out2;
835 }
836
837 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
838 rpcrdma_cq_async_error_upcall, ep,
839 ep->rep_attr.cap.max_recv_wr + 1, 0);
840 if (IS_ERR(recvcq)) {
841 rc = PTR_ERR(recvcq);
842 dprintk("RPC: %s: failed to create recv CQ: %i\n",
843 __func__, rc);
844 goto out2;
845 }
846
847 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
848 if (rc) {
849 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
850 __func__, rc);
851 ib_destroy_cq(recvcq);
852 goto out2;
853 }
854
855 ep->rep_attr.send_cq = sendcq;
856 ep->rep_attr.recv_cq = recvcq;
857
858 /* Initialize cma parameters */
859
860 /* RPC/RDMA does not use private data */
861 ep->rep_remote_cma.private_data = NULL;
862 ep->rep_remote_cma.private_data_len = 0;
863
864 /* Client offers RDMA Read but does not initiate */
865 ep->rep_remote_cma.initiator_depth = 0;
866 if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
867 ep->rep_remote_cma.responder_resources = 32;
868 else
869 ep->rep_remote_cma.responder_resources =
870 devattr->max_qp_rd_atom;
871
872 ep->rep_remote_cma.retry_count = 7;
873 ep->rep_remote_cma.flow_control = 0;
874 ep->rep_remote_cma.rnr_retry_count = 0;
875
876 return 0;
877
878 out2:
879 err = ib_destroy_cq(sendcq);
880 if (err)
881 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
882 __func__, err);
883 out1:
884 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
885 return rc;
886 }
887
888 /*
889 * rpcrdma_ep_destroy
890 *
891 * Disconnect and destroy endpoint. After this, the only
892 * valid operations on the ep are to free it (if dynamically
893 * allocated) or re-create it.
894 */
895 void
896 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
897 {
898 int rc;
899
900 dprintk("RPC: %s: entering, connected is %d\n",
901 __func__, ep->rep_connected);
902
903 cancel_delayed_work_sync(&ep->rep_connect_worker);
904
905 if (ia->ri_id->qp) {
906 rpcrdma_ep_disconnect(ep, ia);
907 rdma_destroy_qp(ia->ri_id);
908 ia->ri_id->qp = NULL;
909 }
910
911 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
912
913 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
914 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
915 if (rc)
916 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
917 __func__, rc);
918
919 rpcrdma_clean_cq(ep->rep_attr.send_cq);
920 rc = ib_destroy_cq(ep->rep_attr.send_cq);
921 if (rc)
922 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
923 __func__, rc);
924 }
925
926 /*
927 * Connect unconnected endpoint.
928 */
929 int
930 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
931 {
932 struct rdma_cm_id *id, *old;
933 int rc = 0;
934 int retry_count = 0;
935
936 if (ep->rep_connected != 0) {
937 struct rpcrdma_xprt *xprt;
938 retry:
939 dprintk("RPC: %s: reconnecting...\n", __func__);
940
941 rpcrdma_ep_disconnect(ep, ia);
942 rpcrdma_flush_cqs(ep);
943
944 switch (ia->ri_memreg_strategy) {
945 case RPCRDMA_FRMR:
946 rpcrdma_reset_frmrs(ia);
947 break;
948 case RPCRDMA_MTHCAFMR:
949 rpcrdma_reset_fmrs(ia);
950 break;
951 case RPCRDMA_ALLPHYSICAL:
952 break;
953 default:
954 rc = -EIO;
955 goto out;
956 }
957
958 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
959 id = rpcrdma_create_id(xprt, ia,
960 (struct sockaddr *)&xprt->rx_data.addr);
961 if (IS_ERR(id)) {
962 rc = -EHOSTUNREACH;
963 goto out;
964 }
965 /* TEMP TEMP TEMP - fail if new device:
966 * Deregister/remarshal *all* requests!
967 * Close and recreate adapter, pd, etc!
968 * Re-determine all attributes still sane!
969 * More stuff I haven't thought of!
970 * Rrrgh!
971 */
972 if (ia->ri_id->device != id->device) {
973 printk("RPC: %s: can't reconnect on "
974 "different device!\n", __func__);
975 rdma_destroy_id(id);
976 rc = -ENETUNREACH;
977 goto out;
978 }
979 /* END TEMP */
980 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
981 if (rc) {
982 dprintk("RPC: %s: rdma_create_qp failed %i\n",
983 __func__, rc);
984 rdma_destroy_id(id);
985 rc = -ENETUNREACH;
986 goto out;
987 }
988
989 write_lock(&ia->ri_qplock);
990 old = ia->ri_id;
991 ia->ri_id = id;
992 write_unlock(&ia->ri_qplock);
993
994 rdma_destroy_qp(old);
995 rdma_destroy_id(old);
996 } else {
997 dprintk("RPC: %s: connecting...\n", __func__);
998 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
999 if (rc) {
1000 dprintk("RPC: %s: rdma_create_qp failed %i\n",
1001 __func__, rc);
1002 /* do not update ep->rep_connected */
1003 return -ENETUNREACH;
1004 }
1005 }
1006
1007 ep->rep_connected = 0;
1008
1009 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
1010 if (rc) {
1011 dprintk("RPC: %s: rdma_connect() failed with %i\n",
1012 __func__, rc);
1013 goto out;
1014 }
1015
1016 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
1017
1018 /*
1019 * Check state. A non-peer reject indicates no listener
1020 * (ECONNREFUSED), which may be a transient state. All
1021 * others indicate a transport condition which has already
1022 * undergone a best-effort.
1023 */
1024 if (ep->rep_connected == -ECONNREFUSED &&
1025 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
1026 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
1027 goto retry;
1028 }
1029 if (ep->rep_connected <= 0) {
1030 /* Sometimes, the only way to reliably connect to remote
1031 * CMs is to use same nonzero values for ORD and IRD. */
1032 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
1033 (ep->rep_remote_cma.responder_resources == 0 ||
1034 ep->rep_remote_cma.initiator_depth !=
1035 ep->rep_remote_cma.responder_resources)) {
1036 if (ep->rep_remote_cma.responder_resources == 0)
1037 ep->rep_remote_cma.responder_resources = 1;
1038 ep->rep_remote_cma.initiator_depth =
1039 ep->rep_remote_cma.responder_resources;
1040 goto retry;
1041 }
1042 rc = ep->rep_connected;
1043 } else {
1044 dprintk("RPC: %s: connected\n", __func__);
1045 }
1046
1047 out:
1048 if (rc)
1049 ep->rep_connected = rc;
1050 return rc;
1051 }
1052
1053 /*
1054 * rpcrdma_ep_disconnect
1055 *
1056 * This is separate from destroy to facilitate the ability
1057 * to reconnect without recreating the endpoint.
1058 *
1059 * This call is not reentrant, and must not be made in parallel
1060 * on the same endpoint.
1061 */
1062 void
1063 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
1064 {
1065 int rc;
1066
1067 rpcrdma_flush_cqs(ep);
1068 rc = rdma_disconnect(ia->ri_id);
1069 if (!rc) {
1070 /* returns without wait if not connected */
1071 wait_event_interruptible(ep->rep_connect_wait,
1072 ep->rep_connected != 1);
1073 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
1074 (ep->rep_connected == 1) ? "still " : "dis");
1075 } else {
1076 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
1077 ep->rep_connected = rc;
1078 }
1079 }
1080
1081 static struct rpcrdma_req *
1082 rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
1083 {
1084 struct rpcrdma_req *req;
1085
1086 req = kzalloc(sizeof(*req), GFP_KERNEL);
1087 if (req == NULL)
1088 return ERR_PTR(-ENOMEM);
1089
1090 req->rl_buffer = &r_xprt->rx_buf;
1091 return req;
1092 }
1093
1094 static struct rpcrdma_rep *
1095 rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
1096 {
1097 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1098 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1099 struct rpcrdma_rep *rep;
1100 int rc;
1101
1102 rc = -ENOMEM;
1103 rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1104 if (rep == NULL)
1105 goto out;
1106
1107 rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
1108 GFP_KERNEL);
1109 if (IS_ERR(rep->rr_rdmabuf)) {
1110 rc = PTR_ERR(rep->rr_rdmabuf);
1111 goto out_free;
1112 }
1113
1114 rep->rr_buffer = &r_xprt->rx_buf;
1115 return rep;
1116
1117 out_free:
1118 kfree(rep);
1119 out:
1120 return ERR_PTR(rc);
1121 }
1122
1123 static int
1124 rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1125 {
1126 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
1127 struct ib_fmr_attr fmr_attr = {
1128 .max_pages = RPCRDMA_MAX_DATA_SEGS,
1129 .max_maps = 1,
1130 .page_shift = PAGE_SHIFT
1131 };
1132 struct rpcrdma_mw *r;
1133 int i, rc;
1134
1135 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1136 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
1137
1138 while (i--) {
1139 r = kzalloc(sizeof(*r), GFP_KERNEL);
1140 if (r == NULL)
1141 return -ENOMEM;
1142
1143 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1144 if (IS_ERR(r->r.fmr)) {
1145 rc = PTR_ERR(r->r.fmr);
1146 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1147 __func__, rc);
1148 goto out_free;
1149 }
1150
1151 list_add(&r->mw_list, &buf->rb_mws);
1152 list_add(&r->mw_all, &buf->rb_all);
1153 }
1154 return 0;
1155
1156 out_free:
1157 kfree(r);
1158 return rc;
1159 }
1160
1161 static int
1162 rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1163 {
1164 struct rpcrdma_frmr *f;
1165 struct rpcrdma_mw *r;
1166 int i, rc;
1167
1168 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1169 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
1170
1171 while (i--) {
1172 r = kzalloc(sizeof(*r), GFP_KERNEL);
1173 if (r == NULL)
1174 return -ENOMEM;
1175 f = &r->r.frmr;
1176
1177 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1178 ia->ri_max_frmr_depth);
1179 if (IS_ERR(f->fr_mr)) {
1180 rc = PTR_ERR(f->fr_mr);
1181 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1182 "failed %i\n", __func__, rc);
1183 goto out_free;
1184 }
1185
1186 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1187 ia->ri_max_frmr_depth);
1188 if (IS_ERR(f->fr_pgl)) {
1189 rc = PTR_ERR(f->fr_pgl);
1190 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1191 "failed %i\n", __func__, rc);
1192
1193 ib_dereg_mr(f->fr_mr);
1194 goto out_free;
1195 }
1196
1197 list_add(&r->mw_list, &buf->rb_mws);
1198 list_add(&r->mw_all, &buf->rb_all);
1199 }
1200
1201 return 0;
1202
1203 out_free:
1204 kfree(r);
1205 return rc;
1206 }
1207
1208 int
1209 rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1210 {
1211 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1212 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1213 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1214 char *p;
1215 size_t len;
1216 int i, rc;
1217
1218 buf->rb_max_requests = cdata->max_requests;
1219 spin_lock_init(&buf->rb_lock);
1220
1221 /* Need to allocate:
1222 * 1. arrays for send and recv pointers
1223 * 2. arrays of struct rpcrdma_req to fill in pointers
1224 * 3. array of struct rpcrdma_rep for replies
1225 * Send/recv buffers in req/rep need to be registered
1226 */
1227 len = buf->rb_max_requests *
1228 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1229
1230 p = kzalloc(len, GFP_KERNEL);
1231 if (p == NULL) {
1232 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1233 __func__, len);
1234 rc = -ENOMEM;
1235 goto out;
1236 }
1237 buf->rb_pool = p; /* for freeing it later */
1238
1239 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1240 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1241 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1242 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1243
1244 INIT_LIST_HEAD(&buf->rb_mws);
1245 INIT_LIST_HEAD(&buf->rb_all);
1246 switch (ia->ri_memreg_strategy) {
1247 case RPCRDMA_FRMR:
1248 rc = rpcrdma_init_frmrs(ia, buf);
1249 if (rc)
1250 goto out;
1251 break;
1252 case RPCRDMA_MTHCAFMR:
1253 rc = rpcrdma_init_fmrs(ia, buf);
1254 if (rc)
1255 goto out;
1256 break;
1257 default:
1258 break;
1259 }
1260
1261 for (i = 0; i < buf->rb_max_requests; i++) {
1262 struct rpcrdma_req *req;
1263 struct rpcrdma_rep *rep;
1264
1265 req = rpcrdma_create_req(r_xprt);
1266 if (IS_ERR(req)) {
1267 dprintk("RPC: %s: request buffer %d alloc"
1268 " failed\n", __func__, i);
1269 rc = PTR_ERR(req);
1270 goto out;
1271 }
1272 buf->rb_send_bufs[i] = req;
1273
1274 rep = rpcrdma_create_rep(r_xprt);
1275 if (IS_ERR(rep)) {
1276 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1277 __func__, i);
1278 rc = PTR_ERR(rep);
1279 goto out;
1280 }
1281 buf->rb_recv_bufs[i] = rep;
1282 }
1283
1284 return 0;
1285 out:
1286 rpcrdma_buffer_destroy(buf);
1287 return rc;
1288 }
1289
1290 static void
1291 rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
1292 {
1293 if (!rep)
1294 return;
1295
1296 rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
1297 kfree(rep);
1298 }
1299
1300 static void
1301 rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
1302 {
1303 if (!req)
1304 return;
1305
1306 rpcrdma_free_regbuf(ia, req->rl_sendbuf);
1307 rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
1308 kfree(req);
1309 }
1310
1311 static void
1312 rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1313 {
1314 struct rpcrdma_mw *r;
1315 int rc;
1316
1317 while (!list_empty(&buf->rb_all)) {
1318 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1319 list_del(&r->mw_all);
1320 list_del(&r->mw_list);
1321
1322 rc = ib_dealloc_fmr(r->r.fmr);
1323 if (rc)
1324 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1325 __func__, rc);
1326
1327 kfree(r);
1328 }
1329 }
1330
1331 static void
1332 rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1333 {
1334 struct rpcrdma_mw *r;
1335 int rc;
1336
1337 while (!list_empty(&buf->rb_all)) {
1338 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1339 list_del(&r->mw_all);
1340 list_del(&r->mw_list);
1341
1342 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1343 if (rc)
1344 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1345 __func__, rc);
1346 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1347
1348 kfree(r);
1349 }
1350 }
1351
1352 void
1353 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1354 {
1355 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1356 int i;
1357
1358 /* clean up in reverse order from create
1359 * 1. recv mr memory (mr free, then kfree)
1360 * 2. send mr memory (mr free, then kfree)
1361 * 3. MWs
1362 */
1363 dprintk("RPC: %s: entering\n", __func__);
1364
1365 for (i = 0; i < buf->rb_max_requests; i++) {
1366 if (buf->rb_recv_bufs)
1367 rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
1368 if (buf->rb_send_bufs)
1369 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
1370 }
1371
1372 switch (ia->ri_memreg_strategy) {
1373 case RPCRDMA_FRMR:
1374 rpcrdma_destroy_frmrs(buf);
1375 break;
1376 case RPCRDMA_MTHCAFMR:
1377 rpcrdma_destroy_fmrs(buf);
1378 break;
1379 default:
1380 break;
1381 }
1382
1383 kfree(buf->rb_pool);
1384 }
1385
1386 /* After a disconnect, unmap all FMRs.
1387 *
1388 * This is invoked only in the transport connect worker in order
1389 * to serialize with rpcrdma_register_fmr_external().
1390 */
1391 static void
1392 rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
1393 {
1394 struct rpcrdma_xprt *r_xprt =
1395 container_of(ia, struct rpcrdma_xprt, rx_ia);
1396 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1397 struct list_head *pos;
1398 struct rpcrdma_mw *r;
1399 LIST_HEAD(l);
1400 int rc;
1401
1402 list_for_each(pos, &buf->rb_all) {
1403 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1404
1405 INIT_LIST_HEAD(&l);
1406 list_add(&r->r.fmr->list, &l);
1407 rc = ib_unmap_fmr(&l);
1408 if (rc)
1409 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
1410 __func__, rc);
1411 }
1412 }
1413
1414 /* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1415 * an unusable state. Find FRMRs in this state and dereg / reg
1416 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1417 * also torn down.
1418 *
1419 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1420 *
1421 * This is invoked only in the transport connect worker in order
1422 * to serialize with rpcrdma_register_frmr_external().
1423 */
1424 static void
1425 rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1426 {
1427 struct rpcrdma_xprt *r_xprt =
1428 container_of(ia, struct rpcrdma_xprt, rx_ia);
1429 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1430 struct list_head *pos;
1431 struct rpcrdma_mw *r;
1432 int rc;
1433
1434 list_for_each(pos, &buf->rb_all) {
1435 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1436
1437 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1438 continue;
1439
1440 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1441 if (rc)
1442 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1443 __func__, rc);
1444 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1445
1446 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1447 ia->ri_max_frmr_depth);
1448 if (IS_ERR(r->r.frmr.fr_mr)) {
1449 rc = PTR_ERR(r->r.frmr.fr_mr);
1450 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1451 " failed %i\n", __func__, rc);
1452 continue;
1453 }
1454 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1455 ia->ri_id->device,
1456 ia->ri_max_frmr_depth);
1457 if (IS_ERR(r->r.frmr.fr_pgl)) {
1458 rc = PTR_ERR(r->r.frmr.fr_pgl);
1459 dprintk("RPC: %s: "
1460 "ib_alloc_fast_reg_page_list "
1461 "failed %i\n", __func__, rc);
1462
1463 ib_dereg_mr(r->r.frmr.fr_mr);
1464 continue;
1465 }
1466 r->r.frmr.fr_state = FRMR_IS_INVALID;
1467 }
1468 }
1469
1470 /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1471 * some req segments uninitialized.
1472 */
1473 static void
1474 rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1475 {
1476 if (*mw) {
1477 list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
1478 *mw = NULL;
1479 }
1480 }
1481
1482 /* Cycle mw's back in reverse order, and "spin" them.
1483 * This delays and scrambles reuse as much as possible.
1484 */
1485 static void
1486 rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1487 {
1488 struct rpcrdma_mr_seg *seg = req->rl_segments;
1489 struct rpcrdma_mr_seg *seg1 = seg;
1490 int i;
1491
1492 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
1493 rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
1494 rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
1495 }
1496
1497 static void
1498 rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1499 {
1500 buf->rb_send_bufs[--buf->rb_send_index] = req;
1501 req->rl_niovs = 0;
1502 if (req->rl_reply) {
1503 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1504 req->rl_reply->rr_func = NULL;
1505 req->rl_reply = NULL;
1506 }
1507 }
1508
1509 /* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
1510 * Redo only the ib_post_send().
1511 */
1512 static void
1513 rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1514 {
1515 struct rpcrdma_xprt *r_xprt =
1516 container_of(ia, struct rpcrdma_xprt, rx_ia);
1517 struct ib_send_wr invalidate_wr, *bad_wr;
1518 int rc;
1519
1520 dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
1521
1522 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
1523 r->r.frmr.fr_state = FRMR_IS_INVALID;
1524
1525 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1526 invalidate_wr.wr_id = (unsigned long)(void *)r;
1527 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1528 invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1529 DECR_CQCOUNT(&r_xprt->rx_ep);
1530
1531 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
1532 __func__, r, r->r.frmr.fr_mr->rkey);
1533
1534 read_lock(&ia->ri_qplock);
1535 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1536 read_unlock(&ia->ri_qplock);
1537 if (rc) {
1538 /* Force rpcrdma_buffer_get() to retry */
1539 r->r.frmr.fr_state = FRMR_IS_STALE;
1540 dprintk("RPC: %s: ib_post_send failed, %i\n",
1541 __func__, rc);
1542 }
1543 }
1544
1545 static void
1546 rpcrdma_retry_flushed_linv(struct list_head *stale,
1547 struct rpcrdma_buffer *buf)
1548 {
1549 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1550 struct list_head *pos;
1551 struct rpcrdma_mw *r;
1552 unsigned long flags;
1553
1554 list_for_each(pos, stale) {
1555 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1556 rpcrdma_retry_local_inv(r, ia);
1557 }
1558
1559 spin_lock_irqsave(&buf->rb_lock, flags);
1560 list_splice_tail(stale, &buf->rb_mws);
1561 spin_unlock_irqrestore(&buf->rb_lock, flags);
1562 }
1563
1564 static struct rpcrdma_req *
1565 rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1566 struct list_head *stale)
1567 {
1568 struct rpcrdma_mw *r;
1569 int i;
1570
1571 i = RPCRDMA_MAX_SEGS - 1;
1572 while (!list_empty(&buf->rb_mws)) {
1573 r = list_entry(buf->rb_mws.next,
1574 struct rpcrdma_mw, mw_list);
1575 list_del(&r->mw_list);
1576 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1577 list_add(&r->mw_list, stale);
1578 continue;
1579 }
1580 req->rl_segments[i].rl_mw = r;
1581 if (unlikely(i-- == 0))
1582 return req; /* Success */
1583 }
1584
1585 /* Not enough entries on rb_mws for this req */
1586 rpcrdma_buffer_put_sendbuf(req, buf);
1587 rpcrdma_buffer_put_mrs(req, buf);
1588 return NULL;
1589 }
1590
1591 static struct rpcrdma_req *
1592 rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1593 {
1594 struct rpcrdma_mw *r;
1595 int i;
1596
1597 i = RPCRDMA_MAX_SEGS - 1;
1598 while (!list_empty(&buf->rb_mws)) {
1599 r = list_entry(buf->rb_mws.next,
1600 struct rpcrdma_mw, mw_list);
1601 list_del(&r->mw_list);
1602 req->rl_segments[i].rl_mw = r;
1603 if (unlikely(i-- == 0))
1604 return req; /* Success */
1605 }
1606
1607 /* Not enough entries on rb_mws for this req */
1608 rpcrdma_buffer_put_sendbuf(req, buf);
1609 rpcrdma_buffer_put_mrs(req, buf);
1610 return NULL;
1611 }
1612
1613 /*
1614 * Get a set of request/reply buffers.
1615 *
1616 * Reply buffer (if needed) is attached to send buffer upon return.
1617 * Rule:
1618 * rb_send_index and rb_recv_index MUST always be pointing to the
1619 * *next* available buffer (non-NULL). They are incremented after
1620 * removing buffers, and decremented *before* returning them.
1621 */
1622 struct rpcrdma_req *
1623 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1624 {
1625 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1626 struct list_head stale;
1627 struct rpcrdma_req *req;
1628 unsigned long flags;
1629
1630 spin_lock_irqsave(&buffers->rb_lock, flags);
1631 if (buffers->rb_send_index == buffers->rb_max_requests) {
1632 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1633 dprintk("RPC: %s: out of request buffers\n", __func__);
1634 return ((struct rpcrdma_req *)NULL);
1635 }
1636
1637 req = buffers->rb_send_bufs[buffers->rb_send_index];
1638 if (buffers->rb_send_index < buffers->rb_recv_index) {
1639 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1640 __func__,
1641 buffers->rb_recv_index - buffers->rb_send_index);
1642 req->rl_reply = NULL;
1643 } else {
1644 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1645 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1646 }
1647 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1648
1649 INIT_LIST_HEAD(&stale);
1650 switch (ia->ri_memreg_strategy) {
1651 case RPCRDMA_FRMR:
1652 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1653 break;
1654 case RPCRDMA_MTHCAFMR:
1655 req = rpcrdma_buffer_get_fmrs(req, buffers);
1656 break;
1657 default:
1658 break;
1659 }
1660 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1661 if (!list_empty(&stale))
1662 rpcrdma_retry_flushed_linv(&stale, buffers);
1663 return req;
1664 }
1665
1666 /*
1667 * Put request/reply buffers back into pool.
1668 * Pre-decrement counter/array index.
1669 */
1670 void
1671 rpcrdma_buffer_put(struct rpcrdma_req *req)
1672 {
1673 struct rpcrdma_buffer *buffers = req->rl_buffer;
1674 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1675 unsigned long flags;
1676
1677 spin_lock_irqsave(&buffers->rb_lock, flags);
1678 rpcrdma_buffer_put_sendbuf(req, buffers);
1679 switch (ia->ri_memreg_strategy) {
1680 case RPCRDMA_FRMR:
1681 case RPCRDMA_MTHCAFMR:
1682 rpcrdma_buffer_put_mrs(req, buffers);
1683 break;
1684 default:
1685 break;
1686 }
1687 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1688 }
1689
1690 /*
1691 * Recover reply buffers from pool.
1692 * This happens when recovering from error conditions.
1693 * Post-increment counter/array index.
1694 */
1695 void
1696 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1697 {
1698 struct rpcrdma_buffer *buffers = req->rl_buffer;
1699 unsigned long flags;
1700
1701 spin_lock_irqsave(&buffers->rb_lock, flags);
1702 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1703 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1704 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1705 }
1706 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1707 }
1708
1709 /*
1710 * Put reply buffers back into pool when not attached to
1711 * request. This happens in error conditions.
1712 */
1713 void
1714 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1715 {
1716 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1717 unsigned long flags;
1718
1719 rep->rr_func = NULL;
1720 spin_lock_irqsave(&buffers->rb_lock, flags);
1721 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1722 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1723 }
1724
1725 /*
1726 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1727 */
1728
1729 static int
1730 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1731 struct ib_mr **mrp, struct ib_sge *iov)
1732 {
1733 struct ib_phys_buf ipb;
1734 struct ib_mr *mr;
1735 int rc;
1736
1737 /*
1738 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1739 */
1740 iov->addr = ib_dma_map_single(ia->ri_id->device,
1741 va, len, DMA_BIDIRECTIONAL);
1742 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1743 return -ENOMEM;
1744
1745 iov->length = len;
1746
1747 if (ia->ri_have_dma_lkey) {
1748 *mrp = NULL;
1749 iov->lkey = ia->ri_dma_lkey;
1750 return 0;
1751 } else if (ia->ri_bind_mem != NULL) {
1752 *mrp = NULL;
1753 iov->lkey = ia->ri_bind_mem->lkey;
1754 return 0;
1755 }
1756
1757 ipb.addr = iov->addr;
1758 ipb.size = iov->length;
1759 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1760 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1761
1762 dprintk("RPC: %s: phys convert: 0x%llx "
1763 "registered 0x%llx length %d\n",
1764 __func__, (unsigned long long)ipb.addr,
1765 (unsigned long long)iov->addr, len);
1766
1767 if (IS_ERR(mr)) {
1768 *mrp = NULL;
1769 rc = PTR_ERR(mr);
1770 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1771 } else {
1772 *mrp = mr;
1773 iov->lkey = mr->lkey;
1774 rc = 0;
1775 }
1776
1777 return rc;
1778 }
1779
1780 static int
1781 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1782 struct ib_mr *mr, struct ib_sge *iov)
1783 {
1784 int rc;
1785
1786 ib_dma_unmap_single(ia->ri_id->device,
1787 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1788
1789 if (NULL == mr)
1790 return 0;
1791
1792 rc = ib_dereg_mr(mr);
1793 if (rc)
1794 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1795 return rc;
1796 }
1797
1798 /**
1799 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
1800 * @ia: controlling rpcrdma_ia
1801 * @size: size of buffer to be allocated, in bytes
1802 * @flags: GFP flags
1803 *
1804 * Returns pointer to private header of an area of internally
1805 * registered memory, or an ERR_PTR. The registered buffer follows
1806 * the end of the private header.
1807 *
1808 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
1809 * receiving the payload of RDMA RECV operations. regbufs are not
1810 * used for RDMA READ/WRITE operations, thus are registered only for
1811 * LOCAL access.
1812 */
1813 struct rpcrdma_regbuf *
1814 rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
1815 {
1816 struct rpcrdma_regbuf *rb;
1817 int rc;
1818
1819 rc = -ENOMEM;
1820 rb = kmalloc(sizeof(*rb) + size, flags);
1821 if (rb == NULL)
1822 goto out;
1823
1824 rb->rg_size = size;
1825 rb->rg_owner = NULL;
1826 rc = rpcrdma_register_internal(ia, rb->rg_base, size,
1827 &rb->rg_mr, &rb->rg_iov);
1828 if (rc)
1829 goto out_free;
1830
1831 return rb;
1832
1833 out_free:
1834 kfree(rb);
1835 out:
1836 return ERR_PTR(rc);
1837 }
1838
1839 /**
1840 * rpcrdma_free_regbuf - deregister and free registered buffer
1841 * @ia: controlling rpcrdma_ia
1842 * @rb: regbuf to be deregistered and freed
1843 */
1844 void
1845 rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
1846 {
1847 if (rb) {
1848 rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
1849 kfree(rb);
1850 }
1851 }
1852
1853 /*
1854 * Wrappers for chunk registration, shared by read/write chunk code.
1855 */
1856
1857 static void
1858 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1859 {
1860 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1861 seg->mr_dmalen = seg->mr_len;
1862 if (seg->mr_page)
1863 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1864 seg->mr_page, offset_in_page(seg->mr_offset),
1865 seg->mr_dmalen, seg->mr_dir);
1866 else
1867 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1868 seg->mr_offset,
1869 seg->mr_dmalen, seg->mr_dir);
1870 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1871 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1872 __func__,
1873 (unsigned long long)seg->mr_dma,
1874 seg->mr_offset, seg->mr_dmalen);
1875 }
1876 }
1877
1878 static void
1879 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1880 {
1881 if (seg->mr_page)
1882 ib_dma_unmap_page(ia->ri_id->device,
1883 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1884 else
1885 ib_dma_unmap_single(ia->ri_id->device,
1886 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1887 }
1888
1889 static int
1890 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1891 int *nsegs, int writing, struct rpcrdma_ia *ia,
1892 struct rpcrdma_xprt *r_xprt)
1893 {
1894 struct rpcrdma_mr_seg *seg1 = seg;
1895 struct rpcrdma_mw *mw = seg1->rl_mw;
1896 struct rpcrdma_frmr *frmr = &mw->r.frmr;
1897 struct ib_mr *mr = frmr->fr_mr;
1898 struct ib_send_wr fastreg_wr, *bad_wr;
1899 u8 key;
1900 int len, pageoff;
1901 int i, rc;
1902 int seg_len;
1903 u64 pa;
1904 int page_no;
1905
1906 pageoff = offset_in_page(seg1->mr_offset);
1907 seg1->mr_offset -= pageoff; /* start of page */
1908 seg1->mr_len += pageoff;
1909 len = -pageoff;
1910 if (*nsegs > ia->ri_max_frmr_depth)
1911 *nsegs = ia->ri_max_frmr_depth;
1912 for (page_no = i = 0; i < *nsegs;) {
1913 rpcrdma_map_one(ia, seg, writing);
1914 pa = seg->mr_dma;
1915 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1916 frmr->fr_pgl->page_list[page_no++] = pa;
1917 pa += PAGE_SIZE;
1918 }
1919 len += seg->mr_len;
1920 ++seg;
1921 ++i;
1922 /* Check for holes */
1923 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1924 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1925 break;
1926 }
1927 dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n",
1928 __func__, mw, i, len);
1929
1930 frmr->fr_state = FRMR_IS_VALID;
1931
1932 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1933 fastreg_wr.wr_id = (unsigned long)(void *)mw;
1934 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1935 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
1936 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1937 fastreg_wr.wr.fast_reg.page_list_len = page_no;
1938 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1939 fastreg_wr.wr.fast_reg.length = len;
1940
1941 /* Bump the key */
1942 key = (u8)(mr->rkey & 0x000000FF);
1943 ib_update_fast_reg_key(mr, ++key);
1944
1945 fastreg_wr.wr.fast_reg.access_flags = (writing ?
1946 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1947 IB_ACCESS_REMOTE_READ);
1948 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
1949 DECR_CQCOUNT(&r_xprt->rx_ep);
1950
1951 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
1952 if (rc) {
1953 dprintk("RPC: %s: failed ib_post_send for register,"
1954 " status %i\n", __func__, rc);
1955 ib_update_fast_reg_key(mr, --key);
1956 goto out_err;
1957 } else {
1958 seg1->mr_rkey = mr->rkey;
1959 seg1->mr_base = seg1->mr_dma + pageoff;
1960 seg1->mr_nsegs = i;
1961 seg1->mr_len = len;
1962 }
1963 *nsegs = i;
1964 return 0;
1965 out_err:
1966 frmr->fr_state = FRMR_IS_INVALID;
1967 while (i--)
1968 rpcrdma_unmap_one(ia, --seg);
1969 return rc;
1970 }
1971
1972 static int
1973 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1974 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1975 {
1976 struct rpcrdma_mr_seg *seg1 = seg;
1977 struct ib_send_wr invalidate_wr, *bad_wr;
1978 int rc;
1979
1980 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
1981
1982 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1983 invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
1984 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1985 invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
1986 DECR_CQCOUNT(&r_xprt->rx_ep);
1987
1988 read_lock(&ia->ri_qplock);
1989 while (seg1->mr_nsegs--)
1990 rpcrdma_unmap_one(ia, seg++);
1991 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1992 read_unlock(&ia->ri_qplock);
1993 if (rc) {
1994 /* Force rpcrdma_buffer_get() to retry */
1995 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
1996 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1997 " status %i\n", __func__, rc);
1998 }
1999 return rc;
2000 }
2001
2002 static int
2003 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
2004 int *nsegs, int writing, struct rpcrdma_ia *ia)
2005 {
2006 struct rpcrdma_mr_seg *seg1 = seg;
2007 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
2008 int len, pageoff, i, rc;
2009
2010 pageoff = offset_in_page(seg1->mr_offset);
2011 seg1->mr_offset -= pageoff; /* start of page */
2012 seg1->mr_len += pageoff;
2013 len = -pageoff;
2014 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
2015 *nsegs = RPCRDMA_MAX_DATA_SEGS;
2016 for (i = 0; i < *nsegs;) {
2017 rpcrdma_map_one(ia, seg, writing);
2018 physaddrs[i] = seg->mr_dma;
2019 len += seg->mr_len;
2020 ++seg;
2021 ++i;
2022 /* Check for holes */
2023 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
2024 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
2025 break;
2026 }
2027 rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma);
2028 if (rc) {
2029 dprintk("RPC: %s: failed ib_map_phys_fmr "
2030 "%u@0x%llx+%i (%d)... status %i\n", __func__,
2031 len, (unsigned long long)seg1->mr_dma,
2032 pageoff, i, rc);
2033 while (i--)
2034 rpcrdma_unmap_one(ia, --seg);
2035 } else {
2036 seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey;
2037 seg1->mr_base = seg1->mr_dma + pageoff;
2038 seg1->mr_nsegs = i;
2039 seg1->mr_len = len;
2040 }
2041 *nsegs = i;
2042 return rc;
2043 }
2044
2045 static int
2046 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
2047 struct rpcrdma_ia *ia)
2048 {
2049 struct rpcrdma_mr_seg *seg1 = seg;
2050 LIST_HEAD(l);
2051 int rc;
2052
2053 list_add(&seg1->rl_mw->r.fmr->list, &l);
2054 rc = ib_unmap_fmr(&l);
2055 read_lock(&ia->ri_qplock);
2056 while (seg1->mr_nsegs--)
2057 rpcrdma_unmap_one(ia, seg++);
2058 read_unlock(&ia->ri_qplock);
2059 if (rc)
2060 dprintk("RPC: %s: failed ib_unmap_fmr,"
2061 " status %i\n", __func__, rc);
2062 return rc;
2063 }
2064
2065 int
2066 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
2067 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
2068 {
2069 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2070 int rc = 0;
2071
2072 switch (ia->ri_memreg_strategy) {
2073
2074 case RPCRDMA_ALLPHYSICAL:
2075 rpcrdma_map_one(ia, seg, writing);
2076 seg->mr_rkey = ia->ri_bind_mem->rkey;
2077 seg->mr_base = seg->mr_dma;
2078 seg->mr_nsegs = 1;
2079 nsegs = 1;
2080 break;
2081
2082 /* Registration using frmr registration */
2083 case RPCRDMA_FRMR:
2084 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
2085 break;
2086
2087 /* Registration using fmr memory registration */
2088 case RPCRDMA_MTHCAFMR:
2089 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
2090 break;
2091
2092 default:
2093 return -EIO;
2094 }
2095 if (rc)
2096 return rc;
2097
2098 return nsegs;
2099 }
2100
2101 int
2102 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
2103 struct rpcrdma_xprt *r_xprt)
2104 {
2105 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2106 int nsegs = seg->mr_nsegs, rc;
2107
2108 switch (ia->ri_memreg_strategy) {
2109
2110 case RPCRDMA_ALLPHYSICAL:
2111 read_lock(&ia->ri_qplock);
2112 rpcrdma_unmap_one(ia, seg);
2113 read_unlock(&ia->ri_qplock);
2114 break;
2115
2116 case RPCRDMA_FRMR:
2117 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
2118 break;
2119
2120 case RPCRDMA_MTHCAFMR:
2121 rc = rpcrdma_deregister_fmr_external(seg, ia);
2122 break;
2123
2124 default:
2125 break;
2126 }
2127 return nsegs;
2128 }
2129
2130 /*
2131 * Prepost any receive buffer, then post send.
2132 *
2133 * Receive buffer is donated to hardware, reclaimed upon recv completion.
2134 */
2135 int
2136 rpcrdma_ep_post(struct rpcrdma_ia *ia,
2137 struct rpcrdma_ep *ep,
2138 struct rpcrdma_req *req)
2139 {
2140 struct ib_send_wr send_wr, *send_wr_fail;
2141 struct rpcrdma_rep *rep = req->rl_reply;
2142 int rc;
2143
2144 if (rep) {
2145 rc = rpcrdma_ep_post_recv(ia, ep, rep);
2146 if (rc)
2147 goto out;
2148 req->rl_reply = NULL;
2149 }
2150
2151 send_wr.next = NULL;
2152 send_wr.wr_id = 0ULL; /* no send cookie */
2153 send_wr.sg_list = req->rl_send_iov;
2154 send_wr.num_sge = req->rl_niovs;
2155 send_wr.opcode = IB_WR_SEND;
2156 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
2157 ib_dma_sync_single_for_device(ia->ri_id->device,
2158 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
2159 DMA_TO_DEVICE);
2160 ib_dma_sync_single_for_device(ia->ri_id->device,
2161 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
2162 DMA_TO_DEVICE);
2163 ib_dma_sync_single_for_device(ia->ri_id->device,
2164 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
2165 DMA_TO_DEVICE);
2166
2167 if (DECR_CQCOUNT(ep) > 0)
2168 send_wr.send_flags = 0;
2169 else { /* Provider must take a send completion every now and then */
2170 INIT_CQCOUNT(ep);
2171 send_wr.send_flags = IB_SEND_SIGNALED;
2172 }
2173
2174 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
2175 if (rc)
2176 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
2177 rc);
2178 out:
2179 return rc;
2180 }
2181
2182 /*
2183 * (Re)post a receive buffer.
2184 */
2185 int
2186 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2187 struct rpcrdma_ep *ep,
2188 struct rpcrdma_rep *rep)
2189 {
2190 struct ib_recv_wr recv_wr, *recv_wr_fail;
2191 int rc;
2192
2193 recv_wr.next = NULL;
2194 recv_wr.wr_id = (u64) (unsigned long) rep;
2195 recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
2196 recv_wr.num_sge = 1;
2197
2198 ib_dma_sync_single_for_cpu(ia->ri_id->device,
2199 rdmab_addr(rep->rr_rdmabuf),
2200 rdmab_length(rep->rr_rdmabuf),
2201 DMA_BIDIRECTIONAL);
2202
2203 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
2204
2205 if (rc)
2206 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
2207 rc);
2208 return rc;
2209 }
2210
2211 /* Physical mapping means one Read/Write list entry per-page.
2212 * All list entries must fit within an inline buffer
2213 *
2214 * NB: The server must return a Write list for NFS READ,
2215 * which has the same constraint. Factor in the inline
2216 * rsize as well.
2217 */
2218 static size_t
2219 rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
2220 {
2221 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2222 unsigned int inline_size, pages;
2223
2224 inline_size = min_t(unsigned int,
2225 cdata->inline_wsize, cdata->inline_rsize);
2226 inline_size -= RPCRDMA_HDRLEN_MIN;
2227 pages = inline_size / sizeof(struct rpcrdma_segment);
2228 return pages << PAGE_SHIFT;
2229 }
2230
2231 static size_t
2232 rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2233 {
2234 return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2235 }
2236
2237 size_t
2238 rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2239 {
2240 size_t result;
2241
2242 switch (r_xprt->rx_ia.ri_memreg_strategy) {
2243 case RPCRDMA_ALLPHYSICAL:
2244 result = rpcrdma_physical_max_payload(r_xprt);
2245 break;
2246 default:
2247 result = rpcrdma_mr_max_payload(r_xprt);
2248 }
2249 return result;
2250 }
This page took 0.11188 seconds and 4 git commands to generate.