xprtrdma: Prevent infinite loop in rpcrdma_ep_create()
[deliverable/linux.git] / net / sunrpc / xprtrdma / verbs.c
1 /*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50 #include <linux/interrupt.h>
51 #include <linux/slab.h>
52 #include <linux/prefetch.h>
53 #include <linux/sunrpc/addr.h>
54 #include <asm/bitops.h>
55
56 #include "xprt_rdma.h"
57
58 /*
59 * Globals/Macros
60 */
61
62 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
63 # define RPCDBG_FACILITY RPCDBG_TRANS
64 #endif
65
66 static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
67 static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
68
69 /*
70 * internal functions
71 */
72
73 /*
74 * handle replies in tasklet context, using a single, global list
75 * rdma tasklet function -- just turn around and call the func
76 * for all replies on the list
77 */
78
79 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
80 static LIST_HEAD(rpcrdma_tasklets_g);
81
82 static void
83 rpcrdma_run_tasklet(unsigned long data)
84 {
85 struct rpcrdma_rep *rep;
86 void (*func)(struct rpcrdma_rep *);
87 unsigned long flags;
88
89 data = data;
90 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
91 while (!list_empty(&rpcrdma_tasklets_g)) {
92 rep = list_entry(rpcrdma_tasklets_g.next,
93 struct rpcrdma_rep, rr_list);
94 list_del(&rep->rr_list);
95 func = rep->rr_func;
96 rep->rr_func = NULL;
97 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
98
99 if (func)
100 func(rep);
101 else
102 rpcrdma_recv_buffer_put(rep);
103
104 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
105 }
106 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
107 }
108
109 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
110
111 static const char * const async_event[] = {
112 "CQ error",
113 "QP fatal error",
114 "QP request error",
115 "QP access error",
116 "communication established",
117 "send queue drained",
118 "path migration successful",
119 "path mig error",
120 "device fatal error",
121 "port active",
122 "port error",
123 "LID change",
124 "P_key change",
125 "SM change",
126 "SRQ error",
127 "SRQ limit reached",
128 "last WQE reached",
129 "client reregister",
130 "GID change",
131 };
132
133 #define ASYNC_MSG(status) \
134 ((status) < ARRAY_SIZE(async_event) ? \
135 async_event[(status)] : "unknown async error")
136
137 static void
138 rpcrdma_schedule_tasklet(struct list_head *sched_list)
139 {
140 unsigned long flags;
141
142 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
143 list_splice_tail(sched_list, &rpcrdma_tasklets_g);
144 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
145 tasklet_schedule(&rpcrdma_tasklet_g);
146 }
147
148 static void
149 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
150 {
151 struct rpcrdma_ep *ep = context;
152
153 pr_err("RPC: %s: %s on device %s ep %p\n",
154 __func__, ASYNC_MSG(event->event),
155 event->device->name, context);
156 if (ep->rep_connected == 1) {
157 ep->rep_connected = -EIO;
158 rpcrdma_conn_func(ep);
159 wake_up_all(&ep->rep_connect_wait);
160 }
161 }
162
163 static void
164 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
165 {
166 struct rpcrdma_ep *ep = context;
167
168 pr_err("RPC: %s: %s on device %s ep %p\n",
169 __func__, ASYNC_MSG(event->event),
170 event->device->name, context);
171 if (ep->rep_connected == 1) {
172 ep->rep_connected = -EIO;
173 rpcrdma_conn_func(ep);
174 wake_up_all(&ep->rep_connect_wait);
175 }
176 }
177
178 static const char * const wc_status[] = {
179 "success",
180 "local length error",
181 "local QP operation error",
182 "local EE context operation error",
183 "local protection error",
184 "WR flushed",
185 "memory management operation error",
186 "bad response error",
187 "local access error",
188 "remote invalid request error",
189 "remote access error",
190 "remote operation error",
191 "transport retry counter exceeded",
192 "RNR retrycounter exceeded",
193 "local RDD violation error",
194 "remove invalid RD request",
195 "operation aborted",
196 "invalid EE context number",
197 "invalid EE context state",
198 "fatal error",
199 "response timeout error",
200 "general error",
201 };
202
203 #define COMPLETION_MSG(status) \
204 ((status) < ARRAY_SIZE(wc_status) ? \
205 wc_status[(status)] : "unexpected completion error")
206
207 static void
208 rpcrdma_sendcq_process_wc(struct ib_wc *wc)
209 {
210 if (likely(wc->status == IB_WC_SUCCESS))
211 return;
212
213 /* WARNING: Only wr_id and status are reliable at this point */
214 if (wc->wr_id == 0ULL) {
215 if (wc->status != IB_WC_WR_FLUSH_ERR)
216 pr_err("RPC: %s: SEND: %s\n",
217 __func__, COMPLETION_MSG(wc->status));
218 } else {
219 struct rpcrdma_mw *r;
220
221 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
222 r->r.frmr.fr_state = FRMR_IS_STALE;
223 pr_err("RPC: %s: frmr %p (stale): %s\n",
224 __func__, r, COMPLETION_MSG(wc->status));
225 }
226 }
227
228 static int
229 rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
230 {
231 struct ib_wc *wcs;
232 int budget, count, rc;
233
234 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
235 do {
236 wcs = ep->rep_send_wcs;
237
238 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
239 if (rc <= 0)
240 return rc;
241
242 count = rc;
243 while (count-- > 0)
244 rpcrdma_sendcq_process_wc(wcs++);
245 } while (rc == RPCRDMA_POLLSIZE && --budget);
246 return 0;
247 }
248
249 /*
250 * Handle send, fast_reg_mr, and local_inv completions.
251 *
252 * Send events are typically suppressed and thus do not result
253 * in an upcall. Occasionally one is signaled, however. This
254 * prevents the provider's completion queue from wrapping and
255 * losing a completion.
256 */
257 static void
258 rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
259 {
260 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
261 int rc;
262
263 rc = rpcrdma_sendcq_poll(cq, ep);
264 if (rc) {
265 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
266 __func__, rc);
267 return;
268 }
269
270 rc = ib_req_notify_cq(cq,
271 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
272 if (rc == 0)
273 return;
274 if (rc < 0) {
275 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
276 __func__, rc);
277 return;
278 }
279
280 rpcrdma_sendcq_poll(cq, ep);
281 }
282
283 static void
284 rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
285 {
286 struct rpcrdma_rep *rep =
287 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
288
289 /* WARNING: Only wr_id and status are reliable at this point */
290 if (wc->status != IB_WC_SUCCESS)
291 goto out_fail;
292
293 /* status == SUCCESS means all fields in wc are trustworthy */
294 if (wc->opcode != IB_WC_RECV)
295 return;
296
297 dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
298 __func__, rep, wc->byte_len);
299
300 rep->rr_len = wc->byte_len;
301 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
302 rdmab_addr(rep->rr_rdmabuf),
303 rep->rr_len, DMA_FROM_DEVICE);
304 prefetch(rdmab_to_msg(rep->rr_rdmabuf));
305
306 out_schedule:
307 list_add_tail(&rep->rr_list, sched_list);
308 return;
309 out_fail:
310 if (wc->status != IB_WC_WR_FLUSH_ERR)
311 pr_err("RPC: %s: rep %p: %s\n",
312 __func__, rep, COMPLETION_MSG(wc->status));
313 rep->rr_len = ~0U;
314 goto out_schedule;
315 }
316
317 static int
318 rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
319 {
320 struct list_head sched_list;
321 struct ib_wc *wcs;
322 int budget, count, rc;
323
324 INIT_LIST_HEAD(&sched_list);
325 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
326 do {
327 wcs = ep->rep_recv_wcs;
328
329 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
330 if (rc <= 0)
331 goto out_schedule;
332
333 count = rc;
334 while (count-- > 0)
335 rpcrdma_recvcq_process_wc(wcs++, &sched_list);
336 } while (rc == RPCRDMA_POLLSIZE && --budget);
337 rc = 0;
338
339 out_schedule:
340 rpcrdma_schedule_tasklet(&sched_list);
341 return rc;
342 }
343
344 /*
345 * Handle receive completions.
346 *
347 * It is reentrant but processes single events in order to maintain
348 * ordering of receives to keep server credits.
349 *
350 * It is the responsibility of the scheduled tasklet to return
351 * recv buffers to the pool. NOTE: this affects synchronization of
352 * connection shutdown. That is, the structures required for
353 * the completion of the reply handler must remain intact until
354 * all memory has been reclaimed.
355 */
356 static void
357 rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
358 {
359 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
360 int rc;
361
362 rc = rpcrdma_recvcq_poll(cq, ep);
363 if (rc) {
364 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
365 __func__, rc);
366 return;
367 }
368
369 rc = ib_req_notify_cq(cq,
370 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
371 if (rc == 0)
372 return;
373 if (rc < 0) {
374 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
375 __func__, rc);
376 return;
377 }
378
379 rpcrdma_recvcq_poll(cq, ep);
380 }
381
382 static void
383 rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
384 {
385 struct ib_wc wc;
386 LIST_HEAD(sched_list);
387
388 while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
389 rpcrdma_recvcq_process_wc(&wc, &sched_list);
390 if (!list_empty(&sched_list))
391 rpcrdma_schedule_tasklet(&sched_list);
392 while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
393 rpcrdma_sendcq_process_wc(&wc);
394 }
395
396 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
397 static const char * const conn[] = {
398 "address resolved",
399 "address error",
400 "route resolved",
401 "route error",
402 "connect request",
403 "connect response",
404 "connect error",
405 "unreachable",
406 "rejected",
407 "established",
408 "disconnected",
409 "device removal",
410 "multicast join",
411 "multicast error",
412 "address change",
413 "timewait exit",
414 };
415
416 #define CONNECTION_MSG(status) \
417 ((status) < ARRAY_SIZE(conn) ? \
418 conn[(status)] : "unrecognized connection error")
419 #endif
420
421 static int
422 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
423 {
424 struct rpcrdma_xprt *xprt = id->context;
425 struct rpcrdma_ia *ia = &xprt->rx_ia;
426 struct rpcrdma_ep *ep = &xprt->rx_ep;
427 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
428 struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
429 #endif
430 struct ib_qp_attr *attr = &ia->ri_qp_attr;
431 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
432 int connstate = 0;
433
434 switch (event->event) {
435 case RDMA_CM_EVENT_ADDR_RESOLVED:
436 case RDMA_CM_EVENT_ROUTE_RESOLVED:
437 ia->ri_async_rc = 0;
438 complete(&ia->ri_done);
439 break;
440 case RDMA_CM_EVENT_ADDR_ERROR:
441 ia->ri_async_rc = -EHOSTUNREACH;
442 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
443 __func__, ep);
444 complete(&ia->ri_done);
445 break;
446 case RDMA_CM_EVENT_ROUTE_ERROR:
447 ia->ri_async_rc = -ENETUNREACH;
448 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
449 __func__, ep);
450 complete(&ia->ri_done);
451 break;
452 case RDMA_CM_EVENT_ESTABLISHED:
453 connstate = 1;
454 ib_query_qp(ia->ri_id->qp, attr,
455 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
456 iattr);
457 dprintk("RPC: %s: %d responder resources"
458 " (%d initiator)\n",
459 __func__, attr->max_dest_rd_atomic,
460 attr->max_rd_atomic);
461 goto connected;
462 case RDMA_CM_EVENT_CONNECT_ERROR:
463 connstate = -ENOTCONN;
464 goto connected;
465 case RDMA_CM_EVENT_UNREACHABLE:
466 connstate = -ENETDOWN;
467 goto connected;
468 case RDMA_CM_EVENT_REJECTED:
469 connstate = -ECONNREFUSED;
470 goto connected;
471 case RDMA_CM_EVENT_DISCONNECTED:
472 connstate = -ECONNABORTED;
473 goto connected;
474 case RDMA_CM_EVENT_DEVICE_REMOVAL:
475 connstate = -ENODEV;
476 connected:
477 dprintk("RPC: %s: %sconnected\n",
478 __func__, connstate > 0 ? "" : "dis");
479 ep->rep_connected = connstate;
480 rpcrdma_conn_func(ep);
481 wake_up_all(&ep->rep_connect_wait);
482 /*FALLTHROUGH*/
483 default:
484 dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
485 __func__, sap, rpc_get_port(sap), ep,
486 CONNECTION_MSG(event->event));
487 break;
488 }
489
490 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
491 if (connstate == 1) {
492 int ird = attr->max_dest_rd_atomic;
493 int tird = ep->rep_remote_cma.responder_resources;
494
495 pr_info("rpcrdma: connection to %pIS:%u on %s, memreg %d slots %d ird %d%s\n",
496 sap, rpc_get_port(sap),
497 ia->ri_id->device->name,
498 ia->ri_memreg_strategy,
499 xprt->rx_buf.rb_max_requests,
500 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
501 } else if (connstate < 0) {
502 pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
503 sap, rpc_get_port(sap), connstate);
504 }
505 #endif
506
507 return 0;
508 }
509
510 static struct rdma_cm_id *
511 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
512 struct rpcrdma_ia *ia, struct sockaddr *addr)
513 {
514 struct rdma_cm_id *id;
515 int rc;
516
517 init_completion(&ia->ri_done);
518
519 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
520 if (IS_ERR(id)) {
521 rc = PTR_ERR(id);
522 dprintk("RPC: %s: rdma_create_id() failed %i\n",
523 __func__, rc);
524 return id;
525 }
526
527 ia->ri_async_rc = -ETIMEDOUT;
528 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
529 if (rc) {
530 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
531 __func__, rc);
532 goto out;
533 }
534 wait_for_completion_interruptible_timeout(&ia->ri_done,
535 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
536 rc = ia->ri_async_rc;
537 if (rc)
538 goto out;
539
540 ia->ri_async_rc = -ETIMEDOUT;
541 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
542 if (rc) {
543 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
544 __func__, rc);
545 goto out;
546 }
547 wait_for_completion_interruptible_timeout(&ia->ri_done,
548 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
549 rc = ia->ri_async_rc;
550 if (rc)
551 goto out;
552
553 return id;
554
555 out:
556 rdma_destroy_id(id);
557 return ERR_PTR(rc);
558 }
559
560 /*
561 * Drain any cq, prior to teardown.
562 */
563 static void
564 rpcrdma_clean_cq(struct ib_cq *cq)
565 {
566 struct ib_wc wc;
567 int count = 0;
568
569 while (1 == ib_poll_cq(cq, 1, &wc))
570 ++count;
571
572 if (count)
573 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
574 __func__, count, wc.opcode);
575 }
576
577 /*
578 * Exported functions.
579 */
580
581 /*
582 * Open and initialize an Interface Adapter.
583 * o initializes fields of struct rpcrdma_ia, including
584 * interface and provider attributes and protection zone.
585 */
586 int
587 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
588 {
589 int rc, mem_priv;
590 struct rpcrdma_ia *ia = &xprt->rx_ia;
591 struct ib_device_attr *devattr = &ia->ri_devattr;
592
593 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
594 if (IS_ERR(ia->ri_id)) {
595 rc = PTR_ERR(ia->ri_id);
596 goto out1;
597 }
598
599 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
600 if (IS_ERR(ia->ri_pd)) {
601 rc = PTR_ERR(ia->ri_pd);
602 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
603 __func__, rc);
604 goto out2;
605 }
606
607 rc = ib_query_device(ia->ri_id->device, devattr);
608 if (rc) {
609 dprintk("RPC: %s: ib_query_device failed %d\n",
610 __func__, rc);
611 goto out3;
612 }
613
614 if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
615 ia->ri_have_dma_lkey = 1;
616 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
617 }
618
619 if (memreg == RPCRDMA_FRMR) {
620 /* Requires both frmr reg and local dma lkey */
621 if (((devattr->device_cap_flags &
622 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
623 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
624 (devattr->max_fast_reg_page_list_len == 0)) {
625 dprintk("RPC: %s: FRMR registration "
626 "not supported by HCA\n", __func__);
627 memreg = RPCRDMA_MTHCAFMR;
628 } else {
629 /* Mind the ia limit on FRMR page list depth */
630 ia->ri_max_frmr_depth = min_t(unsigned int,
631 RPCRDMA_MAX_DATA_SEGS,
632 devattr->max_fast_reg_page_list_len);
633 }
634 }
635 if (memreg == RPCRDMA_MTHCAFMR) {
636 if (!ia->ri_id->device->alloc_fmr) {
637 dprintk("RPC: %s: MTHCAFMR registration "
638 "not supported by HCA\n", __func__);
639 memreg = RPCRDMA_ALLPHYSICAL;
640 }
641 }
642
643 /*
644 * Optionally obtain an underlying physical identity mapping in
645 * order to do a memory window-based bind. This base registration
646 * is protected from remote access - that is enabled only by binding
647 * for the specific bytes targeted during each RPC operation, and
648 * revoked after the corresponding completion similar to a storage
649 * adapter.
650 */
651 switch (memreg) {
652 case RPCRDMA_FRMR:
653 break;
654 case RPCRDMA_ALLPHYSICAL:
655 mem_priv = IB_ACCESS_LOCAL_WRITE |
656 IB_ACCESS_REMOTE_WRITE |
657 IB_ACCESS_REMOTE_READ;
658 goto register_setup;
659 case RPCRDMA_MTHCAFMR:
660 if (ia->ri_have_dma_lkey)
661 break;
662 mem_priv = IB_ACCESS_LOCAL_WRITE;
663 register_setup:
664 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
665 if (IS_ERR(ia->ri_bind_mem)) {
666 printk(KERN_ALERT "%s: ib_get_dma_mr for "
667 "phys register failed with %lX\n",
668 __func__, PTR_ERR(ia->ri_bind_mem));
669 rc = -ENOMEM;
670 goto out3;
671 }
672 break;
673 default:
674 printk(KERN_ERR "RPC: Unsupported memory "
675 "registration mode: %d\n", memreg);
676 rc = -ENOMEM;
677 goto out3;
678 }
679 dprintk("RPC: %s: memory registration strategy is %d\n",
680 __func__, memreg);
681
682 /* Else will do memory reg/dereg for each chunk */
683 ia->ri_memreg_strategy = memreg;
684
685 rwlock_init(&ia->ri_qplock);
686 return 0;
687
688 out3:
689 ib_dealloc_pd(ia->ri_pd);
690 ia->ri_pd = NULL;
691 out2:
692 rdma_destroy_id(ia->ri_id);
693 ia->ri_id = NULL;
694 out1:
695 return rc;
696 }
697
698 /*
699 * Clean up/close an IA.
700 * o if event handles and PD have been initialized, free them.
701 * o close the IA
702 */
703 void
704 rpcrdma_ia_close(struct rpcrdma_ia *ia)
705 {
706 int rc;
707
708 dprintk("RPC: %s: entering\n", __func__);
709 if (ia->ri_bind_mem != NULL) {
710 rc = ib_dereg_mr(ia->ri_bind_mem);
711 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
712 __func__, rc);
713 }
714 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
715 if (ia->ri_id->qp)
716 rdma_destroy_qp(ia->ri_id);
717 rdma_destroy_id(ia->ri_id);
718 ia->ri_id = NULL;
719 }
720 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
721 rc = ib_dealloc_pd(ia->ri_pd);
722 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
723 __func__, rc);
724 }
725 }
726
727 /*
728 * Create unconnected endpoint.
729 */
730 int
731 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
732 struct rpcrdma_create_data_internal *cdata)
733 {
734 struct ib_device_attr *devattr = &ia->ri_devattr;
735 struct ib_cq *sendcq, *recvcq;
736 int rc, err;
737
738 /* check provider's send/recv wr limits */
739 if (cdata->max_requests > devattr->max_qp_wr)
740 cdata->max_requests = devattr->max_qp_wr;
741
742 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
743 ep->rep_attr.qp_context = ep;
744 /* send_cq and recv_cq initialized below */
745 ep->rep_attr.srq = NULL;
746 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
747 switch (ia->ri_memreg_strategy) {
748 case RPCRDMA_FRMR: {
749 int depth = 7;
750
751 /* Add room for frmr register and invalidate WRs.
752 * 1. FRMR reg WR for head
753 * 2. FRMR invalidate WR for head
754 * 3. N FRMR reg WRs for pagelist
755 * 4. N FRMR invalidate WRs for pagelist
756 * 5. FRMR reg WR for tail
757 * 6. FRMR invalidate WR for tail
758 * 7. The RDMA_SEND WR
759 */
760
761 /* Calculate N if the device max FRMR depth is smaller than
762 * RPCRDMA_MAX_DATA_SEGS.
763 */
764 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
765 int delta = RPCRDMA_MAX_DATA_SEGS -
766 ia->ri_max_frmr_depth;
767
768 do {
769 depth += 2; /* FRMR reg + invalidate */
770 delta -= ia->ri_max_frmr_depth;
771 } while (delta > 0);
772
773 }
774 ep->rep_attr.cap.max_send_wr *= depth;
775 if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
776 cdata->max_requests = devattr->max_qp_wr / depth;
777 if (!cdata->max_requests)
778 return -EINVAL;
779 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
780 depth;
781 }
782 break;
783 }
784 default:
785 break;
786 }
787 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
788 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
789 ep->rep_attr.cap.max_recv_sge = 1;
790 ep->rep_attr.cap.max_inline_data = 0;
791 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
792 ep->rep_attr.qp_type = IB_QPT_RC;
793 ep->rep_attr.port_num = ~0;
794
795 if (cdata->padding) {
796 ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
797 GFP_KERNEL);
798 if (IS_ERR(ep->rep_padbuf))
799 return PTR_ERR(ep->rep_padbuf);
800 } else
801 ep->rep_padbuf = NULL;
802
803 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
804 "iovs: send %d recv %d\n",
805 __func__,
806 ep->rep_attr.cap.max_send_wr,
807 ep->rep_attr.cap.max_recv_wr,
808 ep->rep_attr.cap.max_send_sge,
809 ep->rep_attr.cap.max_recv_sge);
810
811 /* set trigger for requesting send completion */
812 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
813 if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
814 ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
815 else if (ep->rep_cqinit <= 2)
816 ep->rep_cqinit = 0;
817 INIT_CQCOUNT(ep);
818 init_waitqueue_head(&ep->rep_connect_wait);
819 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
820
821 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
822 rpcrdma_cq_async_error_upcall, ep,
823 ep->rep_attr.cap.max_send_wr + 1, 0);
824 if (IS_ERR(sendcq)) {
825 rc = PTR_ERR(sendcq);
826 dprintk("RPC: %s: failed to create send CQ: %i\n",
827 __func__, rc);
828 goto out1;
829 }
830
831 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
832 if (rc) {
833 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
834 __func__, rc);
835 goto out2;
836 }
837
838 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
839 rpcrdma_cq_async_error_upcall, ep,
840 ep->rep_attr.cap.max_recv_wr + 1, 0);
841 if (IS_ERR(recvcq)) {
842 rc = PTR_ERR(recvcq);
843 dprintk("RPC: %s: failed to create recv CQ: %i\n",
844 __func__, rc);
845 goto out2;
846 }
847
848 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
849 if (rc) {
850 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
851 __func__, rc);
852 ib_destroy_cq(recvcq);
853 goto out2;
854 }
855
856 ep->rep_attr.send_cq = sendcq;
857 ep->rep_attr.recv_cq = recvcq;
858
859 /* Initialize cma parameters */
860
861 /* RPC/RDMA does not use private data */
862 ep->rep_remote_cma.private_data = NULL;
863 ep->rep_remote_cma.private_data_len = 0;
864
865 /* Client offers RDMA Read but does not initiate */
866 ep->rep_remote_cma.initiator_depth = 0;
867 if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
868 ep->rep_remote_cma.responder_resources = 32;
869 else
870 ep->rep_remote_cma.responder_resources =
871 devattr->max_qp_rd_atom;
872
873 ep->rep_remote_cma.retry_count = 7;
874 ep->rep_remote_cma.flow_control = 0;
875 ep->rep_remote_cma.rnr_retry_count = 0;
876
877 return 0;
878
879 out2:
880 err = ib_destroy_cq(sendcq);
881 if (err)
882 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
883 __func__, err);
884 out1:
885 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
886 return rc;
887 }
888
889 /*
890 * rpcrdma_ep_destroy
891 *
892 * Disconnect and destroy endpoint. After this, the only
893 * valid operations on the ep are to free it (if dynamically
894 * allocated) or re-create it.
895 */
896 void
897 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
898 {
899 int rc;
900
901 dprintk("RPC: %s: entering, connected is %d\n",
902 __func__, ep->rep_connected);
903
904 cancel_delayed_work_sync(&ep->rep_connect_worker);
905
906 if (ia->ri_id->qp) {
907 rpcrdma_ep_disconnect(ep, ia);
908 rdma_destroy_qp(ia->ri_id);
909 ia->ri_id->qp = NULL;
910 }
911
912 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
913
914 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
915 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
916 if (rc)
917 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
918 __func__, rc);
919
920 rpcrdma_clean_cq(ep->rep_attr.send_cq);
921 rc = ib_destroy_cq(ep->rep_attr.send_cq);
922 if (rc)
923 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
924 __func__, rc);
925 }
926
927 /*
928 * Connect unconnected endpoint.
929 */
930 int
931 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
932 {
933 struct rdma_cm_id *id, *old;
934 int rc = 0;
935 int retry_count = 0;
936
937 if (ep->rep_connected != 0) {
938 struct rpcrdma_xprt *xprt;
939 retry:
940 dprintk("RPC: %s: reconnecting...\n", __func__);
941
942 rpcrdma_ep_disconnect(ep, ia);
943 rpcrdma_flush_cqs(ep);
944
945 switch (ia->ri_memreg_strategy) {
946 case RPCRDMA_FRMR:
947 rpcrdma_reset_frmrs(ia);
948 break;
949 case RPCRDMA_MTHCAFMR:
950 rpcrdma_reset_fmrs(ia);
951 break;
952 case RPCRDMA_ALLPHYSICAL:
953 break;
954 default:
955 rc = -EIO;
956 goto out;
957 }
958
959 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
960 id = rpcrdma_create_id(xprt, ia,
961 (struct sockaddr *)&xprt->rx_data.addr);
962 if (IS_ERR(id)) {
963 rc = -EHOSTUNREACH;
964 goto out;
965 }
966 /* TEMP TEMP TEMP - fail if new device:
967 * Deregister/remarshal *all* requests!
968 * Close and recreate adapter, pd, etc!
969 * Re-determine all attributes still sane!
970 * More stuff I haven't thought of!
971 * Rrrgh!
972 */
973 if (ia->ri_id->device != id->device) {
974 printk("RPC: %s: can't reconnect on "
975 "different device!\n", __func__);
976 rdma_destroy_id(id);
977 rc = -ENETUNREACH;
978 goto out;
979 }
980 /* END TEMP */
981 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
982 if (rc) {
983 dprintk("RPC: %s: rdma_create_qp failed %i\n",
984 __func__, rc);
985 rdma_destroy_id(id);
986 rc = -ENETUNREACH;
987 goto out;
988 }
989
990 write_lock(&ia->ri_qplock);
991 old = ia->ri_id;
992 ia->ri_id = id;
993 write_unlock(&ia->ri_qplock);
994
995 rdma_destroy_qp(old);
996 rdma_destroy_id(old);
997 } else {
998 dprintk("RPC: %s: connecting...\n", __func__);
999 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
1000 if (rc) {
1001 dprintk("RPC: %s: rdma_create_qp failed %i\n",
1002 __func__, rc);
1003 /* do not update ep->rep_connected */
1004 return -ENETUNREACH;
1005 }
1006 }
1007
1008 ep->rep_connected = 0;
1009
1010 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
1011 if (rc) {
1012 dprintk("RPC: %s: rdma_connect() failed with %i\n",
1013 __func__, rc);
1014 goto out;
1015 }
1016
1017 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
1018
1019 /*
1020 * Check state. A non-peer reject indicates no listener
1021 * (ECONNREFUSED), which may be a transient state. All
1022 * others indicate a transport condition which has already
1023 * undergone a best-effort.
1024 */
1025 if (ep->rep_connected == -ECONNREFUSED &&
1026 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
1027 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
1028 goto retry;
1029 }
1030 if (ep->rep_connected <= 0) {
1031 /* Sometimes, the only way to reliably connect to remote
1032 * CMs is to use same nonzero values for ORD and IRD. */
1033 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
1034 (ep->rep_remote_cma.responder_resources == 0 ||
1035 ep->rep_remote_cma.initiator_depth !=
1036 ep->rep_remote_cma.responder_resources)) {
1037 if (ep->rep_remote_cma.responder_resources == 0)
1038 ep->rep_remote_cma.responder_resources = 1;
1039 ep->rep_remote_cma.initiator_depth =
1040 ep->rep_remote_cma.responder_resources;
1041 goto retry;
1042 }
1043 rc = ep->rep_connected;
1044 } else {
1045 dprintk("RPC: %s: connected\n", __func__);
1046 }
1047
1048 out:
1049 if (rc)
1050 ep->rep_connected = rc;
1051 return rc;
1052 }
1053
1054 /*
1055 * rpcrdma_ep_disconnect
1056 *
1057 * This is separate from destroy to facilitate the ability
1058 * to reconnect without recreating the endpoint.
1059 *
1060 * This call is not reentrant, and must not be made in parallel
1061 * on the same endpoint.
1062 */
1063 void
1064 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
1065 {
1066 int rc;
1067
1068 rpcrdma_flush_cqs(ep);
1069 rc = rdma_disconnect(ia->ri_id);
1070 if (!rc) {
1071 /* returns without wait if not connected */
1072 wait_event_interruptible(ep->rep_connect_wait,
1073 ep->rep_connected != 1);
1074 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
1075 (ep->rep_connected == 1) ? "still " : "dis");
1076 } else {
1077 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
1078 ep->rep_connected = rc;
1079 }
1080 }
1081
1082 static struct rpcrdma_req *
1083 rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
1084 {
1085 struct rpcrdma_req *req;
1086
1087 req = kzalloc(sizeof(*req), GFP_KERNEL);
1088 if (req == NULL)
1089 return ERR_PTR(-ENOMEM);
1090
1091 req->rl_buffer = &r_xprt->rx_buf;
1092 return req;
1093 }
1094
1095 static struct rpcrdma_rep *
1096 rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
1097 {
1098 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1099 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1100 struct rpcrdma_rep *rep;
1101 int rc;
1102
1103 rc = -ENOMEM;
1104 rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1105 if (rep == NULL)
1106 goto out;
1107
1108 rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
1109 GFP_KERNEL);
1110 if (IS_ERR(rep->rr_rdmabuf)) {
1111 rc = PTR_ERR(rep->rr_rdmabuf);
1112 goto out_free;
1113 }
1114
1115 rep->rr_buffer = &r_xprt->rx_buf;
1116 return rep;
1117
1118 out_free:
1119 kfree(rep);
1120 out:
1121 return ERR_PTR(rc);
1122 }
1123
1124 static int
1125 rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1126 {
1127 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
1128 struct ib_fmr_attr fmr_attr = {
1129 .max_pages = RPCRDMA_MAX_DATA_SEGS,
1130 .max_maps = 1,
1131 .page_shift = PAGE_SHIFT
1132 };
1133 struct rpcrdma_mw *r;
1134 int i, rc;
1135
1136 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1137 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
1138
1139 while (i--) {
1140 r = kzalloc(sizeof(*r), GFP_KERNEL);
1141 if (r == NULL)
1142 return -ENOMEM;
1143
1144 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1145 if (IS_ERR(r->r.fmr)) {
1146 rc = PTR_ERR(r->r.fmr);
1147 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1148 __func__, rc);
1149 goto out_free;
1150 }
1151
1152 list_add(&r->mw_list, &buf->rb_mws);
1153 list_add(&r->mw_all, &buf->rb_all);
1154 }
1155 return 0;
1156
1157 out_free:
1158 kfree(r);
1159 return rc;
1160 }
1161
1162 static int
1163 rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1164 {
1165 struct rpcrdma_frmr *f;
1166 struct rpcrdma_mw *r;
1167 int i, rc;
1168
1169 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1170 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
1171
1172 while (i--) {
1173 r = kzalloc(sizeof(*r), GFP_KERNEL);
1174 if (r == NULL)
1175 return -ENOMEM;
1176 f = &r->r.frmr;
1177
1178 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1179 ia->ri_max_frmr_depth);
1180 if (IS_ERR(f->fr_mr)) {
1181 rc = PTR_ERR(f->fr_mr);
1182 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1183 "failed %i\n", __func__, rc);
1184 goto out_free;
1185 }
1186
1187 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1188 ia->ri_max_frmr_depth);
1189 if (IS_ERR(f->fr_pgl)) {
1190 rc = PTR_ERR(f->fr_pgl);
1191 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1192 "failed %i\n", __func__, rc);
1193
1194 ib_dereg_mr(f->fr_mr);
1195 goto out_free;
1196 }
1197
1198 list_add(&r->mw_list, &buf->rb_mws);
1199 list_add(&r->mw_all, &buf->rb_all);
1200 }
1201
1202 return 0;
1203
1204 out_free:
1205 kfree(r);
1206 return rc;
1207 }
1208
1209 int
1210 rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1211 {
1212 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1213 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1214 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1215 char *p;
1216 size_t len;
1217 int i, rc;
1218
1219 buf->rb_max_requests = cdata->max_requests;
1220 spin_lock_init(&buf->rb_lock);
1221
1222 /* Need to allocate:
1223 * 1. arrays for send and recv pointers
1224 * 2. arrays of struct rpcrdma_req to fill in pointers
1225 * 3. array of struct rpcrdma_rep for replies
1226 * Send/recv buffers in req/rep need to be registered
1227 */
1228 len = buf->rb_max_requests *
1229 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1230
1231 p = kzalloc(len, GFP_KERNEL);
1232 if (p == NULL) {
1233 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1234 __func__, len);
1235 rc = -ENOMEM;
1236 goto out;
1237 }
1238 buf->rb_pool = p; /* for freeing it later */
1239
1240 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1241 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1242 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1243 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1244
1245 INIT_LIST_HEAD(&buf->rb_mws);
1246 INIT_LIST_HEAD(&buf->rb_all);
1247 switch (ia->ri_memreg_strategy) {
1248 case RPCRDMA_FRMR:
1249 rc = rpcrdma_init_frmrs(ia, buf);
1250 if (rc)
1251 goto out;
1252 break;
1253 case RPCRDMA_MTHCAFMR:
1254 rc = rpcrdma_init_fmrs(ia, buf);
1255 if (rc)
1256 goto out;
1257 break;
1258 default:
1259 break;
1260 }
1261
1262 for (i = 0; i < buf->rb_max_requests; i++) {
1263 struct rpcrdma_req *req;
1264 struct rpcrdma_rep *rep;
1265
1266 req = rpcrdma_create_req(r_xprt);
1267 if (IS_ERR(req)) {
1268 dprintk("RPC: %s: request buffer %d alloc"
1269 " failed\n", __func__, i);
1270 rc = PTR_ERR(req);
1271 goto out;
1272 }
1273 buf->rb_send_bufs[i] = req;
1274
1275 rep = rpcrdma_create_rep(r_xprt);
1276 if (IS_ERR(rep)) {
1277 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1278 __func__, i);
1279 rc = PTR_ERR(rep);
1280 goto out;
1281 }
1282 buf->rb_recv_bufs[i] = rep;
1283 }
1284
1285 return 0;
1286 out:
1287 rpcrdma_buffer_destroy(buf);
1288 return rc;
1289 }
1290
1291 static void
1292 rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
1293 {
1294 if (!rep)
1295 return;
1296
1297 rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
1298 kfree(rep);
1299 }
1300
1301 static void
1302 rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
1303 {
1304 if (!req)
1305 return;
1306
1307 rpcrdma_free_regbuf(ia, req->rl_sendbuf);
1308 rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
1309 kfree(req);
1310 }
1311
1312 static void
1313 rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1314 {
1315 struct rpcrdma_mw *r;
1316 int rc;
1317
1318 while (!list_empty(&buf->rb_all)) {
1319 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1320 list_del(&r->mw_all);
1321 list_del(&r->mw_list);
1322
1323 rc = ib_dealloc_fmr(r->r.fmr);
1324 if (rc)
1325 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1326 __func__, rc);
1327
1328 kfree(r);
1329 }
1330 }
1331
1332 static void
1333 rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1334 {
1335 struct rpcrdma_mw *r;
1336 int rc;
1337
1338 while (!list_empty(&buf->rb_all)) {
1339 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1340 list_del(&r->mw_all);
1341 list_del(&r->mw_list);
1342
1343 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1344 if (rc)
1345 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1346 __func__, rc);
1347 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1348
1349 kfree(r);
1350 }
1351 }
1352
1353 void
1354 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1355 {
1356 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1357 int i;
1358
1359 /* clean up in reverse order from create
1360 * 1. recv mr memory (mr free, then kfree)
1361 * 2. send mr memory (mr free, then kfree)
1362 * 3. MWs
1363 */
1364 dprintk("RPC: %s: entering\n", __func__);
1365
1366 for (i = 0; i < buf->rb_max_requests; i++) {
1367 if (buf->rb_recv_bufs)
1368 rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
1369 if (buf->rb_send_bufs)
1370 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
1371 }
1372
1373 switch (ia->ri_memreg_strategy) {
1374 case RPCRDMA_FRMR:
1375 rpcrdma_destroy_frmrs(buf);
1376 break;
1377 case RPCRDMA_MTHCAFMR:
1378 rpcrdma_destroy_fmrs(buf);
1379 break;
1380 default:
1381 break;
1382 }
1383
1384 kfree(buf->rb_pool);
1385 }
1386
1387 /* After a disconnect, unmap all FMRs.
1388 *
1389 * This is invoked only in the transport connect worker in order
1390 * to serialize with rpcrdma_register_fmr_external().
1391 */
1392 static void
1393 rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
1394 {
1395 struct rpcrdma_xprt *r_xprt =
1396 container_of(ia, struct rpcrdma_xprt, rx_ia);
1397 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1398 struct list_head *pos;
1399 struct rpcrdma_mw *r;
1400 LIST_HEAD(l);
1401 int rc;
1402
1403 list_for_each(pos, &buf->rb_all) {
1404 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1405
1406 INIT_LIST_HEAD(&l);
1407 list_add(&r->r.fmr->list, &l);
1408 rc = ib_unmap_fmr(&l);
1409 if (rc)
1410 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
1411 __func__, rc);
1412 }
1413 }
1414
1415 /* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1416 * an unusable state. Find FRMRs in this state and dereg / reg
1417 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1418 * also torn down.
1419 *
1420 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1421 *
1422 * This is invoked only in the transport connect worker in order
1423 * to serialize with rpcrdma_register_frmr_external().
1424 */
1425 static void
1426 rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1427 {
1428 struct rpcrdma_xprt *r_xprt =
1429 container_of(ia, struct rpcrdma_xprt, rx_ia);
1430 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1431 struct list_head *pos;
1432 struct rpcrdma_mw *r;
1433 int rc;
1434
1435 list_for_each(pos, &buf->rb_all) {
1436 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1437
1438 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1439 continue;
1440
1441 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1442 if (rc)
1443 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1444 __func__, rc);
1445 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1446
1447 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1448 ia->ri_max_frmr_depth);
1449 if (IS_ERR(r->r.frmr.fr_mr)) {
1450 rc = PTR_ERR(r->r.frmr.fr_mr);
1451 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1452 " failed %i\n", __func__, rc);
1453 continue;
1454 }
1455 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1456 ia->ri_id->device,
1457 ia->ri_max_frmr_depth);
1458 if (IS_ERR(r->r.frmr.fr_pgl)) {
1459 rc = PTR_ERR(r->r.frmr.fr_pgl);
1460 dprintk("RPC: %s: "
1461 "ib_alloc_fast_reg_page_list "
1462 "failed %i\n", __func__, rc);
1463
1464 ib_dereg_mr(r->r.frmr.fr_mr);
1465 continue;
1466 }
1467 r->r.frmr.fr_state = FRMR_IS_INVALID;
1468 }
1469 }
1470
1471 /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1472 * some req segments uninitialized.
1473 */
1474 static void
1475 rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1476 {
1477 if (*mw) {
1478 list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
1479 *mw = NULL;
1480 }
1481 }
1482
1483 /* Cycle mw's back in reverse order, and "spin" them.
1484 * This delays and scrambles reuse as much as possible.
1485 */
1486 static void
1487 rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1488 {
1489 struct rpcrdma_mr_seg *seg = req->rl_segments;
1490 struct rpcrdma_mr_seg *seg1 = seg;
1491 int i;
1492
1493 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
1494 rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
1495 rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
1496 }
1497
1498 static void
1499 rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1500 {
1501 buf->rb_send_bufs[--buf->rb_send_index] = req;
1502 req->rl_niovs = 0;
1503 if (req->rl_reply) {
1504 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1505 req->rl_reply->rr_func = NULL;
1506 req->rl_reply = NULL;
1507 }
1508 }
1509
1510 /* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
1511 * Redo only the ib_post_send().
1512 */
1513 static void
1514 rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1515 {
1516 struct rpcrdma_xprt *r_xprt =
1517 container_of(ia, struct rpcrdma_xprt, rx_ia);
1518 struct ib_send_wr invalidate_wr, *bad_wr;
1519 int rc;
1520
1521 dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
1522
1523 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
1524 r->r.frmr.fr_state = FRMR_IS_INVALID;
1525
1526 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1527 invalidate_wr.wr_id = (unsigned long)(void *)r;
1528 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1529 invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1530 DECR_CQCOUNT(&r_xprt->rx_ep);
1531
1532 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
1533 __func__, r, r->r.frmr.fr_mr->rkey);
1534
1535 read_lock(&ia->ri_qplock);
1536 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1537 read_unlock(&ia->ri_qplock);
1538 if (rc) {
1539 /* Force rpcrdma_buffer_get() to retry */
1540 r->r.frmr.fr_state = FRMR_IS_STALE;
1541 dprintk("RPC: %s: ib_post_send failed, %i\n",
1542 __func__, rc);
1543 }
1544 }
1545
1546 static void
1547 rpcrdma_retry_flushed_linv(struct list_head *stale,
1548 struct rpcrdma_buffer *buf)
1549 {
1550 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1551 struct list_head *pos;
1552 struct rpcrdma_mw *r;
1553 unsigned long flags;
1554
1555 list_for_each(pos, stale) {
1556 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1557 rpcrdma_retry_local_inv(r, ia);
1558 }
1559
1560 spin_lock_irqsave(&buf->rb_lock, flags);
1561 list_splice_tail(stale, &buf->rb_mws);
1562 spin_unlock_irqrestore(&buf->rb_lock, flags);
1563 }
1564
1565 static struct rpcrdma_req *
1566 rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1567 struct list_head *stale)
1568 {
1569 struct rpcrdma_mw *r;
1570 int i;
1571
1572 i = RPCRDMA_MAX_SEGS - 1;
1573 while (!list_empty(&buf->rb_mws)) {
1574 r = list_entry(buf->rb_mws.next,
1575 struct rpcrdma_mw, mw_list);
1576 list_del(&r->mw_list);
1577 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1578 list_add(&r->mw_list, stale);
1579 continue;
1580 }
1581 req->rl_segments[i].rl_mw = r;
1582 if (unlikely(i-- == 0))
1583 return req; /* Success */
1584 }
1585
1586 /* Not enough entries on rb_mws for this req */
1587 rpcrdma_buffer_put_sendbuf(req, buf);
1588 rpcrdma_buffer_put_mrs(req, buf);
1589 return NULL;
1590 }
1591
1592 static struct rpcrdma_req *
1593 rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1594 {
1595 struct rpcrdma_mw *r;
1596 int i;
1597
1598 i = RPCRDMA_MAX_SEGS - 1;
1599 while (!list_empty(&buf->rb_mws)) {
1600 r = list_entry(buf->rb_mws.next,
1601 struct rpcrdma_mw, mw_list);
1602 list_del(&r->mw_list);
1603 req->rl_segments[i].rl_mw = r;
1604 if (unlikely(i-- == 0))
1605 return req; /* Success */
1606 }
1607
1608 /* Not enough entries on rb_mws for this req */
1609 rpcrdma_buffer_put_sendbuf(req, buf);
1610 rpcrdma_buffer_put_mrs(req, buf);
1611 return NULL;
1612 }
1613
1614 /*
1615 * Get a set of request/reply buffers.
1616 *
1617 * Reply buffer (if needed) is attached to send buffer upon return.
1618 * Rule:
1619 * rb_send_index and rb_recv_index MUST always be pointing to the
1620 * *next* available buffer (non-NULL). They are incremented after
1621 * removing buffers, and decremented *before* returning them.
1622 */
1623 struct rpcrdma_req *
1624 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1625 {
1626 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1627 struct list_head stale;
1628 struct rpcrdma_req *req;
1629 unsigned long flags;
1630
1631 spin_lock_irqsave(&buffers->rb_lock, flags);
1632 if (buffers->rb_send_index == buffers->rb_max_requests) {
1633 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1634 dprintk("RPC: %s: out of request buffers\n", __func__);
1635 return ((struct rpcrdma_req *)NULL);
1636 }
1637
1638 req = buffers->rb_send_bufs[buffers->rb_send_index];
1639 if (buffers->rb_send_index < buffers->rb_recv_index) {
1640 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1641 __func__,
1642 buffers->rb_recv_index - buffers->rb_send_index);
1643 req->rl_reply = NULL;
1644 } else {
1645 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1646 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1647 }
1648 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1649
1650 INIT_LIST_HEAD(&stale);
1651 switch (ia->ri_memreg_strategy) {
1652 case RPCRDMA_FRMR:
1653 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1654 break;
1655 case RPCRDMA_MTHCAFMR:
1656 req = rpcrdma_buffer_get_fmrs(req, buffers);
1657 break;
1658 default:
1659 break;
1660 }
1661 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1662 if (!list_empty(&stale))
1663 rpcrdma_retry_flushed_linv(&stale, buffers);
1664 return req;
1665 }
1666
1667 /*
1668 * Put request/reply buffers back into pool.
1669 * Pre-decrement counter/array index.
1670 */
1671 void
1672 rpcrdma_buffer_put(struct rpcrdma_req *req)
1673 {
1674 struct rpcrdma_buffer *buffers = req->rl_buffer;
1675 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1676 unsigned long flags;
1677
1678 spin_lock_irqsave(&buffers->rb_lock, flags);
1679 rpcrdma_buffer_put_sendbuf(req, buffers);
1680 switch (ia->ri_memreg_strategy) {
1681 case RPCRDMA_FRMR:
1682 case RPCRDMA_MTHCAFMR:
1683 rpcrdma_buffer_put_mrs(req, buffers);
1684 break;
1685 default:
1686 break;
1687 }
1688 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1689 }
1690
1691 /*
1692 * Recover reply buffers from pool.
1693 * This happens when recovering from error conditions.
1694 * Post-increment counter/array index.
1695 */
1696 void
1697 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1698 {
1699 struct rpcrdma_buffer *buffers = req->rl_buffer;
1700 unsigned long flags;
1701
1702 spin_lock_irqsave(&buffers->rb_lock, flags);
1703 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1704 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1705 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1706 }
1707 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1708 }
1709
1710 /*
1711 * Put reply buffers back into pool when not attached to
1712 * request. This happens in error conditions.
1713 */
1714 void
1715 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1716 {
1717 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1718 unsigned long flags;
1719
1720 rep->rr_func = NULL;
1721 spin_lock_irqsave(&buffers->rb_lock, flags);
1722 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1723 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1724 }
1725
1726 /*
1727 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1728 */
1729
1730 static int
1731 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1732 struct ib_mr **mrp, struct ib_sge *iov)
1733 {
1734 struct ib_phys_buf ipb;
1735 struct ib_mr *mr;
1736 int rc;
1737
1738 /*
1739 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1740 */
1741 iov->addr = ib_dma_map_single(ia->ri_id->device,
1742 va, len, DMA_BIDIRECTIONAL);
1743 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1744 return -ENOMEM;
1745
1746 iov->length = len;
1747
1748 if (ia->ri_have_dma_lkey) {
1749 *mrp = NULL;
1750 iov->lkey = ia->ri_dma_lkey;
1751 return 0;
1752 } else if (ia->ri_bind_mem != NULL) {
1753 *mrp = NULL;
1754 iov->lkey = ia->ri_bind_mem->lkey;
1755 return 0;
1756 }
1757
1758 ipb.addr = iov->addr;
1759 ipb.size = iov->length;
1760 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1761 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1762
1763 dprintk("RPC: %s: phys convert: 0x%llx "
1764 "registered 0x%llx length %d\n",
1765 __func__, (unsigned long long)ipb.addr,
1766 (unsigned long long)iov->addr, len);
1767
1768 if (IS_ERR(mr)) {
1769 *mrp = NULL;
1770 rc = PTR_ERR(mr);
1771 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1772 } else {
1773 *mrp = mr;
1774 iov->lkey = mr->lkey;
1775 rc = 0;
1776 }
1777
1778 return rc;
1779 }
1780
1781 static int
1782 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1783 struct ib_mr *mr, struct ib_sge *iov)
1784 {
1785 int rc;
1786
1787 ib_dma_unmap_single(ia->ri_id->device,
1788 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1789
1790 if (NULL == mr)
1791 return 0;
1792
1793 rc = ib_dereg_mr(mr);
1794 if (rc)
1795 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1796 return rc;
1797 }
1798
1799 /**
1800 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
1801 * @ia: controlling rpcrdma_ia
1802 * @size: size of buffer to be allocated, in bytes
1803 * @flags: GFP flags
1804 *
1805 * Returns pointer to private header of an area of internally
1806 * registered memory, or an ERR_PTR. The registered buffer follows
1807 * the end of the private header.
1808 *
1809 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
1810 * receiving the payload of RDMA RECV operations. regbufs are not
1811 * used for RDMA READ/WRITE operations, thus are registered only for
1812 * LOCAL access.
1813 */
1814 struct rpcrdma_regbuf *
1815 rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
1816 {
1817 struct rpcrdma_regbuf *rb;
1818 int rc;
1819
1820 rc = -ENOMEM;
1821 rb = kmalloc(sizeof(*rb) + size, flags);
1822 if (rb == NULL)
1823 goto out;
1824
1825 rb->rg_size = size;
1826 rb->rg_owner = NULL;
1827 rc = rpcrdma_register_internal(ia, rb->rg_base, size,
1828 &rb->rg_mr, &rb->rg_iov);
1829 if (rc)
1830 goto out_free;
1831
1832 return rb;
1833
1834 out_free:
1835 kfree(rb);
1836 out:
1837 return ERR_PTR(rc);
1838 }
1839
1840 /**
1841 * rpcrdma_free_regbuf - deregister and free registered buffer
1842 * @ia: controlling rpcrdma_ia
1843 * @rb: regbuf to be deregistered and freed
1844 */
1845 void
1846 rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
1847 {
1848 if (rb) {
1849 rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
1850 kfree(rb);
1851 }
1852 }
1853
1854 /*
1855 * Wrappers for chunk registration, shared by read/write chunk code.
1856 */
1857
1858 static void
1859 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1860 {
1861 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1862 seg->mr_dmalen = seg->mr_len;
1863 if (seg->mr_page)
1864 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1865 seg->mr_page, offset_in_page(seg->mr_offset),
1866 seg->mr_dmalen, seg->mr_dir);
1867 else
1868 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1869 seg->mr_offset,
1870 seg->mr_dmalen, seg->mr_dir);
1871 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1872 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1873 __func__,
1874 (unsigned long long)seg->mr_dma,
1875 seg->mr_offset, seg->mr_dmalen);
1876 }
1877 }
1878
1879 static void
1880 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1881 {
1882 if (seg->mr_page)
1883 ib_dma_unmap_page(ia->ri_id->device,
1884 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1885 else
1886 ib_dma_unmap_single(ia->ri_id->device,
1887 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1888 }
1889
1890 static int
1891 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1892 int *nsegs, int writing, struct rpcrdma_ia *ia,
1893 struct rpcrdma_xprt *r_xprt)
1894 {
1895 struct rpcrdma_mr_seg *seg1 = seg;
1896 struct rpcrdma_mw *mw = seg1->rl_mw;
1897 struct rpcrdma_frmr *frmr = &mw->r.frmr;
1898 struct ib_mr *mr = frmr->fr_mr;
1899 struct ib_send_wr fastreg_wr, *bad_wr;
1900 u8 key;
1901 int len, pageoff;
1902 int i, rc;
1903 int seg_len;
1904 u64 pa;
1905 int page_no;
1906
1907 pageoff = offset_in_page(seg1->mr_offset);
1908 seg1->mr_offset -= pageoff; /* start of page */
1909 seg1->mr_len += pageoff;
1910 len = -pageoff;
1911 if (*nsegs > ia->ri_max_frmr_depth)
1912 *nsegs = ia->ri_max_frmr_depth;
1913 for (page_no = i = 0; i < *nsegs;) {
1914 rpcrdma_map_one(ia, seg, writing);
1915 pa = seg->mr_dma;
1916 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1917 frmr->fr_pgl->page_list[page_no++] = pa;
1918 pa += PAGE_SIZE;
1919 }
1920 len += seg->mr_len;
1921 ++seg;
1922 ++i;
1923 /* Check for holes */
1924 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1925 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1926 break;
1927 }
1928 dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n",
1929 __func__, mw, i, len);
1930
1931 frmr->fr_state = FRMR_IS_VALID;
1932
1933 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1934 fastreg_wr.wr_id = (unsigned long)(void *)mw;
1935 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1936 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
1937 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1938 fastreg_wr.wr.fast_reg.page_list_len = page_no;
1939 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1940 fastreg_wr.wr.fast_reg.length = len;
1941
1942 /* Bump the key */
1943 key = (u8)(mr->rkey & 0x000000FF);
1944 ib_update_fast_reg_key(mr, ++key);
1945
1946 fastreg_wr.wr.fast_reg.access_flags = (writing ?
1947 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1948 IB_ACCESS_REMOTE_READ);
1949 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
1950 DECR_CQCOUNT(&r_xprt->rx_ep);
1951
1952 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
1953 if (rc) {
1954 dprintk("RPC: %s: failed ib_post_send for register,"
1955 " status %i\n", __func__, rc);
1956 ib_update_fast_reg_key(mr, --key);
1957 goto out_err;
1958 } else {
1959 seg1->mr_rkey = mr->rkey;
1960 seg1->mr_base = seg1->mr_dma + pageoff;
1961 seg1->mr_nsegs = i;
1962 seg1->mr_len = len;
1963 }
1964 *nsegs = i;
1965 return 0;
1966 out_err:
1967 frmr->fr_state = FRMR_IS_INVALID;
1968 while (i--)
1969 rpcrdma_unmap_one(ia, --seg);
1970 return rc;
1971 }
1972
1973 static int
1974 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1975 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1976 {
1977 struct rpcrdma_mr_seg *seg1 = seg;
1978 struct ib_send_wr invalidate_wr, *bad_wr;
1979 int rc;
1980
1981 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
1982
1983 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1984 invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
1985 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1986 invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
1987 DECR_CQCOUNT(&r_xprt->rx_ep);
1988
1989 read_lock(&ia->ri_qplock);
1990 while (seg1->mr_nsegs--)
1991 rpcrdma_unmap_one(ia, seg++);
1992 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1993 read_unlock(&ia->ri_qplock);
1994 if (rc) {
1995 /* Force rpcrdma_buffer_get() to retry */
1996 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
1997 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1998 " status %i\n", __func__, rc);
1999 }
2000 return rc;
2001 }
2002
2003 static int
2004 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
2005 int *nsegs, int writing, struct rpcrdma_ia *ia)
2006 {
2007 struct rpcrdma_mr_seg *seg1 = seg;
2008 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
2009 int len, pageoff, i, rc;
2010
2011 pageoff = offset_in_page(seg1->mr_offset);
2012 seg1->mr_offset -= pageoff; /* start of page */
2013 seg1->mr_len += pageoff;
2014 len = -pageoff;
2015 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
2016 *nsegs = RPCRDMA_MAX_DATA_SEGS;
2017 for (i = 0; i < *nsegs;) {
2018 rpcrdma_map_one(ia, seg, writing);
2019 physaddrs[i] = seg->mr_dma;
2020 len += seg->mr_len;
2021 ++seg;
2022 ++i;
2023 /* Check for holes */
2024 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
2025 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
2026 break;
2027 }
2028 rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma);
2029 if (rc) {
2030 dprintk("RPC: %s: failed ib_map_phys_fmr "
2031 "%u@0x%llx+%i (%d)... status %i\n", __func__,
2032 len, (unsigned long long)seg1->mr_dma,
2033 pageoff, i, rc);
2034 while (i--)
2035 rpcrdma_unmap_one(ia, --seg);
2036 } else {
2037 seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey;
2038 seg1->mr_base = seg1->mr_dma + pageoff;
2039 seg1->mr_nsegs = i;
2040 seg1->mr_len = len;
2041 }
2042 *nsegs = i;
2043 return rc;
2044 }
2045
2046 static int
2047 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
2048 struct rpcrdma_ia *ia)
2049 {
2050 struct rpcrdma_mr_seg *seg1 = seg;
2051 LIST_HEAD(l);
2052 int rc;
2053
2054 list_add(&seg1->rl_mw->r.fmr->list, &l);
2055 rc = ib_unmap_fmr(&l);
2056 read_lock(&ia->ri_qplock);
2057 while (seg1->mr_nsegs--)
2058 rpcrdma_unmap_one(ia, seg++);
2059 read_unlock(&ia->ri_qplock);
2060 if (rc)
2061 dprintk("RPC: %s: failed ib_unmap_fmr,"
2062 " status %i\n", __func__, rc);
2063 return rc;
2064 }
2065
2066 int
2067 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
2068 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
2069 {
2070 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2071 int rc = 0;
2072
2073 switch (ia->ri_memreg_strategy) {
2074
2075 case RPCRDMA_ALLPHYSICAL:
2076 rpcrdma_map_one(ia, seg, writing);
2077 seg->mr_rkey = ia->ri_bind_mem->rkey;
2078 seg->mr_base = seg->mr_dma;
2079 seg->mr_nsegs = 1;
2080 nsegs = 1;
2081 break;
2082
2083 /* Registration using frmr registration */
2084 case RPCRDMA_FRMR:
2085 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
2086 break;
2087
2088 /* Registration using fmr memory registration */
2089 case RPCRDMA_MTHCAFMR:
2090 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
2091 break;
2092
2093 default:
2094 return -EIO;
2095 }
2096 if (rc)
2097 return rc;
2098
2099 return nsegs;
2100 }
2101
2102 int
2103 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
2104 struct rpcrdma_xprt *r_xprt)
2105 {
2106 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2107 int nsegs = seg->mr_nsegs, rc;
2108
2109 switch (ia->ri_memreg_strategy) {
2110
2111 case RPCRDMA_ALLPHYSICAL:
2112 read_lock(&ia->ri_qplock);
2113 rpcrdma_unmap_one(ia, seg);
2114 read_unlock(&ia->ri_qplock);
2115 break;
2116
2117 case RPCRDMA_FRMR:
2118 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
2119 break;
2120
2121 case RPCRDMA_MTHCAFMR:
2122 rc = rpcrdma_deregister_fmr_external(seg, ia);
2123 break;
2124
2125 default:
2126 break;
2127 }
2128 return nsegs;
2129 }
2130
2131 /*
2132 * Prepost any receive buffer, then post send.
2133 *
2134 * Receive buffer is donated to hardware, reclaimed upon recv completion.
2135 */
2136 int
2137 rpcrdma_ep_post(struct rpcrdma_ia *ia,
2138 struct rpcrdma_ep *ep,
2139 struct rpcrdma_req *req)
2140 {
2141 struct ib_send_wr send_wr, *send_wr_fail;
2142 struct rpcrdma_rep *rep = req->rl_reply;
2143 int rc;
2144
2145 if (rep) {
2146 rc = rpcrdma_ep_post_recv(ia, ep, rep);
2147 if (rc)
2148 goto out;
2149 req->rl_reply = NULL;
2150 }
2151
2152 send_wr.next = NULL;
2153 send_wr.wr_id = 0ULL; /* no send cookie */
2154 send_wr.sg_list = req->rl_send_iov;
2155 send_wr.num_sge = req->rl_niovs;
2156 send_wr.opcode = IB_WR_SEND;
2157 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
2158 ib_dma_sync_single_for_device(ia->ri_id->device,
2159 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
2160 DMA_TO_DEVICE);
2161 ib_dma_sync_single_for_device(ia->ri_id->device,
2162 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
2163 DMA_TO_DEVICE);
2164 ib_dma_sync_single_for_device(ia->ri_id->device,
2165 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
2166 DMA_TO_DEVICE);
2167
2168 if (DECR_CQCOUNT(ep) > 0)
2169 send_wr.send_flags = 0;
2170 else { /* Provider must take a send completion every now and then */
2171 INIT_CQCOUNT(ep);
2172 send_wr.send_flags = IB_SEND_SIGNALED;
2173 }
2174
2175 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
2176 if (rc)
2177 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
2178 rc);
2179 out:
2180 return rc;
2181 }
2182
2183 /*
2184 * (Re)post a receive buffer.
2185 */
2186 int
2187 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2188 struct rpcrdma_ep *ep,
2189 struct rpcrdma_rep *rep)
2190 {
2191 struct ib_recv_wr recv_wr, *recv_wr_fail;
2192 int rc;
2193
2194 recv_wr.next = NULL;
2195 recv_wr.wr_id = (u64) (unsigned long) rep;
2196 recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
2197 recv_wr.num_sge = 1;
2198
2199 ib_dma_sync_single_for_cpu(ia->ri_id->device,
2200 rdmab_addr(rep->rr_rdmabuf),
2201 rdmab_length(rep->rr_rdmabuf),
2202 DMA_BIDIRECTIONAL);
2203
2204 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
2205
2206 if (rc)
2207 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
2208 rc);
2209 return rc;
2210 }
2211
2212 /* Physical mapping means one Read/Write list entry per-page.
2213 * All list entries must fit within an inline buffer
2214 *
2215 * NB: The server must return a Write list for NFS READ,
2216 * which has the same constraint. Factor in the inline
2217 * rsize as well.
2218 */
2219 static size_t
2220 rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
2221 {
2222 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2223 unsigned int inline_size, pages;
2224
2225 inline_size = min_t(unsigned int,
2226 cdata->inline_wsize, cdata->inline_rsize);
2227 inline_size -= RPCRDMA_HDRLEN_MIN;
2228 pages = inline_size / sizeof(struct rpcrdma_segment);
2229 return pages << PAGE_SHIFT;
2230 }
2231
2232 static size_t
2233 rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2234 {
2235 return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2236 }
2237
2238 size_t
2239 rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2240 {
2241 size_t result;
2242
2243 switch (r_xprt->rx_ia.ri_memreg_strategy) {
2244 case RPCRDMA_ALLPHYSICAL:
2245 result = rpcrdma_physical_max_payload(r_xprt);
2246 break;
2247 default:
2248 result = rpcrdma_mr_max_payload(r_xprt);
2249 }
2250 return result;
2251 }
This page took 0.091541 seconds and 5 git commands to generate.