xprtrdma: Add vector of ops for each memory registration strategy
[deliverable/linux.git] / net / sunrpc / xprtrdma / verbs.c
1 /*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50 #include <linux/interrupt.h>
51 #include <linux/slab.h>
52 #include <linux/prefetch.h>
53 #include <linux/sunrpc/addr.h>
54 #include <asm/bitops.h>
55
56 #include "xprt_rdma.h"
57
58 /*
59 * Globals/Macros
60 */
61
62 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
63 # define RPCDBG_FACILITY RPCDBG_TRANS
64 #endif
65
66 static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
67 static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
68
69 /*
70 * internal functions
71 */
72
73 /*
74 * handle replies in tasklet context, using a single, global list
75 * rdma tasklet function -- just turn around and call the func
76 * for all replies on the list
77 */
78
79 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
80 static LIST_HEAD(rpcrdma_tasklets_g);
81
82 static void
83 rpcrdma_run_tasklet(unsigned long data)
84 {
85 struct rpcrdma_rep *rep;
86 void (*func)(struct rpcrdma_rep *);
87 unsigned long flags;
88
89 data = data;
90 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
91 while (!list_empty(&rpcrdma_tasklets_g)) {
92 rep = list_entry(rpcrdma_tasklets_g.next,
93 struct rpcrdma_rep, rr_list);
94 list_del(&rep->rr_list);
95 func = rep->rr_func;
96 rep->rr_func = NULL;
97 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
98
99 if (func)
100 func(rep);
101 else
102 rpcrdma_recv_buffer_put(rep);
103
104 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
105 }
106 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
107 }
108
109 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
110
111 static const char * const async_event[] = {
112 "CQ error",
113 "QP fatal error",
114 "QP request error",
115 "QP access error",
116 "communication established",
117 "send queue drained",
118 "path migration successful",
119 "path mig error",
120 "device fatal error",
121 "port active",
122 "port error",
123 "LID change",
124 "P_key change",
125 "SM change",
126 "SRQ error",
127 "SRQ limit reached",
128 "last WQE reached",
129 "client reregister",
130 "GID change",
131 };
132
133 #define ASYNC_MSG(status) \
134 ((status) < ARRAY_SIZE(async_event) ? \
135 async_event[(status)] : "unknown async error")
136
137 static void
138 rpcrdma_schedule_tasklet(struct list_head *sched_list)
139 {
140 unsigned long flags;
141
142 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
143 list_splice_tail(sched_list, &rpcrdma_tasklets_g);
144 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
145 tasklet_schedule(&rpcrdma_tasklet_g);
146 }
147
148 static void
149 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
150 {
151 struct rpcrdma_ep *ep = context;
152
153 pr_err("RPC: %s: %s on device %s ep %p\n",
154 __func__, ASYNC_MSG(event->event),
155 event->device->name, context);
156 if (ep->rep_connected == 1) {
157 ep->rep_connected = -EIO;
158 rpcrdma_conn_func(ep);
159 wake_up_all(&ep->rep_connect_wait);
160 }
161 }
162
163 static void
164 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
165 {
166 struct rpcrdma_ep *ep = context;
167
168 pr_err("RPC: %s: %s on device %s ep %p\n",
169 __func__, ASYNC_MSG(event->event),
170 event->device->name, context);
171 if (ep->rep_connected == 1) {
172 ep->rep_connected = -EIO;
173 rpcrdma_conn_func(ep);
174 wake_up_all(&ep->rep_connect_wait);
175 }
176 }
177
178 static const char * const wc_status[] = {
179 "success",
180 "local length error",
181 "local QP operation error",
182 "local EE context operation error",
183 "local protection error",
184 "WR flushed",
185 "memory management operation error",
186 "bad response error",
187 "local access error",
188 "remote invalid request error",
189 "remote access error",
190 "remote operation error",
191 "transport retry counter exceeded",
192 "RNR retrycounter exceeded",
193 "local RDD violation error",
194 "remove invalid RD request",
195 "operation aborted",
196 "invalid EE context number",
197 "invalid EE context state",
198 "fatal error",
199 "response timeout error",
200 "general error",
201 };
202
203 #define COMPLETION_MSG(status) \
204 ((status) < ARRAY_SIZE(wc_status) ? \
205 wc_status[(status)] : "unexpected completion error")
206
207 static void
208 rpcrdma_sendcq_process_wc(struct ib_wc *wc)
209 {
210 if (likely(wc->status == IB_WC_SUCCESS))
211 return;
212
213 /* WARNING: Only wr_id and status are reliable at this point */
214 if (wc->wr_id == 0ULL) {
215 if (wc->status != IB_WC_WR_FLUSH_ERR)
216 pr_err("RPC: %s: SEND: %s\n",
217 __func__, COMPLETION_MSG(wc->status));
218 } else {
219 struct rpcrdma_mw *r;
220
221 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
222 r->r.frmr.fr_state = FRMR_IS_STALE;
223 pr_err("RPC: %s: frmr %p (stale): %s\n",
224 __func__, r, COMPLETION_MSG(wc->status));
225 }
226 }
227
228 static int
229 rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
230 {
231 struct ib_wc *wcs;
232 int budget, count, rc;
233
234 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
235 do {
236 wcs = ep->rep_send_wcs;
237
238 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
239 if (rc <= 0)
240 return rc;
241
242 count = rc;
243 while (count-- > 0)
244 rpcrdma_sendcq_process_wc(wcs++);
245 } while (rc == RPCRDMA_POLLSIZE && --budget);
246 return 0;
247 }
248
249 /*
250 * Handle send, fast_reg_mr, and local_inv completions.
251 *
252 * Send events are typically suppressed and thus do not result
253 * in an upcall. Occasionally one is signaled, however. This
254 * prevents the provider's completion queue from wrapping and
255 * losing a completion.
256 */
257 static void
258 rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
259 {
260 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
261 int rc;
262
263 rc = rpcrdma_sendcq_poll(cq, ep);
264 if (rc) {
265 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
266 __func__, rc);
267 return;
268 }
269
270 rc = ib_req_notify_cq(cq,
271 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
272 if (rc == 0)
273 return;
274 if (rc < 0) {
275 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
276 __func__, rc);
277 return;
278 }
279
280 rpcrdma_sendcq_poll(cq, ep);
281 }
282
283 static void
284 rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
285 {
286 struct rpcrdma_rep *rep =
287 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
288
289 /* WARNING: Only wr_id and status are reliable at this point */
290 if (wc->status != IB_WC_SUCCESS)
291 goto out_fail;
292
293 /* status == SUCCESS means all fields in wc are trustworthy */
294 if (wc->opcode != IB_WC_RECV)
295 return;
296
297 dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
298 __func__, rep, wc->byte_len);
299
300 rep->rr_len = wc->byte_len;
301 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
302 rdmab_addr(rep->rr_rdmabuf),
303 rep->rr_len, DMA_FROM_DEVICE);
304 prefetch(rdmab_to_msg(rep->rr_rdmabuf));
305
306 out_schedule:
307 list_add_tail(&rep->rr_list, sched_list);
308 return;
309 out_fail:
310 if (wc->status != IB_WC_WR_FLUSH_ERR)
311 pr_err("RPC: %s: rep %p: %s\n",
312 __func__, rep, COMPLETION_MSG(wc->status));
313 rep->rr_len = ~0U;
314 goto out_schedule;
315 }
316
317 static int
318 rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
319 {
320 struct list_head sched_list;
321 struct ib_wc *wcs;
322 int budget, count, rc;
323
324 INIT_LIST_HEAD(&sched_list);
325 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
326 do {
327 wcs = ep->rep_recv_wcs;
328
329 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
330 if (rc <= 0)
331 goto out_schedule;
332
333 count = rc;
334 while (count-- > 0)
335 rpcrdma_recvcq_process_wc(wcs++, &sched_list);
336 } while (rc == RPCRDMA_POLLSIZE && --budget);
337 rc = 0;
338
339 out_schedule:
340 rpcrdma_schedule_tasklet(&sched_list);
341 return rc;
342 }
343
344 /*
345 * Handle receive completions.
346 *
347 * It is reentrant but processes single events in order to maintain
348 * ordering of receives to keep server credits.
349 *
350 * It is the responsibility of the scheduled tasklet to return
351 * recv buffers to the pool. NOTE: this affects synchronization of
352 * connection shutdown. That is, the structures required for
353 * the completion of the reply handler must remain intact until
354 * all memory has been reclaimed.
355 */
356 static void
357 rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
358 {
359 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
360 int rc;
361
362 rc = rpcrdma_recvcq_poll(cq, ep);
363 if (rc) {
364 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
365 __func__, rc);
366 return;
367 }
368
369 rc = ib_req_notify_cq(cq,
370 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
371 if (rc == 0)
372 return;
373 if (rc < 0) {
374 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
375 __func__, rc);
376 return;
377 }
378
379 rpcrdma_recvcq_poll(cq, ep);
380 }
381
382 static void
383 rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
384 {
385 struct ib_wc wc;
386 LIST_HEAD(sched_list);
387
388 while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
389 rpcrdma_recvcq_process_wc(&wc, &sched_list);
390 if (!list_empty(&sched_list))
391 rpcrdma_schedule_tasklet(&sched_list);
392 while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
393 rpcrdma_sendcq_process_wc(&wc);
394 }
395
396 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
397 static const char * const conn[] = {
398 "address resolved",
399 "address error",
400 "route resolved",
401 "route error",
402 "connect request",
403 "connect response",
404 "connect error",
405 "unreachable",
406 "rejected",
407 "established",
408 "disconnected",
409 "device removal",
410 "multicast join",
411 "multicast error",
412 "address change",
413 "timewait exit",
414 };
415
416 #define CONNECTION_MSG(status) \
417 ((status) < ARRAY_SIZE(conn) ? \
418 conn[(status)] : "unrecognized connection error")
419 #endif
420
421 static int
422 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
423 {
424 struct rpcrdma_xprt *xprt = id->context;
425 struct rpcrdma_ia *ia = &xprt->rx_ia;
426 struct rpcrdma_ep *ep = &xprt->rx_ep;
427 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
428 struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
429 #endif
430 struct ib_qp_attr *attr = &ia->ri_qp_attr;
431 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
432 int connstate = 0;
433
434 switch (event->event) {
435 case RDMA_CM_EVENT_ADDR_RESOLVED:
436 case RDMA_CM_EVENT_ROUTE_RESOLVED:
437 ia->ri_async_rc = 0;
438 complete(&ia->ri_done);
439 break;
440 case RDMA_CM_EVENT_ADDR_ERROR:
441 ia->ri_async_rc = -EHOSTUNREACH;
442 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
443 __func__, ep);
444 complete(&ia->ri_done);
445 break;
446 case RDMA_CM_EVENT_ROUTE_ERROR:
447 ia->ri_async_rc = -ENETUNREACH;
448 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
449 __func__, ep);
450 complete(&ia->ri_done);
451 break;
452 case RDMA_CM_EVENT_ESTABLISHED:
453 connstate = 1;
454 ib_query_qp(ia->ri_id->qp, attr,
455 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
456 iattr);
457 dprintk("RPC: %s: %d responder resources"
458 " (%d initiator)\n",
459 __func__, attr->max_dest_rd_atomic,
460 attr->max_rd_atomic);
461 goto connected;
462 case RDMA_CM_EVENT_CONNECT_ERROR:
463 connstate = -ENOTCONN;
464 goto connected;
465 case RDMA_CM_EVENT_UNREACHABLE:
466 connstate = -ENETDOWN;
467 goto connected;
468 case RDMA_CM_EVENT_REJECTED:
469 connstate = -ECONNREFUSED;
470 goto connected;
471 case RDMA_CM_EVENT_DISCONNECTED:
472 connstate = -ECONNABORTED;
473 goto connected;
474 case RDMA_CM_EVENT_DEVICE_REMOVAL:
475 connstate = -ENODEV;
476 connected:
477 dprintk("RPC: %s: %sconnected\n",
478 __func__, connstate > 0 ? "" : "dis");
479 ep->rep_connected = connstate;
480 rpcrdma_conn_func(ep);
481 wake_up_all(&ep->rep_connect_wait);
482 /*FALLTHROUGH*/
483 default:
484 dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
485 __func__, sap, rpc_get_port(sap), ep,
486 CONNECTION_MSG(event->event));
487 break;
488 }
489
490 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
491 if (connstate == 1) {
492 int ird = attr->max_dest_rd_atomic;
493 int tird = ep->rep_remote_cma.responder_resources;
494
495 pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
496 sap, rpc_get_port(sap),
497 ia->ri_id->device->name,
498 ia->ri_ops->ro_displayname,
499 xprt->rx_buf.rb_max_requests,
500 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
501 } else if (connstate < 0) {
502 pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
503 sap, rpc_get_port(sap), connstate);
504 }
505 #endif
506
507 return 0;
508 }
509
510 static struct rdma_cm_id *
511 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
512 struct rpcrdma_ia *ia, struct sockaddr *addr)
513 {
514 struct rdma_cm_id *id;
515 int rc;
516
517 init_completion(&ia->ri_done);
518
519 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
520 if (IS_ERR(id)) {
521 rc = PTR_ERR(id);
522 dprintk("RPC: %s: rdma_create_id() failed %i\n",
523 __func__, rc);
524 return id;
525 }
526
527 ia->ri_async_rc = -ETIMEDOUT;
528 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
529 if (rc) {
530 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
531 __func__, rc);
532 goto out;
533 }
534 wait_for_completion_interruptible_timeout(&ia->ri_done,
535 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
536 rc = ia->ri_async_rc;
537 if (rc)
538 goto out;
539
540 ia->ri_async_rc = -ETIMEDOUT;
541 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
542 if (rc) {
543 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
544 __func__, rc);
545 goto out;
546 }
547 wait_for_completion_interruptible_timeout(&ia->ri_done,
548 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
549 rc = ia->ri_async_rc;
550 if (rc)
551 goto out;
552
553 return id;
554
555 out:
556 rdma_destroy_id(id);
557 return ERR_PTR(rc);
558 }
559
560 /*
561 * Drain any cq, prior to teardown.
562 */
563 static void
564 rpcrdma_clean_cq(struct ib_cq *cq)
565 {
566 struct ib_wc wc;
567 int count = 0;
568
569 while (1 == ib_poll_cq(cq, 1, &wc))
570 ++count;
571
572 if (count)
573 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
574 __func__, count, wc.opcode);
575 }
576
577 /*
578 * Exported functions.
579 */
580
581 /*
582 * Open and initialize an Interface Adapter.
583 * o initializes fields of struct rpcrdma_ia, including
584 * interface and provider attributes and protection zone.
585 */
586 int
587 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
588 {
589 int rc, mem_priv;
590 struct rpcrdma_ia *ia = &xprt->rx_ia;
591 struct ib_device_attr *devattr = &ia->ri_devattr;
592
593 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
594 if (IS_ERR(ia->ri_id)) {
595 rc = PTR_ERR(ia->ri_id);
596 goto out1;
597 }
598
599 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
600 if (IS_ERR(ia->ri_pd)) {
601 rc = PTR_ERR(ia->ri_pd);
602 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
603 __func__, rc);
604 goto out2;
605 }
606
607 rc = ib_query_device(ia->ri_id->device, devattr);
608 if (rc) {
609 dprintk("RPC: %s: ib_query_device failed %d\n",
610 __func__, rc);
611 goto out3;
612 }
613
614 if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
615 ia->ri_have_dma_lkey = 1;
616 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
617 }
618
619 if (memreg == RPCRDMA_FRMR) {
620 /* Requires both frmr reg and local dma lkey */
621 if (((devattr->device_cap_flags &
622 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
623 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
624 (devattr->max_fast_reg_page_list_len == 0)) {
625 dprintk("RPC: %s: FRMR registration "
626 "not supported by HCA\n", __func__);
627 memreg = RPCRDMA_MTHCAFMR;
628 } else {
629 /* Mind the ia limit on FRMR page list depth */
630 ia->ri_max_frmr_depth = min_t(unsigned int,
631 RPCRDMA_MAX_DATA_SEGS,
632 devattr->max_fast_reg_page_list_len);
633 }
634 }
635 if (memreg == RPCRDMA_MTHCAFMR) {
636 if (!ia->ri_id->device->alloc_fmr) {
637 dprintk("RPC: %s: MTHCAFMR registration "
638 "not supported by HCA\n", __func__);
639 memreg = RPCRDMA_ALLPHYSICAL;
640 }
641 }
642
643 /*
644 * Optionally obtain an underlying physical identity mapping in
645 * order to do a memory window-based bind. This base registration
646 * is protected from remote access - that is enabled only by binding
647 * for the specific bytes targeted during each RPC operation, and
648 * revoked after the corresponding completion similar to a storage
649 * adapter.
650 */
651 switch (memreg) {
652 case RPCRDMA_FRMR:
653 ia->ri_ops = &rpcrdma_frwr_memreg_ops;
654 break;
655 case RPCRDMA_ALLPHYSICAL:
656 ia->ri_ops = &rpcrdma_physical_memreg_ops;
657 mem_priv = IB_ACCESS_LOCAL_WRITE |
658 IB_ACCESS_REMOTE_WRITE |
659 IB_ACCESS_REMOTE_READ;
660 goto register_setup;
661 case RPCRDMA_MTHCAFMR:
662 ia->ri_ops = &rpcrdma_fmr_memreg_ops;
663 if (ia->ri_have_dma_lkey)
664 break;
665 mem_priv = IB_ACCESS_LOCAL_WRITE;
666 register_setup:
667 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
668 if (IS_ERR(ia->ri_bind_mem)) {
669 printk(KERN_ALERT "%s: ib_get_dma_mr for "
670 "phys register failed with %lX\n",
671 __func__, PTR_ERR(ia->ri_bind_mem));
672 rc = -ENOMEM;
673 goto out3;
674 }
675 break;
676 default:
677 printk(KERN_ERR "RPC: Unsupported memory "
678 "registration mode: %d\n", memreg);
679 rc = -ENOMEM;
680 goto out3;
681 }
682 dprintk("RPC: %s: memory registration strategy is '%s'\n",
683 __func__, ia->ri_ops->ro_displayname);
684
685 /* Else will do memory reg/dereg for each chunk */
686 ia->ri_memreg_strategy = memreg;
687
688 rwlock_init(&ia->ri_qplock);
689 return 0;
690
691 out3:
692 ib_dealloc_pd(ia->ri_pd);
693 ia->ri_pd = NULL;
694 out2:
695 rdma_destroy_id(ia->ri_id);
696 ia->ri_id = NULL;
697 out1:
698 return rc;
699 }
700
701 /*
702 * Clean up/close an IA.
703 * o if event handles and PD have been initialized, free them.
704 * o close the IA
705 */
706 void
707 rpcrdma_ia_close(struct rpcrdma_ia *ia)
708 {
709 int rc;
710
711 dprintk("RPC: %s: entering\n", __func__);
712 if (ia->ri_bind_mem != NULL) {
713 rc = ib_dereg_mr(ia->ri_bind_mem);
714 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
715 __func__, rc);
716 }
717 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
718 if (ia->ri_id->qp)
719 rdma_destroy_qp(ia->ri_id);
720 rdma_destroy_id(ia->ri_id);
721 ia->ri_id = NULL;
722 }
723 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
724 rc = ib_dealloc_pd(ia->ri_pd);
725 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
726 __func__, rc);
727 }
728 }
729
730 /*
731 * Create unconnected endpoint.
732 */
733 int
734 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
735 struct rpcrdma_create_data_internal *cdata)
736 {
737 struct ib_device_attr *devattr = &ia->ri_devattr;
738 struct ib_cq *sendcq, *recvcq;
739 int rc, err;
740
741 /* check provider's send/recv wr limits */
742 if (cdata->max_requests > devattr->max_qp_wr)
743 cdata->max_requests = devattr->max_qp_wr;
744
745 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
746 ep->rep_attr.qp_context = ep;
747 /* send_cq and recv_cq initialized below */
748 ep->rep_attr.srq = NULL;
749 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
750 switch (ia->ri_memreg_strategy) {
751 case RPCRDMA_FRMR: {
752 int depth = 7;
753
754 /* Add room for frmr register and invalidate WRs.
755 * 1. FRMR reg WR for head
756 * 2. FRMR invalidate WR for head
757 * 3. N FRMR reg WRs for pagelist
758 * 4. N FRMR invalidate WRs for pagelist
759 * 5. FRMR reg WR for tail
760 * 6. FRMR invalidate WR for tail
761 * 7. The RDMA_SEND WR
762 */
763
764 /* Calculate N if the device max FRMR depth is smaller than
765 * RPCRDMA_MAX_DATA_SEGS.
766 */
767 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
768 int delta = RPCRDMA_MAX_DATA_SEGS -
769 ia->ri_max_frmr_depth;
770
771 do {
772 depth += 2; /* FRMR reg + invalidate */
773 delta -= ia->ri_max_frmr_depth;
774 } while (delta > 0);
775
776 }
777 ep->rep_attr.cap.max_send_wr *= depth;
778 if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
779 cdata->max_requests = devattr->max_qp_wr / depth;
780 if (!cdata->max_requests)
781 return -EINVAL;
782 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
783 depth;
784 }
785 break;
786 }
787 default:
788 break;
789 }
790 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
791 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
792 ep->rep_attr.cap.max_recv_sge = 1;
793 ep->rep_attr.cap.max_inline_data = 0;
794 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
795 ep->rep_attr.qp_type = IB_QPT_RC;
796 ep->rep_attr.port_num = ~0;
797
798 if (cdata->padding) {
799 ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
800 GFP_KERNEL);
801 if (IS_ERR(ep->rep_padbuf))
802 return PTR_ERR(ep->rep_padbuf);
803 } else
804 ep->rep_padbuf = NULL;
805
806 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
807 "iovs: send %d recv %d\n",
808 __func__,
809 ep->rep_attr.cap.max_send_wr,
810 ep->rep_attr.cap.max_recv_wr,
811 ep->rep_attr.cap.max_send_sge,
812 ep->rep_attr.cap.max_recv_sge);
813
814 /* set trigger for requesting send completion */
815 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
816 if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
817 ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
818 else if (ep->rep_cqinit <= 2)
819 ep->rep_cqinit = 0;
820 INIT_CQCOUNT(ep);
821 init_waitqueue_head(&ep->rep_connect_wait);
822 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
823
824 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
825 rpcrdma_cq_async_error_upcall, ep,
826 ep->rep_attr.cap.max_send_wr + 1, 0);
827 if (IS_ERR(sendcq)) {
828 rc = PTR_ERR(sendcq);
829 dprintk("RPC: %s: failed to create send CQ: %i\n",
830 __func__, rc);
831 goto out1;
832 }
833
834 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
835 if (rc) {
836 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
837 __func__, rc);
838 goto out2;
839 }
840
841 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
842 rpcrdma_cq_async_error_upcall, ep,
843 ep->rep_attr.cap.max_recv_wr + 1, 0);
844 if (IS_ERR(recvcq)) {
845 rc = PTR_ERR(recvcq);
846 dprintk("RPC: %s: failed to create recv CQ: %i\n",
847 __func__, rc);
848 goto out2;
849 }
850
851 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
852 if (rc) {
853 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
854 __func__, rc);
855 ib_destroy_cq(recvcq);
856 goto out2;
857 }
858
859 ep->rep_attr.send_cq = sendcq;
860 ep->rep_attr.recv_cq = recvcq;
861
862 /* Initialize cma parameters */
863
864 /* RPC/RDMA does not use private data */
865 ep->rep_remote_cma.private_data = NULL;
866 ep->rep_remote_cma.private_data_len = 0;
867
868 /* Client offers RDMA Read but does not initiate */
869 ep->rep_remote_cma.initiator_depth = 0;
870 if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
871 ep->rep_remote_cma.responder_resources = 32;
872 else
873 ep->rep_remote_cma.responder_resources =
874 devattr->max_qp_rd_atom;
875
876 ep->rep_remote_cma.retry_count = 7;
877 ep->rep_remote_cma.flow_control = 0;
878 ep->rep_remote_cma.rnr_retry_count = 0;
879
880 return 0;
881
882 out2:
883 err = ib_destroy_cq(sendcq);
884 if (err)
885 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
886 __func__, err);
887 out1:
888 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
889 return rc;
890 }
891
892 /*
893 * rpcrdma_ep_destroy
894 *
895 * Disconnect and destroy endpoint. After this, the only
896 * valid operations on the ep are to free it (if dynamically
897 * allocated) or re-create it.
898 */
899 void
900 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
901 {
902 int rc;
903
904 dprintk("RPC: %s: entering, connected is %d\n",
905 __func__, ep->rep_connected);
906
907 cancel_delayed_work_sync(&ep->rep_connect_worker);
908
909 if (ia->ri_id->qp) {
910 rpcrdma_ep_disconnect(ep, ia);
911 rdma_destroy_qp(ia->ri_id);
912 ia->ri_id->qp = NULL;
913 }
914
915 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
916
917 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
918 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
919 if (rc)
920 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
921 __func__, rc);
922
923 rpcrdma_clean_cq(ep->rep_attr.send_cq);
924 rc = ib_destroy_cq(ep->rep_attr.send_cq);
925 if (rc)
926 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
927 __func__, rc);
928 }
929
930 /*
931 * Connect unconnected endpoint.
932 */
933 int
934 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
935 {
936 struct rdma_cm_id *id, *old;
937 int rc = 0;
938 int retry_count = 0;
939
940 if (ep->rep_connected != 0) {
941 struct rpcrdma_xprt *xprt;
942 retry:
943 dprintk("RPC: %s: reconnecting...\n", __func__);
944
945 rpcrdma_ep_disconnect(ep, ia);
946 rpcrdma_flush_cqs(ep);
947
948 switch (ia->ri_memreg_strategy) {
949 case RPCRDMA_FRMR:
950 rpcrdma_reset_frmrs(ia);
951 break;
952 case RPCRDMA_MTHCAFMR:
953 rpcrdma_reset_fmrs(ia);
954 break;
955 case RPCRDMA_ALLPHYSICAL:
956 break;
957 default:
958 rc = -EIO;
959 goto out;
960 }
961
962 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
963 id = rpcrdma_create_id(xprt, ia,
964 (struct sockaddr *)&xprt->rx_data.addr);
965 if (IS_ERR(id)) {
966 rc = -EHOSTUNREACH;
967 goto out;
968 }
969 /* TEMP TEMP TEMP - fail if new device:
970 * Deregister/remarshal *all* requests!
971 * Close and recreate adapter, pd, etc!
972 * Re-determine all attributes still sane!
973 * More stuff I haven't thought of!
974 * Rrrgh!
975 */
976 if (ia->ri_id->device != id->device) {
977 printk("RPC: %s: can't reconnect on "
978 "different device!\n", __func__);
979 rdma_destroy_id(id);
980 rc = -ENETUNREACH;
981 goto out;
982 }
983 /* END TEMP */
984 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
985 if (rc) {
986 dprintk("RPC: %s: rdma_create_qp failed %i\n",
987 __func__, rc);
988 rdma_destroy_id(id);
989 rc = -ENETUNREACH;
990 goto out;
991 }
992
993 write_lock(&ia->ri_qplock);
994 old = ia->ri_id;
995 ia->ri_id = id;
996 write_unlock(&ia->ri_qplock);
997
998 rdma_destroy_qp(old);
999 rdma_destroy_id(old);
1000 } else {
1001 dprintk("RPC: %s: connecting...\n", __func__);
1002 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
1003 if (rc) {
1004 dprintk("RPC: %s: rdma_create_qp failed %i\n",
1005 __func__, rc);
1006 /* do not update ep->rep_connected */
1007 return -ENETUNREACH;
1008 }
1009 }
1010
1011 ep->rep_connected = 0;
1012
1013 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
1014 if (rc) {
1015 dprintk("RPC: %s: rdma_connect() failed with %i\n",
1016 __func__, rc);
1017 goto out;
1018 }
1019
1020 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
1021
1022 /*
1023 * Check state. A non-peer reject indicates no listener
1024 * (ECONNREFUSED), which may be a transient state. All
1025 * others indicate a transport condition which has already
1026 * undergone a best-effort.
1027 */
1028 if (ep->rep_connected == -ECONNREFUSED &&
1029 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
1030 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
1031 goto retry;
1032 }
1033 if (ep->rep_connected <= 0) {
1034 /* Sometimes, the only way to reliably connect to remote
1035 * CMs is to use same nonzero values for ORD and IRD. */
1036 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
1037 (ep->rep_remote_cma.responder_resources == 0 ||
1038 ep->rep_remote_cma.initiator_depth !=
1039 ep->rep_remote_cma.responder_resources)) {
1040 if (ep->rep_remote_cma.responder_resources == 0)
1041 ep->rep_remote_cma.responder_resources = 1;
1042 ep->rep_remote_cma.initiator_depth =
1043 ep->rep_remote_cma.responder_resources;
1044 goto retry;
1045 }
1046 rc = ep->rep_connected;
1047 } else {
1048 dprintk("RPC: %s: connected\n", __func__);
1049 }
1050
1051 out:
1052 if (rc)
1053 ep->rep_connected = rc;
1054 return rc;
1055 }
1056
1057 /*
1058 * rpcrdma_ep_disconnect
1059 *
1060 * This is separate from destroy to facilitate the ability
1061 * to reconnect without recreating the endpoint.
1062 *
1063 * This call is not reentrant, and must not be made in parallel
1064 * on the same endpoint.
1065 */
1066 void
1067 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
1068 {
1069 int rc;
1070
1071 rpcrdma_flush_cqs(ep);
1072 rc = rdma_disconnect(ia->ri_id);
1073 if (!rc) {
1074 /* returns without wait if not connected */
1075 wait_event_interruptible(ep->rep_connect_wait,
1076 ep->rep_connected != 1);
1077 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
1078 (ep->rep_connected == 1) ? "still " : "dis");
1079 } else {
1080 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
1081 ep->rep_connected = rc;
1082 }
1083 }
1084
1085 static struct rpcrdma_req *
1086 rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
1087 {
1088 struct rpcrdma_req *req;
1089
1090 req = kzalloc(sizeof(*req), GFP_KERNEL);
1091 if (req == NULL)
1092 return ERR_PTR(-ENOMEM);
1093
1094 req->rl_buffer = &r_xprt->rx_buf;
1095 return req;
1096 }
1097
1098 static struct rpcrdma_rep *
1099 rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
1100 {
1101 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1102 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1103 struct rpcrdma_rep *rep;
1104 int rc;
1105
1106 rc = -ENOMEM;
1107 rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1108 if (rep == NULL)
1109 goto out;
1110
1111 rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
1112 GFP_KERNEL);
1113 if (IS_ERR(rep->rr_rdmabuf)) {
1114 rc = PTR_ERR(rep->rr_rdmabuf);
1115 goto out_free;
1116 }
1117
1118 rep->rr_buffer = &r_xprt->rx_buf;
1119 return rep;
1120
1121 out_free:
1122 kfree(rep);
1123 out:
1124 return ERR_PTR(rc);
1125 }
1126
1127 static int
1128 rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1129 {
1130 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
1131 struct ib_fmr_attr fmr_attr = {
1132 .max_pages = RPCRDMA_MAX_DATA_SEGS,
1133 .max_maps = 1,
1134 .page_shift = PAGE_SHIFT
1135 };
1136 struct rpcrdma_mw *r;
1137 int i, rc;
1138
1139 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1140 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
1141
1142 while (i--) {
1143 r = kzalloc(sizeof(*r), GFP_KERNEL);
1144 if (r == NULL)
1145 return -ENOMEM;
1146
1147 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1148 if (IS_ERR(r->r.fmr)) {
1149 rc = PTR_ERR(r->r.fmr);
1150 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1151 __func__, rc);
1152 goto out_free;
1153 }
1154
1155 list_add(&r->mw_list, &buf->rb_mws);
1156 list_add(&r->mw_all, &buf->rb_all);
1157 }
1158 return 0;
1159
1160 out_free:
1161 kfree(r);
1162 return rc;
1163 }
1164
1165 static int
1166 rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1167 {
1168 struct rpcrdma_frmr *f;
1169 struct rpcrdma_mw *r;
1170 int i, rc;
1171
1172 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1173 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
1174
1175 while (i--) {
1176 r = kzalloc(sizeof(*r), GFP_KERNEL);
1177 if (r == NULL)
1178 return -ENOMEM;
1179 f = &r->r.frmr;
1180
1181 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1182 ia->ri_max_frmr_depth);
1183 if (IS_ERR(f->fr_mr)) {
1184 rc = PTR_ERR(f->fr_mr);
1185 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1186 "failed %i\n", __func__, rc);
1187 goto out_free;
1188 }
1189
1190 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1191 ia->ri_max_frmr_depth);
1192 if (IS_ERR(f->fr_pgl)) {
1193 rc = PTR_ERR(f->fr_pgl);
1194 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1195 "failed %i\n", __func__, rc);
1196
1197 ib_dereg_mr(f->fr_mr);
1198 goto out_free;
1199 }
1200
1201 list_add(&r->mw_list, &buf->rb_mws);
1202 list_add(&r->mw_all, &buf->rb_all);
1203 }
1204
1205 return 0;
1206
1207 out_free:
1208 kfree(r);
1209 return rc;
1210 }
1211
1212 int
1213 rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1214 {
1215 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1216 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1217 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1218 char *p;
1219 size_t len;
1220 int i, rc;
1221
1222 buf->rb_max_requests = cdata->max_requests;
1223 spin_lock_init(&buf->rb_lock);
1224
1225 /* Need to allocate:
1226 * 1. arrays for send and recv pointers
1227 * 2. arrays of struct rpcrdma_req to fill in pointers
1228 * 3. array of struct rpcrdma_rep for replies
1229 * Send/recv buffers in req/rep need to be registered
1230 */
1231 len = buf->rb_max_requests *
1232 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1233
1234 p = kzalloc(len, GFP_KERNEL);
1235 if (p == NULL) {
1236 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1237 __func__, len);
1238 rc = -ENOMEM;
1239 goto out;
1240 }
1241 buf->rb_pool = p; /* for freeing it later */
1242
1243 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1244 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1245 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1246 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1247
1248 INIT_LIST_HEAD(&buf->rb_mws);
1249 INIT_LIST_HEAD(&buf->rb_all);
1250 switch (ia->ri_memreg_strategy) {
1251 case RPCRDMA_FRMR:
1252 rc = rpcrdma_init_frmrs(ia, buf);
1253 if (rc)
1254 goto out;
1255 break;
1256 case RPCRDMA_MTHCAFMR:
1257 rc = rpcrdma_init_fmrs(ia, buf);
1258 if (rc)
1259 goto out;
1260 break;
1261 default:
1262 break;
1263 }
1264
1265 for (i = 0; i < buf->rb_max_requests; i++) {
1266 struct rpcrdma_req *req;
1267 struct rpcrdma_rep *rep;
1268
1269 req = rpcrdma_create_req(r_xprt);
1270 if (IS_ERR(req)) {
1271 dprintk("RPC: %s: request buffer %d alloc"
1272 " failed\n", __func__, i);
1273 rc = PTR_ERR(req);
1274 goto out;
1275 }
1276 buf->rb_send_bufs[i] = req;
1277
1278 rep = rpcrdma_create_rep(r_xprt);
1279 if (IS_ERR(rep)) {
1280 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1281 __func__, i);
1282 rc = PTR_ERR(rep);
1283 goto out;
1284 }
1285 buf->rb_recv_bufs[i] = rep;
1286 }
1287
1288 return 0;
1289 out:
1290 rpcrdma_buffer_destroy(buf);
1291 return rc;
1292 }
1293
1294 static void
1295 rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
1296 {
1297 if (!rep)
1298 return;
1299
1300 rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
1301 kfree(rep);
1302 }
1303
1304 static void
1305 rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
1306 {
1307 if (!req)
1308 return;
1309
1310 rpcrdma_free_regbuf(ia, req->rl_sendbuf);
1311 rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
1312 kfree(req);
1313 }
1314
1315 static void
1316 rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1317 {
1318 struct rpcrdma_mw *r;
1319 int rc;
1320
1321 while (!list_empty(&buf->rb_all)) {
1322 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1323 list_del(&r->mw_all);
1324 list_del(&r->mw_list);
1325
1326 rc = ib_dealloc_fmr(r->r.fmr);
1327 if (rc)
1328 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1329 __func__, rc);
1330
1331 kfree(r);
1332 }
1333 }
1334
1335 static void
1336 rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1337 {
1338 struct rpcrdma_mw *r;
1339 int rc;
1340
1341 while (!list_empty(&buf->rb_all)) {
1342 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1343 list_del(&r->mw_all);
1344 list_del(&r->mw_list);
1345
1346 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1347 if (rc)
1348 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1349 __func__, rc);
1350 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1351
1352 kfree(r);
1353 }
1354 }
1355
1356 void
1357 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1358 {
1359 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1360 int i;
1361
1362 /* clean up in reverse order from create
1363 * 1. recv mr memory (mr free, then kfree)
1364 * 2. send mr memory (mr free, then kfree)
1365 * 3. MWs
1366 */
1367 dprintk("RPC: %s: entering\n", __func__);
1368
1369 for (i = 0; i < buf->rb_max_requests; i++) {
1370 if (buf->rb_recv_bufs)
1371 rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
1372 if (buf->rb_send_bufs)
1373 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
1374 }
1375
1376 switch (ia->ri_memreg_strategy) {
1377 case RPCRDMA_FRMR:
1378 rpcrdma_destroy_frmrs(buf);
1379 break;
1380 case RPCRDMA_MTHCAFMR:
1381 rpcrdma_destroy_fmrs(buf);
1382 break;
1383 default:
1384 break;
1385 }
1386
1387 kfree(buf->rb_pool);
1388 }
1389
1390 /* After a disconnect, unmap all FMRs.
1391 *
1392 * This is invoked only in the transport connect worker in order
1393 * to serialize with rpcrdma_register_fmr_external().
1394 */
1395 static void
1396 rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
1397 {
1398 struct rpcrdma_xprt *r_xprt =
1399 container_of(ia, struct rpcrdma_xprt, rx_ia);
1400 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1401 struct list_head *pos;
1402 struct rpcrdma_mw *r;
1403 LIST_HEAD(l);
1404 int rc;
1405
1406 list_for_each(pos, &buf->rb_all) {
1407 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1408
1409 INIT_LIST_HEAD(&l);
1410 list_add(&r->r.fmr->list, &l);
1411 rc = ib_unmap_fmr(&l);
1412 if (rc)
1413 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
1414 __func__, rc);
1415 }
1416 }
1417
1418 /* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1419 * an unusable state. Find FRMRs in this state and dereg / reg
1420 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1421 * also torn down.
1422 *
1423 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1424 *
1425 * This is invoked only in the transport connect worker in order
1426 * to serialize with rpcrdma_register_frmr_external().
1427 */
1428 static void
1429 rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1430 {
1431 struct rpcrdma_xprt *r_xprt =
1432 container_of(ia, struct rpcrdma_xprt, rx_ia);
1433 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1434 struct list_head *pos;
1435 struct rpcrdma_mw *r;
1436 int rc;
1437
1438 list_for_each(pos, &buf->rb_all) {
1439 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1440
1441 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1442 continue;
1443
1444 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1445 if (rc)
1446 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1447 __func__, rc);
1448 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1449
1450 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1451 ia->ri_max_frmr_depth);
1452 if (IS_ERR(r->r.frmr.fr_mr)) {
1453 rc = PTR_ERR(r->r.frmr.fr_mr);
1454 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1455 " failed %i\n", __func__, rc);
1456 continue;
1457 }
1458 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1459 ia->ri_id->device,
1460 ia->ri_max_frmr_depth);
1461 if (IS_ERR(r->r.frmr.fr_pgl)) {
1462 rc = PTR_ERR(r->r.frmr.fr_pgl);
1463 dprintk("RPC: %s: "
1464 "ib_alloc_fast_reg_page_list "
1465 "failed %i\n", __func__, rc);
1466
1467 ib_dereg_mr(r->r.frmr.fr_mr);
1468 continue;
1469 }
1470 r->r.frmr.fr_state = FRMR_IS_INVALID;
1471 }
1472 }
1473
1474 /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1475 * some req segments uninitialized.
1476 */
1477 static void
1478 rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1479 {
1480 if (*mw) {
1481 list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
1482 *mw = NULL;
1483 }
1484 }
1485
1486 /* Cycle mw's back in reverse order, and "spin" them.
1487 * This delays and scrambles reuse as much as possible.
1488 */
1489 static void
1490 rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1491 {
1492 struct rpcrdma_mr_seg *seg = req->rl_segments;
1493 struct rpcrdma_mr_seg *seg1 = seg;
1494 int i;
1495
1496 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
1497 rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
1498 rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
1499 }
1500
1501 static void
1502 rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1503 {
1504 buf->rb_send_bufs[--buf->rb_send_index] = req;
1505 req->rl_niovs = 0;
1506 if (req->rl_reply) {
1507 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1508 req->rl_reply->rr_func = NULL;
1509 req->rl_reply = NULL;
1510 }
1511 }
1512
1513 /* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
1514 * Redo only the ib_post_send().
1515 */
1516 static void
1517 rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1518 {
1519 struct rpcrdma_xprt *r_xprt =
1520 container_of(ia, struct rpcrdma_xprt, rx_ia);
1521 struct ib_send_wr invalidate_wr, *bad_wr;
1522 int rc;
1523
1524 dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
1525
1526 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
1527 r->r.frmr.fr_state = FRMR_IS_INVALID;
1528
1529 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1530 invalidate_wr.wr_id = (unsigned long)(void *)r;
1531 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1532 invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1533 DECR_CQCOUNT(&r_xprt->rx_ep);
1534
1535 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
1536 __func__, r, r->r.frmr.fr_mr->rkey);
1537
1538 read_lock(&ia->ri_qplock);
1539 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1540 read_unlock(&ia->ri_qplock);
1541 if (rc) {
1542 /* Force rpcrdma_buffer_get() to retry */
1543 r->r.frmr.fr_state = FRMR_IS_STALE;
1544 dprintk("RPC: %s: ib_post_send failed, %i\n",
1545 __func__, rc);
1546 }
1547 }
1548
1549 static void
1550 rpcrdma_retry_flushed_linv(struct list_head *stale,
1551 struct rpcrdma_buffer *buf)
1552 {
1553 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1554 struct list_head *pos;
1555 struct rpcrdma_mw *r;
1556 unsigned long flags;
1557
1558 list_for_each(pos, stale) {
1559 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1560 rpcrdma_retry_local_inv(r, ia);
1561 }
1562
1563 spin_lock_irqsave(&buf->rb_lock, flags);
1564 list_splice_tail(stale, &buf->rb_mws);
1565 spin_unlock_irqrestore(&buf->rb_lock, flags);
1566 }
1567
1568 static struct rpcrdma_req *
1569 rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1570 struct list_head *stale)
1571 {
1572 struct rpcrdma_mw *r;
1573 int i;
1574
1575 i = RPCRDMA_MAX_SEGS - 1;
1576 while (!list_empty(&buf->rb_mws)) {
1577 r = list_entry(buf->rb_mws.next,
1578 struct rpcrdma_mw, mw_list);
1579 list_del(&r->mw_list);
1580 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1581 list_add(&r->mw_list, stale);
1582 continue;
1583 }
1584 req->rl_segments[i].rl_mw = r;
1585 if (unlikely(i-- == 0))
1586 return req; /* Success */
1587 }
1588
1589 /* Not enough entries on rb_mws for this req */
1590 rpcrdma_buffer_put_sendbuf(req, buf);
1591 rpcrdma_buffer_put_mrs(req, buf);
1592 return NULL;
1593 }
1594
1595 static struct rpcrdma_req *
1596 rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1597 {
1598 struct rpcrdma_mw *r;
1599 int i;
1600
1601 i = RPCRDMA_MAX_SEGS - 1;
1602 while (!list_empty(&buf->rb_mws)) {
1603 r = list_entry(buf->rb_mws.next,
1604 struct rpcrdma_mw, mw_list);
1605 list_del(&r->mw_list);
1606 req->rl_segments[i].rl_mw = r;
1607 if (unlikely(i-- == 0))
1608 return req; /* Success */
1609 }
1610
1611 /* Not enough entries on rb_mws for this req */
1612 rpcrdma_buffer_put_sendbuf(req, buf);
1613 rpcrdma_buffer_put_mrs(req, buf);
1614 return NULL;
1615 }
1616
1617 /*
1618 * Get a set of request/reply buffers.
1619 *
1620 * Reply buffer (if needed) is attached to send buffer upon return.
1621 * Rule:
1622 * rb_send_index and rb_recv_index MUST always be pointing to the
1623 * *next* available buffer (non-NULL). They are incremented after
1624 * removing buffers, and decremented *before* returning them.
1625 */
1626 struct rpcrdma_req *
1627 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1628 {
1629 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1630 struct list_head stale;
1631 struct rpcrdma_req *req;
1632 unsigned long flags;
1633
1634 spin_lock_irqsave(&buffers->rb_lock, flags);
1635 if (buffers->rb_send_index == buffers->rb_max_requests) {
1636 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1637 dprintk("RPC: %s: out of request buffers\n", __func__);
1638 return ((struct rpcrdma_req *)NULL);
1639 }
1640
1641 req = buffers->rb_send_bufs[buffers->rb_send_index];
1642 if (buffers->rb_send_index < buffers->rb_recv_index) {
1643 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1644 __func__,
1645 buffers->rb_recv_index - buffers->rb_send_index);
1646 req->rl_reply = NULL;
1647 } else {
1648 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1649 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1650 }
1651 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1652
1653 INIT_LIST_HEAD(&stale);
1654 switch (ia->ri_memreg_strategy) {
1655 case RPCRDMA_FRMR:
1656 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1657 break;
1658 case RPCRDMA_MTHCAFMR:
1659 req = rpcrdma_buffer_get_fmrs(req, buffers);
1660 break;
1661 default:
1662 break;
1663 }
1664 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1665 if (!list_empty(&stale))
1666 rpcrdma_retry_flushed_linv(&stale, buffers);
1667 return req;
1668 }
1669
1670 /*
1671 * Put request/reply buffers back into pool.
1672 * Pre-decrement counter/array index.
1673 */
1674 void
1675 rpcrdma_buffer_put(struct rpcrdma_req *req)
1676 {
1677 struct rpcrdma_buffer *buffers = req->rl_buffer;
1678 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1679 unsigned long flags;
1680
1681 spin_lock_irqsave(&buffers->rb_lock, flags);
1682 rpcrdma_buffer_put_sendbuf(req, buffers);
1683 switch (ia->ri_memreg_strategy) {
1684 case RPCRDMA_FRMR:
1685 case RPCRDMA_MTHCAFMR:
1686 rpcrdma_buffer_put_mrs(req, buffers);
1687 break;
1688 default:
1689 break;
1690 }
1691 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1692 }
1693
1694 /*
1695 * Recover reply buffers from pool.
1696 * This happens when recovering from error conditions.
1697 * Post-increment counter/array index.
1698 */
1699 void
1700 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1701 {
1702 struct rpcrdma_buffer *buffers = req->rl_buffer;
1703 unsigned long flags;
1704
1705 spin_lock_irqsave(&buffers->rb_lock, flags);
1706 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1707 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1708 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1709 }
1710 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1711 }
1712
1713 /*
1714 * Put reply buffers back into pool when not attached to
1715 * request. This happens in error conditions.
1716 */
1717 void
1718 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1719 {
1720 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1721 unsigned long flags;
1722
1723 rep->rr_func = NULL;
1724 spin_lock_irqsave(&buffers->rb_lock, flags);
1725 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1726 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1727 }
1728
1729 /*
1730 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1731 */
1732
1733 static int
1734 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1735 struct ib_mr **mrp, struct ib_sge *iov)
1736 {
1737 struct ib_phys_buf ipb;
1738 struct ib_mr *mr;
1739 int rc;
1740
1741 /*
1742 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1743 */
1744 iov->addr = ib_dma_map_single(ia->ri_id->device,
1745 va, len, DMA_BIDIRECTIONAL);
1746 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1747 return -ENOMEM;
1748
1749 iov->length = len;
1750
1751 if (ia->ri_have_dma_lkey) {
1752 *mrp = NULL;
1753 iov->lkey = ia->ri_dma_lkey;
1754 return 0;
1755 } else if (ia->ri_bind_mem != NULL) {
1756 *mrp = NULL;
1757 iov->lkey = ia->ri_bind_mem->lkey;
1758 return 0;
1759 }
1760
1761 ipb.addr = iov->addr;
1762 ipb.size = iov->length;
1763 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1764 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1765
1766 dprintk("RPC: %s: phys convert: 0x%llx "
1767 "registered 0x%llx length %d\n",
1768 __func__, (unsigned long long)ipb.addr,
1769 (unsigned long long)iov->addr, len);
1770
1771 if (IS_ERR(mr)) {
1772 *mrp = NULL;
1773 rc = PTR_ERR(mr);
1774 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1775 } else {
1776 *mrp = mr;
1777 iov->lkey = mr->lkey;
1778 rc = 0;
1779 }
1780
1781 return rc;
1782 }
1783
1784 static int
1785 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1786 struct ib_mr *mr, struct ib_sge *iov)
1787 {
1788 int rc;
1789
1790 ib_dma_unmap_single(ia->ri_id->device,
1791 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1792
1793 if (NULL == mr)
1794 return 0;
1795
1796 rc = ib_dereg_mr(mr);
1797 if (rc)
1798 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1799 return rc;
1800 }
1801
1802 /**
1803 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
1804 * @ia: controlling rpcrdma_ia
1805 * @size: size of buffer to be allocated, in bytes
1806 * @flags: GFP flags
1807 *
1808 * Returns pointer to private header of an area of internally
1809 * registered memory, or an ERR_PTR. The registered buffer follows
1810 * the end of the private header.
1811 *
1812 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
1813 * receiving the payload of RDMA RECV operations. regbufs are not
1814 * used for RDMA READ/WRITE operations, thus are registered only for
1815 * LOCAL access.
1816 */
1817 struct rpcrdma_regbuf *
1818 rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
1819 {
1820 struct rpcrdma_regbuf *rb;
1821 int rc;
1822
1823 rc = -ENOMEM;
1824 rb = kmalloc(sizeof(*rb) + size, flags);
1825 if (rb == NULL)
1826 goto out;
1827
1828 rb->rg_size = size;
1829 rb->rg_owner = NULL;
1830 rc = rpcrdma_register_internal(ia, rb->rg_base, size,
1831 &rb->rg_mr, &rb->rg_iov);
1832 if (rc)
1833 goto out_free;
1834
1835 return rb;
1836
1837 out_free:
1838 kfree(rb);
1839 out:
1840 return ERR_PTR(rc);
1841 }
1842
1843 /**
1844 * rpcrdma_free_regbuf - deregister and free registered buffer
1845 * @ia: controlling rpcrdma_ia
1846 * @rb: regbuf to be deregistered and freed
1847 */
1848 void
1849 rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
1850 {
1851 if (rb) {
1852 rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
1853 kfree(rb);
1854 }
1855 }
1856
1857 /*
1858 * Wrappers for chunk registration, shared by read/write chunk code.
1859 */
1860
1861 static void
1862 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1863 {
1864 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1865 seg->mr_dmalen = seg->mr_len;
1866 if (seg->mr_page)
1867 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1868 seg->mr_page, offset_in_page(seg->mr_offset),
1869 seg->mr_dmalen, seg->mr_dir);
1870 else
1871 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1872 seg->mr_offset,
1873 seg->mr_dmalen, seg->mr_dir);
1874 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1875 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1876 __func__,
1877 (unsigned long long)seg->mr_dma,
1878 seg->mr_offset, seg->mr_dmalen);
1879 }
1880 }
1881
1882 static void
1883 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1884 {
1885 if (seg->mr_page)
1886 ib_dma_unmap_page(ia->ri_id->device,
1887 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1888 else
1889 ib_dma_unmap_single(ia->ri_id->device,
1890 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1891 }
1892
1893 static int
1894 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1895 int *nsegs, int writing, struct rpcrdma_ia *ia,
1896 struct rpcrdma_xprt *r_xprt)
1897 {
1898 struct rpcrdma_mr_seg *seg1 = seg;
1899 struct rpcrdma_mw *mw = seg1->rl_mw;
1900 struct rpcrdma_frmr *frmr = &mw->r.frmr;
1901 struct ib_mr *mr = frmr->fr_mr;
1902 struct ib_send_wr fastreg_wr, *bad_wr;
1903 u8 key;
1904 int len, pageoff;
1905 int i, rc;
1906 int seg_len;
1907 u64 pa;
1908 int page_no;
1909
1910 pageoff = offset_in_page(seg1->mr_offset);
1911 seg1->mr_offset -= pageoff; /* start of page */
1912 seg1->mr_len += pageoff;
1913 len = -pageoff;
1914 if (*nsegs > ia->ri_max_frmr_depth)
1915 *nsegs = ia->ri_max_frmr_depth;
1916 for (page_no = i = 0; i < *nsegs;) {
1917 rpcrdma_map_one(ia, seg, writing);
1918 pa = seg->mr_dma;
1919 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1920 frmr->fr_pgl->page_list[page_no++] = pa;
1921 pa += PAGE_SIZE;
1922 }
1923 len += seg->mr_len;
1924 ++seg;
1925 ++i;
1926 /* Check for holes */
1927 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1928 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1929 break;
1930 }
1931 dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n",
1932 __func__, mw, i, len);
1933
1934 frmr->fr_state = FRMR_IS_VALID;
1935
1936 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1937 fastreg_wr.wr_id = (unsigned long)(void *)mw;
1938 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1939 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
1940 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1941 fastreg_wr.wr.fast_reg.page_list_len = page_no;
1942 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1943 fastreg_wr.wr.fast_reg.length = len;
1944
1945 /* Bump the key */
1946 key = (u8)(mr->rkey & 0x000000FF);
1947 ib_update_fast_reg_key(mr, ++key);
1948
1949 fastreg_wr.wr.fast_reg.access_flags = (writing ?
1950 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1951 IB_ACCESS_REMOTE_READ);
1952 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
1953 DECR_CQCOUNT(&r_xprt->rx_ep);
1954
1955 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
1956 if (rc) {
1957 dprintk("RPC: %s: failed ib_post_send for register,"
1958 " status %i\n", __func__, rc);
1959 ib_update_fast_reg_key(mr, --key);
1960 goto out_err;
1961 } else {
1962 seg1->mr_rkey = mr->rkey;
1963 seg1->mr_base = seg1->mr_dma + pageoff;
1964 seg1->mr_nsegs = i;
1965 seg1->mr_len = len;
1966 }
1967 *nsegs = i;
1968 return 0;
1969 out_err:
1970 frmr->fr_state = FRMR_IS_INVALID;
1971 while (i--)
1972 rpcrdma_unmap_one(ia, --seg);
1973 return rc;
1974 }
1975
1976 static int
1977 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1978 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1979 {
1980 struct rpcrdma_mr_seg *seg1 = seg;
1981 struct ib_send_wr invalidate_wr, *bad_wr;
1982 int rc;
1983
1984 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
1985
1986 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1987 invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
1988 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1989 invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
1990 DECR_CQCOUNT(&r_xprt->rx_ep);
1991
1992 read_lock(&ia->ri_qplock);
1993 while (seg1->mr_nsegs--)
1994 rpcrdma_unmap_one(ia, seg++);
1995 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1996 read_unlock(&ia->ri_qplock);
1997 if (rc) {
1998 /* Force rpcrdma_buffer_get() to retry */
1999 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
2000 dprintk("RPC: %s: failed ib_post_send for invalidate,"
2001 " status %i\n", __func__, rc);
2002 }
2003 return rc;
2004 }
2005
2006 static int
2007 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
2008 int *nsegs, int writing, struct rpcrdma_ia *ia)
2009 {
2010 struct rpcrdma_mr_seg *seg1 = seg;
2011 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
2012 int len, pageoff, i, rc;
2013
2014 pageoff = offset_in_page(seg1->mr_offset);
2015 seg1->mr_offset -= pageoff; /* start of page */
2016 seg1->mr_len += pageoff;
2017 len = -pageoff;
2018 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
2019 *nsegs = RPCRDMA_MAX_DATA_SEGS;
2020 for (i = 0; i < *nsegs;) {
2021 rpcrdma_map_one(ia, seg, writing);
2022 physaddrs[i] = seg->mr_dma;
2023 len += seg->mr_len;
2024 ++seg;
2025 ++i;
2026 /* Check for holes */
2027 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
2028 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
2029 break;
2030 }
2031 rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma);
2032 if (rc) {
2033 dprintk("RPC: %s: failed ib_map_phys_fmr "
2034 "%u@0x%llx+%i (%d)... status %i\n", __func__,
2035 len, (unsigned long long)seg1->mr_dma,
2036 pageoff, i, rc);
2037 while (i--)
2038 rpcrdma_unmap_one(ia, --seg);
2039 } else {
2040 seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey;
2041 seg1->mr_base = seg1->mr_dma + pageoff;
2042 seg1->mr_nsegs = i;
2043 seg1->mr_len = len;
2044 }
2045 *nsegs = i;
2046 return rc;
2047 }
2048
2049 static int
2050 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
2051 struct rpcrdma_ia *ia)
2052 {
2053 struct rpcrdma_mr_seg *seg1 = seg;
2054 LIST_HEAD(l);
2055 int rc;
2056
2057 list_add(&seg1->rl_mw->r.fmr->list, &l);
2058 rc = ib_unmap_fmr(&l);
2059 read_lock(&ia->ri_qplock);
2060 while (seg1->mr_nsegs--)
2061 rpcrdma_unmap_one(ia, seg++);
2062 read_unlock(&ia->ri_qplock);
2063 if (rc)
2064 dprintk("RPC: %s: failed ib_unmap_fmr,"
2065 " status %i\n", __func__, rc);
2066 return rc;
2067 }
2068
2069 int
2070 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
2071 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
2072 {
2073 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2074 int rc = 0;
2075
2076 switch (ia->ri_memreg_strategy) {
2077
2078 case RPCRDMA_ALLPHYSICAL:
2079 rpcrdma_map_one(ia, seg, writing);
2080 seg->mr_rkey = ia->ri_bind_mem->rkey;
2081 seg->mr_base = seg->mr_dma;
2082 seg->mr_nsegs = 1;
2083 nsegs = 1;
2084 break;
2085
2086 /* Registration using frmr registration */
2087 case RPCRDMA_FRMR:
2088 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
2089 break;
2090
2091 /* Registration using fmr memory registration */
2092 case RPCRDMA_MTHCAFMR:
2093 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
2094 break;
2095
2096 default:
2097 return -EIO;
2098 }
2099 if (rc)
2100 return rc;
2101
2102 return nsegs;
2103 }
2104
2105 int
2106 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
2107 struct rpcrdma_xprt *r_xprt)
2108 {
2109 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2110 int nsegs = seg->mr_nsegs, rc;
2111
2112 switch (ia->ri_memreg_strategy) {
2113
2114 case RPCRDMA_ALLPHYSICAL:
2115 read_lock(&ia->ri_qplock);
2116 rpcrdma_unmap_one(ia, seg);
2117 read_unlock(&ia->ri_qplock);
2118 break;
2119
2120 case RPCRDMA_FRMR:
2121 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
2122 break;
2123
2124 case RPCRDMA_MTHCAFMR:
2125 rc = rpcrdma_deregister_fmr_external(seg, ia);
2126 break;
2127
2128 default:
2129 break;
2130 }
2131 return nsegs;
2132 }
2133
2134 /*
2135 * Prepost any receive buffer, then post send.
2136 *
2137 * Receive buffer is donated to hardware, reclaimed upon recv completion.
2138 */
2139 int
2140 rpcrdma_ep_post(struct rpcrdma_ia *ia,
2141 struct rpcrdma_ep *ep,
2142 struct rpcrdma_req *req)
2143 {
2144 struct ib_send_wr send_wr, *send_wr_fail;
2145 struct rpcrdma_rep *rep = req->rl_reply;
2146 int rc;
2147
2148 if (rep) {
2149 rc = rpcrdma_ep_post_recv(ia, ep, rep);
2150 if (rc)
2151 goto out;
2152 req->rl_reply = NULL;
2153 }
2154
2155 send_wr.next = NULL;
2156 send_wr.wr_id = 0ULL; /* no send cookie */
2157 send_wr.sg_list = req->rl_send_iov;
2158 send_wr.num_sge = req->rl_niovs;
2159 send_wr.opcode = IB_WR_SEND;
2160 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
2161 ib_dma_sync_single_for_device(ia->ri_id->device,
2162 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
2163 DMA_TO_DEVICE);
2164 ib_dma_sync_single_for_device(ia->ri_id->device,
2165 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
2166 DMA_TO_DEVICE);
2167 ib_dma_sync_single_for_device(ia->ri_id->device,
2168 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
2169 DMA_TO_DEVICE);
2170
2171 if (DECR_CQCOUNT(ep) > 0)
2172 send_wr.send_flags = 0;
2173 else { /* Provider must take a send completion every now and then */
2174 INIT_CQCOUNT(ep);
2175 send_wr.send_flags = IB_SEND_SIGNALED;
2176 }
2177
2178 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
2179 if (rc)
2180 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
2181 rc);
2182 out:
2183 return rc;
2184 }
2185
2186 /*
2187 * (Re)post a receive buffer.
2188 */
2189 int
2190 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2191 struct rpcrdma_ep *ep,
2192 struct rpcrdma_rep *rep)
2193 {
2194 struct ib_recv_wr recv_wr, *recv_wr_fail;
2195 int rc;
2196
2197 recv_wr.next = NULL;
2198 recv_wr.wr_id = (u64) (unsigned long) rep;
2199 recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
2200 recv_wr.num_sge = 1;
2201
2202 ib_dma_sync_single_for_cpu(ia->ri_id->device,
2203 rdmab_addr(rep->rr_rdmabuf),
2204 rdmab_length(rep->rr_rdmabuf),
2205 DMA_BIDIRECTIONAL);
2206
2207 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
2208
2209 if (rc)
2210 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
2211 rc);
2212 return rc;
2213 }
2214
2215 /* Physical mapping means one Read/Write list entry per-page.
2216 * All list entries must fit within an inline buffer
2217 *
2218 * NB: The server must return a Write list for NFS READ,
2219 * which has the same constraint. Factor in the inline
2220 * rsize as well.
2221 */
2222 static size_t
2223 rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
2224 {
2225 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2226 unsigned int inline_size, pages;
2227
2228 inline_size = min_t(unsigned int,
2229 cdata->inline_wsize, cdata->inline_rsize);
2230 inline_size -= RPCRDMA_HDRLEN_MIN;
2231 pages = inline_size / sizeof(struct rpcrdma_segment);
2232 return pages << PAGE_SHIFT;
2233 }
2234
2235 static size_t
2236 rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2237 {
2238 return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2239 }
2240
2241 size_t
2242 rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2243 {
2244 size_t result;
2245
2246 switch (r_xprt->rx_ia.ri_memreg_strategy) {
2247 case RPCRDMA_ALLPHYSICAL:
2248 result = rpcrdma_physical_max_payload(r_xprt);
2249 break;
2250 default:
2251 result = rpcrdma_mr_max_payload(r_xprt);
2252 }
2253 return result;
2254 }
This page took 0.164061 seconds and 5 git commands to generate.