2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * Encapsulates the major functions managing:
50 #include <linux/interrupt.h>
51 #include <linux/slab.h>
52 #include <linux/prefetch.h>
53 #include <linux/sunrpc/addr.h>
54 #include <asm/bitops.h>
56 #include "xprt_rdma.h"
62 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
63 # define RPCDBG_FACILITY RPCDBG_TRANS
66 static void rpcrdma_reset_frmrs(struct rpcrdma_ia
*);
67 static void rpcrdma_reset_fmrs(struct rpcrdma_ia
*);
74 * handle replies in tasklet context, using a single, global list
75 * rdma tasklet function -- just turn around and call the func
76 * for all replies on the list
79 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g
);
80 static LIST_HEAD(rpcrdma_tasklets_g
);
83 rpcrdma_run_tasklet(unsigned long data
)
85 struct rpcrdma_rep
*rep
;
86 void (*func
)(struct rpcrdma_rep
*);
90 spin_lock_irqsave(&rpcrdma_tk_lock_g
, flags
);
91 while (!list_empty(&rpcrdma_tasklets_g
)) {
92 rep
= list_entry(rpcrdma_tasklets_g
.next
,
93 struct rpcrdma_rep
, rr_list
);
94 list_del(&rep
->rr_list
);
97 spin_unlock_irqrestore(&rpcrdma_tk_lock_g
, flags
);
102 rpcrdma_recv_buffer_put(rep
);
104 spin_lock_irqsave(&rpcrdma_tk_lock_g
, flags
);
106 spin_unlock_irqrestore(&rpcrdma_tk_lock_g
, flags
);
109 static DECLARE_TASKLET(rpcrdma_tasklet_g
, rpcrdma_run_tasklet
, 0UL);
111 static const char * const async_event
[] = {
116 "communication established",
117 "send queue drained",
118 "path migration successful",
120 "device fatal error",
133 #define ASYNC_MSG(status) \
134 ((status) < ARRAY_SIZE(async_event) ? \
135 async_event[(status)] : "unknown async error")
138 rpcrdma_schedule_tasklet(struct list_head
*sched_list
)
142 spin_lock_irqsave(&rpcrdma_tk_lock_g
, flags
);
143 list_splice_tail(sched_list
, &rpcrdma_tasklets_g
);
144 spin_unlock_irqrestore(&rpcrdma_tk_lock_g
, flags
);
145 tasklet_schedule(&rpcrdma_tasklet_g
);
149 rpcrdma_qp_async_error_upcall(struct ib_event
*event
, void *context
)
151 struct rpcrdma_ep
*ep
= context
;
153 pr_err("RPC: %s: %s on device %s ep %p\n",
154 __func__
, ASYNC_MSG(event
->event
),
155 event
->device
->name
, context
);
156 if (ep
->rep_connected
== 1) {
157 ep
->rep_connected
= -EIO
;
158 rpcrdma_conn_func(ep
);
159 wake_up_all(&ep
->rep_connect_wait
);
164 rpcrdma_cq_async_error_upcall(struct ib_event
*event
, void *context
)
166 struct rpcrdma_ep
*ep
= context
;
168 pr_err("RPC: %s: %s on device %s ep %p\n",
169 __func__
, ASYNC_MSG(event
->event
),
170 event
->device
->name
, context
);
171 if (ep
->rep_connected
== 1) {
172 ep
->rep_connected
= -EIO
;
173 rpcrdma_conn_func(ep
);
174 wake_up_all(&ep
->rep_connect_wait
);
178 static const char * const wc_status
[] = {
180 "local length error",
181 "local QP operation error",
182 "local EE context operation error",
183 "local protection error",
185 "memory management operation error",
186 "bad response error",
187 "local access error",
188 "remote invalid request error",
189 "remote access error",
190 "remote operation error",
191 "transport retry counter exceeded",
192 "RNR retrycounter exceeded",
193 "local RDD violation error",
194 "remove invalid RD request",
196 "invalid EE context number",
197 "invalid EE context state",
199 "response timeout error",
203 #define COMPLETION_MSG(status) \
204 ((status) < ARRAY_SIZE(wc_status) ? \
205 wc_status[(status)] : "unexpected completion error")
208 rpcrdma_sendcq_process_wc(struct ib_wc
*wc
)
210 if (likely(wc
->status
== IB_WC_SUCCESS
))
213 /* WARNING: Only wr_id and status are reliable at this point */
214 if (wc
->wr_id
== 0ULL) {
215 if (wc
->status
!= IB_WC_WR_FLUSH_ERR
)
216 pr_err("RPC: %s: SEND: %s\n",
217 __func__
, COMPLETION_MSG(wc
->status
));
219 struct rpcrdma_mw
*r
;
221 r
= (struct rpcrdma_mw
*)(unsigned long)wc
->wr_id
;
222 r
->r
.frmr
.fr_state
= FRMR_IS_STALE
;
223 pr_err("RPC: %s: frmr %p (stale): %s\n",
224 __func__
, r
, COMPLETION_MSG(wc
->status
));
229 rpcrdma_sendcq_poll(struct ib_cq
*cq
, struct rpcrdma_ep
*ep
)
232 int budget
, count
, rc
;
234 budget
= RPCRDMA_WC_BUDGET
/ RPCRDMA_POLLSIZE
;
236 wcs
= ep
->rep_send_wcs
;
238 rc
= ib_poll_cq(cq
, RPCRDMA_POLLSIZE
, wcs
);
244 rpcrdma_sendcq_process_wc(wcs
++);
245 } while (rc
== RPCRDMA_POLLSIZE
&& --budget
);
250 * Handle send, fast_reg_mr, and local_inv completions.
252 * Send events are typically suppressed and thus do not result
253 * in an upcall. Occasionally one is signaled, however. This
254 * prevents the provider's completion queue from wrapping and
255 * losing a completion.
258 rpcrdma_sendcq_upcall(struct ib_cq
*cq
, void *cq_context
)
260 struct rpcrdma_ep
*ep
= (struct rpcrdma_ep
*)cq_context
;
263 rc
= rpcrdma_sendcq_poll(cq
, ep
);
265 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
270 rc
= ib_req_notify_cq(cq
,
271 IB_CQ_NEXT_COMP
| IB_CQ_REPORT_MISSED_EVENTS
);
275 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
280 rpcrdma_sendcq_poll(cq
, ep
);
284 rpcrdma_recvcq_process_wc(struct ib_wc
*wc
, struct list_head
*sched_list
)
286 struct rpcrdma_rep
*rep
=
287 (struct rpcrdma_rep
*)(unsigned long)wc
->wr_id
;
289 /* WARNING: Only wr_id and status are reliable at this point */
290 if (wc
->status
!= IB_WC_SUCCESS
)
293 /* status == SUCCESS means all fields in wc are trustworthy */
294 if (wc
->opcode
!= IB_WC_RECV
)
297 dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
298 __func__
, rep
, wc
->byte_len
);
300 rep
->rr_len
= wc
->byte_len
;
301 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep
->rr_buffer
)->ri_id
->device
,
302 rdmab_addr(rep
->rr_rdmabuf
),
303 rep
->rr_len
, DMA_FROM_DEVICE
);
304 prefetch(rdmab_to_msg(rep
->rr_rdmabuf
));
307 list_add_tail(&rep
->rr_list
, sched_list
);
310 if (wc
->status
!= IB_WC_WR_FLUSH_ERR
)
311 pr_err("RPC: %s: rep %p: %s\n",
312 __func__
, rep
, COMPLETION_MSG(wc
->status
));
318 rpcrdma_recvcq_poll(struct ib_cq
*cq
, struct rpcrdma_ep
*ep
)
320 struct list_head sched_list
;
322 int budget
, count
, rc
;
324 INIT_LIST_HEAD(&sched_list
);
325 budget
= RPCRDMA_WC_BUDGET
/ RPCRDMA_POLLSIZE
;
327 wcs
= ep
->rep_recv_wcs
;
329 rc
= ib_poll_cq(cq
, RPCRDMA_POLLSIZE
, wcs
);
335 rpcrdma_recvcq_process_wc(wcs
++, &sched_list
);
336 } while (rc
== RPCRDMA_POLLSIZE
&& --budget
);
340 rpcrdma_schedule_tasklet(&sched_list
);
345 * Handle receive completions.
347 * It is reentrant but processes single events in order to maintain
348 * ordering of receives to keep server credits.
350 * It is the responsibility of the scheduled tasklet to return
351 * recv buffers to the pool. NOTE: this affects synchronization of
352 * connection shutdown. That is, the structures required for
353 * the completion of the reply handler must remain intact until
354 * all memory has been reclaimed.
357 rpcrdma_recvcq_upcall(struct ib_cq
*cq
, void *cq_context
)
359 struct rpcrdma_ep
*ep
= (struct rpcrdma_ep
*)cq_context
;
362 rc
= rpcrdma_recvcq_poll(cq
, ep
);
364 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
369 rc
= ib_req_notify_cq(cq
,
370 IB_CQ_NEXT_COMP
| IB_CQ_REPORT_MISSED_EVENTS
);
374 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
379 rpcrdma_recvcq_poll(cq
, ep
);
383 rpcrdma_flush_cqs(struct rpcrdma_ep
*ep
)
386 LIST_HEAD(sched_list
);
388 while (ib_poll_cq(ep
->rep_attr
.recv_cq
, 1, &wc
) > 0)
389 rpcrdma_recvcq_process_wc(&wc
, &sched_list
);
390 if (!list_empty(&sched_list
))
391 rpcrdma_schedule_tasklet(&sched_list
);
392 while (ib_poll_cq(ep
->rep_attr
.send_cq
, 1, &wc
) > 0)
393 rpcrdma_sendcq_process_wc(&wc
);
396 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
397 static const char * const conn
[] = {
416 #define CONNECTION_MSG(status) \
417 ((status) < ARRAY_SIZE(conn) ? \
418 conn[(status)] : "unrecognized connection error")
422 rpcrdma_conn_upcall(struct rdma_cm_id
*id
, struct rdma_cm_event
*event
)
424 struct rpcrdma_xprt
*xprt
= id
->context
;
425 struct rpcrdma_ia
*ia
= &xprt
->rx_ia
;
426 struct rpcrdma_ep
*ep
= &xprt
->rx_ep
;
427 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
428 struct sockaddr
*sap
= (struct sockaddr
*)&ep
->rep_remote_addr
;
430 struct ib_qp_attr
*attr
= &ia
->ri_qp_attr
;
431 struct ib_qp_init_attr
*iattr
= &ia
->ri_qp_init_attr
;
434 switch (event
->event
) {
435 case RDMA_CM_EVENT_ADDR_RESOLVED
:
436 case RDMA_CM_EVENT_ROUTE_RESOLVED
:
438 complete(&ia
->ri_done
);
440 case RDMA_CM_EVENT_ADDR_ERROR
:
441 ia
->ri_async_rc
= -EHOSTUNREACH
;
442 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
444 complete(&ia
->ri_done
);
446 case RDMA_CM_EVENT_ROUTE_ERROR
:
447 ia
->ri_async_rc
= -ENETUNREACH
;
448 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
450 complete(&ia
->ri_done
);
452 case RDMA_CM_EVENT_ESTABLISHED
:
454 ib_query_qp(ia
->ri_id
->qp
, attr
,
455 IB_QP_MAX_QP_RD_ATOMIC
| IB_QP_MAX_DEST_RD_ATOMIC
,
457 dprintk("RPC: %s: %d responder resources"
459 __func__
, attr
->max_dest_rd_atomic
,
460 attr
->max_rd_atomic
);
462 case RDMA_CM_EVENT_CONNECT_ERROR
:
463 connstate
= -ENOTCONN
;
465 case RDMA_CM_EVENT_UNREACHABLE
:
466 connstate
= -ENETDOWN
;
468 case RDMA_CM_EVENT_REJECTED
:
469 connstate
= -ECONNREFUSED
;
471 case RDMA_CM_EVENT_DISCONNECTED
:
472 connstate
= -ECONNABORTED
;
474 case RDMA_CM_EVENT_DEVICE_REMOVAL
:
477 dprintk("RPC: %s: %sconnected\n",
478 __func__
, connstate
> 0 ? "" : "dis");
479 ep
->rep_connected
= connstate
;
480 rpcrdma_conn_func(ep
);
481 wake_up_all(&ep
->rep_connect_wait
);
484 dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
485 __func__
, sap
, rpc_get_port(sap
), ep
,
486 CONNECTION_MSG(event
->event
));
490 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
491 if (connstate
== 1) {
492 int ird
= attr
->max_dest_rd_atomic
;
493 int tird
= ep
->rep_remote_cma
.responder_resources
;
495 pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
496 sap
, rpc_get_port(sap
),
497 ia
->ri_id
->device
->name
,
498 ia
->ri_ops
->ro_displayname
,
499 xprt
->rx_buf
.rb_max_requests
,
500 ird
, ird
< 4 && ird
< tird
/ 2 ? " (low!)" : "");
501 } else if (connstate
< 0) {
502 pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
503 sap
, rpc_get_port(sap
), connstate
);
510 static struct rdma_cm_id
*
511 rpcrdma_create_id(struct rpcrdma_xprt
*xprt
,
512 struct rpcrdma_ia
*ia
, struct sockaddr
*addr
)
514 struct rdma_cm_id
*id
;
517 init_completion(&ia
->ri_done
);
519 id
= rdma_create_id(rpcrdma_conn_upcall
, xprt
, RDMA_PS_TCP
, IB_QPT_RC
);
522 dprintk("RPC: %s: rdma_create_id() failed %i\n",
527 ia
->ri_async_rc
= -ETIMEDOUT
;
528 rc
= rdma_resolve_addr(id
, NULL
, addr
, RDMA_RESOLVE_TIMEOUT
);
530 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
534 wait_for_completion_interruptible_timeout(&ia
->ri_done
,
535 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT
) + 1);
536 rc
= ia
->ri_async_rc
;
540 ia
->ri_async_rc
= -ETIMEDOUT
;
541 rc
= rdma_resolve_route(id
, RDMA_RESOLVE_TIMEOUT
);
543 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
547 wait_for_completion_interruptible_timeout(&ia
->ri_done
,
548 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT
) + 1);
549 rc
= ia
->ri_async_rc
;
561 * Drain any cq, prior to teardown.
564 rpcrdma_clean_cq(struct ib_cq
*cq
)
569 while (1 == ib_poll_cq(cq
, 1, &wc
))
573 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
574 __func__
, count
, wc
.opcode
);
578 * Exported functions.
582 * Open and initialize an Interface Adapter.
583 * o initializes fields of struct rpcrdma_ia, including
584 * interface and provider attributes and protection zone.
587 rpcrdma_ia_open(struct rpcrdma_xprt
*xprt
, struct sockaddr
*addr
, int memreg
)
590 struct rpcrdma_ia
*ia
= &xprt
->rx_ia
;
591 struct ib_device_attr
*devattr
= &ia
->ri_devattr
;
593 ia
->ri_id
= rpcrdma_create_id(xprt
, ia
, addr
);
594 if (IS_ERR(ia
->ri_id
)) {
595 rc
= PTR_ERR(ia
->ri_id
);
599 ia
->ri_pd
= ib_alloc_pd(ia
->ri_id
->device
);
600 if (IS_ERR(ia
->ri_pd
)) {
601 rc
= PTR_ERR(ia
->ri_pd
);
602 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
607 rc
= ib_query_device(ia
->ri_id
->device
, devattr
);
609 dprintk("RPC: %s: ib_query_device failed %d\n",
614 if (devattr
->device_cap_flags
& IB_DEVICE_LOCAL_DMA_LKEY
) {
615 ia
->ri_have_dma_lkey
= 1;
616 ia
->ri_dma_lkey
= ia
->ri_id
->device
->local_dma_lkey
;
619 if (memreg
== RPCRDMA_FRMR
) {
620 /* Requires both frmr reg and local dma lkey */
621 if (((devattr
->device_cap_flags
&
622 (IB_DEVICE_MEM_MGT_EXTENSIONS
|IB_DEVICE_LOCAL_DMA_LKEY
)) !=
623 (IB_DEVICE_MEM_MGT_EXTENSIONS
|IB_DEVICE_LOCAL_DMA_LKEY
)) ||
624 (devattr
->max_fast_reg_page_list_len
== 0)) {
625 dprintk("RPC: %s: FRMR registration "
626 "not supported by HCA\n", __func__
);
627 memreg
= RPCRDMA_MTHCAFMR
;
629 /* Mind the ia limit on FRMR page list depth */
630 ia
->ri_max_frmr_depth
= min_t(unsigned int,
631 RPCRDMA_MAX_DATA_SEGS
,
632 devattr
->max_fast_reg_page_list_len
);
635 if (memreg
== RPCRDMA_MTHCAFMR
) {
636 if (!ia
->ri_id
->device
->alloc_fmr
) {
637 dprintk("RPC: %s: MTHCAFMR registration "
638 "not supported by HCA\n", __func__
);
639 memreg
= RPCRDMA_ALLPHYSICAL
;
644 * Optionally obtain an underlying physical identity mapping in
645 * order to do a memory window-based bind. This base registration
646 * is protected from remote access - that is enabled only by binding
647 * for the specific bytes targeted during each RPC operation, and
648 * revoked after the corresponding completion similar to a storage
653 ia
->ri_ops
= &rpcrdma_frwr_memreg_ops
;
655 case RPCRDMA_ALLPHYSICAL
:
656 ia
->ri_ops
= &rpcrdma_physical_memreg_ops
;
657 mem_priv
= IB_ACCESS_LOCAL_WRITE
|
658 IB_ACCESS_REMOTE_WRITE
|
659 IB_ACCESS_REMOTE_READ
;
661 case RPCRDMA_MTHCAFMR
:
662 ia
->ri_ops
= &rpcrdma_fmr_memreg_ops
;
663 if (ia
->ri_have_dma_lkey
)
665 mem_priv
= IB_ACCESS_LOCAL_WRITE
;
667 ia
->ri_bind_mem
= ib_get_dma_mr(ia
->ri_pd
, mem_priv
);
668 if (IS_ERR(ia
->ri_bind_mem
)) {
669 printk(KERN_ALERT
"%s: ib_get_dma_mr for "
670 "phys register failed with %lX\n",
671 __func__
, PTR_ERR(ia
->ri_bind_mem
));
677 printk(KERN_ERR
"RPC: Unsupported memory "
678 "registration mode: %d\n", memreg
);
682 dprintk("RPC: %s: memory registration strategy is '%s'\n",
683 __func__
, ia
->ri_ops
->ro_displayname
);
685 /* Else will do memory reg/dereg for each chunk */
686 ia
->ri_memreg_strategy
= memreg
;
688 rwlock_init(&ia
->ri_qplock
);
692 ib_dealloc_pd(ia
->ri_pd
);
695 rdma_destroy_id(ia
->ri_id
);
702 * Clean up/close an IA.
703 * o if event handles and PD have been initialized, free them.
707 rpcrdma_ia_close(struct rpcrdma_ia
*ia
)
711 dprintk("RPC: %s: entering\n", __func__
);
712 if (ia
->ri_bind_mem
!= NULL
) {
713 rc
= ib_dereg_mr(ia
->ri_bind_mem
);
714 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
717 if (ia
->ri_id
!= NULL
&& !IS_ERR(ia
->ri_id
)) {
719 rdma_destroy_qp(ia
->ri_id
);
720 rdma_destroy_id(ia
->ri_id
);
723 if (ia
->ri_pd
!= NULL
&& !IS_ERR(ia
->ri_pd
)) {
724 rc
= ib_dealloc_pd(ia
->ri_pd
);
725 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
731 * Create unconnected endpoint.
734 rpcrdma_ep_create(struct rpcrdma_ep
*ep
, struct rpcrdma_ia
*ia
,
735 struct rpcrdma_create_data_internal
*cdata
)
737 struct ib_device_attr
*devattr
= &ia
->ri_devattr
;
738 struct ib_cq
*sendcq
, *recvcq
;
741 /* check provider's send/recv wr limits */
742 if (cdata
->max_requests
> devattr
->max_qp_wr
)
743 cdata
->max_requests
= devattr
->max_qp_wr
;
745 ep
->rep_attr
.event_handler
= rpcrdma_qp_async_error_upcall
;
746 ep
->rep_attr
.qp_context
= ep
;
747 /* send_cq and recv_cq initialized below */
748 ep
->rep_attr
.srq
= NULL
;
749 ep
->rep_attr
.cap
.max_send_wr
= cdata
->max_requests
;
750 switch (ia
->ri_memreg_strategy
) {
754 /* Add room for frmr register and invalidate WRs.
755 * 1. FRMR reg WR for head
756 * 2. FRMR invalidate WR for head
757 * 3. N FRMR reg WRs for pagelist
758 * 4. N FRMR invalidate WRs for pagelist
759 * 5. FRMR reg WR for tail
760 * 6. FRMR invalidate WR for tail
761 * 7. The RDMA_SEND WR
764 /* Calculate N if the device max FRMR depth is smaller than
765 * RPCRDMA_MAX_DATA_SEGS.
767 if (ia
->ri_max_frmr_depth
< RPCRDMA_MAX_DATA_SEGS
) {
768 int delta
= RPCRDMA_MAX_DATA_SEGS
-
769 ia
->ri_max_frmr_depth
;
772 depth
+= 2; /* FRMR reg + invalidate */
773 delta
-= ia
->ri_max_frmr_depth
;
777 ep
->rep_attr
.cap
.max_send_wr
*= depth
;
778 if (ep
->rep_attr
.cap
.max_send_wr
> devattr
->max_qp_wr
) {
779 cdata
->max_requests
= devattr
->max_qp_wr
/ depth
;
780 if (!cdata
->max_requests
)
782 ep
->rep_attr
.cap
.max_send_wr
= cdata
->max_requests
*
790 ep
->rep_attr
.cap
.max_recv_wr
= cdata
->max_requests
;
791 ep
->rep_attr
.cap
.max_send_sge
= (cdata
->padding
? 4 : 2);
792 ep
->rep_attr
.cap
.max_recv_sge
= 1;
793 ep
->rep_attr
.cap
.max_inline_data
= 0;
794 ep
->rep_attr
.sq_sig_type
= IB_SIGNAL_REQ_WR
;
795 ep
->rep_attr
.qp_type
= IB_QPT_RC
;
796 ep
->rep_attr
.port_num
= ~0;
798 if (cdata
->padding
) {
799 ep
->rep_padbuf
= rpcrdma_alloc_regbuf(ia
, cdata
->padding
,
801 if (IS_ERR(ep
->rep_padbuf
))
802 return PTR_ERR(ep
->rep_padbuf
);
804 ep
->rep_padbuf
= NULL
;
806 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
807 "iovs: send %d recv %d\n",
809 ep
->rep_attr
.cap
.max_send_wr
,
810 ep
->rep_attr
.cap
.max_recv_wr
,
811 ep
->rep_attr
.cap
.max_send_sge
,
812 ep
->rep_attr
.cap
.max_recv_sge
);
814 /* set trigger for requesting send completion */
815 ep
->rep_cqinit
= ep
->rep_attr
.cap
.max_send_wr
/2 - 1;
816 if (ep
->rep_cqinit
> RPCRDMA_MAX_UNSIGNALED_SENDS
)
817 ep
->rep_cqinit
= RPCRDMA_MAX_UNSIGNALED_SENDS
;
818 else if (ep
->rep_cqinit
<= 2)
821 init_waitqueue_head(&ep
->rep_connect_wait
);
822 INIT_DELAYED_WORK(&ep
->rep_connect_worker
, rpcrdma_connect_worker
);
824 sendcq
= ib_create_cq(ia
->ri_id
->device
, rpcrdma_sendcq_upcall
,
825 rpcrdma_cq_async_error_upcall
, ep
,
826 ep
->rep_attr
.cap
.max_send_wr
+ 1, 0);
827 if (IS_ERR(sendcq
)) {
828 rc
= PTR_ERR(sendcq
);
829 dprintk("RPC: %s: failed to create send CQ: %i\n",
834 rc
= ib_req_notify_cq(sendcq
, IB_CQ_NEXT_COMP
);
836 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
841 recvcq
= ib_create_cq(ia
->ri_id
->device
, rpcrdma_recvcq_upcall
,
842 rpcrdma_cq_async_error_upcall
, ep
,
843 ep
->rep_attr
.cap
.max_recv_wr
+ 1, 0);
844 if (IS_ERR(recvcq
)) {
845 rc
= PTR_ERR(recvcq
);
846 dprintk("RPC: %s: failed to create recv CQ: %i\n",
851 rc
= ib_req_notify_cq(recvcq
, IB_CQ_NEXT_COMP
);
853 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
855 ib_destroy_cq(recvcq
);
859 ep
->rep_attr
.send_cq
= sendcq
;
860 ep
->rep_attr
.recv_cq
= recvcq
;
862 /* Initialize cma parameters */
864 /* RPC/RDMA does not use private data */
865 ep
->rep_remote_cma
.private_data
= NULL
;
866 ep
->rep_remote_cma
.private_data_len
= 0;
868 /* Client offers RDMA Read but does not initiate */
869 ep
->rep_remote_cma
.initiator_depth
= 0;
870 if (devattr
->max_qp_rd_atom
> 32) /* arbitrary but <= 255 */
871 ep
->rep_remote_cma
.responder_resources
= 32;
873 ep
->rep_remote_cma
.responder_resources
=
874 devattr
->max_qp_rd_atom
;
876 ep
->rep_remote_cma
.retry_count
= 7;
877 ep
->rep_remote_cma
.flow_control
= 0;
878 ep
->rep_remote_cma
.rnr_retry_count
= 0;
883 err
= ib_destroy_cq(sendcq
);
885 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
888 rpcrdma_free_regbuf(ia
, ep
->rep_padbuf
);
895 * Disconnect and destroy endpoint. After this, the only
896 * valid operations on the ep are to free it (if dynamically
897 * allocated) or re-create it.
900 rpcrdma_ep_destroy(struct rpcrdma_ep
*ep
, struct rpcrdma_ia
*ia
)
904 dprintk("RPC: %s: entering, connected is %d\n",
905 __func__
, ep
->rep_connected
);
907 cancel_delayed_work_sync(&ep
->rep_connect_worker
);
910 rpcrdma_ep_disconnect(ep
, ia
);
911 rdma_destroy_qp(ia
->ri_id
);
912 ia
->ri_id
->qp
= NULL
;
915 rpcrdma_free_regbuf(ia
, ep
->rep_padbuf
);
917 rpcrdma_clean_cq(ep
->rep_attr
.recv_cq
);
918 rc
= ib_destroy_cq(ep
->rep_attr
.recv_cq
);
920 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
923 rpcrdma_clean_cq(ep
->rep_attr
.send_cq
);
924 rc
= ib_destroy_cq(ep
->rep_attr
.send_cq
);
926 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
931 * Connect unconnected endpoint.
934 rpcrdma_ep_connect(struct rpcrdma_ep
*ep
, struct rpcrdma_ia
*ia
)
936 struct rdma_cm_id
*id
, *old
;
940 if (ep
->rep_connected
!= 0) {
941 struct rpcrdma_xprt
*xprt
;
943 dprintk("RPC: %s: reconnecting...\n", __func__
);
945 rpcrdma_ep_disconnect(ep
, ia
);
946 rpcrdma_flush_cqs(ep
);
948 switch (ia
->ri_memreg_strategy
) {
950 rpcrdma_reset_frmrs(ia
);
952 case RPCRDMA_MTHCAFMR
:
953 rpcrdma_reset_fmrs(ia
);
955 case RPCRDMA_ALLPHYSICAL
:
962 xprt
= container_of(ia
, struct rpcrdma_xprt
, rx_ia
);
963 id
= rpcrdma_create_id(xprt
, ia
,
964 (struct sockaddr
*)&xprt
->rx_data
.addr
);
969 /* TEMP TEMP TEMP - fail if new device:
970 * Deregister/remarshal *all* requests!
971 * Close and recreate adapter, pd, etc!
972 * Re-determine all attributes still sane!
973 * More stuff I haven't thought of!
976 if (ia
->ri_id
->device
!= id
->device
) {
977 printk("RPC: %s: can't reconnect on "
978 "different device!\n", __func__
);
984 rc
= rdma_create_qp(id
, ia
->ri_pd
, &ep
->rep_attr
);
986 dprintk("RPC: %s: rdma_create_qp failed %i\n",
993 write_lock(&ia
->ri_qplock
);
996 write_unlock(&ia
->ri_qplock
);
998 rdma_destroy_qp(old
);
999 rdma_destroy_id(old
);
1001 dprintk("RPC: %s: connecting...\n", __func__
);
1002 rc
= rdma_create_qp(ia
->ri_id
, ia
->ri_pd
, &ep
->rep_attr
);
1004 dprintk("RPC: %s: rdma_create_qp failed %i\n",
1006 /* do not update ep->rep_connected */
1007 return -ENETUNREACH
;
1011 ep
->rep_connected
= 0;
1013 rc
= rdma_connect(ia
->ri_id
, &ep
->rep_remote_cma
);
1015 dprintk("RPC: %s: rdma_connect() failed with %i\n",
1020 wait_event_interruptible(ep
->rep_connect_wait
, ep
->rep_connected
!= 0);
1023 * Check state. A non-peer reject indicates no listener
1024 * (ECONNREFUSED), which may be a transient state. All
1025 * others indicate a transport condition which has already
1026 * undergone a best-effort.
1028 if (ep
->rep_connected
== -ECONNREFUSED
&&
1029 ++retry_count
<= RDMA_CONNECT_RETRY_MAX
) {
1030 dprintk("RPC: %s: non-peer_reject, retry\n", __func__
);
1033 if (ep
->rep_connected
<= 0) {
1034 /* Sometimes, the only way to reliably connect to remote
1035 * CMs is to use same nonzero values for ORD and IRD. */
1036 if (retry_count
++ <= RDMA_CONNECT_RETRY_MAX
+ 1 &&
1037 (ep
->rep_remote_cma
.responder_resources
== 0 ||
1038 ep
->rep_remote_cma
.initiator_depth
!=
1039 ep
->rep_remote_cma
.responder_resources
)) {
1040 if (ep
->rep_remote_cma
.responder_resources
== 0)
1041 ep
->rep_remote_cma
.responder_resources
= 1;
1042 ep
->rep_remote_cma
.initiator_depth
=
1043 ep
->rep_remote_cma
.responder_resources
;
1046 rc
= ep
->rep_connected
;
1048 dprintk("RPC: %s: connected\n", __func__
);
1053 ep
->rep_connected
= rc
;
1058 * rpcrdma_ep_disconnect
1060 * This is separate from destroy to facilitate the ability
1061 * to reconnect without recreating the endpoint.
1063 * This call is not reentrant, and must not be made in parallel
1064 * on the same endpoint.
1067 rpcrdma_ep_disconnect(struct rpcrdma_ep
*ep
, struct rpcrdma_ia
*ia
)
1071 rpcrdma_flush_cqs(ep
);
1072 rc
= rdma_disconnect(ia
->ri_id
);
1074 /* returns without wait if not connected */
1075 wait_event_interruptible(ep
->rep_connect_wait
,
1076 ep
->rep_connected
!= 1);
1077 dprintk("RPC: %s: after wait, %sconnected\n", __func__
,
1078 (ep
->rep_connected
== 1) ? "still " : "dis");
1080 dprintk("RPC: %s: rdma_disconnect %i\n", __func__
, rc
);
1081 ep
->rep_connected
= rc
;
1085 static struct rpcrdma_req
*
1086 rpcrdma_create_req(struct rpcrdma_xprt
*r_xprt
)
1088 struct rpcrdma_req
*req
;
1090 req
= kzalloc(sizeof(*req
), GFP_KERNEL
);
1092 return ERR_PTR(-ENOMEM
);
1094 req
->rl_buffer
= &r_xprt
->rx_buf
;
1098 static struct rpcrdma_rep
*
1099 rpcrdma_create_rep(struct rpcrdma_xprt
*r_xprt
)
1101 struct rpcrdma_create_data_internal
*cdata
= &r_xprt
->rx_data
;
1102 struct rpcrdma_ia
*ia
= &r_xprt
->rx_ia
;
1103 struct rpcrdma_rep
*rep
;
1107 rep
= kzalloc(sizeof(*rep
), GFP_KERNEL
);
1111 rep
->rr_rdmabuf
= rpcrdma_alloc_regbuf(ia
, cdata
->inline_rsize
,
1113 if (IS_ERR(rep
->rr_rdmabuf
)) {
1114 rc
= PTR_ERR(rep
->rr_rdmabuf
);
1118 rep
->rr_buffer
= &r_xprt
->rx_buf
;
1128 rpcrdma_init_fmrs(struct rpcrdma_ia
*ia
, struct rpcrdma_buffer
*buf
)
1130 int mr_access_flags
= IB_ACCESS_REMOTE_WRITE
| IB_ACCESS_REMOTE_READ
;
1131 struct ib_fmr_attr fmr_attr
= {
1132 .max_pages
= RPCRDMA_MAX_DATA_SEGS
,
1134 .page_shift
= PAGE_SHIFT
1136 struct rpcrdma_mw
*r
;
1139 i
= (buf
->rb_max_requests
+ 1) * RPCRDMA_MAX_SEGS
;
1140 dprintk("RPC: %s: initalizing %d FMRs\n", __func__
, i
);
1143 r
= kzalloc(sizeof(*r
), GFP_KERNEL
);
1147 r
->r
.fmr
= ib_alloc_fmr(ia
->ri_pd
, mr_access_flags
, &fmr_attr
);
1148 if (IS_ERR(r
->r
.fmr
)) {
1149 rc
= PTR_ERR(r
->r
.fmr
);
1150 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1155 list_add(&r
->mw_list
, &buf
->rb_mws
);
1156 list_add(&r
->mw_all
, &buf
->rb_all
);
1166 rpcrdma_init_frmrs(struct rpcrdma_ia
*ia
, struct rpcrdma_buffer
*buf
)
1168 struct rpcrdma_frmr
*f
;
1169 struct rpcrdma_mw
*r
;
1172 i
= (buf
->rb_max_requests
+ 1) * RPCRDMA_MAX_SEGS
;
1173 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__
, i
);
1176 r
= kzalloc(sizeof(*r
), GFP_KERNEL
);
1181 f
->fr_mr
= ib_alloc_fast_reg_mr(ia
->ri_pd
,
1182 ia
->ri_max_frmr_depth
);
1183 if (IS_ERR(f
->fr_mr
)) {
1184 rc
= PTR_ERR(f
->fr_mr
);
1185 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1186 "failed %i\n", __func__
, rc
);
1190 f
->fr_pgl
= ib_alloc_fast_reg_page_list(ia
->ri_id
->device
,
1191 ia
->ri_max_frmr_depth
);
1192 if (IS_ERR(f
->fr_pgl
)) {
1193 rc
= PTR_ERR(f
->fr_pgl
);
1194 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1195 "failed %i\n", __func__
, rc
);
1197 ib_dereg_mr(f
->fr_mr
);
1201 list_add(&r
->mw_list
, &buf
->rb_mws
);
1202 list_add(&r
->mw_all
, &buf
->rb_all
);
1213 rpcrdma_buffer_create(struct rpcrdma_xprt
*r_xprt
)
1215 struct rpcrdma_buffer
*buf
= &r_xprt
->rx_buf
;
1216 struct rpcrdma_ia
*ia
= &r_xprt
->rx_ia
;
1217 struct rpcrdma_create_data_internal
*cdata
= &r_xprt
->rx_data
;
1222 buf
->rb_max_requests
= cdata
->max_requests
;
1223 spin_lock_init(&buf
->rb_lock
);
1225 /* Need to allocate:
1226 * 1. arrays for send and recv pointers
1227 * 2. arrays of struct rpcrdma_req to fill in pointers
1228 * 3. array of struct rpcrdma_rep for replies
1229 * Send/recv buffers in req/rep need to be registered
1231 len
= buf
->rb_max_requests
*
1232 (sizeof(struct rpcrdma_req
*) + sizeof(struct rpcrdma_rep
*));
1234 p
= kzalloc(len
, GFP_KERNEL
);
1236 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1241 buf
->rb_pool
= p
; /* for freeing it later */
1243 buf
->rb_send_bufs
= (struct rpcrdma_req
**) p
;
1244 p
= (char *) &buf
->rb_send_bufs
[buf
->rb_max_requests
];
1245 buf
->rb_recv_bufs
= (struct rpcrdma_rep
**) p
;
1246 p
= (char *) &buf
->rb_recv_bufs
[buf
->rb_max_requests
];
1248 INIT_LIST_HEAD(&buf
->rb_mws
);
1249 INIT_LIST_HEAD(&buf
->rb_all
);
1250 switch (ia
->ri_memreg_strategy
) {
1252 rc
= rpcrdma_init_frmrs(ia
, buf
);
1256 case RPCRDMA_MTHCAFMR
:
1257 rc
= rpcrdma_init_fmrs(ia
, buf
);
1265 for (i
= 0; i
< buf
->rb_max_requests
; i
++) {
1266 struct rpcrdma_req
*req
;
1267 struct rpcrdma_rep
*rep
;
1269 req
= rpcrdma_create_req(r_xprt
);
1271 dprintk("RPC: %s: request buffer %d alloc"
1272 " failed\n", __func__
, i
);
1276 buf
->rb_send_bufs
[i
] = req
;
1278 rep
= rpcrdma_create_rep(r_xprt
);
1280 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1285 buf
->rb_recv_bufs
[i
] = rep
;
1290 rpcrdma_buffer_destroy(buf
);
1295 rpcrdma_destroy_rep(struct rpcrdma_ia
*ia
, struct rpcrdma_rep
*rep
)
1300 rpcrdma_free_regbuf(ia
, rep
->rr_rdmabuf
);
1305 rpcrdma_destroy_req(struct rpcrdma_ia
*ia
, struct rpcrdma_req
*req
)
1310 rpcrdma_free_regbuf(ia
, req
->rl_sendbuf
);
1311 rpcrdma_free_regbuf(ia
, req
->rl_rdmabuf
);
1316 rpcrdma_destroy_fmrs(struct rpcrdma_buffer
*buf
)
1318 struct rpcrdma_mw
*r
;
1321 while (!list_empty(&buf
->rb_all
)) {
1322 r
= list_entry(buf
->rb_all
.next
, struct rpcrdma_mw
, mw_all
);
1323 list_del(&r
->mw_all
);
1324 list_del(&r
->mw_list
);
1326 rc
= ib_dealloc_fmr(r
->r
.fmr
);
1328 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1336 rpcrdma_destroy_frmrs(struct rpcrdma_buffer
*buf
)
1338 struct rpcrdma_mw
*r
;
1341 while (!list_empty(&buf
->rb_all
)) {
1342 r
= list_entry(buf
->rb_all
.next
, struct rpcrdma_mw
, mw_all
);
1343 list_del(&r
->mw_all
);
1344 list_del(&r
->mw_list
);
1346 rc
= ib_dereg_mr(r
->r
.frmr
.fr_mr
);
1348 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1350 ib_free_fast_reg_page_list(r
->r
.frmr
.fr_pgl
);
1357 rpcrdma_buffer_destroy(struct rpcrdma_buffer
*buf
)
1359 struct rpcrdma_ia
*ia
= rdmab_to_ia(buf
);
1362 /* clean up in reverse order from create
1363 * 1. recv mr memory (mr free, then kfree)
1364 * 2. send mr memory (mr free, then kfree)
1367 dprintk("RPC: %s: entering\n", __func__
);
1369 for (i
= 0; i
< buf
->rb_max_requests
; i
++) {
1370 if (buf
->rb_recv_bufs
)
1371 rpcrdma_destroy_rep(ia
, buf
->rb_recv_bufs
[i
]);
1372 if (buf
->rb_send_bufs
)
1373 rpcrdma_destroy_req(ia
, buf
->rb_send_bufs
[i
]);
1376 switch (ia
->ri_memreg_strategy
) {
1378 rpcrdma_destroy_frmrs(buf
);
1380 case RPCRDMA_MTHCAFMR
:
1381 rpcrdma_destroy_fmrs(buf
);
1387 kfree(buf
->rb_pool
);
1390 /* After a disconnect, unmap all FMRs.
1392 * This is invoked only in the transport connect worker in order
1393 * to serialize with rpcrdma_register_fmr_external().
1396 rpcrdma_reset_fmrs(struct rpcrdma_ia
*ia
)
1398 struct rpcrdma_xprt
*r_xprt
=
1399 container_of(ia
, struct rpcrdma_xprt
, rx_ia
);
1400 struct rpcrdma_buffer
*buf
= &r_xprt
->rx_buf
;
1401 struct list_head
*pos
;
1402 struct rpcrdma_mw
*r
;
1406 list_for_each(pos
, &buf
->rb_all
) {
1407 r
= list_entry(pos
, struct rpcrdma_mw
, mw_all
);
1410 list_add(&r
->r
.fmr
->list
, &l
);
1411 rc
= ib_unmap_fmr(&l
);
1413 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
1418 /* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1419 * an unusable state. Find FRMRs in this state and dereg / reg
1420 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1423 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1425 * This is invoked only in the transport connect worker in order
1426 * to serialize with rpcrdma_register_frmr_external().
1429 rpcrdma_reset_frmrs(struct rpcrdma_ia
*ia
)
1431 struct rpcrdma_xprt
*r_xprt
=
1432 container_of(ia
, struct rpcrdma_xprt
, rx_ia
);
1433 struct rpcrdma_buffer
*buf
= &r_xprt
->rx_buf
;
1434 struct list_head
*pos
;
1435 struct rpcrdma_mw
*r
;
1438 list_for_each(pos
, &buf
->rb_all
) {
1439 r
= list_entry(pos
, struct rpcrdma_mw
, mw_all
);
1441 if (r
->r
.frmr
.fr_state
== FRMR_IS_INVALID
)
1444 rc
= ib_dereg_mr(r
->r
.frmr
.fr_mr
);
1446 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1448 ib_free_fast_reg_page_list(r
->r
.frmr
.fr_pgl
);
1450 r
->r
.frmr
.fr_mr
= ib_alloc_fast_reg_mr(ia
->ri_pd
,
1451 ia
->ri_max_frmr_depth
);
1452 if (IS_ERR(r
->r
.frmr
.fr_mr
)) {
1453 rc
= PTR_ERR(r
->r
.frmr
.fr_mr
);
1454 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1455 " failed %i\n", __func__
, rc
);
1458 r
->r
.frmr
.fr_pgl
= ib_alloc_fast_reg_page_list(
1460 ia
->ri_max_frmr_depth
);
1461 if (IS_ERR(r
->r
.frmr
.fr_pgl
)) {
1462 rc
= PTR_ERR(r
->r
.frmr
.fr_pgl
);
1464 "ib_alloc_fast_reg_page_list "
1465 "failed %i\n", __func__
, rc
);
1467 ib_dereg_mr(r
->r
.frmr
.fr_mr
);
1470 r
->r
.frmr
.fr_state
= FRMR_IS_INVALID
;
1474 /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1475 * some req segments uninitialized.
1478 rpcrdma_buffer_put_mr(struct rpcrdma_mw
**mw
, struct rpcrdma_buffer
*buf
)
1481 list_add_tail(&(*mw
)->mw_list
, &buf
->rb_mws
);
1486 /* Cycle mw's back in reverse order, and "spin" them.
1487 * This delays and scrambles reuse as much as possible.
1490 rpcrdma_buffer_put_mrs(struct rpcrdma_req
*req
, struct rpcrdma_buffer
*buf
)
1492 struct rpcrdma_mr_seg
*seg
= req
->rl_segments
;
1493 struct rpcrdma_mr_seg
*seg1
= seg
;
1496 for (i
= 1, seg
++; i
< RPCRDMA_MAX_SEGS
; seg
++, i
++)
1497 rpcrdma_buffer_put_mr(&seg
->rl_mw
, buf
);
1498 rpcrdma_buffer_put_mr(&seg1
->rl_mw
, buf
);
1502 rpcrdma_buffer_put_sendbuf(struct rpcrdma_req
*req
, struct rpcrdma_buffer
*buf
)
1504 buf
->rb_send_bufs
[--buf
->rb_send_index
] = req
;
1506 if (req
->rl_reply
) {
1507 buf
->rb_recv_bufs
[--buf
->rb_recv_index
] = req
->rl_reply
;
1508 req
->rl_reply
->rr_func
= NULL
;
1509 req
->rl_reply
= NULL
;
1513 /* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
1514 * Redo only the ib_post_send().
1517 rpcrdma_retry_local_inv(struct rpcrdma_mw
*r
, struct rpcrdma_ia
*ia
)
1519 struct rpcrdma_xprt
*r_xprt
=
1520 container_of(ia
, struct rpcrdma_xprt
, rx_ia
);
1521 struct ib_send_wr invalidate_wr
, *bad_wr
;
1524 dprintk("RPC: %s: FRMR %p is stale\n", __func__
, r
);
1526 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
1527 r
->r
.frmr
.fr_state
= FRMR_IS_INVALID
;
1529 memset(&invalidate_wr
, 0, sizeof(invalidate_wr
));
1530 invalidate_wr
.wr_id
= (unsigned long)(void *)r
;
1531 invalidate_wr
.opcode
= IB_WR_LOCAL_INV
;
1532 invalidate_wr
.ex
.invalidate_rkey
= r
->r
.frmr
.fr_mr
->rkey
;
1533 DECR_CQCOUNT(&r_xprt
->rx_ep
);
1535 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
1536 __func__
, r
, r
->r
.frmr
.fr_mr
->rkey
);
1538 read_lock(&ia
->ri_qplock
);
1539 rc
= ib_post_send(ia
->ri_id
->qp
, &invalidate_wr
, &bad_wr
);
1540 read_unlock(&ia
->ri_qplock
);
1542 /* Force rpcrdma_buffer_get() to retry */
1543 r
->r
.frmr
.fr_state
= FRMR_IS_STALE
;
1544 dprintk("RPC: %s: ib_post_send failed, %i\n",
1550 rpcrdma_retry_flushed_linv(struct list_head
*stale
,
1551 struct rpcrdma_buffer
*buf
)
1553 struct rpcrdma_ia
*ia
= rdmab_to_ia(buf
);
1554 struct list_head
*pos
;
1555 struct rpcrdma_mw
*r
;
1556 unsigned long flags
;
1558 list_for_each(pos
, stale
) {
1559 r
= list_entry(pos
, struct rpcrdma_mw
, mw_list
);
1560 rpcrdma_retry_local_inv(r
, ia
);
1563 spin_lock_irqsave(&buf
->rb_lock
, flags
);
1564 list_splice_tail(stale
, &buf
->rb_mws
);
1565 spin_unlock_irqrestore(&buf
->rb_lock
, flags
);
1568 static struct rpcrdma_req
*
1569 rpcrdma_buffer_get_frmrs(struct rpcrdma_req
*req
, struct rpcrdma_buffer
*buf
,
1570 struct list_head
*stale
)
1572 struct rpcrdma_mw
*r
;
1575 i
= RPCRDMA_MAX_SEGS
- 1;
1576 while (!list_empty(&buf
->rb_mws
)) {
1577 r
= list_entry(buf
->rb_mws
.next
,
1578 struct rpcrdma_mw
, mw_list
);
1579 list_del(&r
->mw_list
);
1580 if (r
->r
.frmr
.fr_state
== FRMR_IS_STALE
) {
1581 list_add(&r
->mw_list
, stale
);
1584 req
->rl_segments
[i
].rl_mw
= r
;
1585 if (unlikely(i
-- == 0))
1586 return req
; /* Success */
1589 /* Not enough entries on rb_mws for this req */
1590 rpcrdma_buffer_put_sendbuf(req
, buf
);
1591 rpcrdma_buffer_put_mrs(req
, buf
);
1595 static struct rpcrdma_req
*
1596 rpcrdma_buffer_get_fmrs(struct rpcrdma_req
*req
, struct rpcrdma_buffer
*buf
)
1598 struct rpcrdma_mw
*r
;
1601 i
= RPCRDMA_MAX_SEGS
- 1;
1602 while (!list_empty(&buf
->rb_mws
)) {
1603 r
= list_entry(buf
->rb_mws
.next
,
1604 struct rpcrdma_mw
, mw_list
);
1605 list_del(&r
->mw_list
);
1606 req
->rl_segments
[i
].rl_mw
= r
;
1607 if (unlikely(i
-- == 0))
1608 return req
; /* Success */
1611 /* Not enough entries on rb_mws for this req */
1612 rpcrdma_buffer_put_sendbuf(req
, buf
);
1613 rpcrdma_buffer_put_mrs(req
, buf
);
1618 * Get a set of request/reply buffers.
1620 * Reply buffer (if needed) is attached to send buffer upon return.
1622 * rb_send_index and rb_recv_index MUST always be pointing to the
1623 * *next* available buffer (non-NULL). They are incremented after
1624 * removing buffers, and decremented *before* returning them.
1626 struct rpcrdma_req
*
1627 rpcrdma_buffer_get(struct rpcrdma_buffer
*buffers
)
1629 struct rpcrdma_ia
*ia
= rdmab_to_ia(buffers
);
1630 struct list_head stale
;
1631 struct rpcrdma_req
*req
;
1632 unsigned long flags
;
1634 spin_lock_irqsave(&buffers
->rb_lock
, flags
);
1635 if (buffers
->rb_send_index
== buffers
->rb_max_requests
) {
1636 spin_unlock_irqrestore(&buffers
->rb_lock
, flags
);
1637 dprintk("RPC: %s: out of request buffers\n", __func__
);
1638 return ((struct rpcrdma_req
*)NULL
);
1641 req
= buffers
->rb_send_bufs
[buffers
->rb_send_index
];
1642 if (buffers
->rb_send_index
< buffers
->rb_recv_index
) {
1643 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1645 buffers
->rb_recv_index
- buffers
->rb_send_index
);
1646 req
->rl_reply
= NULL
;
1648 req
->rl_reply
= buffers
->rb_recv_bufs
[buffers
->rb_recv_index
];
1649 buffers
->rb_recv_bufs
[buffers
->rb_recv_index
++] = NULL
;
1651 buffers
->rb_send_bufs
[buffers
->rb_send_index
++] = NULL
;
1653 INIT_LIST_HEAD(&stale
);
1654 switch (ia
->ri_memreg_strategy
) {
1656 req
= rpcrdma_buffer_get_frmrs(req
, buffers
, &stale
);
1658 case RPCRDMA_MTHCAFMR
:
1659 req
= rpcrdma_buffer_get_fmrs(req
, buffers
);
1664 spin_unlock_irqrestore(&buffers
->rb_lock
, flags
);
1665 if (!list_empty(&stale
))
1666 rpcrdma_retry_flushed_linv(&stale
, buffers
);
1671 * Put request/reply buffers back into pool.
1672 * Pre-decrement counter/array index.
1675 rpcrdma_buffer_put(struct rpcrdma_req
*req
)
1677 struct rpcrdma_buffer
*buffers
= req
->rl_buffer
;
1678 struct rpcrdma_ia
*ia
= rdmab_to_ia(buffers
);
1679 unsigned long flags
;
1681 spin_lock_irqsave(&buffers
->rb_lock
, flags
);
1682 rpcrdma_buffer_put_sendbuf(req
, buffers
);
1683 switch (ia
->ri_memreg_strategy
) {
1685 case RPCRDMA_MTHCAFMR
:
1686 rpcrdma_buffer_put_mrs(req
, buffers
);
1691 spin_unlock_irqrestore(&buffers
->rb_lock
, flags
);
1695 * Recover reply buffers from pool.
1696 * This happens when recovering from error conditions.
1697 * Post-increment counter/array index.
1700 rpcrdma_recv_buffer_get(struct rpcrdma_req
*req
)
1702 struct rpcrdma_buffer
*buffers
= req
->rl_buffer
;
1703 unsigned long flags
;
1705 spin_lock_irqsave(&buffers
->rb_lock
, flags
);
1706 if (buffers
->rb_recv_index
< buffers
->rb_max_requests
) {
1707 req
->rl_reply
= buffers
->rb_recv_bufs
[buffers
->rb_recv_index
];
1708 buffers
->rb_recv_bufs
[buffers
->rb_recv_index
++] = NULL
;
1710 spin_unlock_irqrestore(&buffers
->rb_lock
, flags
);
1714 * Put reply buffers back into pool when not attached to
1715 * request. This happens in error conditions.
1718 rpcrdma_recv_buffer_put(struct rpcrdma_rep
*rep
)
1720 struct rpcrdma_buffer
*buffers
= rep
->rr_buffer
;
1721 unsigned long flags
;
1723 rep
->rr_func
= NULL
;
1724 spin_lock_irqsave(&buffers
->rb_lock
, flags
);
1725 buffers
->rb_recv_bufs
[--buffers
->rb_recv_index
] = rep
;
1726 spin_unlock_irqrestore(&buffers
->rb_lock
, flags
);
1730 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1734 rpcrdma_register_internal(struct rpcrdma_ia
*ia
, void *va
, int len
,
1735 struct ib_mr
**mrp
, struct ib_sge
*iov
)
1737 struct ib_phys_buf ipb
;
1742 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1744 iov
->addr
= ib_dma_map_single(ia
->ri_id
->device
,
1745 va
, len
, DMA_BIDIRECTIONAL
);
1746 if (ib_dma_mapping_error(ia
->ri_id
->device
, iov
->addr
))
1751 if (ia
->ri_have_dma_lkey
) {
1753 iov
->lkey
= ia
->ri_dma_lkey
;
1755 } else if (ia
->ri_bind_mem
!= NULL
) {
1757 iov
->lkey
= ia
->ri_bind_mem
->lkey
;
1761 ipb
.addr
= iov
->addr
;
1762 ipb
.size
= iov
->length
;
1763 mr
= ib_reg_phys_mr(ia
->ri_pd
, &ipb
, 1,
1764 IB_ACCESS_LOCAL_WRITE
, &iov
->addr
);
1766 dprintk("RPC: %s: phys convert: 0x%llx "
1767 "registered 0x%llx length %d\n",
1768 __func__
, (unsigned long long)ipb
.addr
,
1769 (unsigned long long)iov
->addr
, len
);
1774 dprintk("RPC: %s: failed with %i\n", __func__
, rc
);
1777 iov
->lkey
= mr
->lkey
;
1785 rpcrdma_deregister_internal(struct rpcrdma_ia
*ia
,
1786 struct ib_mr
*mr
, struct ib_sge
*iov
)
1790 ib_dma_unmap_single(ia
->ri_id
->device
,
1791 iov
->addr
, iov
->length
, DMA_BIDIRECTIONAL
);
1796 rc
= ib_dereg_mr(mr
);
1798 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__
, rc
);
1803 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
1804 * @ia: controlling rpcrdma_ia
1805 * @size: size of buffer to be allocated, in bytes
1808 * Returns pointer to private header of an area of internally
1809 * registered memory, or an ERR_PTR. The registered buffer follows
1810 * the end of the private header.
1812 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
1813 * receiving the payload of RDMA RECV operations. regbufs are not
1814 * used for RDMA READ/WRITE operations, thus are registered only for
1817 struct rpcrdma_regbuf
*
1818 rpcrdma_alloc_regbuf(struct rpcrdma_ia
*ia
, size_t size
, gfp_t flags
)
1820 struct rpcrdma_regbuf
*rb
;
1824 rb
= kmalloc(sizeof(*rb
) + size
, flags
);
1829 rb
->rg_owner
= NULL
;
1830 rc
= rpcrdma_register_internal(ia
, rb
->rg_base
, size
,
1831 &rb
->rg_mr
, &rb
->rg_iov
);
1844 * rpcrdma_free_regbuf - deregister and free registered buffer
1845 * @ia: controlling rpcrdma_ia
1846 * @rb: regbuf to be deregistered and freed
1849 rpcrdma_free_regbuf(struct rpcrdma_ia
*ia
, struct rpcrdma_regbuf
*rb
)
1852 rpcrdma_deregister_internal(ia
, rb
->rg_mr
, &rb
->rg_iov
);
1858 * Wrappers for chunk registration, shared by read/write chunk code.
1862 rpcrdma_map_one(struct rpcrdma_ia
*ia
, struct rpcrdma_mr_seg
*seg
, int writing
)
1864 seg
->mr_dir
= writing
? DMA_FROM_DEVICE
: DMA_TO_DEVICE
;
1865 seg
->mr_dmalen
= seg
->mr_len
;
1867 seg
->mr_dma
= ib_dma_map_page(ia
->ri_id
->device
,
1868 seg
->mr_page
, offset_in_page(seg
->mr_offset
),
1869 seg
->mr_dmalen
, seg
->mr_dir
);
1871 seg
->mr_dma
= ib_dma_map_single(ia
->ri_id
->device
,
1873 seg
->mr_dmalen
, seg
->mr_dir
);
1874 if (ib_dma_mapping_error(ia
->ri_id
->device
, seg
->mr_dma
)) {
1875 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1877 (unsigned long long)seg
->mr_dma
,
1878 seg
->mr_offset
, seg
->mr_dmalen
);
1883 rpcrdma_unmap_one(struct rpcrdma_ia
*ia
, struct rpcrdma_mr_seg
*seg
)
1886 ib_dma_unmap_page(ia
->ri_id
->device
,
1887 seg
->mr_dma
, seg
->mr_dmalen
, seg
->mr_dir
);
1889 ib_dma_unmap_single(ia
->ri_id
->device
,
1890 seg
->mr_dma
, seg
->mr_dmalen
, seg
->mr_dir
);
1894 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg
*seg
,
1895 int *nsegs
, int writing
, struct rpcrdma_ia
*ia
,
1896 struct rpcrdma_xprt
*r_xprt
)
1898 struct rpcrdma_mr_seg
*seg1
= seg
;
1899 struct rpcrdma_mw
*mw
= seg1
->rl_mw
;
1900 struct rpcrdma_frmr
*frmr
= &mw
->r
.frmr
;
1901 struct ib_mr
*mr
= frmr
->fr_mr
;
1902 struct ib_send_wr fastreg_wr
, *bad_wr
;
1910 pageoff
= offset_in_page(seg1
->mr_offset
);
1911 seg1
->mr_offset
-= pageoff
; /* start of page */
1912 seg1
->mr_len
+= pageoff
;
1914 if (*nsegs
> ia
->ri_max_frmr_depth
)
1915 *nsegs
= ia
->ri_max_frmr_depth
;
1916 for (page_no
= i
= 0; i
< *nsegs
;) {
1917 rpcrdma_map_one(ia
, seg
, writing
);
1919 for (seg_len
= seg
->mr_len
; seg_len
> 0; seg_len
-= PAGE_SIZE
) {
1920 frmr
->fr_pgl
->page_list
[page_no
++] = pa
;
1926 /* Check for holes */
1927 if ((i
< *nsegs
&& offset_in_page(seg
->mr_offset
)) ||
1928 offset_in_page((seg
-1)->mr_offset
+ (seg
-1)->mr_len
))
1931 dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n",
1932 __func__
, mw
, i
, len
);
1934 frmr
->fr_state
= FRMR_IS_VALID
;
1936 memset(&fastreg_wr
, 0, sizeof(fastreg_wr
));
1937 fastreg_wr
.wr_id
= (unsigned long)(void *)mw
;
1938 fastreg_wr
.opcode
= IB_WR_FAST_REG_MR
;
1939 fastreg_wr
.wr
.fast_reg
.iova_start
= seg1
->mr_dma
+ pageoff
;
1940 fastreg_wr
.wr
.fast_reg
.page_list
= frmr
->fr_pgl
;
1941 fastreg_wr
.wr
.fast_reg
.page_list_len
= page_no
;
1942 fastreg_wr
.wr
.fast_reg
.page_shift
= PAGE_SHIFT
;
1943 fastreg_wr
.wr
.fast_reg
.length
= len
;
1946 key
= (u8
)(mr
->rkey
& 0x000000FF);
1947 ib_update_fast_reg_key(mr
, ++key
);
1949 fastreg_wr
.wr
.fast_reg
.access_flags
= (writing
?
1950 IB_ACCESS_REMOTE_WRITE
| IB_ACCESS_LOCAL_WRITE
:
1951 IB_ACCESS_REMOTE_READ
);
1952 fastreg_wr
.wr
.fast_reg
.rkey
= mr
->rkey
;
1953 DECR_CQCOUNT(&r_xprt
->rx_ep
);
1955 rc
= ib_post_send(ia
->ri_id
->qp
, &fastreg_wr
, &bad_wr
);
1957 dprintk("RPC: %s: failed ib_post_send for register,"
1958 " status %i\n", __func__
, rc
);
1959 ib_update_fast_reg_key(mr
, --key
);
1962 seg1
->mr_rkey
= mr
->rkey
;
1963 seg1
->mr_base
= seg1
->mr_dma
+ pageoff
;
1970 frmr
->fr_state
= FRMR_IS_INVALID
;
1972 rpcrdma_unmap_one(ia
, --seg
);
1977 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg
*seg
,
1978 struct rpcrdma_ia
*ia
, struct rpcrdma_xprt
*r_xprt
)
1980 struct rpcrdma_mr_seg
*seg1
= seg
;
1981 struct ib_send_wr invalidate_wr
, *bad_wr
;
1984 seg1
->rl_mw
->r
.frmr
.fr_state
= FRMR_IS_INVALID
;
1986 memset(&invalidate_wr
, 0, sizeof invalidate_wr
);
1987 invalidate_wr
.wr_id
= (unsigned long)(void *)seg1
->rl_mw
;
1988 invalidate_wr
.opcode
= IB_WR_LOCAL_INV
;
1989 invalidate_wr
.ex
.invalidate_rkey
= seg1
->rl_mw
->r
.frmr
.fr_mr
->rkey
;
1990 DECR_CQCOUNT(&r_xprt
->rx_ep
);
1992 read_lock(&ia
->ri_qplock
);
1993 while (seg1
->mr_nsegs
--)
1994 rpcrdma_unmap_one(ia
, seg
++);
1995 rc
= ib_post_send(ia
->ri_id
->qp
, &invalidate_wr
, &bad_wr
);
1996 read_unlock(&ia
->ri_qplock
);
1998 /* Force rpcrdma_buffer_get() to retry */
1999 seg1
->rl_mw
->r
.frmr
.fr_state
= FRMR_IS_STALE
;
2000 dprintk("RPC: %s: failed ib_post_send for invalidate,"
2001 " status %i\n", __func__
, rc
);
2007 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg
*seg
,
2008 int *nsegs
, int writing
, struct rpcrdma_ia
*ia
)
2010 struct rpcrdma_mr_seg
*seg1
= seg
;
2011 u64 physaddrs
[RPCRDMA_MAX_DATA_SEGS
];
2012 int len
, pageoff
, i
, rc
;
2014 pageoff
= offset_in_page(seg1
->mr_offset
);
2015 seg1
->mr_offset
-= pageoff
; /* start of page */
2016 seg1
->mr_len
+= pageoff
;
2018 if (*nsegs
> RPCRDMA_MAX_DATA_SEGS
)
2019 *nsegs
= RPCRDMA_MAX_DATA_SEGS
;
2020 for (i
= 0; i
< *nsegs
;) {
2021 rpcrdma_map_one(ia
, seg
, writing
);
2022 physaddrs
[i
] = seg
->mr_dma
;
2026 /* Check for holes */
2027 if ((i
< *nsegs
&& offset_in_page(seg
->mr_offset
)) ||
2028 offset_in_page((seg
-1)->mr_offset
+ (seg
-1)->mr_len
))
2031 rc
= ib_map_phys_fmr(seg1
->rl_mw
->r
.fmr
, physaddrs
, i
, seg1
->mr_dma
);
2033 dprintk("RPC: %s: failed ib_map_phys_fmr "
2034 "%u@0x%llx+%i (%d)... status %i\n", __func__
,
2035 len
, (unsigned long long)seg1
->mr_dma
,
2038 rpcrdma_unmap_one(ia
, --seg
);
2040 seg1
->mr_rkey
= seg1
->rl_mw
->r
.fmr
->rkey
;
2041 seg1
->mr_base
= seg1
->mr_dma
+ pageoff
;
2050 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg
*seg
,
2051 struct rpcrdma_ia
*ia
)
2053 struct rpcrdma_mr_seg
*seg1
= seg
;
2057 list_add(&seg1
->rl_mw
->r
.fmr
->list
, &l
);
2058 rc
= ib_unmap_fmr(&l
);
2059 read_lock(&ia
->ri_qplock
);
2060 while (seg1
->mr_nsegs
--)
2061 rpcrdma_unmap_one(ia
, seg
++);
2062 read_unlock(&ia
->ri_qplock
);
2064 dprintk("RPC: %s: failed ib_unmap_fmr,"
2065 " status %i\n", __func__
, rc
);
2070 rpcrdma_register_external(struct rpcrdma_mr_seg
*seg
,
2071 int nsegs
, int writing
, struct rpcrdma_xprt
*r_xprt
)
2073 struct rpcrdma_ia
*ia
= &r_xprt
->rx_ia
;
2076 switch (ia
->ri_memreg_strategy
) {
2078 case RPCRDMA_ALLPHYSICAL
:
2079 rpcrdma_map_one(ia
, seg
, writing
);
2080 seg
->mr_rkey
= ia
->ri_bind_mem
->rkey
;
2081 seg
->mr_base
= seg
->mr_dma
;
2086 /* Registration using frmr registration */
2088 rc
= rpcrdma_register_frmr_external(seg
, &nsegs
, writing
, ia
, r_xprt
);
2091 /* Registration using fmr memory registration */
2092 case RPCRDMA_MTHCAFMR
:
2093 rc
= rpcrdma_register_fmr_external(seg
, &nsegs
, writing
, ia
);
2106 rpcrdma_deregister_external(struct rpcrdma_mr_seg
*seg
,
2107 struct rpcrdma_xprt
*r_xprt
)
2109 struct rpcrdma_ia
*ia
= &r_xprt
->rx_ia
;
2110 int nsegs
= seg
->mr_nsegs
, rc
;
2112 switch (ia
->ri_memreg_strategy
) {
2114 case RPCRDMA_ALLPHYSICAL
:
2115 read_lock(&ia
->ri_qplock
);
2116 rpcrdma_unmap_one(ia
, seg
);
2117 read_unlock(&ia
->ri_qplock
);
2121 rc
= rpcrdma_deregister_frmr_external(seg
, ia
, r_xprt
);
2124 case RPCRDMA_MTHCAFMR
:
2125 rc
= rpcrdma_deregister_fmr_external(seg
, ia
);
2135 * Prepost any receive buffer, then post send.
2137 * Receive buffer is donated to hardware, reclaimed upon recv completion.
2140 rpcrdma_ep_post(struct rpcrdma_ia
*ia
,
2141 struct rpcrdma_ep
*ep
,
2142 struct rpcrdma_req
*req
)
2144 struct ib_send_wr send_wr
, *send_wr_fail
;
2145 struct rpcrdma_rep
*rep
= req
->rl_reply
;
2149 rc
= rpcrdma_ep_post_recv(ia
, ep
, rep
);
2152 req
->rl_reply
= NULL
;
2155 send_wr
.next
= NULL
;
2156 send_wr
.wr_id
= 0ULL; /* no send cookie */
2157 send_wr
.sg_list
= req
->rl_send_iov
;
2158 send_wr
.num_sge
= req
->rl_niovs
;
2159 send_wr
.opcode
= IB_WR_SEND
;
2160 if (send_wr
.num_sge
== 4) /* no need to sync any pad (constant) */
2161 ib_dma_sync_single_for_device(ia
->ri_id
->device
,
2162 req
->rl_send_iov
[3].addr
, req
->rl_send_iov
[3].length
,
2164 ib_dma_sync_single_for_device(ia
->ri_id
->device
,
2165 req
->rl_send_iov
[1].addr
, req
->rl_send_iov
[1].length
,
2167 ib_dma_sync_single_for_device(ia
->ri_id
->device
,
2168 req
->rl_send_iov
[0].addr
, req
->rl_send_iov
[0].length
,
2171 if (DECR_CQCOUNT(ep
) > 0)
2172 send_wr
.send_flags
= 0;
2173 else { /* Provider must take a send completion every now and then */
2175 send_wr
.send_flags
= IB_SEND_SIGNALED
;
2178 rc
= ib_post_send(ia
->ri_id
->qp
, &send_wr
, &send_wr_fail
);
2180 dprintk("RPC: %s: ib_post_send returned %i\n", __func__
,
2187 * (Re)post a receive buffer.
2190 rpcrdma_ep_post_recv(struct rpcrdma_ia
*ia
,
2191 struct rpcrdma_ep
*ep
,
2192 struct rpcrdma_rep
*rep
)
2194 struct ib_recv_wr recv_wr
, *recv_wr_fail
;
2197 recv_wr
.next
= NULL
;
2198 recv_wr
.wr_id
= (u64
) (unsigned long) rep
;
2199 recv_wr
.sg_list
= &rep
->rr_rdmabuf
->rg_iov
;
2200 recv_wr
.num_sge
= 1;
2202 ib_dma_sync_single_for_cpu(ia
->ri_id
->device
,
2203 rdmab_addr(rep
->rr_rdmabuf
),
2204 rdmab_length(rep
->rr_rdmabuf
),
2207 rc
= ib_post_recv(ia
->ri_id
->qp
, &recv_wr
, &recv_wr_fail
);
2210 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__
,
2215 /* Physical mapping means one Read/Write list entry per-page.
2216 * All list entries must fit within an inline buffer
2218 * NB: The server must return a Write list for NFS READ,
2219 * which has the same constraint. Factor in the inline
2223 rpcrdma_physical_max_payload(struct rpcrdma_xprt
*r_xprt
)
2225 struct rpcrdma_create_data_internal
*cdata
= &r_xprt
->rx_data
;
2226 unsigned int inline_size
, pages
;
2228 inline_size
= min_t(unsigned int,
2229 cdata
->inline_wsize
, cdata
->inline_rsize
);
2230 inline_size
-= RPCRDMA_HDRLEN_MIN
;
2231 pages
= inline_size
/ sizeof(struct rpcrdma_segment
);
2232 return pages
<< PAGE_SHIFT
;
2236 rpcrdma_mr_max_payload(struct rpcrdma_xprt
*r_xprt
)
2238 return RPCRDMA_MAX_DATA_SEGS
<< PAGE_SHIFT
;
2242 rpcrdma_max_payload(struct rpcrdma_xprt
*r_xprt
)
2246 switch (r_xprt
->rx_ia
.ri_memreg_strategy
) {
2247 case RPCRDMA_ALLPHYSICAL
:
2248 result
= rpcrdma_physical_max_payload(r_xprt
);
2251 result
= rpcrdma_mr_max_payload(r_xprt
);