xprtrdma: Cap req_cqinit
[deliverable/linux.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
f58851e6 1/*
c56c65fb
TT
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
38 */
39
c56c65fb
TT
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
a6b7a407 50#include <linux/interrupt.h>
5a0e3ad6 51#include <linux/slab.h>
65866f82 52#include <asm/bitops.h>
c56c65fb 53
f58851e6
TT
54#include "xprt_rdma.h"
55
c56c65fb
TT
56/*
57 * Globals/Macros
58 */
59
60#ifdef RPC_DEBUG
61# define RPCDBG_FACILITY RPCDBG_TRANS
62#endif
63
9f9d802a
CL
64static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
65
c56c65fb
TT
66/*
67 * internal functions
68 */
69
70/*
71 * handle replies in tasklet context, using a single, global list
72 * rdma tasklet function -- just turn around and call the func
73 * for all replies on the list
74 */
75
76static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
77static LIST_HEAD(rpcrdma_tasklets_g);
78
79static void
80rpcrdma_run_tasklet(unsigned long data)
81{
82 struct rpcrdma_rep *rep;
83 void (*func)(struct rpcrdma_rep *);
84 unsigned long flags;
85
86 data = data;
87 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
88 while (!list_empty(&rpcrdma_tasklets_g)) {
89 rep = list_entry(rpcrdma_tasklets_g.next,
90 struct rpcrdma_rep, rr_list);
91 list_del(&rep->rr_list);
92 func = rep->rr_func;
93 rep->rr_func = NULL;
94 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
95
96 if (func)
97 func(rep);
98 else
99 rpcrdma_recv_buffer_put(rep);
100
101 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
102 }
103 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
104}
105
106static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
107
c56c65fb
TT
108static void
109rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
110{
111 struct rpcrdma_ep *ep = context;
112
113 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
114 __func__, event->event, event->device->name, context);
115 if (ep->rep_connected == 1) {
116 ep->rep_connected = -EIO;
117 ep->rep_func(ep);
118 wake_up_all(&ep->rep_connect_wait);
119 }
120}
121
122static void
123rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
124{
125 struct rpcrdma_ep *ep = context;
126
127 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
128 __func__, event->event, event->device->name, context);
129 if (ep->rep_connected == 1) {
130 ep->rep_connected = -EIO;
131 ep->rep_func(ep);
132 wake_up_all(&ep->rep_connect_wait);
133 }
134}
135
fc664485
CL
136static void
137rpcrdma_sendcq_process_wc(struct ib_wc *wc)
c56c65fb 138{
fc664485 139 struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
c56c65fb 140
fc664485
CL
141 dprintk("RPC: %s: frmr %p status %X opcode %d\n",
142 __func__, frmr, wc->status, wc->opcode);
c56c65fb 143
fc664485 144 if (wc->wr_id == 0ULL)
c56c65fb 145 return;
dab7e3b8 146 if (wc->status != IB_WC_SUCCESS)
9f9d802a 147 frmr->r.frmr.fr_state = FRMR_IS_STALE;
c56c65fb
TT
148}
149
fc664485 150static int
1c00dd07 151rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
c56c65fb 152{
1c00dd07 153 struct ib_wc *wcs;
8301a2c0 154 int budget, count, rc;
c56c65fb 155
8301a2c0 156 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
157 do {
158 wcs = ep->rep_send_wcs;
159
160 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
161 if (rc <= 0)
162 return rc;
163
164 count = rc;
165 while (count-- > 0)
166 rpcrdma_sendcq_process_wc(wcs++);
8301a2c0 167 } while (rc == RPCRDMA_POLLSIZE && --budget);
1c00dd07 168 return 0;
fc664485 169}
c56c65fb 170
fc664485
CL
171/*
172 * Handle send, fast_reg_mr, and local_inv completions.
173 *
174 * Send events are typically suppressed and thus do not result
175 * in an upcall. Occasionally one is signaled, however. This
176 * prevents the provider's completion queue from wrapping and
177 * losing a completion.
178 */
179static void
180rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
181{
1c00dd07 182 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
fc664485
CL
183 int rc;
184
1c00dd07 185 rc = rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
186 if (rc) {
187 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
188 __func__, rc);
189 return;
c56c65fb
TT
190 }
191
7f23f6f6
CL
192 rc = ib_req_notify_cq(cq,
193 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
194 if (rc == 0)
195 return;
196 if (rc < 0) {
fc664485
CL
197 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
198 __func__, rc);
199 return;
200 }
201
1c00dd07 202 rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
203}
204
205static void
bb96193d 206rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
fc664485
CL
207{
208 struct rpcrdma_rep *rep =
209 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
210
211 dprintk("RPC: %s: rep %p status %X opcode %X length %u\n",
212 __func__, rep, wc->status, wc->opcode, wc->byte_len);
213
214 if (wc->status != IB_WC_SUCCESS) {
215 rep->rr_len = ~0U;
216 goto out_schedule;
217 }
218 if (wc->opcode != IB_WC_RECV)
219 return;
220
221 rep->rr_len = wc->byte_len;
222 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
223 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
224
225 if (rep->rr_len >= 16) {
226 struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
227 unsigned int credits = ntohl(p->rm_credit);
228
229 if (credits == 0)
230 credits = 1; /* don't deadlock */
231 else if (credits > rep->rr_buffer->rb_max_requests)
232 credits = rep->rr_buffer->rb_max_requests;
233 atomic_set(&rep->rr_buffer->rb_credits, credits);
234 }
235
236out_schedule:
bb96193d 237 list_add_tail(&rep->rr_list, sched_list);
fc664485
CL
238}
239
240static int
1c00dd07 241rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
fc664485 242{
bb96193d 243 struct list_head sched_list;
1c00dd07 244 struct ib_wc *wcs;
8301a2c0 245 int budget, count, rc;
bb96193d 246 unsigned long flags;
fc664485 247
bb96193d 248 INIT_LIST_HEAD(&sched_list);
8301a2c0 249 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
250 do {
251 wcs = ep->rep_recv_wcs;
252
253 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
254 if (rc <= 0)
bb96193d 255 goto out_schedule;
1c00dd07
CL
256
257 count = rc;
258 while (count-- > 0)
bb96193d 259 rpcrdma_recvcq_process_wc(wcs++, &sched_list);
8301a2c0 260 } while (rc == RPCRDMA_POLLSIZE && --budget);
bb96193d
CL
261 rc = 0;
262
263out_schedule:
264 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
265 list_splice_tail(&sched_list, &rpcrdma_tasklets_g);
266 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
267 tasklet_schedule(&rpcrdma_tasklet_g);
268 return rc;
c56c65fb
TT
269}
270
271/*
fc664485 272 * Handle receive completions.
c56c65fb 273 *
c56c65fb
TT
274 * It is reentrant but processes single events in order to maintain
275 * ordering of receives to keep server credits.
276 *
277 * It is the responsibility of the scheduled tasklet to return
278 * recv buffers to the pool. NOTE: this affects synchronization of
279 * connection shutdown. That is, the structures required for
280 * the completion of the reply handler must remain intact until
281 * all memory has been reclaimed.
c56c65fb
TT
282 */
283static void
fc664485 284rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
c56c65fb 285{
1c00dd07 286 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
c56c65fb
TT
287 int rc;
288
1c00dd07 289 rc = rpcrdma_recvcq_poll(cq, ep);
fc664485
CL
290 if (rc) {
291 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
292 __func__, rc);
c56c65fb 293 return;
fc664485 294 }
c56c65fb 295
7f23f6f6
CL
296 rc = ib_req_notify_cq(cq,
297 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
298 if (rc == 0)
299 return;
300 if (rc < 0) {
fc664485 301 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
c56c65fb
TT
302 __func__, rc);
303 return;
304 }
305
1c00dd07 306 rpcrdma_recvcq_poll(cq, ep);
c56c65fb
TT
307}
308
a7bc211a
CL
309static void
310rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
311{
312 rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep);
313 rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep);
314}
315
c56c65fb
TT
316#ifdef RPC_DEBUG
317static const char * const conn[] = {
318 "address resolved",
319 "address error",
320 "route resolved",
321 "route error",
322 "connect request",
323 "connect response",
324 "connect error",
325 "unreachable",
326 "rejected",
327 "established",
328 "disconnected",
8079fb78
CL
329 "device removal",
330 "multicast join",
331 "multicast error",
332 "address change",
333 "timewait exit",
c56c65fb 334};
8079fb78
CL
335
336#define CONNECTION_MSG(status) \
337 ((status) < ARRAY_SIZE(conn) ? \
338 conn[(status)] : "unrecognized connection error")
c56c65fb
TT
339#endif
340
341static int
342rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
343{
344 struct rpcrdma_xprt *xprt = id->context;
345 struct rpcrdma_ia *ia = &xprt->rx_ia;
346 struct rpcrdma_ep *ep = &xprt->rx_ep;
ff0db049 347#ifdef RPC_DEBUG
c56c65fb 348 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
ff0db049 349#endif
c56c65fb
TT
350 struct ib_qp_attr attr;
351 struct ib_qp_init_attr iattr;
352 int connstate = 0;
353
354 switch (event->event) {
355 case RDMA_CM_EVENT_ADDR_RESOLVED:
356 case RDMA_CM_EVENT_ROUTE_RESOLVED:
5675add3 357 ia->ri_async_rc = 0;
c56c65fb
TT
358 complete(&ia->ri_done);
359 break;
360 case RDMA_CM_EVENT_ADDR_ERROR:
361 ia->ri_async_rc = -EHOSTUNREACH;
362 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
363 __func__, ep);
364 complete(&ia->ri_done);
365 break;
366 case RDMA_CM_EVENT_ROUTE_ERROR:
367 ia->ri_async_rc = -ENETUNREACH;
368 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
369 __func__, ep);
370 complete(&ia->ri_done);
371 break;
372 case RDMA_CM_EVENT_ESTABLISHED:
373 connstate = 1;
374 ib_query_qp(ia->ri_id->qp, &attr,
375 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
376 &iattr);
377 dprintk("RPC: %s: %d responder resources"
378 " (%d initiator)\n",
379 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
380 goto connected;
381 case RDMA_CM_EVENT_CONNECT_ERROR:
382 connstate = -ENOTCONN;
383 goto connected;
384 case RDMA_CM_EVENT_UNREACHABLE:
385 connstate = -ENETDOWN;
386 goto connected;
387 case RDMA_CM_EVENT_REJECTED:
388 connstate = -ECONNREFUSED;
389 goto connected;
390 case RDMA_CM_EVENT_DISCONNECTED:
391 connstate = -ECONNABORTED;
392 goto connected;
393 case RDMA_CM_EVENT_DEVICE_REMOVAL:
394 connstate = -ENODEV;
395connected:
c56c65fb
TT
396 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
397 dprintk("RPC: %s: %sconnected\n",
398 __func__, connstate > 0 ? "" : "dis");
399 ep->rep_connected = connstate;
400 ep->rep_func(ep);
401 wake_up_all(&ep->rep_connect_wait);
8079fb78 402 /*FALLTHROUGH*/
c56c65fb 403 default:
8079fb78
CL
404 dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n",
405 __func__, &addr->sin_addr.s_addr,
406 ntohs(addr->sin_port), ep,
407 CONNECTION_MSG(event->event));
c56c65fb
TT
408 break;
409 }
410
b3cd8d45
TT
411#ifdef RPC_DEBUG
412 if (connstate == 1) {
413 int ird = attr.max_dest_rd_atomic;
414 int tird = ep->rep_remote_cma.responder_resources;
21454aaa 415 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
b3cd8d45 416 "on %s, memreg %d slots %d ird %d%s\n",
21454aaa 417 &addr->sin_addr.s_addr,
b3cd8d45
TT
418 ntohs(addr->sin_port),
419 ia->ri_id->device->name,
420 ia->ri_memreg_strategy,
421 xprt->rx_buf.rb_max_requests,
422 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
423 } else if (connstate < 0) {
21454aaa
HH
424 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
425 &addr->sin_addr.s_addr,
b3cd8d45
TT
426 ntohs(addr->sin_port),
427 connstate);
428 }
429#endif
430
c56c65fb
TT
431 return 0;
432}
433
434static struct rdma_cm_id *
435rpcrdma_create_id(struct rpcrdma_xprt *xprt,
436 struct rpcrdma_ia *ia, struct sockaddr *addr)
437{
438 struct rdma_cm_id *id;
439 int rc;
440
1a954051
TT
441 init_completion(&ia->ri_done);
442
b26f9b99 443 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
c56c65fb
TT
444 if (IS_ERR(id)) {
445 rc = PTR_ERR(id);
446 dprintk("RPC: %s: rdma_create_id() failed %i\n",
447 __func__, rc);
448 return id;
449 }
450
5675add3 451 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
452 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
453 if (rc) {
454 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
455 __func__, rc);
456 goto out;
457 }
5675add3
TT
458 wait_for_completion_interruptible_timeout(&ia->ri_done,
459 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
460 rc = ia->ri_async_rc;
461 if (rc)
462 goto out;
463
5675add3 464 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
465 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
466 if (rc) {
467 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
468 __func__, rc);
469 goto out;
470 }
5675add3
TT
471 wait_for_completion_interruptible_timeout(&ia->ri_done,
472 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
473 rc = ia->ri_async_rc;
474 if (rc)
475 goto out;
476
477 return id;
478
479out:
480 rdma_destroy_id(id);
481 return ERR_PTR(rc);
482}
483
484/*
485 * Drain any cq, prior to teardown.
486 */
487static void
488rpcrdma_clean_cq(struct ib_cq *cq)
489{
490 struct ib_wc wc;
491 int count = 0;
492
493 while (1 == ib_poll_cq(cq, 1, &wc))
494 ++count;
495
496 if (count)
497 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
498 __func__, count, wc.opcode);
499}
500
501/*
502 * Exported functions.
503 */
504
505/*
506 * Open and initialize an Interface Adapter.
507 * o initializes fields of struct rpcrdma_ia, including
508 * interface and provider attributes and protection zone.
509 */
510int
511rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
512{
bd7ed1d1
TT
513 int rc, mem_priv;
514 struct ib_device_attr devattr;
c56c65fb
TT
515 struct rpcrdma_ia *ia = &xprt->rx_ia;
516
c56c65fb
TT
517 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
518 if (IS_ERR(ia->ri_id)) {
519 rc = PTR_ERR(ia->ri_id);
520 goto out1;
521 }
522
523 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
524 if (IS_ERR(ia->ri_pd)) {
525 rc = PTR_ERR(ia->ri_pd);
526 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
527 __func__, rc);
528 goto out2;
529 }
530
bd7ed1d1
TT
531 /*
532 * Query the device to determine if the requested memory
533 * registration strategy is supported. If it isn't, set the
534 * strategy to a globally supported model.
535 */
536 rc = ib_query_device(ia->ri_id->device, &devattr);
537 if (rc) {
538 dprintk("RPC: %s: ib_query_device failed %d\n",
539 __func__, rc);
540 goto out2;
541 }
542
543 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
544 ia->ri_have_dma_lkey = 1;
545 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
546 }
547
f10eafd3 548 if (memreg == RPCRDMA_FRMR) {
3197d309
TT
549 /* Requires both frmr reg and local dma lkey */
550 if ((devattr.device_cap_flags &
551 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
552 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
3197d309 553 dprintk("RPC: %s: FRMR registration "
f10eafd3
CL
554 "not supported by HCA\n", __func__);
555 memreg = RPCRDMA_MTHCAFMR;
0fc6c4e7
SW
556 } else {
557 /* Mind the ia limit on FRMR page list depth */
558 ia->ri_max_frmr_depth = min_t(unsigned int,
559 RPCRDMA_MAX_DATA_SEGS,
560 devattr.max_fast_reg_page_list_len);
bd7ed1d1 561 }
f10eafd3
CL
562 }
563 if (memreg == RPCRDMA_MTHCAFMR) {
564 if (!ia->ri_id->device->alloc_fmr) {
565 dprintk("RPC: %s: MTHCAFMR registration "
566 "not supported by HCA\n", __func__);
f10eafd3 567 memreg = RPCRDMA_ALLPHYSICAL;
f10eafd3 568 }
bd7ed1d1
TT
569 }
570
c56c65fb
TT
571 /*
572 * Optionally obtain an underlying physical identity mapping in
573 * order to do a memory window-based bind. This base registration
574 * is protected from remote access - that is enabled only by binding
575 * for the specific bytes targeted during each RPC operation, and
576 * revoked after the corresponding completion similar to a storage
577 * adapter.
578 */
bd7ed1d1 579 switch (memreg) {
3197d309 580 case RPCRDMA_FRMR:
bd7ed1d1 581 break;
bd7ed1d1
TT
582 case RPCRDMA_ALLPHYSICAL:
583 mem_priv = IB_ACCESS_LOCAL_WRITE |
584 IB_ACCESS_REMOTE_WRITE |
585 IB_ACCESS_REMOTE_READ;
586 goto register_setup;
bd7ed1d1
TT
587 case RPCRDMA_MTHCAFMR:
588 if (ia->ri_have_dma_lkey)
c56c65fb 589 break;
bd7ed1d1
TT
590 mem_priv = IB_ACCESS_LOCAL_WRITE;
591 register_setup:
c56c65fb
TT
592 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
593 if (IS_ERR(ia->ri_bind_mem)) {
594 printk(KERN_ALERT "%s: ib_get_dma_mr for "
0ac531c1 595 "phys register failed with %lX\n",
c56c65fb 596 __func__, PTR_ERR(ia->ri_bind_mem));
0ac531c1
CL
597 rc = -ENOMEM;
598 goto out2;
c56c65fb 599 }
bd7ed1d1
TT
600 break;
601 default:
cdd9ade7
CL
602 printk(KERN_ERR "RPC: Unsupported memory "
603 "registration mode: %d\n", memreg);
604 rc = -ENOMEM;
bd7ed1d1 605 goto out2;
c56c65fb 606 }
bd7ed1d1
TT
607 dprintk("RPC: %s: memory registration strategy is %d\n",
608 __func__, memreg);
c56c65fb
TT
609
610 /* Else will do memory reg/dereg for each chunk */
611 ia->ri_memreg_strategy = memreg;
612
73806c88 613 rwlock_init(&ia->ri_qplock);
c56c65fb
TT
614 return 0;
615out2:
616 rdma_destroy_id(ia->ri_id);
fee08caf 617 ia->ri_id = NULL;
c56c65fb
TT
618out1:
619 return rc;
620}
621
622/*
623 * Clean up/close an IA.
624 * o if event handles and PD have been initialized, free them.
625 * o close the IA
626 */
627void
628rpcrdma_ia_close(struct rpcrdma_ia *ia)
629{
630 int rc;
631
632 dprintk("RPC: %s: entering\n", __func__);
633 if (ia->ri_bind_mem != NULL) {
634 rc = ib_dereg_mr(ia->ri_bind_mem);
635 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
636 __func__, rc);
637 }
fee08caf
TT
638 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
639 if (ia->ri_id->qp)
640 rdma_destroy_qp(ia->ri_id);
641 rdma_destroy_id(ia->ri_id);
642 ia->ri_id = NULL;
643 }
c56c65fb
TT
644 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
645 rc = ib_dealloc_pd(ia->ri_pd);
646 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
647 __func__, rc);
648 }
c56c65fb
TT
649}
650
651/*
652 * Create unconnected endpoint.
653 */
654int
655rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
656 struct rpcrdma_create_data_internal *cdata)
657{
658 struct ib_device_attr devattr;
fc664485 659 struct ib_cq *sendcq, *recvcq;
5d40a8a5 660 int rc, err;
c56c65fb
TT
661
662 rc = ib_query_device(ia->ri_id->device, &devattr);
663 if (rc) {
664 dprintk("RPC: %s: ib_query_device failed %d\n",
665 __func__, rc);
666 return rc;
667 }
668
669 /* check provider's send/recv wr limits */
670 if (cdata->max_requests > devattr.max_qp_wr)
671 cdata->max_requests = devattr.max_qp_wr;
672
673 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
674 ep->rep_attr.qp_context = ep;
675 /* send_cq and recv_cq initialized below */
676 ep->rep_attr.srq = NULL;
677 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
678 switch (ia->ri_memreg_strategy) {
0fc6c4e7
SW
679 case RPCRDMA_FRMR: {
680 int depth = 7;
681
15cdc644
TT
682 /* Add room for frmr register and invalidate WRs.
683 * 1. FRMR reg WR for head
684 * 2. FRMR invalidate WR for head
0fc6c4e7
SW
685 * 3. N FRMR reg WRs for pagelist
686 * 4. N FRMR invalidate WRs for pagelist
15cdc644
TT
687 * 5. FRMR reg WR for tail
688 * 6. FRMR invalidate WR for tail
689 * 7. The RDMA_SEND WR
690 */
0fc6c4e7
SW
691
692 /* Calculate N if the device max FRMR depth is smaller than
693 * RPCRDMA_MAX_DATA_SEGS.
694 */
695 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
696 int delta = RPCRDMA_MAX_DATA_SEGS -
697 ia->ri_max_frmr_depth;
698
699 do {
700 depth += 2; /* FRMR reg + invalidate */
701 delta -= ia->ri_max_frmr_depth;
702 } while (delta > 0);
703
704 }
705 ep->rep_attr.cap.max_send_wr *= depth;
15cdc644 706 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
0fc6c4e7 707 cdata->max_requests = devattr.max_qp_wr / depth;
15cdc644
TT
708 if (!cdata->max_requests)
709 return -EINVAL;
0fc6c4e7
SW
710 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
711 depth;
15cdc644 712 }
3197d309 713 break;
0fc6c4e7 714 }
c56c65fb
TT
715 default:
716 break;
717 }
718 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
719 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
720 ep->rep_attr.cap.max_recv_sge = 1;
721 ep->rep_attr.cap.max_inline_data = 0;
722 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
723 ep->rep_attr.qp_type = IB_QPT_RC;
724 ep->rep_attr.port_num = ~0;
725
726 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
727 "iovs: send %d recv %d\n",
728 __func__,
729 ep->rep_attr.cap.max_send_wr,
730 ep->rep_attr.cap.max_recv_wr,
731 ep->rep_attr.cap.max_send_sge,
732 ep->rep_attr.cap.max_recv_sge);
733
734 /* set trigger for requesting send completion */
fc664485 735 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
e7104a2a
CL
736 if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
737 ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
738 else if (ep->rep_cqinit <= 2)
c56c65fb
TT
739 ep->rep_cqinit = 0;
740 INIT_CQCOUNT(ep);
741 ep->rep_ia = ia;
742 init_waitqueue_head(&ep->rep_connect_wait);
254f91e2 743 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
c56c65fb 744
fc664485 745 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
1c00dd07 746 rpcrdma_cq_async_error_upcall, ep,
c56c65fb 747 ep->rep_attr.cap.max_send_wr + 1, 0);
fc664485
CL
748 if (IS_ERR(sendcq)) {
749 rc = PTR_ERR(sendcq);
750 dprintk("RPC: %s: failed to create send CQ: %i\n",
c56c65fb
TT
751 __func__, rc);
752 goto out1;
753 }
754
fc664485 755 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
c56c65fb
TT
756 if (rc) {
757 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
758 __func__, rc);
759 goto out2;
760 }
761
fc664485 762 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
1c00dd07 763 rpcrdma_cq_async_error_upcall, ep,
fc664485
CL
764 ep->rep_attr.cap.max_recv_wr + 1, 0);
765 if (IS_ERR(recvcq)) {
766 rc = PTR_ERR(recvcq);
767 dprintk("RPC: %s: failed to create recv CQ: %i\n",
768 __func__, rc);
769 goto out2;
770 }
771
772 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
773 if (rc) {
774 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
775 __func__, rc);
776 ib_destroy_cq(recvcq);
777 goto out2;
778 }
779
780 ep->rep_attr.send_cq = sendcq;
781 ep->rep_attr.recv_cq = recvcq;
c56c65fb
TT
782
783 /* Initialize cma parameters */
784
785 /* RPC/RDMA does not use private data */
786 ep->rep_remote_cma.private_data = NULL;
787 ep->rep_remote_cma.private_data_len = 0;
788
789 /* Client offers RDMA Read but does not initiate */
b334eaab 790 ep->rep_remote_cma.initiator_depth = 0;
03ff8821 791 if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
b334eaab
TT
792 ep->rep_remote_cma.responder_resources = 32;
793 else
c56c65fb 794 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
c56c65fb
TT
795
796 ep->rep_remote_cma.retry_count = 7;
797 ep->rep_remote_cma.flow_control = 0;
798 ep->rep_remote_cma.rnr_retry_count = 0;
799
800 return 0;
801
802out2:
fc664485 803 err = ib_destroy_cq(sendcq);
5d40a8a5
CL
804 if (err)
805 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
806 __func__, err);
c56c65fb
TT
807out1:
808 return rc;
809}
810
811/*
812 * rpcrdma_ep_destroy
813 *
814 * Disconnect and destroy endpoint. After this, the only
815 * valid operations on the ep are to free it (if dynamically
816 * allocated) or re-create it.
c56c65fb 817 */
7f1d5419 818void
c56c65fb
TT
819rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
820{
821 int rc;
822
823 dprintk("RPC: %s: entering, connected is %d\n",
824 __func__, ep->rep_connected);
825
254f91e2
CL
826 cancel_delayed_work_sync(&ep->rep_connect_worker);
827
c56c65fb 828 if (ia->ri_id->qp) {
282191cb 829 rpcrdma_ep_disconnect(ep, ia);
fee08caf
TT
830 rdma_destroy_qp(ia->ri_id);
831 ia->ri_id->qp = NULL;
c56c65fb
TT
832 }
833
c56c65fb
TT
834 /* padding - could be done in rpcrdma_buffer_destroy... */
835 if (ep->rep_pad_mr) {
836 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
837 ep->rep_pad_mr = NULL;
838 }
839
fc664485
CL
840 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
841 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
842 if (rc)
843 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
844 __func__, rc);
845
846 rpcrdma_clean_cq(ep->rep_attr.send_cq);
847 rc = ib_destroy_cq(ep->rep_attr.send_cq);
c56c65fb
TT
848 if (rc)
849 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
850 __func__, rc);
c56c65fb
TT
851}
852
853/*
854 * Connect unconnected endpoint.
855 */
856int
857rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
858{
73806c88 859 struct rdma_cm_id *id, *old;
c56c65fb
TT
860 int rc = 0;
861 int retry_count = 0;
c56c65fb 862
c055551e 863 if (ep->rep_connected != 0) {
c56c65fb
TT
864 struct rpcrdma_xprt *xprt;
865retry:
ec62f40d 866 dprintk("RPC: %s: reconnecting...\n", __func__);
282191cb
CL
867
868 rpcrdma_ep_disconnect(ep, ia);
a7bc211a 869 rpcrdma_flush_cqs(ep);
c56c65fb 870
9f9d802a
CL
871 if (ia->ri_memreg_strategy == RPCRDMA_FRMR)
872 rpcrdma_reset_frmrs(ia);
873
c56c65fb
TT
874 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
875 id = rpcrdma_create_id(xprt, ia,
876 (struct sockaddr *)&xprt->rx_data.addr);
877 if (IS_ERR(id)) {
ec62f40d 878 rc = -EHOSTUNREACH;
c56c65fb
TT
879 goto out;
880 }
881 /* TEMP TEMP TEMP - fail if new device:
882 * Deregister/remarshal *all* requests!
883 * Close and recreate adapter, pd, etc!
884 * Re-determine all attributes still sane!
885 * More stuff I haven't thought of!
886 * Rrrgh!
887 */
888 if (ia->ri_id->device != id->device) {
889 printk("RPC: %s: can't reconnect on "
890 "different device!\n", __func__);
891 rdma_destroy_id(id);
ec62f40d 892 rc = -ENETUNREACH;
c56c65fb
TT
893 goto out;
894 }
895 /* END TEMP */
ec62f40d
CL
896 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
897 if (rc) {
898 dprintk("RPC: %s: rdma_create_qp failed %i\n",
899 __func__, rc);
900 rdma_destroy_id(id);
901 rc = -ENETUNREACH;
902 goto out;
903 }
73806c88
CL
904
905 write_lock(&ia->ri_qplock);
906 old = ia->ri_id;
c56c65fb 907 ia->ri_id = id;
73806c88
CL
908 write_unlock(&ia->ri_qplock);
909
910 rdma_destroy_qp(old);
911 rdma_destroy_id(old);
ec62f40d
CL
912 } else {
913 dprintk("RPC: %s: connecting...\n", __func__);
914 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
915 if (rc) {
916 dprintk("RPC: %s: rdma_create_qp failed %i\n",
917 __func__, rc);
918 /* do not update ep->rep_connected */
919 return -ENETUNREACH;
920 }
c56c65fb
TT
921 }
922
c56c65fb
TT
923 ep->rep_connected = 0;
924
925 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
926 if (rc) {
927 dprintk("RPC: %s: rdma_connect() failed with %i\n",
928 __func__, rc);
929 goto out;
930 }
931
c56c65fb
TT
932 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
933
934 /*
935 * Check state. A non-peer reject indicates no listener
936 * (ECONNREFUSED), which may be a transient state. All
937 * others indicate a transport condition which has already
938 * undergone a best-effort.
939 */
f64f9e71
JP
940 if (ep->rep_connected == -ECONNREFUSED &&
941 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
c56c65fb
TT
942 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
943 goto retry;
944 }
945 if (ep->rep_connected <= 0) {
946 /* Sometimes, the only way to reliably connect to remote
947 * CMs is to use same nonzero values for ORD and IRD. */
b334eaab
TT
948 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
949 (ep->rep_remote_cma.responder_resources == 0 ||
950 ep->rep_remote_cma.initiator_depth !=
951 ep->rep_remote_cma.responder_resources)) {
952 if (ep->rep_remote_cma.responder_resources == 0)
953 ep->rep_remote_cma.responder_resources = 1;
954 ep->rep_remote_cma.initiator_depth =
955 ep->rep_remote_cma.responder_resources;
c56c65fb 956 goto retry;
b334eaab 957 }
c56c65fb
TT
958 rc = ep->rep_connected;
959 } else {
960 dprintk("RPC: %s: connected\n", __func__);
961 }
962
963out:
964 if (rc)
965 ep->rep_connected = rc;
966 return rc;
967}
968
969/*
970 * rpcrdma_ep_disconnect
971 *
972 * This is separate from destroy to facilitate the ability
973 * to reconnect without recreating the endpoint.
974 *
975 * This call is not reentrant, and must not be made in parallel
976 * on the same endpoint.
977 */
282191cb 978void
c56c65fb
TT
979rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
980{
981 int rc;
982
a7bc211a 983 rpcrdma_flush_cqs(ep);
c56c65fb
TT
984 rc = rdma_disconnect(ia->ri_id);
985 if (!rc) {
986 /* returns without wait if not connected */
987 wait_event_interruptible(ep->rep_connect_wait,
988 ep->rep_connected != 1);
989 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
990 (ep->rep_connected == 1) ? "still " : "dis");
991 } else {
992 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
993 ep->rep_connected = rc;
994 }
c56c65fb
TT
995}
996
2e84522c
CL
997static int
998rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
999{
1000 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
1001 struct ib_fmr_attr fmr_attr = {
1002 .max_pages = RPCRDMA_MAX_DATA_SEGS,
1003 .max_maps = 1,
1004 .page_shift = PAGE_SHIFT
1005 };
1006 struct rpcrdma_mw *r;
1007 int i, rc;
1008
1009 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1010 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
1011
1012 while (i--) {
1013 r = kzalloc(sizeof(*r), GFP_KERNEL);
1014 if (r == NULL)
1015 return -ENOMEM;
1016
1017 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1018 if (IS_ERR(r->r.fmr)) {
1019 rc = PTR_ERR(r->r.fmr);
1020 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1021 __func__, rc);
1022 goto out_free;
1023 }
1024
1025 list_add(&r->mw_list, &buf->rb_mws);
1026 list_add(&r->mw_all, &buf->rb_all);
1027 }
1028 return 0;
1029
1030out_free:
1031 kfree(r);
1032 return rc;
1033}
1034
1035static int
1036rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1037{
1038 struct rpcrdma_frmr *f;
1039 struct rpcrdma_mw *r;
1040 int i, rc;
1041
1042 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1043 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
1044
1045 while (i--) {
1046 r = kzalloc(sizeof(*r), GFP_KERNEL);
1047 if (r == NULL)
1048 return -ENOMEM;
1049 f = &r->r.frmr;
1050
1051 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1052 ia->ri_max_frmr_depth);
1053 if (IS_ERR(f->fr_mr)) {
1054 rc = PTR_ERR(f->fr_mr);
1055 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1056 "failed %i\n", __func__, rc);
1057 goto out_free;
1058 }
1059
1060 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1061 ia->ri_max_frmr_depth);
1062 if (IS_ERR(f->fr_pgl)) {
1063 rc = PTR_ERR(f->fr_pgl);
1064 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1065 "failed %i\n", __func__, rc);
1066
1067 ib_dereg_mr(f->fr_mr);
1068 goto out_free;
1069 }
1070
1071 list_add(&r->mw_list, &buf->rb_mws);
1072 list_add(&r->mw_all, &buf->rb_all);
1073 }
1074
1075 return 0;
1076
1077out_free:
1078 kfree(r);
1079 return rc;
1080}
1081
c56c65fb
TT
1082int
1083rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1084 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
1085{
1086 char *p;
65866f82 1087 size_t len, rlen, wlen;
c56c65fb
TT
1088 int i, rc;
1089
1090 buf->rb_max_requests = cdata->max_requests;
1091 spin_lock_init(&buf->rb_lock);
1092 atomic_set(&buf->rb_credits, 1);
1093
1094 /* Need to allocate:
1095 * 1. arrays for send and recv pointers
1096 * 2. arrays of struct rpcrdma_req to fill in pointers
1097 * 3. array of struct rpcrdma_rep for replies
1098 * 4. padding, if any
c56c65fb
TT
1099 * Send/recv buffers in req/rep need to be registered
1100 */
c56c65fb
TT
1101 len = buf->rb_max_requests *
1102 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1103 len += cdata->padding;
c56c65fb 1104
c56c65fb
TT
1105 p = kzalloc(len, GFP_KERNEL);
1106 if (p == NULL) {
1107 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1108 __func__, len);
1109 rc = -ENOMEM;
1110 goto out;
1111 }
1112 buf->rb_pool = p; /* for freeing it later */
1113
1114 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1115 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1116 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1117 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1118
1119 /*
1120 * Register the zeroed pad buffer, if any.
1121 */
1122 if (cdata->padding) {
1123 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1124 &ep->rep_pad_mr, &ep->rep_pad);
1125 if (rc)
1126 goto out;
1127 }
1128 p += cdata->padding;
1129
c56c65fb 1130 INIT_LIST_HEAD(&buf->rb_mws);
3111d72c 1131 INIT_LIST_HEAD(&buf->rb_all);
c56c65fb 1132 switch (ia->ri_memreg_strategy) {
3197d309 1133 case RPCRDMA_FRMR:
2e84522c
CL
1134 rc = rpcrdma_init_frmrs(ia, buf);
1135 if (rc)
1136 goto out;
3197d309 1137 break;
c56c65fb 1138 case RPCRDMA_MTHCAFMR:
2e84522c
CL
1139 rc = rpcrdma_init_fmrs(ia, buf);
1140 if (rc)
1141 goto out;
c56c65fb 1142 break;
c56c65fb
TT
1143 default:
1144 break;
1145 }
1146
1147 /*
1148 * Allocate/init the request/reply buffers. Doing this
1149 * using kmalloc for now -- one for each buf.
1150 */
65866f82
CL
1151 wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
1152 rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
1153 dprintk("RPC: %s: wlen = %zu, rlen = %zu\n",
1154 __func__, wlen, rlen);
1155
c56c65fb
TT
1156 for (i = 0; i < buf->rb_max_requests; i++) {
1157 struct rpcrdma_req *req;
1158 struct rpcrdma_rep *rep;
1159
65866f82 1160 req = kmalloc(wlen, GFP_KERNEL);
c56c65fb
TT
1161 if (req == NULL) {
1162 dprintk("RPC: %s: request buffer %d alloc"
1163 " failed\n", __func__, i);
1164 rc = -ENOMEM;
1165 goto out;
1166 }
1167 memset(req, 0, sizeof(struct rpcrdma_req));
1168 buf->rb_send_bufs[i] = req;
1169 buf->rb_send_bufs[i]->rl_buffer = buf;
1170
1171 rc = rpcrdma_register_internal(ia, req->rl_base,
65866f82 1172 wlen - offsetof(struct rpcrdma_req, rl_base),
c56c65fb
TT
1173 &buf->rb_send_bufs[i]->rl_handle,
1174 &buf->rb_send_bufs[i]->rl_iov);
1175 if (rc)
1176 goto out;
1177
65866f82
CL
1178 buf->rb_send_bufs[i]->rl_size = wlen -
1179 sizeof(struct rpcrdma_req);
c56c65fb 1180
65866f82 1181 rep = kmalloc(rlen, GFP_KERNEL);
c56c65fb
TT
1182 if (rep == NULL) {
1183 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1184 __func__, i);
1185 rc = -ENOMEM;
1186 goto out;
1187 }
1188 memset(rep, 0, sizeof(struct rpcrdma_rep));
1189 buf->rb_recv_bufs[i] = rep;
1190 buf->rb_recv_bufs[i]->rr_buffer = buf;
c56c65fb
TT
1191
1192 rc = rpcrdma_register_internal(ia, rep->rr_base,
65866f82 1193 rlen - offsetof(struct rpcrdma_rep, rr_base),
c56c65fb
TT
1194 &buf->rb_recv_bufs[i]->rr_handle,
1195 &buf->rb_recv_bufs[i]->rr_iov);
1196 if (rc)
1197 goto out;
1198
1199 }
1200 dprintk("RPC: %s: max_requests %d\n",
1201 __func__, buf->rb_max_requests);
1202 /* done */
1203 return 0;
1204out:
1205 rpcrdma_buffer_destroy(buf);
1206 return rc;
1207}
1208
2e84522c
CL
1209static void
1210rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1211{
1212 struct rpcrdma_mw *r;
1213 int rc;
1214
1215 while (!list_empty(&buf->rb_all)) {
1216 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1217 list_del(&r->mw_all);
1218 list_del(&r->mw_list);
1219
1220 rc = ib_dealloc_fmr(r->r.fmr);
1221 if (rc)
1222 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1223 __func__, rc);
1224
1225 kfree(r);
1226 }
1227}
1228
1229static void
1230rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1231{
1232 struct rpcrdma_mw *r;
1233 int rc;
1234
1235 while (!list_empty(&buf->rb_all)) {
1236 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1237 list_del(&r->mw_all);
1238 list_del(&r->mw_list);
1239
1240 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1241 if (rc)
1242 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1243 __func__, rc);
1244 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1245
1246 kfree(r);
1247 }
1248}
1249
c56c65fb
TT
1250void
1251rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1252{
c56c65fb 1253 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
2e84522c 1254 int i;
c56c65fb
TT
1255
1256 /* clean up in reverse order from create
1257 * 1. recv mr memory (mr free, then kfree)
c56c65fb 1258 * 2. send mr memory (mr free, then kfree)
2e84522c 1259 * 3. MWs
c56c65fb
TT
1260 */
1261 dprintk("RPC: %s: entering\n", __func__);
1262
1263 for (i = 0; i < buf->rb_max_requests; i++) {
1264 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1265 rpcrdma_deregister_internal(ia,
1266 buf->rb_recv_bufs[i]->rr_handle,
1267 &buf->rb_recv_bufs[i]->rr_iov);
1268 kfree(buf->rb_recv_bufs[i]);
1269 }
1270 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
c56c65fb
TT
1271 rpcrdma_deregister_internal(ia,
1272 buf->rb_send_bufs[i]->rl_handle,
1273 &buf->rb_send_bufs[i]->rl_iov);
1274 kfree(buf->rb_send_bufs[i]);
1275 }
1276 }
1277
2e84522c
CL
1278 switch (ia->ri_memreg_strategy) {
1279 case RPCRDMA_FRMR:
1280 rpcrdma_destroy_frmrs(buf);
1281 break;
1282 case RPCRDMA_MTHCAFMR:
1283 rpcrdma_destroy_fmrs(buf);
1284 break;
1285 default:
1286 break;
4034ba04
AA
1287 }
1288
c56c65fb
TT
1289 kfree(buf->rb_pool);
1290}
1291
9f9d802a
CL
1292/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1293 * an unusable state. Find FRMRs in this state and dereg / reg
1294 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1295 * also torn down.
1296 *
1297 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1298 *
1299 * This is invoked only in the transport connect worker in order
1300 * to serialize with rpcrdma_register_frmr_external().
1301 */
1302static void
1303rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1304{
1305 struct rpcrdma_xprt *r_xprt =
1306 container_of(ia, struct rpcrdma_xprt, rx_ia);
1307 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1308 struct list_head *pos;
1309 struct rpcrdma_mw *r;
1310 int rc;
1311
1312 list_for_each(pos, &buf->rb_all) {
1313 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1314
1315 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1316 continue;
1317
1318 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1319 if (rc)
1320 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1321 __func__, rc);
1322 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1323
1324 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1325 ia->ri_max_frmr_depth);
1326 if (IS_ERR(r->r.frmr.fr_mr)) {
1327 rc = PTR_ERR(r->r.frmr.fr_mr);
1328 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1329 " failed %i\n", __func__, rc);
1330 continue;
1331 }
1332 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1333 ia->ri_id->device,
1334 ia->ri_max_frmr_depth);
1335 if (IS_ERR(r->r.frmr.fr_pgl)) {
1336 rc = PTR_ERR(r->r.frmr.fr_pgl);
1337 dprintk("RPC: %s: "
1338 "ib_alloc_fast_reg_page_list "
1339 "failed %i\n", __func__, rc);
1340
1341 ib_dereg_mr(r->r.frmr.fr_mr);
1342 continue;
1343 }
1344 r->r.frmr.fr_state = FRMR_IS_INVALID;
1345 }
1346}
1347
c2922c02
CL
1348/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1349 * some req segments uninitialized.
1350 */
1351static void
1352rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1353{
1354 if (*mw) {
1355 list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
1356 *mw = NULL;
1357 }
1358}
1359
1360/* Cycle mw's back in reverse order, and "spin" them.
1361 * This delays and scrambles reuse as much as possible.
1362 */
1363static void
1364rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1365{
1366 struct rpcrdma_mr_seg *seg = req->rl_segments;
1367 struct rpcrdma_mr_seg *seg1 = seg;
1368 int i;
1369
1370 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
1371 rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf);
1372 rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf);
1373}
1374
1375static void
1376rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1377{
1378 buf->rb_send_bufs[--buf->rb_send_index] = req;
1379 req->rl_niovs = 0;
1380 if (req->rl_reply) {
1381 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1382 req->rl_reply->rr_func = NULL;
1383 req->rl_reply = NULL;
1384 }
1385}
1386
ddb6bebc
CL
1387/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
1388 * Redo only the ib_post_send().
1389 */
1390static void
1391rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1392{
1393 struct rpcrdma_xprt *r_xprt =
1394 container_of(ia, struct rpcrdma_xprt, rx_ia);
1395 struct ib_send_wr invalidate_wr, *bad_wr;
1396 int rc;
1397
1398 dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
1399
1400 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
dab7e3b8 1401 r->r.frmr.fr_state = FRMR_IS_INVALID;
ddb6bebc
CL
1402
1403 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1404 invalidate_wr.wr_id = (unsigned long)(void *)r;
1405 invalidate_wr.opcode = IB_WR_LOCAL_INV;
ddb6bebc
CL
1406 invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1407 DECR_CQCOUNT(&r_xprt->rx_ep);
1408
1409 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
1410 __func__, r, r->r.frmr.fr_mr->rkey);
1411
1412 read_lock(&ia->ri_qplock);
1413 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1414 read_unlock(&ia->ri_qplock);
1415 if (rc) {
1416 /* Force rpcrdma_buffer_get() to retry */
1417 r->r.frmr.fr_state = FRMR_IS_STALE;
1418 dprintk("RPC: %s: ib_post_send failed, %i\n",
1419 __func__, rc);
1420 }
1421}
1422
1423static void
1424rpcrdma_retry_flushed_linv(struct list_head *stale,
1425 struct rpcrdma_buffer *buf)
1426{
1427 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1428 struct list_head *pos;
1429 struct rpcrdma_mw *r;
1430 unsigned long flags;
1431
1432 list_for_each(pos, stale) {
1433 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1434 rpcrdma_retry_local_inv(r, ia);
1435 }
1436
1437 spin_lock_irqsave(&buf->rb_lock, flags);
1438 list_splice_tail(stale, &buf->rb_mws);
1439 spin_unlock_irqrestore(&buf->rb_lock, flags);
1440}
1441
1442static struct rpcrdma_req *
1443rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1444 struct list_head *stale)
1445{
1446 struct rpcrdma_mw *r;
1447 int i;
1448
1449 i = RPCRDMA_MAX_SEGS - 1;
1450 while (!list_empty(&buf->rb_mws)) {
1451 r = list_entry(buf->rb_mws.next,
1452 struct rpcrdma_mw, mw_list);
1453 list_del(&r->mw_list);
1454 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1455 list_add(&r->mw_list, stale);
1456 continue;
1457 }
1458 req->rl_segments[i].mr_chunk.rl_mw = r;
1459 if (unlikely(i-- == 0))
1460 return req; /* Success */
1461 }
1462
1463 /* Not enough entries on rb_mws for this req */
1464 rpcrdma_buffer_put_sendbuf(req, buf);
1465 rpcrdma_buffer_put_mrs(req, buf);
1466 return NULL;
1467}
1468
c2922c02 1469static struct rpcrdma_req *
ddb6bebc 1470rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
c2922c02
CL
1471{
1472 struct rpcrdma_mw *r;
1473 int i;
1474
1475 i = RPCRDMA_MAX_SEGS - 1;
1476 while (!list_empty(&buf->rb_mws)) {
1477 r = list_entry(buf->rb_mws.next,
1478 struct rpcrdma_mw, mw_list);
1479 list_del(&r->mw_list);
1480 req->rl_segments[i].mr_chunk.rl_mw = r;
1481 if (unlikely(i-- == 0))
1482 return req; /* Success */
1483 }
1484
1485 /* Not enough entries on rb_mws for this req */
1486 rpcrdma_buffer_put_sendbuf(req, buf);
1487 rpcrdma_buffer_put_mrs(req, buf);
1488 return NULL;
1489}
1490
c56c65fb
TT
1491/*
1492 * Get a set of request/reply buffers.
1493 *
1494 * Reply buffer (if needed) is attached to send buffer upon return.
1495 * Rule:
1496 * rb_send_index and rb_recv_index MUST always be pointing to the
1497 * *next* available buffer (non-NULL). They are incremented after
1498 * removing buffers, and decremented *before* returning them.
1499 */
1500struct rpcrdma_req *
1501rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1502{
c2922c02 1503 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
ddb6bebc 1504 struct list_head stale;
c56c65fb
TT
1505 struct rpcrdma_req *req;
1506 unsigned long flags;
1507
1508 spin_lock_irqsave(&buffers->rb_lock, flags);
1509 if (buffers->rb_send_index == buffers->rb_max_requests) {
1510 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1511 dprintk("RPC: %s: out of request buffers\n", __func__);
1512 return ((struct rpcrdma_req *)NULL);
1513 }
1514
1515 req = buffers->rb_send_bufs[buffers->rb_send_index];
1516 if (buffers->rb_send_index < buffers->rb_recv_index) {
1517 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1518 __func__,
1519 buffers->rb_recv_index - buffers->rb_send_index);
1520 req->rl_reply = NULL;
1521 } else {
1522 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1523 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1524 }
1525 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
ddb6bebc
CL
1526
1527 INIT_LIST_HEAD(&stale);
c2922c02
CL
1528 switch (ia->ri_memreg_strategy) {
1529 case RPCRDMA_FRMR:
ddb6bebc
CL
1530 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1531 break;
c2922c02 1532 case RPCRDMA_MTHCAFMR:
ddb6bebc 1533 req = rpcrdma_buffer_get_fmrs(req, buffers);
c2922c02
CL
1534 break;
1535 default:
1536 break;
c56c65fb
TT
1537 }
1538 spin_unlock_irqrestore(&buffers->rb_lock, flags);
ddb6bebc
CL
1539 if (!list_empty(&stale))
1540 rpcrdma_retry_flushed_linv(&stale, buffers);
c56c65fb
TT
1541 return req;
1542}
1543
1544/*
1545 * Put request/reply buffers back into pool.
1546 * Pre-decrement counter/array index.
1547 */
1548void
1549rpcrdma_buffer_put(struct rpcrdma_req *req)
1550{
1551 struct rpcrdma_buffer *buffers = req->rl_buffer;
1552 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
c56c65fb
TT
1553 unsigned long flags;
1554
c56c65fb 1555 spin_lock_irqsave(&buffers->rb_lock, flags);
c2922c02 1556 rpcrdma_buffer_put_sendbuf(req, buffers);
c56c65fb 1557 switch (ia->ri_memreg_strategy) {
3197d309 1558 case RPCRDMA_FRMR:
c56c65fb 1559 case RPCRDMA_MTHCAFMR:
c2922c02 1560 rpcrdma_buffer_put_mrs(req, buffers);
c56c65fb
TT
1561 break;
1562 default:
1563 break;
1564 }
1565 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1566}
1567
1568/*
1569 * Recover reply buffers from pool.
1570 * This happens when recovering from error conditions.
1571 * Post-increment counter/array index.
1572 */
1573void
1574rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1575{
1576 struct rpcrdma_buffer *buffers = req->rl_buffer;
1577 unsigned long flags;
1578
1579 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1580 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1581 spin_lock_irqsave(&buffers->rb_lock, flags);
1582 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1583 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1584 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1585 }
1586 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1587}
1588
1589/*
1590 * Put reply buffers back into pool when not attached to
b45ccfd2 1591 * request. This happens in error conditions.
c56c65fb
TT
1592 */
1593void
1594rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1595{
1596 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1597 unsigned long flags;
1598
1599 rep->rr_func = NULL;
1600 spin_lock_irqsave(&buffers->rb_lock, flags);
1601 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1602 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1603}
1604
1605/*
1606 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1607 */
1608
1609int
1610rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1611 struct ib_mr **mrp, struct ib_sge *iov)
1612{
1613 struct ib_phys_buf ipb;
1614 struct ib_mr *mr;
1615 int rc;
1616
1617 /*
1618 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1619 */
1620 iov->addr = ib_dma_map_single(ia->ri_id->device,
1621 va, len, DMA_BIDIRECTIONAL);
bf858ab0
YB
1622 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1623 return -ENOMEM;
1624
c56c65fb
TT
1625 iov->length = len;
1626
bd7ed1d1
TT
1627 if (ia->ri_have_dma_lkey) {
1628 *mrp = NULL;
1629 iov->lkey = ia->ri_dma_lkey;
1630 return 0;
1631 } else if (ia->ri_bind_mem != NULL) {
c56c65fb
TT
1632 *mrp = NULL;
1633 iov->lkey = ia->ri_bind_mem->lkey;
1634 return 0;
1635 }
1636
1637 ipb.addr = iov->addr;
1638 ipb.size = iov->length;
1639 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1640 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1641
1642 dprintk("RPC: %s: phys convert: 0x%llx "
1643 "registered 0x%llx length %d\n",
a56daeb7
AM
1644 __func__, (unsigned long long)ipb.addr,
1645 (unsigned long long)iov->addr, len);
c56c65fb
TT
1646
1647 if (IS_ERR(mr)) {
1648 *mrp = NULL;
1649 rc = PTR_ERR(mr);
1650 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1651 } else {
1652 *mrp = mr;
1653 iov->lkey = mr->lkey;
1654 rc = 0;
1655 }
1656
1657 return rc;
1658}
1659
1660int
1661rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1662 struct ib_mr *mr, struct ib_sge *iov)
1663{
1664 int rc;
1665
1666 ib_dma_unmap_single(ia->ri_id->device,
1667 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1668
1669 if (NULL == mr)
1670 return 0;
1671
1672 rc = ib_dereg_mr(mr);
1673 if (rc)
1674 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1675 return rc;
1676}
1677
1678/*
1679 * Wrappers for chunk registration, shared by read/write chunk code.
1680 */
1681
1682static void
1683rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1684{
1685 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1686 seg->mr_dmalen = seg->mr_len;
1687 if (seg->mr_page)
1688 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1689 seg->mr_page, offset_in_page(seg->mr_offset),
1690 seg->mr_dmalen, seg->mr_dir);
1691 else
1692 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1693 seg->mr_offset,
1694 seg->mr_dmalen, seg->mr_dir);
5c635e09
TT
1695 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1696 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1697 __func__,
986d4abb
RD
1698 (unsigned long long)seg->mr_dma,
1699 seg->mr_offset, seg->mr_dmalen);
5c635e09 1700 }
c56c65fb
TT
1701}
1702
1703static void
1704rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1705{
1706 if (seg->mr_page)
1707 ib_dma_unmap_page(ia->ri_id->device,
1708 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1709 else
1710 ib_dma_unmap_single(ia->ri_id->device,
1711 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1712}
1713
3197d309
TT
1714static int
1715rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1716 int *nsegs, int writing, struct rpcrdma_ia *ia,
1717 struct rpcrdma_xprt *r_xprt)
1718{
1719 struct rpcrdma_mr_seg *seg1 = seg;
0dbb4108
CL
1720 struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw;
1721 struct rpcrdma_frmr *frmr = &mw->r.frmr;
1722 struct ib_mr *mr = frmr->fr_mr;
f590e878 1723 struct ib_send_wr fastreg_wr, *bad_wr;
3197d309
TT
1724 u8 key;
1725 int len, pageoff;
1726 int i, rc;
9b78145c
TT
1727 int seg_len;
1728 u64 pa;
1729 int page_no;
3197d309
TT
1730
1731 pageoff = offset_in_page(seg1->mr_offset);
1732 seg1->mr_offset -= pageoff; /* start of page */
1733 seg1->mr_len += pageoff;
1734 len = -pageoff;
0fc6c4e7
SW
1735 if (*nsegs > ia->ri_max_frmr_depth)
1736 *nsegs = ia->ri_max_frmr_depth;
9b78145c 1737 for (page_no = i = 0; i < *nsegs;) {
3197d309 1738 rpcrdma_map_one(ia, seg, writing);
9b78145c
TT
1739 pa = seg->mr_dma;
1740 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
0dbb4108 1741 frmr->fr_pgl->page_list[page_no++] = pa;
9b78145c
TT
1742 pa += PAGE_SIZE;
1743 }
3197d309
TT
1744 len += seg->mr_len;
1745 ++seg;
1746 ++i;
1747 /* Check for holes */
1748 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1749 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1750 break;
1751 }
1752 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
0dbb4108 1753 __func__, mw, i);
3197d309 1754
05055722
CL
1755 frmr->fr_state = FRMR_IS_VALID;
1756
f590e878
CL
1757 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1758 fastreg_wr.wr_id = (unsigned long)(void *)mw;
1759 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1760 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1761 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1762 fastreg_wr.wr.fast_reg.page_list_len = page_no;
1763 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1764 fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1765 if (fastreg_wr.wr.fast_reg.length < len) {
5fc83f47
CL
1766 rc = -EIO;
1767 goto out_err;
c977dea2
CL
1768 }
1769
1770 /* Bump the key */
0dbb4108
CL
1771 key = (u8)(mr->rkey & 0x000000FF);
1772 ib_update_fast_reg_key(mr, ++key);
c977dea2 1773
f590e878 1774 fastreg_wr.wr.fast_reg.access_flags = (writing ?
68743082
VP
1775 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1776 IB_ACCESS_REMOTE_READ);
f590e878 1777 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
3197d309
TT
1778 DECR_CQCOUNT(&r_xprt->rx_ep);
1779
f590e878 1780 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
3197d309
TT
1781 if (rc) {
1782 dprintk("RPC: %s: failed ib_post_send for register,"
1783 " status %i\n", __func__, rc);
c93e986a 1784 ib_update_fast_reg_key(mr, --key);
5fc83f47 1785 goto out_err;
3197d309 1786 } else {
0dbb4108 1787 seg1->mr_rkey = mr->rkey;
3197d309
TT
1788 seg1->mr_base = seg1->mr_dma + pageoff;
1789 seg1->mr_nsegs = i;
1790 seg1->mr_len = len;
1791 }
1792 *nsegs = i;
5fc83f47
CL
1793 return 0;
1794out_err:
05055722 1795 frmr->fr_state = FRMR_IS_INVALID;
5fc83f47
CL
1796 while (i--)
1797 rpcrdma_unmap_one(ia, --seg);
3197d309
TT
1798 return rc;
1799}
1800
1801static int
1802rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1803 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1804{
1805 struct rpcrdma_mr_seg *seg1 = seg;
1806 struct ib_send_wr invalidate_wr, *bad_wr;
1807 int rc;
1808
dab7e3b8
CL
1809 seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
1810
3197d309 1811 memset(&invalidate_wr, 0, sizeof invalidate_wr);
5c635e09 1812 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
3197d309 1813 invalidate_wr.opcode = IB_WR_LOCAL_INV;
3197d309
TT
1814 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1815 DECR_CQCOUNT(&r_xprt->rx_ep);
1816
73806c88
CL
1817 read_lock(&ia->ri_qplock);
1818 while (seg1->mr_nsegs--)
1819 rpcrdma_unmap_one(ia, seg++);
3197d309 1820 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
73806c88 1821 read_unlock(&ia->ri_qplock);
dab7e3b8
CL
1822 if (rc) {
1823 /* Force rpcrdma_buffer_get() to retry */
1824 seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
3197d309
TT
1825 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1826 " status %i\n", __func__, rc);
dab7e3b8 1827 }
3197d309
TT
1828 return rc;
1829}
1830
8d4ba034
TT
1831static int
1832rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1833 int *nsegs, int writing, struct rpcrdma_ia *ia)
1834{
1835 struct rpcrdma_mr_seg *seg1 = seg;
1836 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1837 int len, pageoff, i, rc;
1838
1839 pageoff = offset_in_page(seg1->mr_offset);
1840 seg1->mr_offset -= pageoff; /* start of page */
1841 seg1->mr_len += pageoff;
1842 len = -pageoff;
1843 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1844 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1845 for (i = 0; i < *nsegs;) {
1846 rpcrdma_map_one(ia, seg, writing);
1847 physaddrs[i] = seg->mr_dma;
1848 len += seg->mr_len;
1849 ++seg;
1850 ++i;
1851 /* Check for holes */
1852 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1853 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1854 break;
1855 }
1856 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1857 physaddrs, i, seg1->mr_dma);
1858 if (rc) {
1859 dprintk("RPC: %s: failed ib_map_phys_fmr "
1860 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1861 len, (unsigned long long)seg1->mr_dma,
1862 pageoff, i, rc);
1863 while (i--)
1864 rpcrdma_unmap_one(ia, --seg);
1865 } else {
1866 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1867 seg1->mr_base = seg1->mr_dma + pageoff;
1868 seg1->mr_nsegs = i;
1869 seg1->mr_len = len;
1870 }
1871 *nsegs = i;
1872 return rc;
1873}
1874
1875static int
1876rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1877 struct rpcrdma_ia *ia)
1878{
1879 struct rpcrdma_mr_seg *seg1 = seg;
1880 LIST_HEAD(l);
1881 int rc;
1882
1883 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1884 rc = ib_unmap_fmr(&l);
73806c88 1885 read_lock(&ia->ri_qplock);
8d4ba034
TT
1886 while (seg1->mr_nsegs--)
1887 rpcrdma_unmap_one(ia, seg++);
73806c88 1888 read_unlock(&ia->ri_qplock);
8d4ba034
TT
1889 if (rc)
1890 dprintk("RPC: %s: failed ib_unmap_fmr,"
1891 " status %i\n", __func__, rc);
1892 return rc;
1893}
1894
c56c65fb
TT
1895int
1896rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1897 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1898{
1899 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1900 int rc = 0;
1901
1902 switch (ia->ri_memreg_strategy) {
1903
c56c65fb
TT
1904 case RPCRDMA_ALLPHYSICAL:
1905 rpcrdma_map_one(ia, seg, writing);
1906 seg->mr_rkey = ia->ri_bind_mem->rkey;
1907 seg->mr_base = seg->mr_dma;
1908 seg->mr_nsegs = 1;
1909 nsegs = 1;
1910 break;
c56c65fb 1911
3197d309
TT
1912 /* Registration using frmr registration */
1913 case RPCRDMA_FRMR:
1914 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1915 break;
1916
8d4ba034 1917 /* Registration using fmr memory registration */
c56c65fb 1918 case RPCRDMA_MTHCAFMR:
8d4ba034 1919 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1920 break;
1921
c56c65fb 1922 default:
92b98361 1923 return -EIO;
c56c65fb
TT
1924 }
1925 if (rc)
92b98361 1926 return rc;
c56c65fb
TT
1927
1928 return nsegs;
1929}
1930
1931int
1932rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
13c9ff8f 1933 struct rpcrdma_xprt *r_xprt)
c56c65fb
TT
1934{
1935 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1936 int nsegs = seg->mr_nsegs, rc;
1937
1938 switch (ia->ri_memreg_strategy) {
1939
c56c65fb 1940 case RPCRDMA_ALLPHYSICAL:
73806c88 1941 read_lock(&ia->ri_qplock);
c56c65fb 1942 rpcrdma_unmap_one(ia, seg);
73806c88 1943 read_unlock(&ia->ri_qplock);
c56c65fb 1944 break;
c56c65fb 1945
3197d309
TT
1946 case RPCRDMA_FRMR:
1947 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1948 break;
1949
c56c65fb 1950 case RPCRDMA_MTHCAFMR:
8d4ba034 1951 rc = rpcrdma_deregister_fmr_external(seg, ia);
c56c65fb
TT
1952 break;
1953
c56c65fb 1954 default:
c56c65fb
TT
1955 break;
1956 }
c56c65fb
TT
1957 return nsegs;
1958}
1959
1960/*
1961 * Prepost any receive buffer, then post send.
1962 *
1963 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1964 */
1965int
1966rpcrdma_ep_post(struct rpcrdma_ia *ia,
1967 struct rpcrdma_ep *ep,
1968 struct rpcrdma_req *req)
1969{
1970 struct ib_send_wr send_wr, *send_wr_fail;
1971 struct rpcrdma_rep *rep = req->rl_reply;
1972 int rc;
1973
1974 if (rep) {
1975 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1976 if (rc)
1977 goto out;
1978 req->rl_reply = NULL;
1979 }
1980
1981 send_wr.next = NULL;
1982 send_wr.wr_id = 0ULL; /* no send cookie */
1983 send_wr.sg_list = req->rl_send_iov;
1984 send_wr.num_sge = req->rl_niovs;
1985 send_wr.opcode = IB_WR_SEND;
c56c65fb
TT
1986 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1987 ib_dma_sync_single_for_device(ia->ri_id->device,
1988 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1989 DMA_TO_DEVICE);
1990 ib_dma_sync_single_for_device(ia->ri_id->device,
1991 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1992 DMA_TO_DEVICE);
1993 ib_dma_sync_single_for_device(ia->ri_id->device,
1994 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1995 DMA_TO_DEVICE);
1996
1997 if (DECR_CQCOUNT(ep) > 0)
1998 send_wr.send_flags = 0;
1999 else { /* Provider must take a send completion every now and then */
2000 INIT_CQCOUNT(ep);
2001 send_wr.send_flags = IB_SEND_SIGNALED;
2002 }
2003
2004 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
2005 if (rc)
2006 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
2007 rc);
2008out:
2009 return rc;
2010}
2011
2012/*
2013 * (Re)post a receive buffer.
2014 */
2015int
2016rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2017 struct rpcrdma_ep *ep,
2018 struct rpcrdma_rep *rep)
2019{
2020 struct ib_recv_wr recv_wr, *recv_wr_fail;
2021 int rc;
2022
2023 recv_wr.next = NULL;
2024 recv_wr.wr_id = (u64) (unsigned long) rep;
2025 recv_wr.sg_list = &rep->rr_iov;
2026 recv_wr.num_sge = 1;
2027
2028 ib_dma_sync_single_for_cpu(ia->ri_id->device,
2029 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
2030
c56c65fb
TT
2031 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
2032
2033 if (rc)
2034 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
2035 rc);
2036 return rc;
2037}
43e95988
CL
2038
2039/* Physical mapping means one Read/Write list entry per-page.
2040 * All list entries must fit within an inline buffer
2041 *
2042 * NB: The server must return a Write list for NFS READ,
2043 * which has the same constraint. Factor in the inline
2044 * rsize as well.
2045 */
2046static size_t
2047rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
2048{
2049 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2050 unsigned int inline_size, pages;
2051
2052 inline_size = min_t(unsigned int,
2053 cdata->inline_wsize, cdata->inline_rsize);
2054 inline_size -= RPCRDMA_HDRLEN_MIN;
2055 pages = inline_size / sizeof(struct rpcrdma_segment);
2056 return pages << PAGE_SHIFT;
2057}
2058
2059static size_t
2060rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2061{
2062 return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2063}
2064
2065size_t
2066rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2067{
2068 size_t result;
2069
2070 switch (r_xprt->rx_ia.ri_memreg_strategy) {
2071 case RPCRDMA_ALLPHYSICAL:
2072 result = rpcrdma_physical_max_payload(r_xprt);
2073 break;
2074 default:
2075 result = rpcrdma_mr_max_payload(r_xprt);
2076 }
2077 return result;
2078}
This page took 4.078882 seconds and 5 git commands to generate.