xprtrdma: Add a "deregister_external" op for each memreg mode
[deliverable/linux.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
f58851e6 1/*
c56c65fb
TT
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
38 */
39
c56c65fb
TT
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
a6b7a407 50#include <linux/interrupt.h>
5a0e3ad6 51#include <linux/slab.h>
eba8ff66 52#include <linux/prefetch.h>
0dd39cae 53#include <linux/sunrpc/addr.h>
65866f82 54#include <asm/bitops.h>
c56c65fb 55
f58851e6
TT
56#include "xprt_rdma.h"
57
c56c65fb
TT
58/*
59 * Globals/Macros
60 */
61
f895b252 62#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
c56c65fb
TT
63# define RPCDBG_FACILITY RPCDBG_TRANS
64#endif
65
9f9d802a 66static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
467c9674 67static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
9f9d802a 68
c56c65fb
TT
69/*
70 * internal functions
71 */
72
73/*
74 * handle replies in tasklet context, using a single, global list
75 * rdma tasklet function -- just turn around and call the func
76 * for all replies on the list
77 */
78
79static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
80static LIST_HEAD(rpcrdma_tasklets_g);
81
82static void
83rpcrdma_run_tasklet(unsigned long data)
84{
85 struct rpcrdma_rep *rep;
86 void (*func)(struct rpcrdma_rep *);
87 unsigned long flags;
88
89 data = data;
90 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
91 while (!list_empty(&rpcrdma_tasklets_g)) {
92 rep = list_entry(rpcrdma_tasklets_g.next,
93 struct rpcrdma_rep, rr_list);
94 list_del(&rep->rr_list);
95 func = rep->rr_func;
96 rep->rr_func = NULL;
97 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
98
99 if (func)
100 func(rep);
101 else
102 rpcrdma_recv_buffer_put(rep);
103
104 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
105 }
106 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
107}
108
109static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
110
7ff11de1
CL
111static const char * const async_event[] = {
112 "CQ error",
113 "QP fatal error",
114 "QP request error",
115 "QP access error",
116 "communication established",
117 "send queue drained",
118 "path migration successful",
119 "path mig error",
120 "device fatal error",
121 "port active",
122 "port error",
123 "LID change",
124 "P_key change",
125 "SM change",
126 "SRQ error",
127 "SRQ limit reached",
128 "last WQE reached",
129 "client reregister",
130 "GID change",
131};
132
133#define ASYNC_MSG(status) \
134 ((status) < ARRAY_SIZE(async_event) ? \
135 async_event[(status)] : "unknown async error")
136
f1a03b76
CL
137static void
138rpcrdma_schedule_tasklet(struct list_head *sched_list)
139{
140 unsigned long flags;
141
142 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
143 list_splice_tail(sched_list, &rpcrdma_tasklets_g);
144 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
145 tasklet_schedule(&rpcrdma_tasklet_g);
146}
147
c56c65fb
TT
148static void
149rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
150{
151 struct rpcrdma_ep *ep = context;
152
7ff11de1
CL
153 pr_err("RPC: %s: %s on device %s ep %p\n",
154 __func__, ASYNC_MSG(event->event),
155 event->device->name, context);
c56c65fb
TT
156 if (ep->rep_connected == 1) {
157 ep->rep_connected = -EIO;
afadc468 158 rpcrdma_conn_func(ep);
c56c65fb
TT
159 wake_up_all(&ep->rep_connect_wait);
160 }
161}
162
163static void
164rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
165{
166 struct rpcrdma_ep *ep = context;
167
7ff11de1
CL
168 pr_err("RPC: %s: %s on device %s ep %p\n",
169 __func__, ASYNC_MSG(event->event),
170 event->device->name, context);
c56c65fb
TT
171 if (ep->rep_connected == 1) {
172 ep->rep_connected = -EIO;
afadc468 173 rpcrdma_conn_func(ep);
c56c65fb
TT
174 wake_up_all(&ep->rep_connect_wait);
175 }
176}
177
8502427c
CL
178static const char * const wc_status[] = {
179 "success",
180 "local length error",
181 "local QP operation error",
182 "local EE context operation error",
183 "local protection error",
184 "WR flushed",
185 "memory management operation error",
186 "bad response error",
187 "local access error",
188 "remote invalid request error",
189 "remote access error",
190 "remote operation error",
191 "transport retry counter exceeded",
192 "RNR retrycounter exceeded",
193 "local RDD violation error",
194 "remove invalid RD request",
195 "operation aborted",
196 "invalid EE context number",
197 "invalid EE context state",
198 "fatal error",
199 "response timeout error",
200 "general error",
201};
202
203#define COMPLETION_MSG(status) \
204 ((status) < ARRAY_SIZE(wc_status) ? \
205 wc_status[(status)] : "unexpected completion error")
206
fc664485
CL
207static void
208rpcrdma_sendcq_process_wc(struct ib_wc *wc)
c56c65fb 209{
8502427c 210 if (likely(wc->status == IB_WC_SUCCESS))
c56c65fb 211 return;
8502427c
CL
212
213 /* WARNING: Only wr_id and status are reliable at this point */
214 if (wc->wr_id == 0ULL) {
215 if (wc->status != IB_WC_WR_FLUSH_ERR)
216 pr_err("RPC: %s: SEND: %s\n",
217 __func__, COMPLETION_MSG(wc->status));
218 } else {
219 struct rpcrdma_mw *r;
220
221 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
222 r->r.frmr.fr_state = FRMR_IS_STALE;
223 pr_err("RPC: %s: frmr %p (stale): %s\n",
224 __func__, r, COMPLETION_MSG(wc->status));
225 }
c56c65fb
TT
226}
227
fc664485 228static int
1c00dd07 229rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
c56c65fb 230{
1c00dd07 231 struct ib_wc *wcs;
8301a2c0 232 int budget, count, rc;
c56c65fb 233
8301a2c0 234 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
235 do {
236 wcs = ep->rep_send_wcs;
237
238 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
239 if (rc <= 0)
240 return rc;
241
242 count = rc;
243 while (count-- > 0)
244 rpcrdma_sendcq_process_wc(wcs++);
8301a2c0 245 } while (rc == RPCRDMA_POLLSIZE && --budget);
1c00dd07 246 return 0;
fc664485 247}
c56c65fb 248
fc664485
CL
249/*
250 * Handle send, fast_reg_mr, and local_inv completions.
251 *
252 * Send events are typically suppressed and thus do not result
253 * in an upcall. Occasionally one is signaled, however. This
254 * prevents the provider's completion queue from wrapping and
255 * losing a completion.
256 */
257static void
258rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
259{
1c00dd07 260 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
fc664485
CL
261 int rc;
262
1c00dd07 263 rc = rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
264 if (rc) {
265 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
266 __func__, rc);
267 return;
c56c65fb
TT
268 }
269
7f23f6f6
CL
270 rc = ib_req_notify_cq(cq,
271 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
272 if (rc == 0)
273 return;
274 if (rc < 0) {
fc664485
CL
275 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
276 __func__, rc);
277 return;
278 }
279
1c00dd07 280 rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
281}
282
283static void
bb96193d 284rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
fc664485
CL
285{
286 struct rpcrdma_rep *rep =
287 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
288
8502427c
CL
289 /* WARNING: Only wr_id and status are reliable at this point */
290 if (wc->status != IB_WC_SUCCESS)
291 goto out_fail;
fc664485 292
8502427c 293 /* status == SUCCESS means all fields in wc are trustworthy */
fc664485
CL
294 if (wc->opcode != IB_WC_RECV)
295 return;
296
8502427c
CL
297 dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
298 __func__, rep, wc->byte_len);
299
fc664485
CL
300 rep->rr_len = wc->byte_len;
301 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
6b1184cd
CL
302 rdmab_addr(rep->rr_rdmabuf),
303 rep->rr_len, DMA_FROM_DEVICE);
304 prefetch(rdmab_to_msg(rep->rr_rdmabuf));
fc664485
CL
305
306out_schedule:
bb96193d 307 list_add_tail(&rep->rr_list, sched_list);
8502427c
CL
308 return;
309out_fail:
310 if (wc->status != IB_WC_WR_FLUSH_ERR)
311 pr_err("RPC: %s: rep %p: %s\n",
312 __func__, rep, COMPLETION_MSG(wc->status));
313 rep->rr_len = ~0U;
314 goto out_schedule;
fc664485
CL
315}
316
317static int
1c00dd07 318rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
fc664485 319{
bb96193d 320 struct list_head sched_list;
1c00dd07 321 struct ib_wc *wcs;
8301a2c0 322 int budget, count, rc;
fc664485 323
bb96193d 324 INIT_LIST_HEAD(&sched_list);
8301a2c0 325 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
326 do {
327 wcs = ep->rep_recv_wcs;
328
329 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
330 if (rc <= 0)
bb96193d 331 goto out_schedule;
1c00dd07
CL
332
333 count = rc;
334 while (count-- > 0)
bb96193d 335 rpcrdma_recvcq_process_wc(wcs++, &sched_list);
8301a2c0 336 } while (rc == RPCRDMA_POLLSIZE && --budget);
bb96193d
CL
337 rc = 0;
338
339out_schedule:
f1a03b76 340 rpcrdma_schedule_tasklet(&sched_list);
bb96193d 341 return rc;
c56c65fb
TT
342}
343
344/*
fc664485 345 * Handle receive completions.
c56c65fb 346 *
c56c65fb
TT
347 * It is reentrant but processes single events in order to maintain
348 * ordering of receives to keep server credits.
349 *
350 * It is the responsibility of the scheduled tasklet to return
351 * recv buffers to the pool. NOTE: this affects synchronization of
352 * connection shutdown. That is, the structures required for
353 * the completion of the reply handler must remain intact until
354 * all memory has been reclaimed.
c56c65fb
TT
355 */
356static void
fc664485 357rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
c56c65fb 358{
1c00dd07 359 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
c56c65fb
TT
360 int rc;
361
1c00dd07 362 rc = rpcrdma_recvcq_poll(cq, ep);
fc664485
CL
363 if (rc) {
364 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
365 __func__, rc);
c56c65fb 366 return;
fc664485 367 }
c56c65fb 368
7f23f6f6
CL
369 rc = ib_req_notify_cq(cq,
370 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
371 if (rc == 0)
372 return;
373 if (rc < 0) {
fc664485 374 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
c56c65fb
TT
375 __func__, rc);
376 return;
377 }
378
1c00dd07 379 rpcrdma_recvcq_poll(cq, ep);
c56c65fb
TT
380}
381
a7bc211a
CL
382static void
383rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
384{
5c166bef
CL
385 struct ib_wc wc;
386 LIST_HEAD(sched_list);
387
388 while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
389 rpcrdma_recvcq_process_wc(&wc, &sched_list);
390 if (!list_empty(&sched_list))
391 rpcrdma_schedule_tasklet(&sched_list);
392 while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
393 rpcrdma_sendcq_process_wc(&wc);
a7bc211a
CL
394}
395
f895b252 396#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
c56c65fb
TT
397static const char * const conn[] = {
398 "address resolved",
399 "address error",
400 "route resolved",
401 "route error",
402 "connect request",
403 "connect response",
404 "connect error",
405 "unreachable",
406 "rejected",
407 "established",
408 "disconnected",
8079fb78
CL
409 "device removal",
410 "multicast join",
411 "multicast error",
412 "address change",
413 "timewait exit",
c56c65fb 414};
8079fb78
CL
415
416#define CONNECTION_MSG(status) \
417 ((status) < ARRAY_SIZE(conn) ? \
418 conn[(status)] : "unrecognized connection error")
c56c65fb
TT
419#endif
420
421static int
422rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
423{
424 struct rpcrdma_xprt *xprt = id->context;
425 struct rpcrdma_ia *ia = &xprt->rx_ia;
426 struct rpcrdma_ep *ep = &xprt->rx_ep;
f895b252 427#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
0dd39cae 428 struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
ff0db049 429#endif
ce1ab9ab
CL
430 struct ib_qp_attr *attr = &ia->ri_qp_attr;
431 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
c56c65fb
TT
432 int connstate = 0;
433
434 switch (event->event) {
435 case RDMA_CM_EVENT_ADDR_RESOLVED:
436 case RDMA_CM_EVENT_ROUTE_RESOLVED:
5675add3 437 ia->ri_async_rc = 0;
c56c65fb
TT
438 complete(&ia->ri_done);
439 break;
440 case RDMA_CM_EVENT_ADDR_ERROR:
441 ia->ri_async_rc = -EHOSTUNREACH;
442 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
443 __func__, ep);
444 complete(&ia->ri_done);
445 break;
446 case RDMA_CM_EVENT_ROUTE_ERROR:
447 ia->ri_async_rc = -ENETUNREACH;
448 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
449 __func__, ep);
450 complete(&ia->ri_done);
451 break;
452 case RDMA_CM_EVENT_ESTABLISHED:
453 connstate = 1;
ce1ab9ab
CL
454 ib_query_qp(ia->ri_id->qp, attr,
455 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
456 iattr);
c56c65fb
TT
457 dprintk("RPC: %s: %d responder resources"
458 " (%d initiator)\n",
ce1ab9ab
CL
459 __func__, attr->max_dest_rd_atomic,
460 attr->max_rd_atomic);
c56c65fb
TT
461 goto connected;
462 case RDMA_CM_EVENT_CONNECT_ERROR:
463 connstate = -ENOTCONN;
464 goto connected;
465 case RDMA_CM_EVENT_UNREACHABLE:
466 connstate = -ENETDOWN;
467 goto connected;
468 case RDMA_CM_EVENT_REJECTED:
469 connstate = -ECONNREFUSED;
470 goto connected;
471 case RDMA_CM_EVENT_DISCONNECTED:
472 connstate = -ECONNABORTED;
473 goto connected;
474 case RDMA_CM_EVENT_DEVICE_REMOVAL:
475 connstate = -ENODEV;
476connected:
c56c65fb
TT
477 dprintk("RPC: %s: %sconnected\n",
478 __func__, connstate > 0 ? "" : "dis");
479 ep->rep_connected = connstate;
afadc468 480 rpcrdma_conn_func(ep);
c56c65fb 481 wake_up_all(&ep->rep_connect_wait);
8079fb78 482 /*FALLTHROUGH*/
c56c65fb 483 default:
0dd39cae
CL
484 dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
485 __func__, sap, rpc_get_port(sap), ep,
8079fb78 486 CONNECTION_MSG(event->event));
c56c65fb
TT
487 break;
488 }
489
f895b252 490#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
b3cd8d45 491 if (connstate == 1) {
ce1ab9ab 492 int ird = attr->max_dest_rd_atomic;
b3cd8d45 493 int tird = ep->rep_remote_cma.responder_resources;
0dd39cae 494
a0ce85f5 495 pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
0dd39cae 496 sap, rpc_get_port(sap),
b3cd8d45 497 ia->ri_id->device->name,
a0ce85f5 498 ia->ri_ops->ro_displayname,
b3cd8d45
TT
499 xprt->rx_buf.rb_max_requests,
500 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
501 } else if (connstate < 0) {
0dd39cae
CL
502 pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
503 sap, rpc_get_port(sap), connstate);
b3cd8d45
TT
504 }
505#endif
506
c56c65fb
TT
507 return 0;
508}
509
510static struct rdma_cm_id *
511rpcrdma_create_id(struct rpcrdma_xprt *xprt,
512 struct rpcrdma_ia *ia, struct sockaddr *addr)
513{
514 struct rdma_cm_id *id;
515 int rc;
516
1a954051
TT
517 init_completion(&ia->ri_done);
518
b26f9b99 519 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
c56c65fb
TT
520 if (IS_ERR(id)) {
521 rc = PTR_ERR(id);
522 dprintk("RPC: %s: rdma_create_id() failed %i\n",
523 __func__, rc);
524 return id;
525 }
526
5675add3 527 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
528 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
529 if (rc) {
530 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
531 __func__, rc);
532 goto out;
533 }
5675add3
TT
534 wait_for_completion_interruptible_timeout(&ia->ri_done,
535 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
536 rc = ia->ri_async_rc;
537 if (rc)
538 goto out;
539
5675add3 540 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
541 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
542 if (rc) {
543 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
544 __func__, rc);
545 goto out;
546 }
5675add3
TT
547 wait_for_completion_interruptible_timeout(&ia->ri_done,
548 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
549 rc = ia->ri_async_rc;
550 if (rc)
551 goto out;
552
553 return id;
554
555out:
556 rdma_destroy_id(id);
557 return ERR_PTR(rc);
558}
559
560/*
561 * Drain any cq, prior to teardown.
562 */
563static void
564rpcrdma_clean_cq(struct ib_cq *cq)
565{
566 struct ib_wc wc;
567 int count = 0;
568
569 while (1 == ib_poll_cq(cq, 1, &wc))
570 ++count;
571
572 if (count)
573 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
574 __func__, count, wc.opcode);
575}
576
577/*
578 * Exported functions.
579 */
580
581/*
582 * Open and initialize an Interface Adapter.
583 * o initializes fields of struct rpcrdma_ia, including
584 * interface and provider attributes and protection zone.
585 */
586int
587rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
588{
bd7ed1d1 589 int rc, mem_priv;
c56c65fb 590 struct rpcrdma_ia *ia = &xprt->rx_ia;
7bc7972c 591 struct ib_device_attr *devattr = &ia->ri_devattr;
c56c65fb 592
c56c65fb
TT
593 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
594 if (IS_ERR(ia->ri_id)) {
595 rc = PTR_ERR(ia->ri_id);
596 goto out1;
597 }
598
599 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
600 if (IS_ERR(ia->ri_pd)) {
601 rc = PTR_ERR(ia->ri_pd);
602 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
603 __func__, rc);
604 goto out2;
605 }
606
7bc7972c 607 rc = ib_query_device(ia->ri_id->device, devattr);
bd7ed1d1
TT
608 if (rc) {
609 dprintk("RPC: %s: ib_query_device failed %d\n",
610 __func__, rc);
5ae711a2 611 goto out3;
bd7ed1d1
TT
612 }
613
7bc7972c 614 if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
bd7ed1d1
TT
615 ia->ri_have_dma_lkey = 1;
616 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
617 }
618
f10eafd3 619 if (memreg == RPCRDMA_FRMR) {
3197d309 620 /* Requires both frmr reg and local dma lkey */
41f97028 621 if (((devattr->device_cap_flags &
3197d309 622 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
41f97028
CL
623 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
624 (devattr->max_fast_reg_page_list_len == 0)) {
3197d309 625 dprintk("RPC: %s: FRMR registration "
f10eafd3
CL
626 "not supported by HCA\n", __func__);
627 memreg = RPCRDMA_MTHCAFMR;
0fc6c4e7
SW
628 } else {
629 /* Mind the ia limit on FRMR page list depth */
630 ia->ri_max_frmr_depth = min_t(unsigned int,
631 RPCRDMA_MAX_DATA_SEGS,
7bc7972c 632 devattr->max_fast_reg_page_list_len);
bd7ed1d1 633 }
f10eafd3
CL
634 }
635 if (memreg == RPCRDMA_MTHCAFMR) {
636 if (!ia->ri_id->device->alloc_fmr) {
637 dprintk("RPC: %s: MTHCAFMR registration "
638 "not supported by HCA\n", __func__);
f10eafd3 639 memreg = RPCRDMA_ALLPHYSICAL;
f10eafd3 640 }
bd7ed1d1
TT
641 }
642
c56c65fb
TT
643 /*
644 * Optionally obtain an underlying physical identity mapping in
645 * order to do a memory window-based bind. This base registration
646 * is protected from remote access - that is enabled only by binding
647 * for the specific bytes targeted during each RPC operation, and
648 * revoked after the corresponding completion similar to a storage
649 * adapter.
650 */
bd7ed1d1 651 switch (memreg) {
3197d309 652 case RPCRDMA_FRMR:
a0ce85f5 653 ia->ri_ops = &rpcrdma_frwr_memreg_ops;
bd7ed1d1 654 break;
bd7ed1d1 655 case RPCRDMA_ALLPHYSICAL:
a0ce85f5 656 ia->ri_ops = &rpcrdma_physical_memreg_ops;
bd7ed1d1
TT
657 mem_priv = IB_ACCESS_LOCAL_WRITE |
658 IB_ACCESS_REMOTE_WRITE |
659 IB_ACCESS_REMOTE_READ;
660 goto register_setup;
bd7ed1d1 661 case RPCRDMA_MTHCAFMR:
a0ce85f5 662 ia->ri_ops = &rpcrdma_fmr_memreg_ops;
bd7ed1d1 663 if (ia->ri_have_dma_lkey)
c56c65fb 664 break;
bd7ed1d1
TT
665 mem_priv = IB_ACCESS_LOCAL_WRITE;
666 register_setup:
c56c65fb
TT
667 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
668 if (IS_ERR(ia->ri_bind_mem)) {
669 printk(KERN_ALERT "%s: ib_get_dma_mr for "
0ac531c1 670 "phys register failed with %lX\n",
c56c65fb 671 __func__, PTR_ERR(ia->ri_bind_mem));
0ac531c1 672 rc = -ENOMEM;
5ae711a2 673 goto out3;
c56c65fb 674 }
bd7ed1d1
TT
675 break;
676 default:
cdd9ade7
CL
677 printk(KERN_ERR "RPC: Unsupported memory "
678 "registration mode: %d\n", memreg);
679 rc = -ENOMEM;
5ae711a2 680 goto out3;
c56c65fb 681 }
a0ce85f5
CL
682 dprintk("RPC: %s: memory registration strategy is '%s'\n",
683 __func__, ia->ri_ops->ro_displayname);
c56c65fb
TT
684
685 /* Else will do memory reg/dereg for each chunk */
686 ia->ri_memreg_strategy = memreg;
687
73806c88 688 rwlock_init(&ia->ri_qplock);
c56c65fb 689 return 0;
5ae711a2
CL
690
691out3:
692 ib_dealloc_pd(ia->ri_pd);
693 ia->ri_pd = NULL;
c56c65fb
TT
694out2:
695 rdma_destroy_id(ia->ri_id);
fee08caf 696 ia->ri_id = NULL;
c56c65fb
TT
697out1:
698 return rc;
699}
700
701/*
702 * Clean up/close an IA.
703 * o if event handles and PD have been initialized, free them.
704 * o close the IA
705 */
706void
707rpcrdma_ia_close(struct rpcrdma_ia *ia)
708{
709 int rc;
710
711 dprintk("RPC: %s: entering\n", __func__);
712 if (ia->ri_bind_mem != NULL) {
713 rc = ib_dereg_mr(ia->ri_bind_mem);
714 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
715 __func__, rc);
716 }
fee08caf
TT
717 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
718 if (ia->ri_id->qp)
719 rdma_destroy_qp(ia->ri_id);
720 rdma_destroy_id(ia->ri_id);
721 ia->ri_id = NULL;
722 }
c56c65fb
TT
723 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
724 rc = ib_dealloc_pd(ia->ri_pd);
725 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
726 __func__, rc);
727 }
c56c65fb
TT
728}
729
730/*
731 * Create unconnected endpoint.
732 */
733int
734rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
735 struct rpcrdma_create_data_internal *cdata)
736{
7bc7972c 737 struct ib_device_attr *devattr = &ia->ri_devattr;
fc664485 738 struct ib_cq *sendcq, *recvcq;
5d40a8a5 739 int rc, err;
c56c65fb 740
c56c65fb 741 /* check provider's send/recv wr limits */
7bc7972c
CL
742 if (cdata->max_requests > devattr->max_qp_wr)
743 cdata->max_requests = devattr->max_qp_wr;
c56c65fb
TT
744
745 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
746 ep->rep_attr.qp_context = ep;
747 /* send_cq and recv_cq initialized below */
748 ep->rep_attr.srq = NULL;
749 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
750 switch (ia->ri_memreg_strategy) {
0fc6c4e7
SW
751 case RPCRDMA_FRMR: {
752 int depth = 7;
753
15cdc644
TT
754 /* Add room for frmr register and invalidate WRs.
755 * 1. FRMR reg WR for head
756 * 2. FRMR invalidate WR for head
0fc6c4e7
SW
757 * 3. N FRMR reg WRs for pagelist
758 * 4. N FRMR invalidate WRs for pagelist
15cdc644
TT
759 * 5. FRMR reg WR for tail
760 * 6. FRMR invalidate WR for tail
761 * 7. The RDMA_SEND WR
762 */
0fc6c4e7
SW
763
764 /* Calculate N if the device max FRMR depth is smaller than
765 * RPCRDMA_MAX_DATA_SEGS.
766 */
767 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
768 int delta = RPCRDMA_MAX_DATA_SEGS -
769 ia->ri_max_frmr_depth;
770
771 do {
772 depth += 2; /* FRMR reg + invalidate */
773 delta -= ia->ri_max_frmr_depth;
774 } while (delta > 0);
775
776 }
777 ep->rep_attr.cap.max_send_wr *= depth;
7bc7972c
CL
778 if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
779 cdata->max_requests = devattr->max_qp_wr / depth;
15cdc644
TT
780 if (!cdata->max_requests)
781 return -EINVAL;
0fc6c4e7
SW
782 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
783 depth;
15cdc644 784 }
3197d309 785 break;
0fc6c4e7 786 }
c56c65fb
TT
787 default:
788 break;
789 }
790 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
791 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
792 ep->rep_attr.cap.max_recv_sge = 1;
793 ep->rep_attr.cap.max_inline_data = 0;
794 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
795 ep->rep_attr.qp_type = IB_QPT_RC;
796 ep->rep_attr.port_num = ~0;
797
c05fbb5a
CL
798 if (cdata->padding) {
799 ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
800 GFP_KERNEL);
801 if (IS_ERR(ep->rep_padbuf))
802 return PTR_ERR(ep->rep_padbuf);
803 } else
804 ep->rep_padbuf = NULL;
805
c56c65fb
TT
806 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
807 "iovs: send %d recv %d\n",
808 __func__,
809 ep->rep_attr.cap.max_send_wr,
810 ep->rep_attr.cap.max_recv_wr,
811 ep->rep_attr.cap.max_send_sge,
812 ep->rep_attr.cap.max_recv_sge);
813
814 /* set trigger for requesting send completion */
fc664485 815 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
e7104a2a
CL
816 if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
817 ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
818 else if (ep->rep_cqinit <= 2)
c56c65fb
TT
819 ep->rep_cqinit = 0;
820 INIT_CQCOUNT(ep);
c56c65fb 821 init_waitqueue_head(&ep->rep_connect_wait);
254f91e2 822 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
c56c65fb 823
fc664485 824 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
1c00dd07 825 rpcrdma_cq_async_error_upcall, ep,
c56c65fb 826 ep->rep_attr.cap.max_send_wr + 1, 0);
fc664485
CL
827 if (IS_ERR(sendcq)) {
828 rc = PTR_ERR(sendcq);
829 dprintk("RPC: %s: failed to create send CQ: %i\n",
c56c65fb
TT
830 __func__, rc);
831 goto out1;
832 }
833
fc664485 834 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
c56c65fb
TT
835 if (rc) {
836 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
837 __func__, rc);
838 goto out2;
839 }
840
fc664485 841 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
1c00dd07 842 rpcrdma_cq_async_error_upcall, ep,
fc664485
CL
843 ep->rep_attr.cap.max_recv_wr + 1, 0);
844 if (IS_ERR(recvcq)) {
845 rc = PTR_ERR(recvcq);
846 dprintk("RPC: %s: failed to create recv CQ: %i\n",
847 __func__, rc);
848 goto out2;
849 }
850
851 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
852 if (rc) {
853 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
854 __func__, rc);
855 ib_destroy_cq(recvcq);
856 goto out2;
857 }
858
859 ep->rep_attr.send_cq = sendcq;
860 ep->rep_attr.recv_cq = recvcq;
c56c65fb
TT
861
862 /* Initialize cma parameters */
863
864 /* RPC/RDMA does not use private data */
865 ep->rep_remote_cma.private_data = NULL;
866 ep->rep_remote_cma.private_data_len = 0;
867
868 /* Client offers RDMA Read but does not initiate */
b334eaab 869 ep->rep_remote_cma.initiator_depth = 0;
7bc7972c 870 if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
b334eaab
TT
871 ep->rep_remote_cma.responder_resources = 32;
872 else
7bc7972c
CL
873 ep->rep_remote_cma.responder_resources =
874 devattr->max_qp_rd_atom;
c56c65fb
TT
875
876 ep->rep_remote_cma.retry_count = 7;
877 ep->rep_remote_cma.flow_control = 0;
878 ep->rep_remote_cma.rnr_retry_count = 0;
879
880 return 0;
881
882out2:
fc664485 883 err = ib_destroy_cq(sendcq);
5d40a8a5
CL
884 if (err)
885 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
886 __func__, err);
c56c65fb 887out1:
c05fbb5a 888 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
c56c65fb
TT
889 return rc;
890}
891
892/*
893 * rpcrdma_ep_destroy
894 *
895 * Disconnect and destroy endpoint. After this, the only
896 * valid operations on the ep are to free it (if dynamically
897 * allocated) or re-create it.
c56c65fb 898 */
7f1d5419 899void
c56c65fb
TT
900rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
901{
902 int rc;
903
904 dprintk("RPC: %s: entering, connected is %d\n",
905 __func__, ep->rep_connected);
906
254f91e2
CL
907 cancel_delayed_work_sync(&ep->rep_connect_worker);
908
c56c65fb 909 if (ia->ri_id->qp) {
282191cb 910 rpcrdma_ep_disconnect(ep, ia);
fee08caf
TT
911 rdma_destroy_qp(ia->ri_id);
912 ia->ri_id->qp = NULL;
c56c65fb
TT
913 }
914
c05fbb5a 915 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
c56c65fb 916
fc664485
CL
917 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
918 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
919 if (rc)
920 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
921 __func__, rc);
922
923 rpcrdma_clean_cq(ep->rep_attr.send_cq);
924 rc = ib_destroy_cq(ep->rep_attr.send_cq);
c56c65fb
TT
925 if (rc)
926 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
927 __func__, rc);
c56c65fb
TT
928}
929
930/*
931 * Connect unconnected endpoint.
932 */
933int
934rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
935{
73806c88 936 struct rdma_cm_id *id, *old;
c56c65fb
TT
937 int rc = 0;
938 int retry_count = 0;
c56c65fb 939
c055551e 940 if (ep->rep_connected != 0) {
c56c65fb
TT
941 struct rpcrdma_xprt *xprt;
942retry:
ec62f40d 943 dprintk("RPC: %s: reconnecting...\n", __func__);
282191cb
CL
944
945 rpcrdma_ep_disconnect(ep, ia);
a7bc211a 946 rpcrdma_flush_cqs(ep);
c56c65fb 947
467c9674
CL
948 switch (ia->ri_memreg_strategy) {
949 case RPCRDMA_FRMR:
9f9d802a 950 rpcrdma_reset_frmrs(ia);
467c9674
CL
951 break;
952 case RPCRDMA_MTHCAFMR:
953 rpcrdma_reset_fmrs(ia);
954 break;
955 case RPCRDMA_ALLPHYSICAL:
956 break;
957 default:
958 rc = -EIO;
959 goto out;
960 }
9f9d802a 961
c56c65fb
TT
962 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
963 id = rpcrdma_create_id(xprt, ia,
964 (struct sockaddr *)&xprt->rx_data.addr);
965 if (IS_ERR(id)) {
ec62f40d 966 rc = -EHOSTUNREACH;
c56c65fb
TT
967 goto out;
968 }
969 /* TEMP TEMP TEMP - fail if new device:
970 * Deregister/remarshal *all* requests!
971 * Close and recreate adapter, pd, etc!
972 * Re-determine all attributes still sane!
973 * More stuff I haven't thought of!
974 * Rrrgh!
975 */
976 if (ia->ri_id->device != id->device) {
977 printk("RPC: %s: can't reconnect on "
978 "different device!\n", __func__);
979 rdma_destroy_id(id);
ec62f40d 980 rc = -ENETUNREACH;
c56c65fb
TT
981 goto out;
982 }
983 /* END TEMP */
ec62f40d
CL
984 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
985 if (rc) {
986 dprintk("RPC: %s: rdma_create_qp failed %i\n",
987 __func__, rc);
988 rdma_destroy_id(id);
989 rc = -ENETUNREACH;
990 goto out;
991 }
73806c88
CL
992
993 write_lock(&ia->ri_qplock);
994 old = ia->ri_id;
c56c65fb 995 ia->ri_id = id;
73806c88
CL
996 write_unlock(&ia->ri_qplock);
997
998 rdma_destroy_qp(old);
999 rdma_destroy_id(old);
ec62f40d
CL
1000 } else {
1001 dprintk("RPC: %s: connecting...\n", __func__);
1002 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
1003 if (rc) {
1004 dprintk("RPC: %s: rdma_create_qp failed %i\n",
1005 __func__, rc);
1006 /* do not update ep->rep_connected */
1007 return -ENETUNREACH;
1008 }
c56c65fb
TT
1009 }
1010
c56c65fb
TT
1011 ep->rep_connected = 0;
1012
1013 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
1014 if (rc) {
1015 dprintk("RPC: %s: rdma_connect() failed with %i\n",
1016 __func__, rc);
1017 goto out;
1018 }
1019
c56c65fb
TT
1020 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
1021
1022 /*
1023 * Check state. A non-peer reject indicates no listener
1024 * (ECONNREFUSED), which may be a transient state. All
1025 * others indicate a transport condition which has already
1026 * undergone a best-effort.
1027 */
f64f9e71
JP
1028 if (ep->rep_connected == -ECONNREFUSED &&
1029 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
c56c65fb
TT
1030 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
1031 goto retry;
1032 }
1033 if (ep->rep_connected <= 0) {
1034 /* Sometimes, the only way to reliably connect to remote
1035 * CMs is to use same nonzero values for ORD and IRD. */
b334eaab
TT
1036 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
1037 (ep->rep_remote_cma.responder_resources == 0 ||
1038 ep->rep_remote_cma.initiator_depth !=
1039 ep->rep_remote_cma.responder_resources)) {
1040 if (ep->rep_remote_cma.responder_resources == 0)
1041 ep->rep_remote_cma.responder_resources = 1;
1042 ep->rep_remote_cma.initiator_depth =
1043 ep->rep_remote_cma.responder_resources;
c56c65fb 1044 goto retry;
b334eaab 1045 }
c56c65fb
TT
1046 rc = ep->rep_connected;
1047 } else {
1048 dprintk("RPC: %s: connected\n", __func__);
1049 }
1050
1051out:
1052 if (rc)
1053 ep->rep_connected = rc;
1054 return rc;
1055}
1056
1057/*
1058 * rpcrdma_ep_disconnect
1059 *
1060 * This is separate from destroy to facilitate the ability
1061 * to reconnect without recreating the endpoint.
1062 *
1063 * This call is not reentrant, and must not be made in parallel
1064 * on the same endpoint.
1065 */
282191cb 1066void
c56c65fb
TT
1067rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
1068{
1069 int rc;
1070
a7bc211a 1071 rpcrdma_flush_cqs(ep);
c56c65fb
TT
1072 rc = rdma_disconnect(ia->ri_id);
1073 if (!rc) {
1074 /* returns without wait if not connected */
1075 wait_event_interruptible(ep->rep_connect_wait,
1076 ep->rep_connected != 1);
1077 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
1078 (ep->rep_connected == 1) ? "still " : "dis");
1079 } else {
1080 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
1081 ep->rep_connected = rc;
1082 }
c56c65fb
TT
1083}
1084
1392402c
CL
1085static struct rpcrdma_req *
1086rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
1087{
1392402c 1088 struct rpcrdma_req *req;
1392402c 1089
85275c87 1090 req = kzalloc(sizeof(*req), GFP_KERNEL);
1392402c 1091 if (req == NULL)
85275c87 1092 return ERR_PTR(-ENOMEM);
1392402c 1093
1392402c
CL
1094 req->rl_buffer = &r_xprt->rx_buf;
1095 return req;
1392402c
CL
1096}
1097
1098static struct rpcrdma_rep *
1099rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
1100{
1101 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1392402c
CL
1102 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1103 struct rpcrdma_rep *rep;
1104 int rc;
1105
1106 rc = -ENOMEM;
6b1184cd 1107 rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1392402c
CL
1108 if (rep == NULL)
1109 goto out;
1392402c 1110
6b1184cd
CL
1111 rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
1112 GFP_KERNEL);
1113 if (IS_ERR(rep->rr_rdmabuf)) {
1114 rc = PTR_ERR(rep->rr_rdmabuf);
1392402c 1115 goto out_free;
6b1184cd 1116 }
1392402c
CL
1117
1118 rep->rr_buffer = &r_xprt->rx_buf;
1119 return rep;
1120
1121out_free:
1122 kfree(rep);
1123out:
1124 return ERR_PTR(rc);
1125}
1126
2e84522c
CL
1127static int
1128rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1129{
1130 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
1131 struct ib_fmr_attr fmr_attr = {
1132 .max_pages = RPCRDMA_MAX_DATA_SEGS,
1133 .max_maps = 1,
1134 .page_shift = PAGE_SHIFT
1135 };
1136 struct rpcrdma_mw *r;
1137 int i, rc;
1138
1139 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1140 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
1141
1142 while (i--) {
1143 r = kzalloc(sizeof(*r), GFP_KERNEL);
1144 if (r == NULL)
1145 return -ENOMEM;
1146
1147 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1148 if (IS_ERR(r->r.fmr)) {
1149 rc = PTR_ERR(r->r.fmr);
1150 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1151 __func__, rc);
1152 goto out_free;
1153 }
1154
1155 list_add(&r->mw_list, &buf->rb_mws);
1156 list_add(&r->mw_all, &buf->rb_all);
1157 }
1158 return 0;
1159
1160out_free:
1161 kfree(r);
1162 return rc;
1163}
1164
1165static int
1166rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1167{
1168 struct rpcrdma_frmr *f;
1169 struct rpcrdma_mw *r;
1170 int i, rc;
1171
1172 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1173 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
1174
1175 while (i--) {
1176 r = kzalloc(sizeof(*r), GFP_KERNEL);
1177 if (r == NULL)
1178 return -ENOMEM;
1179 f = &r->r.frmr;
1180
1181 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1182 ia->ri_max_frmr_depth);
1183 if (IS_ERR(f->fr_mr)) {
1184 rc = PTR_ERR(f->fr_mr);
1185 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1186 "failed %i\n", __func__, rc);
1187 goto out_free;
1188 }
1189
1190 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1191 ia->ri_max_frmr_depth);
1192 if (IS_ERR(f->fr_pgl)) {
1193 rc = PTR_ERR(f->fr_pgl);
1194 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1195 "failed %i\n", __func__, rc);
1196
1197 ib_dereg_mr(f->fr_mr);
1198 goto out_free;
1199 }
1200
1201 list_add(&r->mw_list, &buf->rb_mws);
1202 list_add(&r->mw_all, &buf->rb_all);
1203 }
1204
1205 return 0;
1206
1207out_free:
1208 kfree(r);
1209 return rc;
1210}
1211
c56c65fb 1212int
ac920d04 1213rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
c56c65fb 1214{
ac920d04
CL
1215 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1216 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1217 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
c56c65fb 1218 char *p;
1392402c 1219 size_t len;
c56c65fb
TT
1220 int i, rc;
1221
1222 buf->rb_max_requests = cdata->max_requests;
1223 spin_lock_init(&buf->rb_lock);
c56c65fb
TT
1224
1225 /* Need to allocate:
1226 * 1. arrays for send and recv pointers
1227 * 2. arrays of struct rpcrdma_req to fill in pointers
1228 * 3. array of struct rpcrdma_rep for replies
c56c65fb
TT
1229 * Send/recv buffers in req/rep need to be registered
1230 */
c56c65fb
TT
1231 len = buf->rb_max_requests *
1232 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
c56c65fb 1233
c56c65fb
TT
1234 p = kzalloc(len, GFP_KERNEL);
1235 if (p == NULL) {
1236 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1237 __func__, len);
1238 rc = -ENOMEM;
1239 goto out;
1240 }
1241 buf->rb_pool = p; /* for freeing it later */
1242
1243 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1244 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1245 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1246 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1247
c56c65fb 1248 INIT_LIST_HEAD(&buf->rb_mws);
3111d72c 1249 INIT_LIST_HEAD(&buf->rb_all);
c56c65fb 1250 switch (ia->ri_memreg_strategy) {
3197d309 1251 case RPCRDMA_FRMR:
2e84522c
CL
1252 rc = rpcrdma_init_frmrs(ia, buf);
1253 if (rc)
1254 goto out;
3197d309 1255 break;
c56c65fb 1256 case RPCRDMA_MTHCAFMR:
2e84522c
CL
1257 rc = rpcrdma_init_fmrs(ia, buf);
1258 if (rc)
1259 goto out;
c56c65fb 1260 break;
c56c65fb
TT
1261 default:
1262 break;
1263 }
1264
c56c65fb
TT
1265 for (i = 0; i < buf->rb_max_requests; i++) {
1266 struct rpcrdma_req *req;
1267 struct rpcrdma_rep *rep;
1268
1392402c
CL
1269 req = rpcrdma_create_req(r_xprt);
1270 if (IS_ERR(req)) {
c56c65fb
TT
1271 dprintk("RPC: %s: request buffer %d alloc"
1272 " failed\n", __func__, i);
1392402c 1273 rc = PTR_ERR(req);
c56c65fb
TT
1274 goto out;
1275 }
c56c65fb 1276 buf->rb_send_bufs[i] = req;
c56c65fb 1277
1392402c
CL
1278 rep = rpcrdma_create_rep(r_xprt);
1279 if (IS_ERR(rep)) {
c56c65fb
TT
1280 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1281 __func__, i);
1392402c 1282 rc = PTR_ERR(rep);
c56c65fb
TT
1283 goto out;
1284 }
c56c65fb 1285 buf->rb_recv_bufs[i] = rep;
c56c65fb 1286 }
1392402c 1287
c56c65fb
TT
1288 return 0;
1289out:
1290 rpcrdma_buffer_destroy(buf);
1291 return rc;
1292}
1293
1392402c
CL
1294static void
1295rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
1296{
1297 if (!rep)
1298 return;
1299
6b1184cd 1300 rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
1392402c
CL
1301 kfree(rep);
1302}
1303
1304static void
1305rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
1306{
1307 if (!req)
1308 return;
1309
0ca77dc3 1310 rpcrdma_free_regbuf(ia, req->rl_sendbuf);
85275c87 1311 rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
1392402c
CL
1312 kfree(req);
1313}
1314
2e84522c
CL
1315static void
1316rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1317{
1318 struct rpcrdma_mw *r;
1319 int rc;
1320
1321 while (!list_empty(&buf->rb_all)) {
1322 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1323 list_del(&r->mw_all);
1324 list_del(&r->mw_list);
1325
1326 rc = ib_dealloc_fmr(r->r.fmr);
1327 if (rc)
1328 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1329 __func__, rc);
1330
1331 kfree(r);
1332 }
1333}
1334
1335static void
1336rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1337{
1338 struct rpcrdma_mw *r;
1339 int rc;
1340
1341 while (!list_empty(&buf->rb_all)) {
1342 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1343 list_del(&r->mw_all);
1344 list_del(&r->mw_list);
1345
1346 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1347 if (rc)
1348 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1349 __func__, rc);
1350 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1351
1352 kfree(r);
1353 }
1354}
1355
c56c65fb
TT
1356void
1357rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1358{
c56c65fb 1359 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
2e84522c 1360 int i;
c56c65fb
TT
1361
1362 /* clean up in reverse order from create
1363 * 1. recv mr memory (mr free, then kfree)
c56c65fb 1364 * 2. send mr memory (mr free, then kfree)
2e84522c 1365 * 3. MWs
c56c65fb
TT
1366 */
1367 dprintk("RPC: %s: entering\n", __func__);
1368
1369 for (i = 0; i < buf->rb_max_requests; i++) {
1392402c
CL
1370 if (buf->rb_recv_bufs)
1371 rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
1372 if (buf->rb_send_bufs)
1373 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
c56c65fb
TT
1374 }
1375
2e84522c
CL
1376 switch (ia->ri_memreg_strategy) {
1377 case RPCRDMA_FRMR:
1378 rpcrdma_destroy_frmrs(buf);
1379 break;
1380 case RPCRDMA_MTHCAFMR:
1381 rpcrdma_destroy_fmrs(buf);
1382 break;
1383 default:
1384 break;
4034ba04
AA
1385 }
1386
c56c65fb
TT
1387 kfree(buf->rb_pool);
1388}
1389
467c9674
CL
1390/* After a disconnect, unmap all FMRs.
1391 *
1392 * This is invoked only in the transport connect worker in order
1393 * to serialize with rpcrdma_register_fmr_external().
1394 */
1395static void
1396rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
1397{
1398 struct rpcrdma_xprt *r_xprt =
1399 container_of(ia, struct rpcrdma_xprt, rx_ia);
1400 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1401 struct list_head *pos;
1402 struct rpcrdma_mw *r;
1403 LIST_HEAD(l);
1404 int rc;
1405
1406 list_for_each(pos, &buf->rb_all) {
1407 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1408
1409 INIT_LIST_HEAD(&l);
1410 list_add(&r->r.fmr->list, &l);
1411 rc = ib_unmap_fmr(&l);
1412 if (rc)
1413 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
1414 __func__, rc);
1415 }
1416}
1417
9f9d802a
CL
1418/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1419 * an unusable state. Find FRMRs in this state and dereg / reg
1420 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1421 * also torn down.
1422 *
1423 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1424 *
1425 * This is invoked only in the transport connect worker in order
1426 * to serialize with rpcrdma_register_frmr_external().
1427 */
1428static void
1429rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1430{
1431 struct rpcrdma_xprt *r_xprt =
1432 container_of(ia, struct rpcrdma_xprt, rx_ia);
1433 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1434 struct list_head *pos;
1435 struct rpcrdma_mw *r;
1436 int rc;
1437
1438 list_for_each(pos, &buf->rb_all) {
1439 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1440
1441 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1442 continue;
1443
1444 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1445 if (rc)
1446 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1447 __func__, rc);
1448 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1449
1450 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1451 ia->ri_max_frmr_depth);
1452 if (IS_ERR(r->r.frmr.fr_mr)) {
1453 rc = PTR_ERR(r->r.frmr.fr_mr);
1454 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1455 " failed %i\n", __func__, rc);
1456 continue;
1457 }
1458 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1459 ia->ri_id->device,
1460 ia->ri_max_frmr_depth);
1461 if (IS_ERR(r->r.frmr.fr_pgl)) {
1462 rc = PTR_ERR(r->r.frmr.fr_pgl);
1463 dprintk("RPC: %s: "
1464 "ib_alloc_fast_reg_page_list "
1465 "failed %i\n", __func__, rc);
1466
1467 ib_dereg_mr(r->r.frmr.fr_mr);
1468 continue;
1469 }
1470 r->r.frmr.fr_state = FRMR_IS_INVALID;
1471 }
1472}
1473
c2922c02
CL
1474/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1475 * some req segments uninitialized.
1476 */
1477static void
1478rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1479{
1480 if (*mw) {
1481 list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
1482 *mw = NULL;
1483 }
1484}
1485
1486/* Cycle mw's back in reverse order, and "spin" them.
1487 * This delays and scrambles reuse as much as possible.
1488 */
1489static void
1490rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1491{
1492 struct rpcrdma_mr_seg *seg = req->rl_segments;
1493 struct rpcrdma_mr_seg *seg1 = seg;
1494 int i;
1495
1496 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
3eb35810
CL
1497 rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
1498 rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
c2922c02
CL
1499}
1500
1501static void
1502rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1503{
1504 buf->rb_send_bufs[--buf->rb_send_index] = req;
1505 req->rl_niovs = 0;
1506 if (req->rl_reply) {
1507 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1508 req->rl_reply->rr_func = NULL;
1509 req->rl_reply = NULL;
1510 }
1511}
1512
6814baea 1513/* rpcrdma_unmap_one() was already done during deregistration.
ddb6bebc
CL
1514 * Redo only the ib_post_send().
1515 */
1516static void
1517rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1518{
1519 struct rpcrdma_xprt *r_xprt =
1520 container_of(ia, struct rpcrdma_xprt, rx_ia);
1521 struct ib_send_wr invalidate_wr, *bad_wr;
1522 int rc;
1523
1524 dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
1525
1526 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
dab7e3b8 1527 r->r.frmr.fr_state = FRMR_IS_INVALID;
ddb6bebc
CL
1528
1529 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1530 invalidate_wr.wr_id = (unsigned long)(void *)r;
1531 invalidate_wr.opcode = IB_WR_LOCAL_INV;
ddb6bebc
CL
1532 invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1533 DECR_CQCOUNT(&r_xprt->rx_ep);
1534
1535 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
1536 __func__, r, r->r.frmr.fr_mr->rkey);
1537
1538 read_lock(&ia->ri_qplock);
1539 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1540 read_unlock(&ia->ri_qplock);
1541 if (rc) {
1542 /* Force rpcrdma_buffer_get() to retry */
1543 r->r.frmr.fr_state = FRMR_IS_STALE;
1544 dprintk("RPC: %s: ib_post_send failed, %i\n",
1545 __func__, rc);
1546 }
1547}
1548
1549static void
1550rpcrdma_retry_flushed_linv(struct list_head *stale,
1551 struct rpcrdma_buffer *buf)
1552{
1553 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1554 struct list_head *pos;
1555 struct rpcrdma_mw *r;
1556 unsigned long flags;
1557
1558 list_for_each(pos, stale) {
1559 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1560 rpcrdma_retry_local_inv(r, ia);
1561 }
1562
1563 spin_lock_irqsave(&buf->rb_lock, flags);
1564 list_splice_tail(stale, &buf->rb_mws);
1565 spin_unlock_irqrestore(&buf->rb_lock, flags);
1566}
1567
1568static struct rpcrdma_req *
1569rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1570 struct list_head *stale)
1571{
1572 struct rpcrdma_mw *r;
1573 int i;
1574
1575 i = RPCRDMA_MAX_SEGS - 1;
1576 while (!list_empty(&buf->rb_mws)) {
1577 r = list_entry(buf->rb_mws.next,
1578 struct rpcrdma_mw, mw_list);
1579 list_del(&r->mw_list);
1580 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1581 list_add(&r->mw_list, stale);
1582 continue;
1583 }
3eb35810 1584 req->rl_segments[i].rl_mw = r;
ddb6bebc
CL
1585 if (unlikely(i-- == 0))
1586 return req; /* Success */
1587 }
1588
1589 /* Not enough entries on rb_mws for this req */
1590 rpcrdma_buffer_put_sendbuf(req, buf);
1591 rpcrdma_buffer_put_mrs(req, buf);
1592 return NULL;
1593}
1594
c2922c02 1595static struct rpcrdma_req *
ddb6bebc 1596rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
c2922c02
CL
1597{
1598 struct rpcrdma_mw *r;
1599 int i;
1600
1601 i = RPCRDMA_MAX_SEGS - 1;
1602 while (!list_empty(&buf->rb_mws)) {
1603 r = list_entry(buf->rb_mws.next,
1604 struct rpcrdma_mw, mw_list);
1605 list_del(&r->mw_list);
3eb35810 1606 req->rl_segments[i].rl_mw = r;
c2922c02
CL
1607 if (unlikely(i-- == 0))
1608 return req; /* Success */
1609 }
1610
1611 /* Not enough entries on rb_mws for this req */
1612 rpcrdma_buffer_put_sendbuf(req, buf);
1613 rpcrdma_buffer_put_mrs(req, buf);
1614 return NULL;
1615}
1616
c56c65fb
TT
1617/*
1618 * Get a set of request/reply buffers.
1619 *
1620 * Reply buffer (if needed) is attached to send buffer upon return.
1621 * Rule:
1622 * rb_send_index and rb_recv_index MUST always be pointing to the
1623 * *next* available buffer (non-NULL). They are incremented after
1624 * removing buffers, and decremented *before* returning them.
1625 */
1626struct rpcrdma_req *
1627rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1628{
c2922c02 1629 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
ddb6bebc 1630 struct list_head stale;
c56c65fb
TT
1631 struct rpcrdma_req *req;
1632 unsigned long flags;
1633
1634 spin_lock_irqsave(&buffers->rb_lock, flags);
1635 if (buffers->rb_send_index == buffers->rb_max_requests) {
1636 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1637 dprintk("RPC: %s: out of request buffers\n", __func__);
1638 return ((struct rpcrdma_req *)NULL);
1639 }
1640
1641 req = buffers->rb_send_bufs[buffers->rb_send_index];
1642 if (buffers->rb_send_index < buffers->rb_recv_index) {
1643 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1644 __func__,
1645 buffers->rb_recv_index - buffers->rb_send_index);
1646 req->rl_reply = NULL;
1647 } else {
1648 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1649 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1650 }
1651 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
ddb6bebc
CL
1652
1653 INIT_LIST_HEAD(&stale);
c2922c02
CL
1654 switch (ia->ri_memreg_strategy) {
1655 case RPCRDMA_FRMR:
ddb6bebc
CL
1656 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1657 break;
c2922c02 1658 case RPCRDMA_MTHCAFMR:
ddb6bebc 1659 req = rpcrdma_buffer_get_fmrs(req, buffers);
c2922c02
CL
1660 break;
1661 default:
1662 break;
c56c65fb
TT
1663 }
1664 spin_unlock_irqrestore(&buffers->rb_lock, flags);
ddb6bebc
CL
1665 if (!list_empty(&stale))
1666 rpcrdma_retry_flushed_linv(&stale, buffers);
c56c65fb
TT
1667 return req;
1668}
1669
1670/*
1671 * Put request/reply buffers back into pool.
1672 * Pre-decrement counter/array index.
1673 */
1674void
1675rpcrdma_buffer_put(struct rpcrdma_req *req)
1676{
1677 struct rpcrdma_buffer *buffers = req->rl_buffer;
1678 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
c56c65fb
TT
1679 unsigned long flags;
1680
c56c65fb 1681 spin_lock_irqsave(&buffers->rb_lock, flags);
c2922c02 1682 rpcrdma_buffer_put_sendbuf(req, buffers);
c56c65fb 1683 switch (ia->ri_memreg_strategy) {
3197d309 1684 case RPCRDMA_FRMR:
c56c65fb 1685 case RPCRDMA_MTHCAFMR:
c2922c02 1686 rpcrdma_buffer_put_mrs(req, buffers);
c56c65fb
TT
1687 break;
1688 default:
1689 break;
1690 }
1691 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1692}
1693
1694/*
1695 * Recover reply buffers from pool.
1696 * This happens when recovering from error conditions.
1697 * Post-increment counter/array index.
1698 */
1699void
1700rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1701{
1702 struct rpcrdma_buffer *buffers = req->rl_buffer;
1703 unsigned long flags;
1704
c56c65fb
TT
1705 spin_lock_irqsave(&buffers->rb_lock, flags);
1706 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1707 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1708 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1709 }
1710 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1711}
1712
1713/*
1714 * Put reply buffers back into pool when not attached to
b45ccfd2 1715 * request. This happens in error conditions.
c56c65fb
TT
1716 */
1717void
1718rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1719{
1720 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1721 unsigned long flags;
1722
1723 rep->rr_func = NULL;
1724 spin_lock_irqsave(&buffers->rb_lock, flags);
1725 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1726 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1727}
1728
1729/*
1730 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1731 */
1732
df515ca7 1733static int
c56c65fb
TT
1734rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1735 struct ib_mr **mrp, struct ib_sge *iov)
1736{
1737 struct ib_phys_buf ipb;
1738 struct ib_mr *mr;
1739 int rc;
1740
1741 /*
1742 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1743 */
1744 iov->addr = ib_dma_map_single(ia->ri_id->device,
1745 va, len, DMA_BIDIRECTIONAL);
bf858ab0
YB
1746 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1747 return -ENOMEM;
1748
c56c65fb
TT
1749 iov->length = len;
1750
bd7ed1d1
TT
1751 if (ia->ri_have_dma_lkey) {
1752 *mrp = NULL;
1753 iov->lkey = ia->ri_dma_lkey;
1754 return 0;
1755 } else if (ia->ri_bind_mem != NULL) {
c56c65fb
TT
1756 *mrp = NULL;
1757 iov->lkey = ia->ri_bind_mem->lkey;
1758 return 0;
1759 }
1760
1761 ipb.addr = iov->addr;
1762 ipb.size = iov->length;
1763 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1764 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1765
1766 dprintk("RPC: %s: phys convert: 0x%llx "
1767 "registered 0x%llx length %d\n",
a56daeb7
AM
1768 __func__, (unsigned long long)ipb.addr,
1769 (unsigned long long)iov->addr, len);
c56c65fb
TT
1770
1771 if (IS_ERR(mr)) {
1772 *mrp = NULL;
1773 rc = PTR_ERR(mr);
1774 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1775 } else {
1776 *mrp = mr;
1777 iov->lkey = mr->lkey;
1778 rc = 0;
1779 }
1780
1781 return rc;
1782}
1783
df515ca7 1784static int
c56c65fb
TT
1785rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1786 struct ib_mr *mr, struct ib_sge *iov)
1787{
1788 int rc;
1789
1790 ib_dma_unmap_single(ia->ri_id->device,
1791 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1792
1793 if (NULL == mr)
1794 return 0;
1795
1796 rc = ib_dereg_mr(mr);
1797 if (rc)
1798 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1799 return rc;
1800}
1801
9128c3e7
CL
1802/**
1803 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
1804 * @ia: controlling rpcrdma_ia
1805 * @size: size of buffer to be allocated, in bytes
1806 * @flags: GFP flags
1807 *
1808 * Returns pointer to private header of an area of internally
1809 * registered memory, or an ERR_PTR. The registered buffer follows
1810 * the end of the private header.
1811 *
1812 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
1813 * receiving the payload of RDMA RECV operations. regbufs are not
1814 * used for RDMA READ/WRITE operations, thus are registered only for
1815 * LOCAL access.
1816 */
1817struct rpcrdma_regbuf *
1818rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
1819{
1820 struct rpcrdma_regbuf *rb;
1821 int rc;
1822
1823 rc = -ENOMEM;
1824 rb = kmalloc(sizeof(*rb) + size, flags);
1825 if (rb == NULL)
1826 goto out;
1827
1828 rb->rg_size = size;
1829 rb->rg_owner = NULL;
1830 rc = rpcrdma_register_internal(ia, rb->rg_base, size,
1831 &rb->rg_mr, &rb->rg_iov);
1832 if (rc)
1833 goto out_free;
1834
1835 return rb;
1836
1837out_free:
1838 kfree(rb);
1839out:
1840 return ERR_PTR(rc);
1841}
1842
1843/**
1844 * rpcrdma_free_regbuf - deregister and free registered buffer
1845 * @ia: controlling rpcrdma_ia
1846 * @rb: regbuf to be deregistered and freed
1847 */
1848void
1849rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
1850{
1851 if (rb) {
1852 rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
1853 kfree(rb);
1854 }
1855}
1856
c56c65fb
TT
1857/*
1858 * Wrappers for chunk registration, shared by read/write chunk code.
1859 */
1860
9c1b4d77
CL
1861void
1862rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, bool writing)
c56c65fb
TT
1863{
1864 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1865 seg->mr_dmalen = seg->mr_len;
1866 if (seg->mr_page)
1867 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1868 seg->mr_page, offset_in_page(seg->mr_offset),
1869 seg->mr_dmalen, seg->mr_dir);
1870 else
1871 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1872 seg->mr_offset,
1873 seg->mr_dmalen, seg->mr_dir);
5c635e09
TT
1874 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1875 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1876 __func__,
986d4abb
RD
1877 (unsigned long long)seg->mr_dma,
1878 seg->mr_offset, seg->mr_dmalen);
5c635e09 1879 }
c56c65fb
TT
1880}
1881
9c1b4d77 1882void
c56c65fb
TT
1883rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1884{
1885 if (seg->mr_page)
1886 ib_dma_unmap_page(ia->ri_id->device,
1887 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1888 else
1889 ib_dma_unmap_single(ia->ri_id->device,
1890 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1891}
1892
c56c65fb
TT
1893/*
1894 * Prepost any receive buffer, then post send.
1895 *
1896 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1897 */
1898int
1899rpcrdma_ep_post(struct rpcrdma_ia *ia,
1900 struct rpcrdma_ep *ep,
1901 struct rpcrdma_req *req)
1902{
1903 struct ib_send_wr send_wr, *send_wr_fail;
1904 struct rpcrdma_rep *rep = req->rl_reply;
1905 int rc;
1906
1907 if (rep) {
1908 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1909 if (rc)
1910 goto out;
1911 req->rl_reply = NULL;
1912 }
1913
1914 send_wr.next = NULL;
1915 send_wr.wr_id = 0ULL; /* no send cookie */
1916 send_wr.sg_list = req->rl_send_iov;
1917 send_wr.num_sge = req->rl_niovs;
1918 send_wr.opcode = IB_WR_SEND;
c56c65fb
TT
1919 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1920 ib_dma_sync_single_for_device(ia->ri_id->device,
1921 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1922 DMA_TO_DEVICE);
1923 ib_dma_sync_single_for_device(ia->ri_id->device,
1924 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1925 DMA_TO_DEVICE);
1926 ib_dma_sync_single_for_device(ia->ri_id->device,
1927 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1928 DMA_TO_DEVICE);
1929
1930 if (DECR_CQCOUNT(ep) > 0)
1931 send_wr.send_flags = 0;
1932 else { /* Provider must take a send completion every now and then */
1933 INIT_CQCOUNT(ep);
1934 send_wr.send_flags = IB_SEND_SIGNALED;
1935 }
1936
1937 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1938 if (rc)
1939 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1940 rc);
1941out:
1942 return rc;
1943}
1944
1945/*
1946 * (Re)post a receive buffer.
1947 */
1948int
1949rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1950 struct rpcrdma_ep *ep,
1951 struct rpcrdma_rep *rep)
1952{
1953 struct ib_recv_wr recv_wr, *recv_wr_fail;
1954 int rc;
1955
1956 recv_wr.next = NULL;
1957 recv_wr.wr_id = (u64) (unsigned long) rep;
6b1184cd 1958 recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
c56c65fb
TT
1959 recv_wr.num_sge = 1;
1960
1961 ib_dma_sync_single_for_cpu(ia->ri_id->device,
6b1184cd
CL
1962 rdmab_addr(rep->rr_rdmabuf),
1963 rdmab_length(rep->rr_rdmabuf),
1964 DMA_BIDIRECTIONAL);
c56c65fb 1965
c56c65fb
TT
1966 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1967
1968 if (rc)
1969 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1970 rc);
1971 return rc;
1972}
43e95988 1973
1c9351ee 1974/* How many chunk list items fit within our inline buffers?
43e95988 1975 */
1c9351ee
CL
1976unsigned int
1977rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
43e95988
CL
1978{
1979 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1c9351ee 1980 int bytes, segments;
43e95988 1981
1c9351ee
CL
1982 bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
1983 bytes -= RPCRDMA_HDRLEN_MIN;
1984 if (bytes < sizeof(struct rpcrdma_segment) * 2) {
1985 pr_warn("RPC: %s: inline threshold too small\n",
1986 __func__);
1987 return 0;
43e95988 1988 }
1c9351ee
CL
1989
1990 segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
1991 dprintk("RPC: %s: max chunk list size = %d segments\n",
1992 __func__, segments);
1993 return segments;
43e95988 1994}
This page took 1.011494 seconds and 5 git commands to generate.