RDMA/cma: Pass QP type into rdma_create_id()
[deliverable/linux.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
f58851e6 1/*
c56c65fb
TT
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
38 */
39
c56c65fb
TT
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50#include <linux/pci.h> /* for Tavor hack below */
5a0e3ad6 51#include <linux/slab.h>
c56c65fb 52
f58851e6
TT
53#include "xprt_rdma.h"
54
c56c65fb
TT
55/*
56 * Globals/Macros
57 */
58
59#ifdef RPC_DEBUG
60# define RPCDBG_FACILITY RPCDBG_TRANS
61#endif
62
63/*
64 * internal functions
65 */
66
67/*
68 * handle replies in tasklet context, using a single, global list
69 * rdma tasklet function -- just turn around and call the func
70 * for all replies on the list
71 */
72
73static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
74static LIST_HEAD(rpcrdma_tasklets_g);
75
76static void
77rpcrdma_run_tasklet(unsigned long data)
78{
79 struct rpcrdma_rep *rep;
80 void (*func)(struct rpcrdma_rep *);
81 unsigned long flags;
82
83 data = data;
84 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
85 while (!list_empty(&rpcrdma_tasklets_g)) {
86 rep = list_entry(rpcrdma_tasklets_g.next,
87 struct rpcrdma_rep, rr_list);
88 list_del(&rep->rr_list);
89 func = rep->rr_func;
90 rep->rr_func = NULL;
91 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
92
93 if (func)
94 func(rep);
95 else
96 rpcrdma_recv_buffer_put(rep);
97
98 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
99 }
100 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
101}
102
103static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
104
105static inline void
106rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
107{
108 unsigned long flags;
109
110 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
111 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
112 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
113 tasklet_schedule(&rpcrdma_tasklet_g);
114}
115
116static void
117rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
118{
119 struct rpcrdma_ep *ep = context;
120
121 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
122 __func__, event->event, event->device->name, context);
123 if (ep->rep_connected == 1) {
124 ep->rep_connected = -EIO;
125 ep->rep_func(ep);
126 wake_up_all(&ep->rep_connect_wait);
127 }
128}
129
130static void
131rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
132{
133 struct rpcrdma_ep *ep = context;
134
135 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
136 __func__, event->event, event->device->name, context);
137 if (ep->rep_connected == 1) {
138 ep->rep_connected = -EIO;
139 ep->rep_func(ep);
140 wake_up_all(&ep->rep_connect_wait);
141 }
142}
143
144static inline
145void rpcrdma_event_process(struct ib_wc *wc)
146{
5c635e09 147 struct rpcrdma_mw *frmr;
c56c65fb
TT
148 struct rpcrdma_rep *rep =
149 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
150
151 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
152 __func__, rep, wc->status, wc->opcode, wc->byte_len);
153
154 if (!rep) /* send or bind completion that we don't care about */
155 return;
156
157 if (IB_WC_SUCCESS != wc->status) {
5c635e09
TT
158 dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
159 __func__, wc->opcode, wc->status);
c56c65fb 160 rep->rr_len = ~0U;
5c635e09
TT
161 if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
162 rpcrdma_schedule_tasklet(rep);
c56c65fb
TT
163 return;
164 }
165
166 switch (wc->opcode) {
5c635e09
TT
167 case IB_WC_FAST_REG_MR:
168 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
169 frmr->r.frmr.state = FRMR_IS_VALID;
170 break;
171 case IB_WC_LOCAL_INV:
172 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
173 frmr->r.frmr.state = FRMR_IS_INVALID;
174 break;
c56c65fb
TT
175 case IB_WC_RECV:
176 rep->rr_len = wc->byte_len;
177 ib_dma_sync_single_for_cpu(
178 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
179 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
180 /* Keep (only) the most recent credits, after check validity */
181 if (rep->rr_len >= 16) {
182 struct rpcrdma_msg *p =
183 (struct rpcrdma_msg *) rep->rr_base;
184 unsigned int credits = ntohl(p->rm_credit);
185 if (credits == 0) {
186 dprintk("RPC: %s: server"
187 " dropped credits to 0!\n", __func__);
188 /* don't deadlock */
189 credits = 1;
190 } else if (credits > rep->rr_buffer->rb_max_requests) {
191 dprintk("RPC: %s: server"
192 " over-crediting: %d (%d)\n",
193 __func__, credits,
194 rep->rr_buffer->rb_max_requests);
195 credits = rep->rr_buffer->rb_max_requests;
196 }
197 atomic_set(&rep->rr_buffer->rb_credits, credits);
198 }
199 /* fall through */
200 case IB_WC_BIND_MW:
201 rpcrdma_schedule_tasklet(rep);
202 break;
203 default:
204 dprintk("RPC: %s: unexpected WC event %X\n",
205 __func__, wc->opcode);
206 break;
207 }
208}
209
210static inline int
211rpcrdma_cq_poll(struct ib_cq *cq)
212{
213 struct ib_wc wc;
214 int rc;
215
216 for (;;) {
217 rc = ib_poll_cq(cq, 1, &wc);
218 if (rc < 0) {
219 dprintk("RPC: %s: ib_poll_cq failed %i\n",
220 __func__, rc);
221 return rc;
222 }
223 if (rc == 0)
224 break;
225
226 rpcrdma_event_process(&wc);
227 }
228
229 return 0;
230}
231
232/*
233 * rpcrdma_cq_event_upcall
234 *
235 * This upcall handles recv, send, bind and unbind events.
236 * It is reentrant but processes single events in order to maintain
237 * ordering of receives to keep server credits.
238 *
239 * It is the responsibility of the scheduled tasklet to return
240 * recv buffers to the pool. NOTE: this affects synchronization of
241 * connection shutdown. That is, the structures required for
242 * the completion of the reply handler must remain intact until
243 * all memory has been reclaimed.
244 *
245 * Note that send events are suppressed and do not result in an upcall.
246 */
247static void
248rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
249{
250 int rc;
251
252 rc = rpcrdma_cq_poll(cq);
253 if (rc)
254 return;
255
256 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
257 if (rc) {
258 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
259 __func__, rc);
260 return;
261 }
262
263 rpcrdma_cq_poll(cq);
264}
265
266#ifdef RPC_DEBUG
267static const char * const conn[] = {
268 "address resolved",
269 "address error",
270 "route resolved",
271 "route error",
272 "connect request",
273 "connect response",
274 "connect error",
275 "unreachable",
276 "rejected",
277 "established",
278 "disconnected",
279 "device removal"
280};
281#endif
282
283static int
284rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
285{
286 struct rpcrdma_xprt *xprt = id->context;
287 struct rpcrdma_ia *ia = &xprt->rx_ia;
288 struct rpcrdma_ep *ep = &xprt->rx_ep;
ff0db049 289#ifdef RPC_DEBUG
c56c65fb 290 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
ff0db049 291#endif
c56c65fb
TT
292 struct ib_qp_attr attr;
293 struct ib_qp_init_attr iattr;
294 int connstate = 0;
295
296 switch (event->event) {
297 case RDMA_CM_EVENT_ADDR_RESOLVED:
298 case RDMA_CM_EVENT_ROUTE_RESOLVED:
5675add3 299 ia->ri_async_rc = 0;
c56c65fb
TT
300 complete(&ia->ri_done);
301 break;
302 case RDMA_CM_EVENT_ADDR_ERROR:
303 ia->ri_async_rc = -EHOSTUNREACH;
304 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
305 __func__, ep);
306 complete(&ia->ri_done);
307 break;
308 case RDMA_CM_EVENT_ROUTE_ERROR:
309 ia->ri_async_rc = -ENETUNREACH;
310 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
311 __func__, ep);
312 complete(&ia->ri_done);
313 break;
314 case RDMA_CM_EVENT_ESTABLISHED:
315 connstate = 1;
316 ib_query_qp(ia->ri_id->qp, &attr,
317 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
318 &iattr);
319 dprintk("RPC: %s: %d responder resources"
320 " (%d initiator)\n",
321 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
322 goto connected;
323 case RDMA_CM_EVENT_CONNECT_ERROR:
324 connstate = -ENOTCONN;
325 goto connected;
326 case RDMA_CM_EVENT_UNREACHABLE:
327 connstate = -ENETDOWN;
328 goto connected;
329 case RDMA_CM_EVENT_REJECTED:
330 connstate = -ECONNREFUSED;
331 goto connected;
332 case RDMA_CM_EVENT_DISCONNECTED:
333 connstate = -ECONNABORTED;
334 goto connected;
335 case RDMA_CM_EVENT_DEVICE_REMOVAL:
336 connstate = -ENODEV;
337connected:
21454aaa 338 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
c56c65fb
TT
339 __func__,
340 (event->event <= 11) ? conn[event->event] :
341 "unknown connection error",
21454aaa 342 &addr->sin_addr.s_addr,
c56c65fb
TT
343 ntohs(addr->sin_port),
344 ep, event->event);
345 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
346 dprintk("RPC: %s: %sconnected\n",
347 __func__, connstate > 0 ? "" : "dis");
348 ep->rep_connected = connstate;
349 ep->rep_func(ep);
350 wake_up_all(&ep->rep_connect_wait);
351 break;
352 default:
1a954051 353 dprintk("RPC: %s: unexpected CM event %d\n",
c56c65fb 354 __func__, event->event);
c56c65fb
TT
355 break;
356 }
357
b3cd8d45
TT
358#ifdef RPC_DEBUG
359 if (connstate == 1) {
360 int ird = attr.max_dest_rd_atomic;
361 int tird = ep->rep_remote_cma.responder_resources;
21454aaa 362 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
b3cd8d45 363 "on %s, memreg %d slots %d ird %d%s\n",
21454aaa 364 &addr->sin_addr.s_addr,
b3cd8d45
TT
365 ntohs(addr->sin_port),
366 ia->ri_id->device->name,
367 ia->ri_memreg_strategy,
368 xprt->rx_buf.rb_max_requests,
369 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
370 } else if (connstate < 0) {
21454aaa
HH
371 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
372 &addr->sin_addr.s_addr,
b3cd8d45
TT
373 ntohs(addr->sin_port),
374 connstate);
375 }
376#endif
377
c56c65fb
TT
378 return 0;
379}
380
381static struct rdma_cm_id *
382rpcrdma_create_id(struct rpcrdma_xprt *xprt,
383 struct rpcrdma_ia *ia, struct sockaddr *addr)
384{
385 struct rdma_cm_id *id;
386 int rc;
387
1a954051
TT
388 init_completion(&ia->ri_done);
389
b26f9b99 390 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
c56c65fb
TT
391 if (IS_ERR(id)) {
392 rc = PTR_ERR(id);
393 dprintk("RPC: %s: rdma_create_id() failed %i\n",
394 __func__, rc);
395 return id;
396 }
397
5675add3 398 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
399 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
400 if (rc) {
401 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
402 __func__, rc);
403 goto out;
404 }
5675add3
TT
405 wait_for_completion_interruptible_timeout(&ia->ri_done,
406 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
407 rc = ia->ri_async_rc;
408 if (rc)
409 goto out;
410
5675add3 411 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
412 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
413 if (rc) {
414 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
415 __func__, rc);
416 goto out;
417 }
5675add3
TT
418 wait_for_completion_interruptible_timeout(&ia->ri_done,
419 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
420 rc = ia->ri_async_rc;
421 if (rc)
422 goto out;
423
424 return id;
425
426out:
427 rdma_destroy_id(id);
428 return ERR_PTR(rc);
429}
430
431/*
432 * Drain any cq, prior to teardown.
433 */
434static void
435rpcrdma_clean_cq(struct ib_cq *cq)
436{
437 struct ib_wc wc;
438 int count = 0;
439
440 while (1 == ib_poll_cq(cq, 1, &wc))
441 ++count;
442
443 if (count)
444 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
445 __func__, count, wc.opcode);
446}
447
448/*
449 * Exported functions.
450 */
451
452/*
453 * Open and initialize an Interface Adapter.
454 * o initializes fields of struct rpcrdma_ia, including
455 * interface and provider attributes and protection zone.
456 */
457int
458rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
459{
bd7ed1d1
TT
460 int rc, mem_priv;
461 struct ib_device_attr devattr;
c56c65fb
TT
462 struct rpcrdma_ia *ia = &xprt->rx_ia;
463
c56c65fb
TT
464 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
465 if (IS_ERR(ia->ri_id)) {
466 rc = PTR_ERR(ia->ri_id);
467 goto out1;
468 }
469
470 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
471 if (IS_ERR(ia->ri_pd)) {
472 rc = PTR_ERR(ia->ri_pd);
473 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
474 __func__, rc);
475 goto out2;
476 }
477
bd7ed1d1
TT
478 /*
479 * Query the device to determine if the requested memory
480 * registration strategy is supported. If it isn't, set the
481 * strategy to a globally supported model.
482 */
483 rc = ib_query_device(ia->ri_id->device, &devattr);
484 if (rc) {
485 dprintk("RPC: %s: ib_query_device failed %d\n",
486 __func__, rc);
487 goto out2;
488 }
489
490 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
491 ia->ri_have_dma_lkey = 1;
492 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
493 }
494
495 switch (memreg) {
496 case RPCRDMA_MEMWINDOWS:
497 case RPCRDMA_MEMWINDOWS_ASYNC:
498 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
499 dprintk("RPC: %s: MEMWINDOWS registration "
500 "specified but not supported by adapter, "
501 "using slower RPCRDMA_REGISTER\n",
502 __func__);
503 memreg = RPCRDMA_REGISTER;
504 }
505 break;
506 case RPCRDMA_MTHCAFMR:
507 if (!ia->ri_id->device->alloc_fmr) {
508#if RPCRDMA_PERSISTENT_REGISTRATION
509 dprintk("RPC: %s: MTHCAFMR registration "
510 "specified but not supported by adapter, "
511 "using riskier RPCRDMA_ALLPHYSICAL\n",
512 __func__);
513 memreg = RPCRDMA_ALLPHYSICAL;
514#else
515 dprintk("RPC: %s: MTHCAFMR registration "
516 "specified but not supported by adapter, "
517 "using slower RPCRDMA_REGISTER\n",
518 __func__);
519 memreg = RPCRDMA_REGISTER;
3197d309
TT
520#endif
521 }
522 break;
523 case RPCRDMA_FRMR:
524 /* Requires both frmr reg and local dma lkey */
525 if ((devattr.device_cap_flags &
526 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
527 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
528#if RPCRDMA_PERSISTENT_REGISTRATION
529 dprintk("RPC: %s: FRMR registration "
530 "specified but not supported by adapter, "
531 "using riskier RPCRDMA_ALLPHYSICAL\n",
532 __func__);
533 memreg = RPCRDMA_ALLPHYSICAL;
534#else
535 dprintk("RPC: %s: FRMR registration "
536 "specified but not supported by adapter, "
537 "using slower RPCRDMA_REGISTER\n",
538 __func__);
539 memreg = RPCRDMA_REGISTER;
bd7ed1d1
TT
540#endif
541 }
542 break;
543 }
544
c56c65fb
TT
545 /*
546 * Optionally obtain an underlying physical identity mapping in
547 * order to do a memory window-based bind. This base registration
548 * is protected from remote access - that is enabled only by binding
549 * for the specific bytes targeted during each RPC operation, and
550 * revoked after the corresponding completion similar to a storage
551 * adapter.
552 */
bd7ed1d1
TT
553 switch (memreg) {
554 case RPCRDMA_BOUNCEBUFFERS:
555 case RPCRDMA_REGISTER:
3197d309 556 case RPCRDMA_FRMR:
bd7ed1d1 557 break;
c56c65fb 558#if RPCRDMA_PERSISTENT_REGISTRATION
bd7ed1d1
TT
559 case RPCRDMA_ALLPHYSICAL:
560 mem_priv = IB_ACCESS_LOCAL_WRITE |
561 IB_ACCESS_REMOTE_WRITE |
562 IB_ACCESS_REMOTE_READ;
563 goto register_setup;
c56c65fb 564#endif
bd7ed1d1
TT
565 case RPCRDMA_MEMWINDOWS_ASYNC:
566 case RPCRDMA_MEMWINDOWS:
567 mem_priv = IB_ACCESS_LOCAL_WRITE |
568 IB_ACCESS_MW_BIND;
569 goto register_setup;
570 case RPCRDMA_MTHCAFMR:
571 if (ia->ri_have_dma_lkey)
c56c65fb 572 break;
bd7ed1d1
TT
573 mem_priv = IB_ACCESS_LOCAL_WRITE;
574 register_setup:
c56c65fb
TT
575 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
576 if (IS_ERR(ia->ri_bind_mem)) {
577 printk(KERN_ALERT "%s: ib_get_dma_mr for "
578 "phys register failed with %lX\n\t"
579 "Will continue with degraded performance\n",
580 __func__, PTR_ERR(ia->ri_bind_mem));
581 memreg = RPCRDMA_REGISTER;
582 ia->ri_bind_mem = NULL;
583 }
bd7ed1d1
TT
584 break;
585 default:
586 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
587 __func__, memreg);
588 rc = -EINVAL;
589 goto out2;
c56c65fb 590 }
bd7ed1d1
TT
591 dprintk("RPC: %s: memory registration strategy is %d\n",
592 __func__, memreg);
c56c65fb
TT
593
594 /* Else will do memory reg/dereg for each chunk */
595 ia->ri_memreg_strategy = memreg;
596
597 return 0;
598out2:
599 rdma_destroy_id(ia->ri_id);
fee08caf 600 ia->ri_id = NULL;
c56c65fb
TT
601out1:
602 return rc;
603}
604
605/*
606 * Clean up/close an IA.
607 * o if event handles and PD have been initialized, free them.
608 * o close the IA
609 */
610void
611rpcrdma_ia_close(struct rpcrdma_ia *ia)
612{
613 int rc;
614
615 dprintk("RPC: %s: entering\n", __func__);
616 if (ia->ri_bind_mem != NULL) {
617 rc = ib_dereg_mr(ia->ri_bind_mem);
618 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
619 __func__, rc);
620 }
fee08caf
TT
621 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
622 if (ia->ri_id->qp)
623 rdma_destroy_qp(ia->ri_id);
624 rdma_destroy_id(ia->ri_id);
625 ia->ri_id = NULL;
626 }
c56c65fb
TT
627 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
628 rc = ib_dealloc_pd(ia->ri_pd);
629 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
630 __func__, rc);
631 }
c56c65fb
TT
632}
633
634/*
635 * Create unconnected endpoint.
636 */
637int
638rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
639 struct rpcrdma_create_data_internal *cdata)
640{
641 struct ib_device_attr devattr;
5d40a8a5 642 int rc, err;
c56c65fb
TT
643
644 rc = ib_query_device(ia->ri_id->device, &devattr);
645 if (rc) {
646 dprintk("RPC: %s: ib_query_device failed %d\n",
647 __func__, rc);
648 return rc;
649 }
650
651 /* check provider's send/recv wr limits */
652 if (cdata->max_requests > devattr.max_qp_wr)
653 cdata->max_requests = devattr.max_qp_wr;
654
655 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
656 ep->rep_attr.qp_context = ep;
657 /* send_cq and recv_cq initialized below */
658 ep->rep_attr.srq = NULL;
659 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
660 switch (ia->ri_memreg_strategy) {
3197d309 661 case RPCRDMA_FRMR:
15cdc644
TT
662 /* Add room for frmr register and invalidate WRs.
663 * 1. FRMR reg WR for head
664 * 2. FRMR invalidate WR for head
665 * 3. FRMR reg WR for pagelist
666 * 4. FRMR invalidate WR for pagelist
667 * 5. FRMR reg WR for tail
668 * 6. FRMR invalidate WR for tail
669 * 7. The RDMA_SEND WR
670 */
671 ep->rep_attr.cap.max_send_wr *= 7;
672 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
673 cdata->max_requests = devattr.max_qp_wr / 7;
674 if (!cdata->max_requests)
675 return -EINVAL;
676 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
677 }
3197d309 678 break;
c56c65fb
TT
679 case RPCRDMA_MEMWINDOWS_ASYNC:
680 case RPCRDMA_MEMWINDOWS:
681 /* Add room for mw_binds+unbinds - overkill! */
682 ep->rep_attr.cap.max_send_wr++;
683 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
684 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
685 return -EINVAL;
686 break;
687 default:
688 break;
689 }
690 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
691 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
692 ep->rep_attr.cap.max_recv_sge = 1;
693 ep->rep_attr.cap.max_inline_data = 0;
694 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
695 ep->rep_attr.qp_type = IB_QPT_RC;
696 ep->rep_attr.port_num = ~0;
697
698 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
699 "iovs: send %d recv %d\n",
700 __func__,
701 ep->rep_attr.cap.max_send_wr,
702 ep->rep_attr.cap.max_recv_wr,
703 ep->rep_attr.cap.max_send_sge,
704 ep->rep_attr.cap.max_recv_sge);
705
706 /* set trigger for requesting send completion */
707 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
708 switch (ia->ri_memreg_strategy) {
709 case RPCRDMA_MEMWINDOWS_ASYNC:
710 case RPCRDMA_MEMWINDOWS:
711 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
712 break;
713 default:
714 break;
715 }
716 if (ep->rep_cqinit <= 2)
717 ep->rep_cqinit = 0;
718 INIT_CQCOUNT(ep);
719 ep->rep_ia = ia;
720 init_waitqueue_head(&ep->rep_connect_wait);
721
722 /*
723 * Create a single cq for receive dto and mw_bind (only ever
724 * care about unbind, really). Send completions are suppressed.
725 * Use single threaded tasklet upcalls to maintain ordering.
726 */
727 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
728 rpcrdma_cq_async_error_upcall, NULL,
729 ep->rep_attr.cap.max_recv_wr +
730 ep->rep_attr.cap.max_send_wr + 1, 0);
731 if (IS_ERR(ep->rep_cq)) {
732 rc = PTR_ERR(ep->rep_cq);
733 dprintk("RPC: %s: ib_create_cq failed: %i\n",
734 __func__, rc);
735 goto out1;
736 }
737
738 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
739 if (rc) {
740 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
741 __func__, rc);
742 goto out2;
743 }
744
745 ep->rep_attr.send_cq = ep->rep_cq;
746 ep->rep_attr.recv_cq = ep->rep_cq;
747
748 /* Initialize cma parameters */
749
750 /* RPC/RDMA does not use private data */
751 ep->rep_remote_cma.private_data = NULL;
752 ep->rep_remote_cma.private_data_len = 0;
753
754 /* Client offers RDMA Read but does not initiate */
b334eaab
TT
755 ep->rep_remote_cma.initiator_depth = 0;
756 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
c56c65fb 757 ep->rep_remote_cma.responder_resources = 0;
b334eaab
TT
758 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
759 ep->rep_remote_cma.responder_resources = 32;
760 else
c56c65fb 761 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
c56c65fb
TT
762
763 ep->rep_remote_cma.retry_count = 7;
764 ep->rep_remote_cma.flow_control = 0;
765 ep->rep_remote_cma.rnr_retry_count = 0;
766
767 return 0;
768
769out2:
5d40a8a5
CL
770 err = ib_destroy_cq(ep->rep_cq);
771 if (err)
772 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
773 __func__, err);
c56c65fb
TT
774out1:
775 return rc;
776}
777
778/*
779 * rpcrdma_ep_destroy
780 *
781 * Disconnect and destroy endpoint. After this, the only
782 * valid operations on the ep are to free it (if dynamically
783 * allocated) or re-create it.
784 *
785 * The caller's error handling must be sure to not leak the endpoint
786 * if this function fails.
787 */
788int
789rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
790{
791 int rc;
792
793 dprintk("RPC: %s: entering, connected is %d\n",
794 __func__, ep->rep_connected);
795
796 if (ia->ri_id->qp) {
797 rc = rpcrdma_ep_disconnect(ep, ia);
798 if (rc)
799 dprintk("RPC: %s: rpcrdma_ep_disconnect"
800 " returned %i\n", __func__, rc);
fee08caf
TT
801 rdma_destroy_qp(ia->ri_id);
802 ia->ri_id->qp = NULL;
c56c65fb
TT
803 }
804
c56c65fb
TT
805 /* padding - could be done in rpcrdma_buffer_destroy... */
806 if (ep->rep_pad_mr) {
807 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
808 ep->rep_pad_mr = NULL;
809 }
810
c56c65fb
TT
811 rpcrdma_clean_cq(ep->rep_cq);
812 rc = ib_destroy_cq(ep->rep_cq);
813 if (rc)
814 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
815 __func__, rc);
816
817 return rc;
818}
819
820/*
821 * Connect unconnected endpoint.
822 */
823int
824rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
825{
826 struct rdma_cm_id *id;
827 int rc = 0;
828 int retry_count = 0;
c56c65fb 829
c055551e 830 if (ep->rep_connected != 0) {
c56c65fb
TT
831 struct rpcrdma_xprt *xprt;
832retry:
833 rc = rpcrdma_ep_disconnect(ep, ia);
834 if (rc && rc != -ENOTCONN)
835 dprintk("RPC: %s: rpcrdma_ep_disconnect"
836 " status %i\n", __func__, rc);
837 rpcrdma_clean_cq(ep->rep_cq);
838
839 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
840 id = rpcrdma_create_id(xprt, ia,
841 (struct sockaddr *)&xprt->rx_data.addr);
842 if (IS_ERR(id)) {
843 rc = PTR_ERR(id);
844 goto out;
845 }
846 /* TEMP TEMP TEMP - fail if new device:
847 * Deregister/remarshal *all* requests!
848 * Close and recreate adapter, pd, etc!
849 * Re-determine all attributes still sane!
850 * More stuff I haven't thought of!
851 * Rrrgh!
852 */
853 if (ia->ri_id->device != id->device) {
854 printk("RPC: %s: can't reconnect on "
855 "different device!\n", __func__);
856 rdma_destroy_id(id);
857 rc = -ENETDOWN;
858 goto out;
859 }
860 /* END TEMP */
1a954051 861 rdma_destroy_qp(ia->ri_id);
c56c65fb
TT
862 rdma_destroy_id(ia->ri_id);
863 ia->ri_id = id;
864 }
865
866 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
867 if (rc) {
868 dprintk("RPC: %s: rdma_create_qp failed %i\n",
869 __func__, rc);
870 goto out;
871 }
872
873/* XXX Tavor device performs badly with 2K MTU! */
874if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
875 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
876 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
877 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
878 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
879 struct ib_qp_attr attr = {
880 .path_mtu = IB_MTU_1024
881 };
882 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
883 }
884}
885
c56c65fb
TT
886 ep->rep_connected = 0;
887
888 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
889 if (rc) {
890 dprintk("RPC: %s: rdma_connect() failed with %i\n",
891 __func__, rc);
892 goto out;
893 }
894
c56c65fb
TT
895 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
896
897 /*
898 * Check state. A non-peer reject indicates no listener
899 * (ECONNREFUSED), which may be a transient state. All
900 * others indicate a transport condition which has already
901 * undergone a best-effort.
902 */
f64f9e71
JP
903 if (ep->rep_connected == -ECONNREFUSED &&
904 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
c56c65fb
TT
905 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
906 goto retry;
907 }
908 if (ep->rep_connected <= 0) {
909 /* Sometimes, the only way to reliably connect to remote
910 * CMs is to use same nonzero values for ORD and IRD. */
b334eaab
TT
911 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
912 (ep->rep_remote_cma.responder_resources == 0 ||
913 ep->rep_remote_cma.initiator_depth !=
914 ep->rep_remote_cma.responder_resources)) {
915 if (ep->rep_remote_cma.responder_resources == 0)
916 ep->rep_remote_cma.responder_resources = 1;
917 ep->rep_remote_cma.initiator_depth =
918 ep->rep_remote_cma.responder_resources;
c56c65fb 919 goto retry;
b334eaab 920 }
c56c65fb
TT
921 rc = ep->rep_connected;
922 } else {
923 dprintk("RPC: %s: connected\n", __func__);
924 }
925
926out:
927 if (rc)
928 ep->rep_connected = rc;
929 return rc;
930}
931
932/*
933 * rpcrdma_ep_disconnect
934 *
935 * This is separate from destroy to facilitate the ability
936 * to reconnect without recreating the endpoint.
937 *
938 * This call is not reentrant, and must not be made in parallel
939 * on the same endpoint.
940 */
941int
942rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
943{
944 int rc;
945
946 rpcrdma_clean_cq(ep->rep_cq);
947 rc = rdma_disconnect(ia->ri_id);
948 if (!rc) {
949 /* returns without wait if not connected */
950 wait_event_interruptible(ep->rep_connect_wait,
951 ep->rep_connected != 1);
952 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
953 (ep->rep_connected == 1) ? "still " : "dis");
954 } else {
955 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
956 ep->rep_connected = rc;
957 }
958 return rc;
959}
960
961/*
962 * Initialize buffer memory
963 */
964int
965rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
966 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
967{
968 char *p;
969 size_t len;
970 int i, rc;
8d4ba034 971 struct rpcrdma_mw *r;
c56c65fb
TT
972
973 buf->rb_max_requests = cdata->max_requests;
974 spin_lock_init(&buf->rb_lock);
975 atomic_set(&buf->rb_credits, 1);
976
977 /* Need to allocate:
978 * 1. arrays for send and recv pointers
979 * 2. arrays of struct rpcrdma_req to fill in pointers
980 * 3. array of struct rpcrdma_rep for replies
981 * 4. padding, if any
3197d309 982 * 5. mw's, fmr's or frmr's, if any
c56c65fb
TT
983 * Send/recv buffers in req/rep need to be registered
984 */
985
986 len = buf->rb_max_requests *
987 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
988 len += cdata->padding;
989 switch (ia->ri_memreg_strategy) {
3197d309
TT
990 case RPCRDMA_FRMR:
991 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
992 sizeof(struct rpcrdma_mw);
993 break;
c56c65fb
TT
994 case RPCRDMA_MTHCAFMR:
995 /* TBD we are perhaps overallocating here */
996 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
997 sizeof(struct rpcrdma_mw);
998 break;
999 case RPCRDMA_MEMWINDOWS_ASYNC:
1000 case RPCRDMA_MEMWINDOWS:
1001 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1002 sizeof(struct rpcrdma_mw);
1003 break;
1004 default:
1005 break;
1006 }
1007
1008 /* allocate 1, 4 and 5 in one shot */
1009 p = kzalloc(len, GFP_KERNEL);
1010 if (p == NULL) {
1011 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1012 __func__, len);
1013 rc = -ENOMEM;
1014 goto out;
1015 }
1016 buf->rb_pool = p; /* for freeing it later */
1017
1018 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1019 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1020 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1021 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1022
1023 /*
1024 * Register the zeroed pad buffer, if any.
1025 */
1026 if (cdata->padding) {
1027 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1028 &ep->rep_pad_mr, &ep->rep_pad);
1029 if (rc)
1030 goto out;
1031 }
1032 p += cdata->padding;
1033
1034 /*
1035 * Allocate the fmr's, or mw's for mw_bind chunk registration.
1036 * We "cycle" the mw's in order to minimize rkey reuse,
1037 * and also reduce unbind-to-bind collision.
1038 */
1039 INIT_LIST_HEAD(&buf->rb_mws);
8d4ba034 1040 r = (struct rpcrdma_mw *)p;
c56c65fb 1041 switch (ia->ri_memreg_strategy) {
3197d309
TT
1042 case RPCRDMA_FRMR:
1043 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1044 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1045 RPCRDMA_MAX_SEGS);
1046 if (IS_ERR(r->r.frmr.fr_mr)) {
1047 rc = PTR_ERR(r->r.frmr.fr_mr);
1048 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1049 " failed %i\n", __func__, rc);
1050 goto out;
1051 }
1052 r->r.frmr.fr_pgl =
1053 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1054 RPCRDMA_MAX_SEGS);
1055 if (IS_ERR(r->r.frmr.fr_pgl)) {
1056 rc = PTR_ERR(r->r.frmr.fr_pgl);
1057 dprintk("RPC: %s: "
1058 "ib_alloc_fast_reg_page_list "
1059 "failed %i\n", __func__, rc);
1060 goto out;
1061 }
1062 list_add(&r->mw_list, &buf->rb_mws);
1063 ++r;
1064 }
1065 break;
c56c65fb 1066 case RPCRDMA_MTHCAFMR:
c56c65fb
TT
1067 /* TBD we are perhaps overallocating here */
1068 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
8d4ba034
TT
1069 static struct ib_fmr_attr fa =
1070 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
c56c65fb
TT
1071 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1072 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1073 &fa);
1074 if (IS_ERR(r->r.fmr)) {
1075 rc = PTR_ERR(r->r.fmr);
1076 dprintk("RPC: %s: ib_alloc_fmr"
1077 " failed %i\n", __func__, rc);
1078 goto out;
1079 }
1080 list_add(&r->mw_list, &buf->rb_mws);
1081 ++r;
1082 }
c56c65fb
TT
1083 break;
1084 case RPCRDMA_MEMWINDOWS_ASYNC:
1085 case RPCRDMA_MEMWINDOWS:
c56c65fb
TT
1086 /* Allocate one extra request's worth, for full cycling */
1087 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1088 r->r.mw = ib_alloc_mw(ia->ri_pd);
1089 if (IS_ERR(r->r.mw)) {
1090 rc = PTR_ERR(r->r.mw);
1091 dprintk("RPC: %s: ib_alloc_mw"
1092 " failed %i\n", __func__, rc);
1093 goto out;
1094 }
1095 list_add(&r->mw_list, &buf->rb_mws);
1096 ++r;
1097 }
c56c65fb
TT
1098 break;
1099 default:
1100 break;
1101 }
1102
1103 /*
1104 * Allocate/init the request/reply buffers. Doing this
1105 * using kmalloc for now -- one for each buf.
1106 */
1107 for (i = 0; i < buf->rb_max_requests; i++) {
1108 struct rpcrdma_req *req;
1109 struct rpcrdma_rep *rep;
1110
1111 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1112 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1113 /* Typical ~2400b, so rounding up saves work later */
1114 if (len < 4096)
1115 len = 4096;
1116 req = kmalloc(len, GFP_KERNEL);
1117 if (req == NULL) {
1118 dprintk("RPC: %s: request buffer %d alloc"
1119 " failed\n", __func__, i);
1120 rc = -ENOMEM;
1121 goto out;
1122 }
1123 memset(req, 0, sizeof(struct rpcrdma_req));
1124 buf->rb_send_bufs[i] = req;
1125 buf->rb_send_bufs[i]->rl_buffer = buf;
1126
1127 rc = rpcrdma_register_internal(ia, req->rl_base,
1128 len - offsetof(struct rpcrdma_req, rl_base),
1129 &buf->rb_send_bufs[i]->rl_handle,
1130 &buf->rb_send_bufs[i]->rl_iov);
1131 if (rc)
1132 goto out;
1133
1134 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1135
1136 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1137 rep = kmalloc(len, GFP_KERNEL);
1138 if (rep == NULL) {
1139 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1140 __func__, i);
1141 rc = -ENOMEM;
1142 goto out;
1143 }
1144 memset(rep, 0, sizeof(struct rpcrdma_rep));
1145 buf->rb_recv_bufs[i] = rep;
1146 buf->rb_recv_bufs[i]->rr_buffer = buf;
1147 init_waitqueue_head(&rep->rr_unbind);
1148
1149 rc = rpcrdma_register_internal(ia, rep->rr_base,
1150 len - offsetof(struct rpcrdma_rep, rr_base),
1151 &buf->rb_recv_bufs[i]->rr_handle,
1152 &buf->rb_recv_bufs[i]->rr_iov);
1153 if (rc)
1154 goto out;
1155
1156 }
1157 dprintk("RPC: %s: max_requests %d\n",
1158 __func__, buf->rb_max_requests);
1159 /* done */
1160 return 0;
1161out:
1162 rpcrdma_buffer_destroy(buf);
1163 return rc;
1164}
1165
1166/*
1167 * Unregister and destroy buffer memory. Need to deal with
1168 * partial initialization, so it's callable from failed create.
1169 * Must be called before destroying endpoint, as registrations
1170 * reference it.
1171 */
1172void
1173rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1174{
1175 int rc, i;
1176 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
8d4ba034 1177 struct rpcrdma_mw *r;
c56c65fb
TT
1178
1179 /* clean up in reverse order from create
1180 * 1. recv mr memory (mr free, then kfree)
1181 * 1a. bind mw memory
1182 * 2. send mr memory (mr free, then kfree)
1183 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1184 * 4. arrays
1185 */
1186 dprintk("RPC: %s: entering\n", __func__);
1187
1188 for (i = 0; i < buf->rb_max_requests; i++) {
1189 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1190 rpcrdma_deregister_internal(ia,
1191 buf->rb_recv_bufs[i]->rr_handle,
1192 &buf->rb_recv_bufs[i]->rr_iov);
1193 kfree(buf->rb_recv_bufs[i]);
1194 }
1195 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1196 while (!list_empty(&buf->rb_mws)) {
c56c65fb
TT
1197 r = list_entry(buf->rb_mws.next,
1198 struct rpcrdma_mw, mw_list);
1199 list_del(&r->mw_list);
1200 switch (ia->ri_memreg_strategy) {
3197d309
TT
1201 case RPCRDMA_FRMR:
1202 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1203 if (rc)
1204 dprintk("RPC: %s:"
1205 " ib_dereg_mr"
1206 " failed %i\n",
1207 __func__, rc);
1208 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1209 break;
c56c65fb
TT
1210 case RPCRDMA_MTHCAFMR:
1211 rc = ib_dealloc_fmr(r->r.fmr);
1212 if (rc)
1213 dprintk("RPC: %s:"
1214 " ib_dealloc_fmr"
1215 " failed %i\n",
1216 __func__, rc);
1217 break;
1218 case RPCRDMA_MEMWINDOWS_ASYNC:
1219 case RPCRDMA_MEMWINDOWS:
1220 rc = ib_dealloc_mw(r->r.mw);
1221 if (rc)
1222 dprintk("RPC: %s:"
1223 " ib_dealloc_mw"
1224 " failed %i\n",
1225 __func__, rc);
1226 break;
1227 default:
1228 break;
1229 }
1230 }
1231 rpcrdma_deregister_internal(ia,
1232 buf->rb_send_bufs[i]->rl_handle,
1233 &buf->rb_send_bufs[i]->rl_iov);
1234 kfree(buf->rb_send_bufs[i]);
1235 }
1236 }
1237
1238 kfree(buf->rb_pool);
1239}
1240
1241/*
1242 * Get a set of request/reply buffers.
1243 *
1244 * Reply buffer (if needed) is attached to send buffer upon return.
1245 * Rule:
1246 * rb_send_index and rb_recv_index MUST always be pointing to the
1247 * *next* available buffer (non-NULL). They are incremented after
1248 * removing buffers, and decremented *before* returning them.
1249 */
1250struct rpcrdma_req *
1251rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1252{
1253 struct rpcrdma_req *req;
1254 unsigned long flags;
8d4ba034
TT
1255 int i;
1256 struct rpcrdma_mw *r;
c56c65fb
TT
1257
1258 spin_lock_irqsave(&buffers->rb_lock, flags);
1259 if (buffers->rb_send_index == buffers->rb_max_requests) {
1260 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1261 dprintk("RPC: %s: out of request buffers\n", __func__);
1262 return ((struct rpcrdma_req *)NULL);
1263 }
1264
1265 req = buffers->rb_send_bufs[buffers->rb_send_index];
1266 if (buffers->rb_send_index < buffers->rb_recv_index) {
1267 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1268 __func__,
1269 buffers->rb_recv_index - buffers->rb_send_index);
1270 req->rl_reply = NULL;
1271 } else {
1272 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1273 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1274 }
1275 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1276 if (!list_empty(&buffers->rb_mws)) {
8d4ba034 1277 i = RPCRDMA_MAX_SEGS - 1;
c56c65fb 1278 do {
c56c65fb
TT
1279 r = list_entry(buffers->rb_mws.next,
1280 struct rpcrdma_mw, mw_list);
1281 list_del(&r->mw_list);
1282 req->rl_segments[i].mr_chunk.rl_mw = r;
1283 } while (--i >= 0);
1284 }
1285 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1286 return req;
1287}
1288
1289/*
1290 * Put request/reply buffers back into pool.
1291 * Pre-decrement counter/array index.
1292 */
1293void
1294rpcrdma_buffer_put(struct rpcrdma_req *req)
1295{
1296 struct rpcrdma_buffer *buffers = req->rl_buffer;
1297 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1298 int i;
1299 unsigned long flags;
1300
1301 BUG_ON(req->rl_nchunks != 0);
1302 spin_lock_irqsave(&buffers->rb_lock, flags);
1303 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1304 req->rl_niovs = 0;
1305 if (req->rl_reply) {
1306 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1307 init_waitqueue_head(&req->rl_reply->rr_unbind);
1308 req->rl_reply->rr_func = NULL;
1309 req->rl_reply = NULL;
1310 }
1311 switch (ia->ri_memreg_strategy) {
3197d309 1312 case RPCRDMA_FRMR:
c56c65fb
TT
1313 case RPCRDMA_MTHCAFMR:
1314 case RPCRDMA_MEMWINDOWS_ASYNC:
1315 case RPCRDMA_MEMWINDOWS:
1316 /*
1317 * Cycle mw's back in reverse order, and "spin" them.
1318 * This delays and scrambles reuse as much as possible.
1319 */
1320 i = 1;
1321 do {
1322 struct rpcrdma_mw **mw;
1323 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1324 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1325 *mw = NULL;
1326 } while (++i < RPCRDMA_MAX_SEGS);
1327 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1328 &buffers->rb_mws);
1329 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1330 break;
1331 default:
1332 break;
1333 }
1334 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1335}
1336
1337/*
1338 * Recover reply buffers from pool.
1339 * This happens when recovering from error conditions.
1340 * Post-increment counter/array index.
1341 */
1342void
1343rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1344{
1345 struct rpcrdma_buffer *buffers = req->rl_buffer;
1346 unsigned long flags;
1347
1348 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1349 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1350 spin_lock_irqsave(&buffers->rb_lock, flags);
1351 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1352 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1353 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1354 }
1355 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1356}
1357
1358/*
1359 * Put reply buffers back into pool when not attached to
1360 * request. This happens in error conditions, and when
1361 * aborting unbinds. Pre-decrement counter/array index.
1362 */
1363void
1364rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1365{
1366 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1367 unsigned long flags;
1368
1369 rep->rr_func = NULL;
1370 spin_lock_irqsave(&buffers->rb_lock, flags);
1371 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1372 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1373}
1374
1375/*
1376 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1377 */
1378
1379int
1380rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1381 struct ib_mr **mrp, struct ib_sge *iov)
1382{
1383 struct ib_phys_buf ipb;
1384 struct ib_mr *mr;
1385 int rc;
1386
1387 /*
1388 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1389 */
1390 iov->addr = ib_dma_map_single(ia->ri_id->device,
1391 va, len, DMA_BIDIRECTIONAL);
1392 iov->length = len;
1393
bd7ed1d1
TT
1394 if (ia->ri_have_dma_lkey) {
1395 *mrp = NULL;
1396 iov->lkey = ia->ri_dma_lkey;
1397 return 0;
1398 } else if (ia->ri_bind_mem != NULL) {
c56c65fb
TT
1399 *mrp = NULL;
1400 iov->lkey = ia->ri_bind_mem->lkey;
1401 return 0;
1402 }
1403
1404 ipb.addr = iov->addr;
1405 ipb.size = iov->length;
1406 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1407 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1408
1409 dprintk("RPC: %s: phys convert: 0x%llx "
1410 "registered 0x%llx length %d\n",
a56daeb7
AM
1411 __func__, (unsigned long long)ipb.addr,
1412 (unsigned long long)iov->addr, len);
c56c65fb
TT
1413
1414 if (IS_ERR(mr)) {
1415 *mrp = NULL;
1416 rc = PTR_ERR(mr);
1417 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1418 } else {
1419 *mrp = mr;
1420 iov->lkey = mr->lkey;
1421 rc = 0;
1422 }
1423
1424 return rc;
1425}
1426
1427int
1428rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1429 struct ib_mr *mr, struct ib_sge *iov)
1430{
1431 int rc;
1432
1433 ib_dma_unmap_single(ia->ri_id->device,
1434 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1435
1436 if (NULL == mr)
1437 return 0;
1438
1439 rc = ib_dereg_mr(mr);
1440 if (rc)
1441 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1442 return rc;
1443}
1444
1445/*
1446 * Wrappers for chunk registration, shared by read/write chunk code.
1447 */
1448
1449static void
1450rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1451{
1452 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1453 seg->mr_dmalen = seg->mr_len;
1454 if (seg->mr_page)
1455 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1456 seg->mr_page, offset_in_page(seg->mr_offset),
1457 seg->mr_dmalen, seg->mr_dir);
1458 else
1459 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1460 seg->mr_offset,
1461 seg->mr_dmalen, seg->mr_dir);
5c635e09
TT
1462 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1463 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1464 __func__,
986d4abb
RD
1465 (unsigned long long)seg->mr_dma,
1466 seg->mr_offset, seg->mr_dmalen);
5c635e09 1467 }
c56c65fb
TT
1468}
1469
1470static void
1471rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1472{
1473 if (seg->mr_page)
1474 ib_dma_unmap_page(ia->ri_id->device,
1475 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1476 else
1477 ib_dma_unmap_single(ia->ri_id->device,
1478 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1479}
1480
3197d309
TT
1481static int
1482rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1483 int *nsegs, int writing, struct rpcrdma_ia *ia,
1484 struct rpcrdma_xprt *r_xprt)
1485{
1486 struct rpcrdma_mr_seg *seg1 = seg;
5c635e09
TT
1487 struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1488
3197d309
TT
1489 u8 key;
1490 int len, pageoff;
1491 int i, rc;
1492
1493 pageoff = offset_in_page(seg1->mr_offset);
1494 seg1->mr_offset -= pageoff; /* start of page */
1495 seg1->mr_len += pageoff;
1496 len = -pageoff;
1497 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1498 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1499 for (i = 0; i < *nsegs;) {
1500 rpcrdma_map_one(ia, seg, writing);
1501 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1502 len += seg->mr_len;
5c635e09 1503 BUG_ON(seg->mr_len > PAGE_SIZE);
3197d309
TT
1504 ++seg;
1505 ++i;
1506 /* Check for holes */
1507 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1508 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1509 break;
1510 }
1511 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1512 __func__, seg1->mr_chunk.rl_mw, i);
1513
5c635e09
TT
1514 if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1515 dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
1516 __func__,
1517 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1518 /* Invalidate before using. */
1519 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1520 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1521 invalidate_wr.next = &frmr_wr;
1522 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1523 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1524 invalidate_wr.ex.invalidate_rkey =
1525 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1526 DECR_CQCOUNT(&r_xprt->rx_ep);
1527 post_wr = &invalidate_wr;
1528 } else
1529 post_wr = &frmr_wr;
1530
3197d309
TT
1531 /* Bump the key */
1532 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1533 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1534
1535 /* Prepare FRMR WR */
1536 memset(&frmr_wr, 0, sizeof frmr_wr);
5c635e09 1537 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
3197d309 1538 frmr_wr.opcode = IB_WR_FAST_REG_MR;
5c635e09 1539 frmr_wr.send_flags = IB_SEND_SIGNALED;
7a8b80eb 1540 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
3197d309
TT
1541 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1542 frmr_wr.wr.fast_reg.page_list_len = i;
1543 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1544 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
5c635e09 1545 BUG_ON(frmr_wr.wr.fast_reg.length < len);
3197d309 1546 frmr_wr.wr.fast_reg.access_flags = (writing ?
68743082
VP
1547 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1548 IB_ACCESS_REMOTE_READ);
3197d309
TT
1549 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1550 DECR_CQCOUNT(&r_xprt->rx_ep);
1551
5c635e09 1552 rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
3197d309
TT
1553
1554 if (rc) {
1555 dprintk("RPC: %s: failed ib_post_send for register,"
1556 " status %i\n", __func__, rc);
1557 while (i--)
1558 rpcrdma_unmap_one(ia, --seg);
1559 } else {
1560 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1561 seg1->mr_base = seg1->mr_dma + pageoff;
1562 seg1->mr_nsegs = i;
1563 seg1->mr_len = len;
1564 }
1565 *nsegs = i;
1566 return rc;
1567}
1568
1569static int
1570rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1571 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1572{
1573 struct rpcrdma_mr_seg *seg1 = seg;
1574 struct ib_send_wr invalidate_wr, *bad_wr;
1575 int rc;
1576
1577 while (seg1->mr_nsegs--)
1578 rpcrdma_unmap_one(ia, seg++);
1579
1580 memset(&invalidate_wr, 0, sizeof invalidate_wr);
5c635e09 1581 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
3197d309 1582 invalidate_wr.opcode = IB_WR_LOCAL_INV;
5c635e09 1583 invalidate_wr.send_flags = IB_SEND_SIGNALED;
3197d309
TT
1584 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1585 DECR_CQCOUNT(&r_xprt->rx_ep);
1586
1587 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1588 if (rc)
1589 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1590 " status %i\n", __func__, rc);
1591 return rc;
1592}
1593
8d4ba034
TT
1594static int
1595rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1596 int *nsegs, int writing, struct rpcrdma_ia *ia)
1597{
1598 struct rpcrdma_mr_seg *seg1 = seg;
1599 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1600 int len, pageoff, i, rc;
1601
1602 pageoff = offset_in_page(seg1->mr_offset);
1603 seg1->mr_offset -= pageoff; /* start of page */
1604 seg1->mr_len += pageoff;
1605 len = -pageoff;
1606 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1607 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1608 for (i = 0; i < *nsegs;) {
1609 rpcrdma_map_one(ia, seg, writing);
1610 physaddrs[i] = seg->mr_dma;
1611 len += seg->mr_len;
1612 ++seg;
1613 ++i;
1614 /* Check for holes */
1615 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1616 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1617 break;
1618 }
1619 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1620 physaddrs, i, seg1->mr_dma);
1621 if (rc) {
1622 dprintk("RPC: %s: failed ib_map_phys_fmr "
1623 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1624 len, (unsigned long long)seg1->mr_dma,
1625 pageoff, i, rc);
1626 while (i--)
1627 rpcrdma_unmap_one(ia, --seg);
1628 } else {
1629 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1630 seg1->mr_base = seg1->mr_dma + pageoff;
1631 seg1->mr_nsegs = i;
1632 seg1->mr_len = len;
1633 }
1634 *nsegs = i;
1635 return rc;
1636}
1637
1638static int
1639rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1640 struct rpcrdma_ia *ia)
1641{
1642 struct rpcrdma_mr_seg *seg1 = seg;
1643 LIST_HEAD(l);
1644 int rc;
1645
1646 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1647 rc = ib_unmap_fmr(&l);
1648 while (seg1->mr_nsegs--)
1649 rpcrdma_unmap_one(ia, seg++);
1650 if (rc)
1651 dprintk("RPC: %s: failed ib_unmap_fmr,"
1652 " status %i\n", __func__, rc);
1653 return rc;
1654}
1655
1656static int
1657rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1658 int *nsegs, int writing, struct rpcrdma_ia *ia,
1659 struct rpcrdma_xprt *r_xprt)
1660{
1661 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1662 IB_ACCESS_REMOTE_READ);
1663 struct ib_mw_bind param;
1664 int rc;
1665
1666 *nsegs = 1;
1667 rpcrdma_map_one(ia, seg, writing);
1668 param.mr = ia->ri_bind_mem;
1669 param.wr_id = 0ULL; /* no send cookie */
1670 param.addr = seg->mr_dma;
1671 param.length = seg->mr_len;
1672 param.send_flags = 0;
1673 param.mw_access_flags = mem_priv;
1674
1675 DECR_CQCOUNT(&r_xprt->rx_ep);
1676 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1677 if (rc) {
1678 dprintk("RPC: %s: failed ib_bind_mw "
1679 "%u@0x%llx status %i\n",
1680 __func__, seg->mr_len,
1681 (unsigned long long)seg->mr_dma, rc);
1682 rpcrdma_unmap_one(ia, seg);
1683 } else {
1684 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1685 seg->mr_base = param.addr;
1686 seg->mr_nsegs = 1;
1687 }
1688 return rc;
1689}
1690
1691static int
1692rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1693 struct rpcrdma_ia *ia,
1694 struct rpcrdma_xprt *r_xprt, void **r)
1695{
1696 struct ib_mw_bind param;
1697 LIST_HEAD(l);
1698 int rc;
1699
1700 BUG_ON(seg->mr_nsegs != 1);
1701 param.mr = ia->ri_bind_mem;
1702 param.addr = 0ULL; /* unbind */
1703 param.length = 0;
1704 param.mw_access_flags = 0;
1705 if (*r) {
1706 param.wr_id = (u64) (unsigned long) *r;
1707 param.send_flags = IB_SEND_SIGNALED;
1708 INIT_CQCOUNT(&r_xprt->rx_ep);
1709 } else {
1710 param.wr_id = 0ULL;
1711 param.send_flags = 0;
1712 DECR_CQCOUNT(&r_xprt->rx_ep);
1713 }
1714 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1715 rpcrdma_unmap_one(ia, seg);
1716 if (rc)
1717 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1718 " status %i\n", __func__, rc);
1719 else
1720 *r = NULL; /* will upcall on completion */
1721 return rc;
1722}
1723
1724static int
1725rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1726 int *nsegs, int writing, struct rpcrdma_ia *ia)
1727{
1728 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1729 IB_ACCESS_REMOTE_READ);
1730 struct rpcrdma_mr_seg *seg1 = seg;
1731 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1732 int len, i, rc = 0;
1733
1734 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1735 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1736 for (len = 0, i = 0; i < *nsegs;) {
1737 rpcrdma_map_one(ia, seg, writing);
1738 ipb[i].addr = seg->mr_dma;
1739 ipb[i].size = seg->mr_len;
1740 len += seg->mr_len;
1741 ++seg;
1742 ++i;
1743 /* Check for holes */
1744 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1745 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1746 break;
1747 }
1748 seg1->mr_base = seg1->mr_dma;
1749 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1750 ipb, i, mem_priv, &seg1->mr_base);
1751 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1752 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1753 dprintk("RPC: %s: failed ib_reg_phys_mr "
1754 "%u@0x%llx (%d)... status %i\n",
1755 __func__, len,
1756 (unsigned long long)seg1->mr_dma, i, rc);
1757 while (i--)
1758 rpcrdma_unmap_one(ia, --seg);
1759 } else {
1760 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1761 seg1->mr_nsegs = i;
1762 seg1->mr_len = len;
1763 }
1764 *nsegs = i;
1765 return rc;
1766}
1767
1768static int
1769rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1770 struct rpcrdma_ia *ia)
1771{
1772 struct rpcrdma_mr_seg *seg1 = seg;
1773 int rc;
1774
1775 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1776 seg1->mr_chunk.rl_mr = NULL;
1777 while (seg1->mr_nsegs--)
1778 rpcrdma_unmap_one(ia, seg++);
1779 if (rc)
1780 dprintk("RPC: %s: failed ib_dereg_mr,"
1781 " status %i\n", __func__, rc);
1782 return rc;
1783}
1784
c56c65fb
TT
1785int
1786rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1787 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1788{
1789 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1790 int rc = 0;
1791
1792 switch (ia->ri_memreg_strategy) {
1793
1794#if RPCRDMA_PERSISTENT_REGISTRATION
1795 case RPCRDMA_ALLPHYSICAL:
1796 rpcrdma_map_one(ia, seg, writing);
1797 seg->mr_rkey = ia->ri_bind_mem->rkey;
1798 seg->mr_base = seg->mr_dma;
1799 seg->mr_nsegs = 1;
1800 nsegs = 1;
1801 break;
1802#endif
1803
3197d309
TT
1804 /* Registration using frmr registration */
1805 case RPCRDMA_FRMR:
1806 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1807 break;
1808
8d4ba034 1809 /* Registration using fmr memory registration */
c56c65fb 1810 case RPCRDMA_MTHCAFMR:
8d4ba034 1811 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1812 break;
1813
1814 /* Registration using memory windows */
1815 case RPCRDMA_MEMWINDOWS_ASYNC:
1816 case RPCRDMA_MEMWINDOWS:
8d4ba034 1817 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
c56c65fb
TT
1818 break;
1819
1820 /* Default registration each time */
1821 default:
8d4ba034 1822 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1823 break;
1824 }
1825 if (rc)
1826 return -1;
1827
1828 return nsegs;
1829}
1830
1831int
1832rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1833 struct rpcrdma_xprt *r_xprt, void *r)
1834{
1835 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1836 int nsegs = seg->mr_nsegs, rc;
1837
1838 switch (ia->ri_memreg_strategy) {
1839
1840#if RPCRDMA_PERSISTENT_REGISTRATION
1841 case RPCRDMA_ALLPHYSICAL:
1842 BUG_ON(nsegs != 1);
1843 rpcrdma_unmap_one(ia, seg);
1844 rc = 0;
1845 break;
1846#endif
1847
3197d309
TT
1848 case RPCRDMA_FRMR:
1849 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1850 break;
1851
c56c65fb 1852 case RPCRDMA_MTHCAFMR:
8d4ba034 1853 rc = rpcrdma_deregister_fmr_external(seg, ia);
c56c65fb
TT
1854 break;
1855
1856 case RPCRDMA_MEMWINDOWS_ASYNC:
1857 case RPCRDMA_MEMWINDOWS:
8d4ba034 1858 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
c56c65fb
TT
1859 break;
1860
1861 default:
8d4ba034 1862 rc = rpcrdma_deregister_default_external(seg, ia);
c56c65fb
TT
1863 break;
1864 }
1865 if (r) {
1866 struct rpcrdma_rep *rep = r;
1867 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1868 rep->rr_func = NULL;
1869 func(rep); /* dereg done, callback now */
1870 }
1871 return nsegs;
1872}
1873
1874/*
1875 * Prepost any receive buffer, then post send.
1876 *
1877 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1878 */
1879int
1880rpcrdma_ep_post(struct rpcrdma_ia *ia,
1881 struct rpcrdma_ep *ep,
1882 struct rpcrdma_req *req)
1883{
1884 struct ib_send_wr send_wr, *send_wr_fail;
1885 struct rpcrdma_rep *rep = req->rl_reply;
1886 int rc;
1887
1888 if (rep) {
1889 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1890 if (rc)
1891 goto out;
1892 req->rl_reply = NULL;
1893 }
1894
1895 send_wr.next = NULL;
1896 send_wr.wr_id = 0ULL; /* no send cookie */
1897 send_wr.sg_list = req->rl_send_iov;
1898 send_wr.num_sge = req->rl_niovs;
1899 send_wr.opcode = IB_WR_SEND;
c56c65fb
TT
1900 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1901 ib_dma_sync_single_for_device(ia->ri_id->device,
1902 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1903 DMA_TO_DEVICE);
1904 ib_dma_sync_single_for_device(ia->ri_id->device,
1905 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1906 DMA_TO_DEVICE);
1907 ib_dma_sync_single_for_device(ia->ri_id->device,
1908 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1909 DMA_TO_DEVICE);
1910
1911 if (DECR_CQCOUNT(ep) > 0)
1912 send_wr.send_flags = 0;
1913 else { /* Provider must take a send completion every now and then */
1914 INIT_CQCOUNT(ep);
1915 send_wr.send_flags = IB_SEND_SIGNALED;
1916 }
1917
1918 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1919 if (rc)
1920 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1921 rc);
1922out:
1923 return rc;
1924}
1925
1926/*
1927 * (Re)post a receive buffer.
1928 */
1929int
1930rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1931 struct rpcrdma_ep *ep,
1932 struct rpcrdma_rep *rep)
1933{
1934 struct ib_recv_wr recv_wr, *recv_wr_fail;
1935 int rc;
1936
1937 recv_wr.next = NULL;
1938 recv_wr.wr_id = (u64) (unsigned long) rep;
1939 recv_wr.sg_list = &rep->rr_iov;
1940 recv_wr.num_sge = 1;
1941
1942 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1943 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1944
1945 DECR_CQCOUNT(ep);
1946 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1947
1948 if (rc)
1949 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1950 rc);
1951 return rc;
1952}
This page took 0.367753 seconds and 5 git commands to generate.