RPC/RDMA: check selected memory registration mode at runtime.
[deliverable/linux.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
f58851e6 1/*
c56c65fb
TT
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
38 */
39
c56c65fb
TT
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50#include <linux/pci.h> /* for Tavor hack below */
51
f58851e6
TT
52#include "xprt_rdma.h"
53
c56c65fb
TT
54/*
55 * Globals/Macros
56 */
57
58#ifdef RPC_DEBUG
59# define RPCDBG_FACILITY RPCDBG_TRANS
60#endif
61
62/*
63 * internal functions
64 */
65
66/*
67 * handle replies in tasklet context, using a single, global list
68 * rdma tasklet function -- just turn around and call the func
69 * for all replies on the list
70 */
71
72static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
73static LIST_HEAD(rpcrdma_tasklets_g);
74
75static void
76rpcrdma_run_tasklet(unsigned long data)
77{
78 struct rpcrdma_rep *rep;
79 void (*func)(struct rpcrdma_rep *);
80 unsigned long flags;
81
82 data = data;
83 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
84 while (!list_empty(&rpcrdma_tasklets_g)) {
85 rep = list_entry(rpcrdma_tasklets_g.next,
86 struct rpcrdma_rep, rr_list);
87 list_del(&rep->rr_list);
88 func = rep->rr_func;
89 rep->rr_func = NULL;
90 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
91
92 if (func)
93 func(rep);
94 else
95 rpcrdma_recv_buffer_put(rep);
96
97 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
98 }
99 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
100}
101
102static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
103
104static inline void
105rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
106{
107 unsigned long flags;
108
109 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
110 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
111 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
112 tasklet_schedule(&rpcrdma_tasklet_g);
113}
114
115static void
116rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
117{
118 struct rpcrdma_ep *ep = context;
119
120 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
121 __func__, event->event, event->device->name, context);
122 if (ep->rep_connected == 1) {
123 ep->rep_connected = -EIO;
124 ep->rep_func(ep);
125 wake_up_all(&ep->rep_connect_wait);
126 }
127}
128
129static void
130rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
131{
132 struct rpcrdma_ep *ep = context;
133
134 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
135 __func__, event->event, event->device->name, context);
136 if (ep->rep_connected == 1) {
137 ep->rep_connected = -EIO;
138 ep->rep_func(ep);
139 wake_up_all(&ep->rep_connect_wait);
140 }
141}
142
143static inline
144void rpcrdma_event_process(struct ib_wc *wc)
145{
146 struct rpcrdma_rep *rep =
147 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
148
149 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
150 __func__, rep, wc->status, wc->opcode, wc->byte_len);
151
152 if (!rep) /* send or bind completion that we don't care about */
153 return;
154
155 if (IB_WC_SUCCESS != wc->status) {
156 dprintk("RPC: %s: %s WC status %X, connection lost\n",
157 __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
158 wc->status);
159 rep->rr_len = ~0U;
160 rpcrdma_schedule_tasklet(rep);
161 return;
162 }
163
164 switch (wc->opcode) {
165 case IB_WC_RECV:
166 rep->rr_len = wc->byte_len;
167 ib_dma_sync_single_for_cpu(
168 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
169 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
170 /* Keep (only) the most recent credits, after check validity */
171 if (rep->rr_len >= 16) {
172 struct rpcrdma_msg *p =
173 (struct rpcrdma_msg *) rep->rr_base;
174 unsigned int credits = ntohl(p->rm_credit);
175 if (credits == 0) {
176 dprintk("RPC: %s: server"
177 " dropped credits to 0!\n", __func__);
178 /* don't deadlock */
179 credits = 1;
180 } else if (credits > rep->rr_buffer->rb_max_requests) {
181 dprintk("RPC: %s: server"
182 " over-crediting: %d (%d)\n",
183 __func__, credits,
184 rep->rr_buffer->rb_max_requests);
185 credits = rep->rr_buffer->rb_max_requests;
186 }
187 atomic_set(&rep->rr_buffer->rb_credits, credits);
188 }
189 /* fall through */
190 case IB_WC_BIND_MW:
191 rpcrdma_schedule_tasklet(rep);
192 break;
193 default:
194 dprintk("RPC: %s: unexpected WC event %X\n",
195 __func__, wc->opcode);
196 break;
197 }
198}
199
200static inline int
201rpcrdma_cq_poll(struct ib_cq *cq)
202{
203 struct ib_wc wc;
204 int rc;
205
206 for (;;) {
207 rc = ib_poll_cq(cq, 1, &wc);
208 if (rc < 0) {
209 dprintk("RPC: %s: ib_poll_cq failed %i\n",
210 __func__, rc);
211 return rc;
212 }
213 if (rc == 0)
214 break;
215
216 rpcrdma_event_process(&wc);
217 }
218
219 return 0;
220}
221
222/*
223 * rpcrdma_cq_event_upcall
224 *
225 * This upcall handles recv, send, bind and unbind events.
226 * It is reentrant but processes single events in order to maintain
227 * ordering of receives to keep server credits.
228 *
229 * It is the responsibility of the scheduled tasklet to return
230 * recv buffers to the pool. NOTE: this affects synchronization of
231 * connection shutdown. That is, the structures required for
232 * the completion of the reply handler must remain intact until
233 * all memory has been reclaimed.
234 *
235 * Note that send events are suppressed and do not result in an upcall.
236 */
237static void
238rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
239{
240 int rc;
241
242 rc = rpcrdma_cq_poll(cq);
243 if (rc)
244 return;
245
246 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
247 if (rc) {
248 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
249 __func__, rc);
250 return;
251 }
252
253 rpcrdma_cq_poll(cq);
254}
255
256#ifdef RPC_DEBUG
257static const char * const conn[] = {
258 "address resolved",
259 "address error",
260 "route resolved",
261 "route error",
262 "connect request",
263 "connect response",
264 "connect error",
265 "unreachable",
266 "rejected",
267 "established",
268 "disconnected",
269 "device removal"
270};
271#endif
272
273static int
274rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
275{
276 struct rpcrdma_xprt *xprt = id->context;
277 struct rpcrdma_ia *ia = &xprt->rx_ia;
278 struct rpcrdma_ep *ep = &xprt->rx_ep;
279 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
280 struct ib_qp_attr attr;
281 struct ib_qp_init_attr iattr;
282 int connstate = 0;
283
284 switch (event->event) {
285 case RDMA_CM_EVENT_ADDR_RESOLVED:
286 case RDMA_CM_EVENT_ROUTE_RESOLVED:
287 complete(&ia->ri_done);
288 break;
289 case RDMA_CM_EVENT_ADDR_ERROR:
290 ia->ri_async_rc = -EHOSTUNREACH;
291 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
292 __func__, ep);
293 complete(&ia->ri_done);
294 break;
295 case RDMA_CM_EVENT_ROUTE_ERROR:
296 ia->ri_async_rc = -ENETUNREACH;
297 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
298 __func__, ep);
299 complete(&ia->ri_done);
300 break;
301 case RDMA_CM_EVENT_ESTABLISHED:
302 connstate = 1;
303 ib_query_qp(ia->ri_id->qp, &attr,
304 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
305 &iattr);
306 dprintk("RPC: %s: %d responder resources"
307 " (%d initiator)\n",
308 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
309 goto connected;
310 case RDMA_CM_EVENT_CONNECT_ERROR:
311 connstate = -ENOTCONN;
312 goto connected;
313 case RDMA_CM_EVENT_UNREACHABLE:
314 connstate = -ENETDOWN;
315 goto connected;
316 case RDMA_CM_EVENT_REJECTED:
317 connstate = -ECONNREFUSED;
318 goto connected;
319 case RDMA_CM_EVENT_DISCONNECTED:
320 connstate = -ECONNABORTED;
321 goto connected;
322 case RDMA_CM_EVENT_DEVICE_REMOVAL:
323 connstate = -ENODEV;
324connected:
325 dprintk("RPC: %s: %s: %u.%u.%u.%u:%u"
326 " (ep 0x%p event 0x%x)\n",
327 __func__,
328 (event->event <= 11) ? conn[event->event] :
329 "unknown connection error",
330 NIPQUAD(addr->sin_addr.s_addr),
331 ntohs(addr->sin_port),
332 ep, event->event);
333 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
334 dprintk("RPC: %s: %sconnected\n",
335 __func__, connstate > 0 ? "" : "dis");
336 ep->rep_connected = connstate;
337 ep->rep_func(ep);
338 wake_up_all(&ep->rep_connect_wait);
339 break;
340 default:
341 ia->ri_async_rc = -EINVAL;
342 dprintk("RPC: %s: unexpected CM event %X\n",
343 __func__, event->event);
344 complete(&ia->ri_done);
345 break;
346 }
347
348 return 0;
349}
350
351static struct rdma_cm_id *
352rpcrdma_create_id(struct rpcrdma_xprt *xprt,
353 struct rpcrdma_ia *ia, struct sockaddr *addr)
354{
355 struct rdma_cm_id *id;
356 int rc;
357
358 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
359 if (IS_ERR(id)) {
360 rc = PTR_ERR(id);
361 dprintk("RPC: %s: rdma_create_id() failed %i\n",
362 __func__, rc);
363 return id;
364 }
365
366 ia->ri_async_rc = 0;
367 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
368 if (rc) {
369 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
370 __func__, rc);
371 goto out;
372 }
373 wait_for_completion(&ia->ri_done);
374 rc = ia->ri_async_rc;
375 if (rc)
376 goto out;
377
378 ia->ri_async_rc = 0;
379 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
380 if (rc) {
381 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
382 __func__, rc);
383 goto out;
384 }
385 wait_for_completion(&ia->ri_done);
386 rc = ia->ri_async_rc;
387 if (rc)
388 goto out;
389
390 return id;
391
392out:
393 rdma_destroy_id(id);
394 return ERR_PTR(rc);
395}
396
397/*
398 * Drain any cq, prior to teardown.
399 */
400static void
401rpcrdma_clean_cq(struct ib_cq *cq)
402{
403 struct ib_wc wc;
404 int count = 0;
405
406 while (1 == ib_poll_cq(cq, 1, &wc))
407 ++count;
408
409 if (count)
410 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
411 __func__, count, wc.opcode);
412}
413
414/*
415 * Exported functions.
416 */
417
418/*
419 * Open and initialize an Interface Adapter.
420 * o initializes fields of struct rpcrdma_ia, including
421 * interface and provider attributes and protection zone.
422 */
423int
424rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
425{
bd7ed1d1
TT
426 int rc, mem_priv;
427 struct ib_device_attr devattr;
c56c65fb
TT
428 struct rpcrdma_ia *ia = &xprt->rx_ia;
429
430 init_completion(&ia->ri_done);
431
432 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
433 if (IS_ERR(ia->ri_id)) {
434 rc = PTR_ERR(ia->ri_id);
435 goto out1;
436 }
437
438 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
439 if (IS_ERR(ia->ri_pd)) {
440 rc = PTR_ERR(ia->ri_pd);
441 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
442 __func__, rc);
443 goto out2;
444 }
445
bd7ed1d1
TT
446 /*
447 * Query the device to determine if the requested memory
448 * registration strategy is supported. If it isn't, set the
449 * strategy to a globally supported model.
450 */
451 rc = ib_query_device(ia->ri_id->device, &devattr);
452 if (rc) {
453 dprintk("RPC: %s: ib_query_device failed %d\n",
454 __func__, rc);
455 goto out2;
456 }
457
458 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
459 ia->ri_have_dma_lkey = 1;
460 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
461 }
462
463 switch (memreg) {
464 case RPCRDMA_MEMWINDOWS:
465 case RPCRDMA_MEMWINDOWS_ASYNC:
466 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
467 dprintk("RPC: %s: MEMWINDOWS registration "
468 "specified but not supported by adapter, "
469 "using slower RPCRDMA_REGISTER\n",
470 __func__);
471 memreg = RPCRDMA_REGISTER;
472 }
473 break;
474 case RPCRDMA_MTHCAFMR:
475 if (!ia->ri_id->device->alloc_fmr) {
476#if RPCRDMA_PERSISTENT_REGISTRATION
477 dprintk("RPC: %s: MTHCAFMR registration "
478 "specified but not supported by adapter, "
479 "using riskier RPCRDMA_ALLPHYSICAL\n",
480 __func__);
481 memreg = RPCRDMA_ALLPHYSICAL;
482#else
483 dprintk("RPC: %s: MTHCAFMR registration "
484 "specified but not supported by adapter, "
485 "using slower RPCRDMA_REGISTER\n",
486 __func__);
487 memreg = RPCRDMA_REGISTER;
488#endif
489 }
490 break;
491 }
492
c56c65fb
TT
493 /*
494 * Optionally obtain an underlying physical identity mapping in
495 * order to do a memory window-based bind. This base registration
496 * is protected from remote access - that is enabled only by binding
497 * for the specific bytes targeted during each RPC operation, and
498 * revoked after the corresponding completion similar to a storage
499 * adapter.
500 */
bd7ed1d1
TT
501 switch (memreg) {
502 case RPCRDMA_BOUNCEBUFFERS:
503 case RPCRDMA_REGISTER:
504 break;
c56c65fb 505#if RPCRDMA_PERSISTENT_REGISTRATION
bd7ed1d1
TT
506 case RPCRDMA_ALLPHYSICAL:
507 mem_priv = IB_ACCESS_LOCAL_WRITE |
508 IB_ACCESS_REMOTE_WRITE |
509 IB_ACCESS_REMOTE_READ;
510 goto register_setup;
c56c65fb 511#endif
bd7ed1d1
TT
512 case RPCRDMA_MEMWINDOWS_ASYNC:
513 case RPCRDMA_MEMWINDOWS:
514 mem_priv = IB_ACCESS_LOCAL_WRITE |
515 IB_ACCESS_MW_BIND;
516 goto register_setup;
517 case RPCRDMA_MTHCAFMR:
518 if (ia->ri_have_dma_lkey)
c56c65fb 519 break;
bd7ed1d1
TT
520 mem_priv = IB_ACCESS_LOCAL_WRITE;
521 register_setup:
c56c65fb
TT
522 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
523 if (IS_ERR(ia->ri_bind_mem)) {
524 printk(KERN_ALERT "%s: ib_get_dma_mr for "
525 "phys register failed with %lX\n\t"
526 "Will continue with degraded performance\n",
527 __func__, PTR_ERR(ia->ri_bind_mem));
528 memreg = RPCRDMA_REGISTER;
529 ia->ri_bind_mem = NULL;
530 }
bd7ed1d1
TT
531 break;
532 default:
533 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
534 __func__, memreg);
535 rc = -EINVAL;
536 goto out2;
c56c65fb 537 }
bd7ed1d1
TT
538 dprintk("RPC: %s: memory registration strategy is %d\n",
539 __func__, memreg);
c56c65fb
TT
540
541 /* Else will do memory reg/dereg for each chunk */
542 ia->ri_memreg_strategy = memreg;
543
544 return 0;
545out2:
546 rdma_destroy_id(ia->ri_id);
547out1:
548 return rc;
549}
550
551/*
552 * Clean up/close an IA.
553 * o if event handles and PD have been initialized, free them.
554 * o close the IA
555 */
556void
557rpcrdma_ia_close(struct rpcrdma_ia *ia)
558{
559 int rc;
560
561 dprintk("RPC: %s: entering\n", __func__);
562 if (ia->ri_bind_mem != NULL) {
563 rc = ib_dereg_mr(ia->ri_bind_mem);
564 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
565 __func__, rc);
566 }
567 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp)
568 rdma_destroy_qp(ia->ri_id);
569 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
570 rc = ib_dealloc_pd(ia->ri_pd);
571 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
572 __func__, rc);
573 }
574 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
575 rdma_destroy_id(ia->ri_id);
576}
577
578/*
579 * Create unconnected endpoint.
580 */
581int
582rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
583 struct rpcrdma_create_data_internal *cdata)
584{
585 struct ib_device_attr devattr;
5d40a8a5 586 int rc, err;
c56c65fb
TT
587
588 rc = ib_query_device(ia->ri_id->device, &devattr);
589 if (rc) {
590 dprintk("RPC: %s: ib_query_device failed %d\n",
591 __func__, rc);
592 return rc;
593 }
594
595 /* check provider's send/recv wr limits */
596 if (cdata->max_requests > devattr.max_qp_wr)
597 cdata->max_requests = devattr.max_qp_wr;
598
599 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
600 ep->rep_attr.qp_context = ep;
601 /* send_cq and recv_cq initialized below */
602 ep->rep_attr.srq = NULL;
603 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
604 switch (ia->ri_memreg_strategy) {
605 case RPCRDMA_MEMWINDOWS_ASYNC:
606 case RPCRDMA_MEMWINDOWS:
607 /* Add room for mw_binds+unbinds - overkill! */
608 ep->rep_attr.cap.max_send_wr++;
609 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
610 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
611 return -EINVAL;
612 break;
613 default:
614 break;
615 }
616 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
617 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
618 ep->rep_attr.cap.max_recv_sge = 1;
619 ep->rep_attr.cap.max_inline_data = 0;
620 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
621 ep->rep_attr.qp_type = IB_QPT_RC;
622 ep->rep_attr.port_num = ~0;
623
624 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
625 "iovs: send %d recv %d\n",
626 __func__,
627 ep->rep_attr.cap.max_send_wr,
628 ep->rep_attr.cap.max_recv_wr,
629 ep->rep_attr.cap.max_send_sge,
630 ep->rep_attr.cap.max_recv_sge);
631
632 /* set trigger for requesting send completion */
633 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
634 switch (ia->ri_memreg_strategy) {
635 case RPCRDMA_MEMWINDOWS_ASYNC:
636 case RPCRDMA_MEMWINDOWS:
637 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
638 break;
639 default:
640 break;
641 }
642 if (ep->rep_cqinit <= 2)
643 ep->rep_cqinit = 0;
644 INIT_CQCOUNT(ep);
645 ep->rep_ia = ia;
646 init_waitqueue_head(&ep->rep_connect_wait);
647
648 /*
649 * Create a single cq for receive dto and mw_bind (only ever
650 * care about unbind, really). Send completions are suppressed.
651 * Use single threaded tasklet upcalls to maintain ordering.
652 */
653 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
654 rpcrdma_cq_async_error_upcall, NULL,
655 ep->rep_attr.cap.max_recv_wr +
656 ep->rep_attr.cap.max_send_wr + 1, 0);
657 if (IS_ERR(ep->rep_cq)) {
658 rc = PTR_ERR(ep->rep_cq);
659 dprintk("RPC: %s: ib_create_cq failed: %i\n",
660 __func__, rc);
661 goto out1;
662 }
663
664 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
665 if (rc) {
666 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
667 __func__, rc);
668 goto out2;
669 }
670
671 ep->rep_attr.send_cq = ep->rep_cq;
672 ep->rep_attr.recv_cq = ep->rep_cq;
673
674 /* Initialize cma parameters */
675
676 /* RPC/RDMA does not use private data */
677 ep->rep_remote_cma.private_data = NULL;
678 ep->rep_remote_cma.private_data_len = 0;
679
680 /* Client offers RDMA Read but does not initiate */
681 switch (ia->ri_memreg_strategy) {
682 case RPCRDMA_BOUNCEBUFFERS:
683 ep->rep_remote_cma.responder_resources = 0;
684 break;
685 case RPCRDMA_MTHCAFMR:
686 case RPCRDMA_REGISTER:
687 ep->rep_remote_cma.responder_resources = cdata->max_requests *
688 (RPCRDMA_MAX_DATA_SEGS / 8);
689 break;
690 case RPCRDMA_MEMWINDOWS:
691 case RPCRDMA_MEMWINDOWS_ASYNC:
692#if RPCRDMA_PERSISTENT_REGISTRATION
693 case RPCRDMA_ALLPHYSICAL:
694#endif
695 ep->rep_remote_cma.responder_resources = cdata->max_requests *
696 (RPCRDMA_MAX_DATA_SEGS / 2);
697 break;
698 default:
699 break;
700 }
701 if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom)
702 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
703 ep->rep_remote_cma.initiator_depth = 0;
704
705 ep->rep_remote_cma.retry_count = 7;
706 ep->rep_remote_cma.flow_control = 0;
707 ep->rep_remote_cma.rnr_retry_count = 0;
708
709 return 0;
710
711out2:
5d40a8a5
CL
712 err = ib_destroy_cq(ep->rep_cq);
713 if (err)
714 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
715 __func__, err);
c56c65fb
TT
716out1:
717 return rc;
718}
719
720/*
721 * rpcrdma_ep_destroy
722 *
723 * Disconnect and destroy endpoint. After this, the only
724 * valid operations on the ep are to free it (if dynamically
725 * allocated) or re-create it.
726 *
727 * The caller's error handling must be sure to not leak the endpoint
728 * if this function fails.
729 */
730int
731rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
732{
733 int rc;
734
735 dprintk("RPC: %s: entering, connected is %d\n",
736 __func__, ep->rep_connected);
737
738 if (ia->ri_id->qp) {
739 rc = rpcrdma_ep_disconnect(ep, ia);
740 if (rc)
741 dprintk("RPC: %s: rpcrdma_ep_disconnect"
742 " returned %i\n", __func__, rc);
743 }
744
745 ep->rep_func = NULL;
746
747 /* padding - could be done in rpcrdma_buffer_destroy... */
748 if (ep->rep_pad_mr) {
749 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
750 ep->rep_pad_mr = NULL;
751 }
752
753 if (ia->ri_id->qp) {
754 rdma_destroy_qp(ia->ri_id);
755 ia->ri_id->qp = NULL;
756 }
757
758 rpcrdma_clean_cq(ep->rep_cq);
759 rc = ib_destroy_cq(ep->rep_cq);
760 if (rc)
761 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
762 __func__, rc);
763
764 return rc;
765}
766
767/*
768 * Connect unconnected endpoint.
769 */
770int
771rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
772{
773 struct rdma_cm_id *id;
774 int rc = 0;
775 int retry_count = 0;
776 int reconnect = (ep->rep_connected != 0);
777
778 if (reconnect) {
779 struct rpcrdma_xprt *xprt;
780retry:
781 rc = rpcrdma_ep_disconnect(ep, ia);
782 if (rc && rc != -ENOTCONN)
783 dprintk("RPC: %s: rpcrdma_ep_disconnect"
784 " status %i\n", __func__, rc);
785 rpcrdma_clean_cq(ep->rep_cq);
786
787 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
788 id = rpcrdma_create_id(xprt, ia,
789 (struct sockaddr *)&xprt->rx_data.addr);
790 if (IS_ERR(id)) {
791 rc = PTR_ERR(id);
792 goto out;
793 }
794 /* TEMP TEMP TEMP - fail if new device:
795 * Deregister/remarshal *all* requests!
796 * Close and recreate adapter, pd, etc!
797 * Re-determine all attributes still sane!
798 * More stuff I haven't thought of!
799 * Rrrgh!
800 */
801 if (ia->ri_id->device != id->device) {
802 printk("RPC: %s: can't reconnect on "
803 "different device!\n", __func__);
804 rdma_destroy_id(id);
805 rc = -ENETDOWN;
806 goto out;
807 }
808 /* END TEMP */
809 rdma_destroy_id(ia->ri_id);
810 ia->ri_id = id;
811 }
812
813 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
814 if (rc) {
815 dprintk("RPC: %s: rdma_create_qp failed %i\n",
816 __func__, rc);
817 goto out;
818 }
819
820/* XXX Tavor device performs badly with 2K MTU! */
821if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
822 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
823 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
824 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
825 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
826 struct ib_qp_attr attr = {
827 .path_mtu = IB_MTU_1024
828 };
829 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
830 }
831}
832
833 /* Theoretically a client initiator_depth > 0 is not needed,
834 * but many peers fail to complete the connection unless they
835 * == responder_resources! */
836 if (ep->rep_remote_cma.initiator_depth !=
837 ep->rep_remote_cma.responder_resources)
838 ep->rep_remote_cma.initiator_depth =
839 ep->rep_remote_cma.responder_resources;
840
841 ep->rep_connected = 0;
842
843 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
844 if (rc) {
845 dprintk("RPC: %s: rdma_connect() failed with %i\n",
846 __func__, rc);
847 goto out;
848 }
849
850 if (reconnect)
851 return 0;
852
853 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
854
855 /*
856 * Check state. A non-peer reject indicates no listener
857 * (ECONNREFUSED), which may be a transient state. All
858 * others indicate a transport condition which has already
859 * undergone a best-effort.
860 */
861 if (ep->rep_connected == -ECONNREFUSED
862 && ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
863 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
864 goto retry;
865 }
866 if (ep->rep_connected <= 0) {
867 /* Sometimes, the only way to reliably connect to remote
868 * CMs is to use same nonzero values for ORD and IRD. */
869 ep->rep_remote_cma.initiator_depth =
870 ep->rep_remote_cma.responder_resources;
871 if (ep->rep_remote_cma.initiator_depth == 0)
872 ++ep->rep_remote_cma.initiator_depth;
873 if (ep->rep_remote_cma.responder_resources == 0)
874 ++ep->rep_remote_cma.responder_resources;
875 if (retry_count++ == 0)
876 goto retry;
877 rc = ep->rep_connected;
878 } else {
879 dprintk("RPC: %s: connected\n", __func__);
880 }
881
882out:
883 if (rc)
884 ep->rep_connected = rc;
885 return rc;
886}
887
888/*
889 * rpcrdma_ep_disconnect
890 *
891 * This is separate from destroy to facilitate the ability
892 * to reconnect without recreating the endpoint.
893 *
894 * This call is not reentrant, and must not be made in parallel
895 * on the same endpoint.
896 */
897int
898rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
899{
900 int rc;
901
902 rpcrdma_clean_cq(ep->rep_cq);
903 rc = rdma_disconnect(ia->ri_id);
904 if (!rc) {
905 /* returns without wait if not connected */
906 wait_event_interruptible(ep->rep_connect_wait,
907 ep->rep_connected != 1);
908 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
909 (ep->rep_connected == 1) ? "still " : "dis");
910 } else {
911 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
912 ep->rep_connected = rc;
913 }
914 return rc;
915}
916
917/*
918 * Initialize buffer memory
919 */
920int
921rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
922 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
923{
924 char *p;
925 size_t len;
926 int i, rc;
8d4ba034 927 struct rpcrdma_mw *r;
c56c65fb
TT
928
929 buf->rb_max_requests = cdata->max_requests;
930 spin_lock_init(&buf->rb_lock);
931 atomic_set(&buf->rb_credits, 1);
932
933 /* Need to allocate:
934 * 1. arrays for send and recv pointers
935 * 2. arrays of struct rpcrdma_req to fill in pointers
936 * 3. array of struct rpcrdma_rep for replies
937 * 4. padding, if any
8d4ba034 938 * 5. mw's or fmr's, if any
c56c65fb
TT
939 * Send/recv buffers in req/rep need to be registered
940 */
941
942 len = buf->rb_max_requests *
943 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
944 len += cdata->padding;
945 switch (ia->ri_memreg_strategy) {
946 case RPCRDMA_MTHCAFMR:
947 /* TBD we are perhaps overallocating here */
948 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
949 sizeof(struct rpcrdma_mw);
950 break;
951 case RPCRDMA_MEMWINDOWS_ASYNC:
952 case RPCRDMA_MEMWINDOWS:
953 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
954 sizeof(struct rpcrdma_mw);
955 break;
956 default:
957 break;
958 }
959
960 /* allocate 1, 4 and 5 in one shot */
961 p = kzalloc(len, GFP_KERNEL);
962 if (p == NULL) {
963 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
964 __func__, len);
965 rc = -ENOMEM;
966 goto out;
967 }
968 buf->rb_pool = p; /* for freeing it later */
969
970 buf->rb_send_bufs = (struct rpcrdma_req **) p;
971 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
972 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
973 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
974
975 /*
976 * Register the zeroed pad buffer, if any.
977 */
978 if (cdata->padding) {
979 rc = rpcrdma_register_internal(ia, p, cdata->padding,
980 &ep->rep_pad_mr, &ep->rep_pad);
981 if (rc)
982 goto out;
983 }
984 p += cdata->padding;
985
986 /*
987 * Allocate the fmr's, or mw's for mw_bind chunk registration.
988 * We "cycle" the mw's in order to minimize rkey reuse,
989 * and also reduce unbind-to-bind collision.
990 */
991 INIT_LIST_HEAD(&buf->rb_mws);
8d4ba034 992 r = (struct rpcrdma_mw *)p;
c56c65fb
TT
993 switch (ia->ri_memreg_strategy) {
994 case RPCRDMA_MTHCAFMR:
c56c65fb
TT
995 /* TBD we are perhaps overallocating here */
996 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
8d4ba034
TT
997 static struct ib_fmr_attr fa =
998 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
c56c65fb
TT
999 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1000 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1001 &fa);
1002 if (IS_ERR(r->r.fmr)) {
1003 rc = PTR_ERR(r->r.fmr);
1004 dprintk("RPC: %s: ib_alloc_fmr"
1005 " failed %i\n", __func__, rc);
1006 goto out;
1007 }
1008 list_add(&r->mw_list, &buf->rb_mws);
1009 ++r;
1010 }
c56c65fb
TT
1011 break;
1012 case RPCRDMA_MEMWINDOWS_ASYNC:
1013 case RPCRDMA_MEMWINDOWS:
c56c65fb
TT
1014 /* Allocate one extra request's worth, for full cycling */
1015 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1016 r->r.mw = ib_alloc_mw(ia->ri_pd);
1017 if (IS_ERR(r->r.mw)) {
1018 rc = PTR_ERR(r->r.mw);
1019 dprintk("RPC: %s: ib_alloc_mw"
1020 " failed %i\n", __func__, rc);
1021 goto out;
1022 }
1023 list_add(&r->mw_list, &buf->rb_mws);
1024 ++r;
1025 }
c56c65fb
TT
1026 break;
1027 default:
1028 break;
1029 }
1030
1031 /*
1032 * Allocate/init the request/reply buffers. Doing this
1033 * using kmalloc for now -- one for each buf.
1034 */
1035 for (i = 0; i < buf->rb_max_requests; i++) {
1036 struct rpcrdma_req *req;
1037 struct rpcrdma_rep *rep;
1038
1039 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1040 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1041 /* Typical ~2400b, so rounding up saves work later */
1042 if (len < 4096)
1043 len = 4096;
1044 req = kmalloc(len, GFP_KERNEL);
1045 if (req == NULL) {
1046 dprintk("RPC: %s: request buffer %d alloc"
1047 " failed\n", __func__, i);
1048 rc = -ENOMEM;
1049 goto out;
1050 }
1051 memset(req, 0, sizeof(struct rpcrdma_req));
1052 buf->rb_send_bufs[i] = req;
1053 buf->rb_send_bufs[i]->rl_buffer = buf;
1054
1055 rc = rpcrdma_register_internal(ia, req->rl_base,
1056 len - offsetof(struct rpcrdma_req, rl_base),
1057 &buf->rb_send_bufs[i]->rl_handle,
1058 &buf->rb_send_bufs[i]->rl_iov);
1059 if (rc)
1060 goto out;
1061
1062 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1063
1064 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1065 rep = kmalloc(len, GFP_KERNEL);
1066 if (rep == NULL) {
1067 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1068 __func__, i);
1069 rc = -ENOMEM;
1070 goto out;
1071 }
1072 memset(rep, 0, sizeof(struct rpcrdma_rep));
1073 buf->rb_recv_bufs[i] = rep;
1074 buf->rb_recv_bufs[i]->rr_buffer = buf;
1075 init_waitqueue_head(&rep->rr_unbind);
1076
1077 rc = rpcrdma_register_internal(ia, rep->rr_base,
1078 len - offsetof(struct rpcrdma_rep, rr_base),
1079 &buf->rb_recv_bufs[i]->rr_handle,
1080 &buf->rb_recv_bufs[i]->rr_iov);
1081 if (rc)
1082 goto out;
1083
1084 }
1085 dprintk("RPC: %s: max_requests %d\n",
1086 __func__, buf->rb_max_requests);
1087 /* done */
1088 return 0;
1089out:
1090 rpcrdma_buffer_destroy(buf);
1091 return rc;
1092}
1093
1094/*
1095 * Unregister and destroy buffer memory. Need to deal with
1096 * partial initialization, so it's callable from failed create.
1097 * Must be called before destroying endpoint, as registrations
1098 * reference it.
1099 */
1100void
1101rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1102{
1103 int rc, i;
1104 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
8d4ba034 1105 struct rpcrdma_mw *r;
c56c65fb
TT
1106
1107 /* clean up in reverse order from create
1108 * 1. recv mr memory (mr free, then kfree)
1109 * 1a. bind mw memory
1110 * 2. send mr memory (mr free, then kfree)
1111 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1112 * 4. arrays
1113 */
1114 dprintk("RPC: %s: entering\n", __func__);
1115
1116 for (i = 0; i < buf->rb_max_requests; i++) {
1117 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1118 rpcrdma_deregister_internal(ia,
1119 buf->rb_recv_bufs[i]->rr_handle,
1120 &buf->rb_recv_bufs[i]->rr_iov);
1121 kfree(buf->rb_recv_bufs[i]);
1122 }
1123 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1124 while (!list_empty(&buf->rb_mws)) {
c56c65fb
TT
1125 r = list_entry(buf->rb_mws.next,
1126 struct rpcrdma_mw, mw_list);
1127 list_del(&r->mw_list);
1128 switch (ia->ri_memreg_strategy) {
1129 case RPCRDMA_MTHCAFMR:
1130 rc = ib_dealloc_fmr(r->r.fmr);
1131 if (rc)
1132 dprintk("RPC: %s:"
1133 " ib_dealloc_fmr"
1134 " failed %i\n",
1135 __func__, rc);
1136 break;
1137 case RPCRDMA_MEMWINDOWS_ASYNC:
1138 case RPCRDMA_MEMWINDOWS:
1139 rc = ib_dealloc_mw(r->r.mw);
1140 if (rc)
1141 dprintk("RPC: %s:"
1142 " ib_dealloc_mw"
1143 " failed %i\n",
1144 __func__, rc);
1145 break;
1146 default:
1147 break;
1148 }
1149 }
1150 rpcrdma_deregister_internal(ia,
1151 buf->rb_send_bufs[i]->rl_handle,
1152 &buf->rb_send_bufs[i]->rl_iov);
1153 kfree(buf->rb_send_bufs[i]);
1154 }
1155 }
1156
1157 kfree(buf->rb_pool);
1158}
1159
1160/*
1161 * Get a set of request/reply buffers.
1162 *
1163 * Reply buffer (if needed) is attached to send buffer upon return.
1164 * Rule:
1165 * rb_send_index and rb_recv_index MUST always be pointing to the
1166 * *next* available buffer (non-NULL). They are incremented after
1167 * removing buffers, and decremented *before* returning them.
1168 */
1169struct rpcrdma_req *
1170rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1171{
1172 struct rpcrdma_req *req;
1173 unsigned long flags;
8d4ba034
TT
1174 int i;
1175 struct rpcrdma_mw *r;
c56c65fb
TT
1176
1177 spin_lock_irqsave(&buffers->rb_lock, flags);
1178 if (buffers->rb_send_index == buffers->rb_max_requests) {
1179 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1180 dprintk("RPC: %s: out of request buffers\n", __func__);
1181 return ((struct rpcrdma_req *)NULL);
1182 }
1183
1184 req = buffers->rb_send_bufs[buffers->rb_send_index];
1185 if (buffers->rb_send_index < buffers->rb_recv_index) {
1186 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1187 __func__,
1188 buffers->rb_recv_index - buffers->rb_send_index);
1189 req->rl_reply = NULL;
1190 } else {
1191 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1192 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1193 }
1194 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1195 if (!list_empty(&buffers->rb_mws)) {
8d4ba034 1196 i = RPCRDMA_MAX_SEGS - 1;
c56c65fb 1197 do {
c56c65fb
TT
1198 r = list_entry(buffers->rb_mws.next,
1199 struct rpcrdma_mw, mw_list);
1200 list_del(&r->mw_list);
1201 req->rl_segments[i].mr_chunk.rl_mw = r;
1202 } while (--i >= 0);
1203 }
1204 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1205 return req;
1206}
1207
1208/*
1209 * Put request/reply buffers back into pool.
1210 * Pre-decrement counter/array index.
1211 */
1212void
1213rpcrdma_buffer_put(struct rpcrdma_req *req)
1214{
1215 struct rpcrdma_buffer *buffers = req->rl_buffer;
1216 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1217 int i;
1218 unsigned long flags;
1219
1220 BUG_ON(req->rl_nchunks != 0);
1221 spin_lock_irqsave(&buffers->rb_lock, flags);
1222 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1223 req->rl_niovs = 0;
1224 if (req->rl_reply) {
1225 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1226 init_waitqueue_head(&req->rl_reply->rr_unbind);
1227 req->rl_reply->rr_func = NULL;
1228 req->rl_reply = NULL;
1229 }
1230 switch (ia->ri_memreg_strategy) {
1231 case RPCRDMA_MTHCAFMR:
1232 case RPCRDMA_MEMWINDOWS_ASYNC:
1233 case RPCRDMA_MEMWINDOWS:
1234 /*
1235 * Cycle mw's back in reverse order, and "spin" them.
1236 * This delays and scrambles reuse as much as possible.
1237 */
1238 i = 1;
1239 do {
1240 struct rpcrdma_mw **mw;
1241 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1242 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1243 *mw = NULL;
1244 } while (++i < RPCRDMA_MAX_SEGS);
1245 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1246 &buffers->rb_mws);
1247 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1248 break;
1249 default:
1250 break;
1251 }
1252 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1253}
1254
1255/*
1256 * Recover reply buffers from pool.
1257 * This happens when recovering from error conditions.
1258 * Post-increment counter/array index.
1259 */
1260void
1261rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1262{
1263 struct rpcrdma_buffer *buffers = req->rl_buffer;
1264 unsigned long flags;
1265
1266 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1267 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1268 spin_lock_irqsave(&buffers->rb_lock, flags);
1269 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1270 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1271 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1272 }
1273 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1274}
1275
1276/*
1277 * Put reply buffers back into pool when not attached to
1278 * request. This happens in error conditions, and when
1279 * aborting unbinds. Pre-decrement counter/array index.
1280 */
1281void
1282rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1283{
1284 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1285 unsigned long flags;
1286
1287 rep->rr_func = NULL;
1288 spin_lock_irqsave(&buffers->rb_lock, flags);
1289 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1290 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1291}
1292
1293/*
1294 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1295 */
1296
1297int
1298rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1299 struct ib_mr **mrp, struct ib_sge *iov)
1300{
1301 struct ib_phys_buf ipb;
1302 struct ib_mr *mr;
1303 int rc;
1304
1305 /*
1306 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1307 */
1308 iov->addr = ib_dma_map_single(ia->ri_id->device,
1309 va, len, DMA_BIDIRECTIONAL);
1310 iov->length = len;
1311
bd7ed1d1
TT
1312 if (ia->ri_have_dma_lkey) {
1313 *mrp = NULL;
1314 iov->lkey = ia->ri_dma_lkey;
1315 return 0;
1316 } else if (ia->ri_bind_mem != NULL) {
c56c65fb
TT
1317 *mrp = NULL;
1318 iov->lkey = ia->ri_bind_mem->lkey;
1319 return 0;
1320 }
1321
1322 ipb.addr = iov->addr;
1323 ipb.size = iov->length;
1324 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1325 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1326
1327 dprintk("RPC: %s: phys convert: 0x%llx "
1328 "registered 0x%llx length %d\n",
a56daeb7
AM
1329 __func__, (unsigned long long)ipb.addr,
1330 (unsigned long long)iov->addr, len);
c56c65fb
TT
1331
1332 if (IS_ERR(mr)) {
1333 *mrp = NULL;
1334 rc = PTR_ERR(mr);
1335 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1336 } else {
1337 *mrp = mr;
1338 iov->lkey = mr->lkey;
1339 rc = 0;
1340 }
1341
1342 return rc;
1343}
1344
1345int
1346rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1347 struct ib_mr *mr, struct ib_sge *iov)
1348{
1349 int rc;
1350
1351 ib_dma_unmap_single(ia->ri_id->device,
1352 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1353
1354 if (NULL == mr)
1355 return 0;
1356
1357 rc = ib_dereg_mr(mr);
1358 if (rc)
1359 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1360 return rc;
1361}
1362
1363/*
1364 * Wrappers for chunk registration, shared by read/write chunk code.
1365 */
1366
1367static void
1368rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1369{
1370 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1371 seg->mr_dmalen = seg->mr_len;
1372 if (seg->mr_page)
1373 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1374 seg->mr_page, offset_in_page(seg->mr_offset),
1375 seg->mr_dmalen, seg->mr_dir);
1376 else
1377 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1378 seg->mr_offset,
1379 seg->mr_dmalen, seg->mr_dir);
1380}
1381
1382static void
1383rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1384{
1385 if (seg->mr_page)
1386 ib_dma_unmap_page(ia->ri_id->device,
1387 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1388 else
1389 ib_dma_unmap_single(ia->ri_id->device,
1390 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1391}
1392
8d4ba034
TT
1393static int
1394rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1395 int *nsegs, int writing, struct rpcrdma_ia *ia)
1396{
1397 struct rpcrdma_mr_seg *seg1 = seg;
1398 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1399 int len, pageoff, i, rc;
1400
1401 pageoff = offset_in_page(seg1->mr_offset);
1402 seg1->mr_offset -= pageoff; /* start of page */
1403 seg1->mr_len += pageoff;
1404 len = -pageoff;
1405 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1406 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1407 for (i = 0; i < *nsegs;) {
1408 rpcrdma_map_one(ia, seg, writing);
1409 physaddrs[i] = seg->mr_dma;
1410 len += seg->mr_len;
1411 ++seg;
1412 ++i;
1413 /* Check for holes */
1414 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1415 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1416 break;
1417 }
1418 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1419 physaddrs, i, seg1->mr_dma);
1420 if (rc) {
1421 dprintk("RPC: %s: failed ib_map_phys_fmr "
1422 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1423 len, (unsigned long long)seg1->mr_dma,
1424 pageoff, i, rc);
1425 while (i--)
1426 rpcrdma_unmap_one(ia, --seg);
1427 } else {
1428 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1429 seg1->mr_base = seg1->mr_dma + pageoff;
1430 seg1->mr_nsegs = i;
1431 seg1->mr_len = len;
1432 }
1433 *nsegs = i;
1434 return rc;
1435}
1436
1437static int
1438rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1439 struct rpcrdma_ia *ia)
1440{
1441 struct rpcrdma_mr_seg *seg1 = seg;
1442 LIST_HEAD(l);
1443 int rc;
1444
1445 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1446 rc = ib_unmap_fmr(&l);
1447 while (seg1->mr_nsegs--)
1448 rpcrdma_unmap_one(ia, seg++);
1449 if (rc)
1450 dprintk("RPC: %s: failed ib_unmap_fmr,"
1451 " status %i\n", __func__, rc);
1452 return rc;
1453}
1454
1455static int
1456rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1457 int *nsegs, int writing, struct rpcrdma_ia *ia,
1458 struct rpcrdma_xprt *r_xprt)
1459{
1460 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1461 IB_ACCESS_REMOTE_READ);
1462 struct ib_mw_bind param;
1463 int rc;
1464
1465 *nsegs = 1;
1466 rpcrdma_map_one(ia, seg, writing);
1467 param.mr = ia->ri_bind_mem;
1468 param.wr_id = 0ULL; /* no send cookie */
1469 param.addr = seg->mr_dma;
1470 param.length = seg->mr_len;
1471 param.send_flags = 0;
1472 param.mw_access_flags = mem_priv;
1473
1474 DECR_CQCOUNT(&r_xprt->rx_ep);
1475 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1476 if (rc) {
1477 dprintk("RPC: %s: failed ib_bind_mw "
1478 "%u@0x%llx status %i\n",
1479 __func__, seg->mr_len,
1480 (unsigned long long)seg->mr_dma, rc);
1481 rpcrdma_unmap_one(ia, seg);
1482 } else {
1483 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1484 seg->mr_base = param.addr;
1485 seg->mr_nsegs = 1;
1486 }
1487 return rc;
1488}
1489
1490static int
1491rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1492 struct rpcrdma_ia *ia,
1493 struct rpcrdma_xprt *r_xprt, void **r)
1494{
1495 struct ib_mw_bind param;
1496 LIST_HEAD(l);
1497 int rc;
1498
1499 BUG_ON(seg->mr_nsegs != 1);
1500 param.mr = ia->ri_bind_mem;
1501 param.addr = 0ULL; /* unbind */
1502 param.length = 0;
1503 param.mw_access_flags = 0;
1504 if (*r) {
1505 param.wr_id = (u64) (unsigned long) *r;
1506 param.send_flags = IB_SEND_SIGNALED;
1507 INIT_CQCOUNT(&r_xprt->rx_ep);
1508 } else {
1509 param.wr_id = 0ULL;
1510 param.send_flags = 0;
1511 DECR_CQCOUNT(&r_xprt->rx_ep);
1512 }
1513 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1514 rpcrdma_unmap_one(ia, seg);
1515 if (rc)
1516 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1517 " status %i\n", __func__, rc);
1518 else
1519 *r = NULL; /* will upcall on completion */
1520 return rc;
1521}
1522
1523static int
1524rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1525 int *nsegs, int writing, struct rpcrdma_ia *ia)
1526{
1527 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1528 IB_ACCESS_REMOTE_READ);
1529 struct rpcrdma_mr_seg *seg1 = seg;
1530 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1531 int len, i, rc = 0;
1532
1533 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1534 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1535 for (len = 0, i = 0; i < *nsegs;) {
1536 rpcrdma_map_one(ia, seg, writing);
1537 ipb[i].addr = seg->mr_dma;
1538 ipb[i].size = seg->mr_len;
1539 len += seg->mr_len;
1540 ++seg;
1541 ++i;
1542 /* Check for holes */
1543 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1544 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1545 break;
1546 }
1547 seg1->mr_base = seg1->mr_dma;
1548 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1549 ipb, i, mem_priv, &seg1->mr_base);
1550 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1551 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1552 dprintk("RPC: %s: failed ib_reg_phys_mr "
1553 "%u@0x%llx (%d)... status %i\n",
1554 __func__, len,
1555 (unsigned long long)seg1->mr_dma, i, rc);
1556 while (i--)
1557 rpcrdma_unmap_one(ia, --seg);
1558 } else {
1559 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1560 seg1->mr_nsegs = i;
1561 seg1->mr_len = len;
1562 }
1563 *nsegs = i;
1564 return rc;
1565}
1566
1567static int
1568rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1569 struct rpcrdma_ia *ia)
1570{
1571 struct rpcrdma_mr_seg *seg1 = seg;
1572 int rc;
1573
1574 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1575 seg1->mr_chunk.rl_mr = NULL;
1576 while (seg1->mr_nsegs--)
1577 rpcrdma_unmap_one(ia, seg++);
1578 if (rc)
1579 dprintk("RPC: %s: failed ib_dereg_mr,"
1580 " status %i\n", __func__, rc);
1581 return rc;
1582}
1583
c56c65fb
TT
1584int
1585rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1586 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1587{
1588 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1589 int rc = 0;
1590
1591 switch (ia->ri_memreg_strategy) {
1592
1593#if RPCRDMA_PERSISTENT_REGISTRATION
1594 case RPCRDMA_ALLPHYSICAL:
1595 rpcrdma_map_one(ia, seg, writing);
1596 seg->mr_rkey = ia->ri_bind_mem->rkey;
1597 seg->mr_base = seg->mr_dma;
1598 seg->mr_nsegs = 1;
1599 nsegs = 1;
1600 break;
1601#endif
1602
8d4ba034 1603 /* Registration using fmr memory registration */
c56c65fb 1604 case RPCRDMA_MTHCAFMR:
8d4ba034 1605 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1606 break;
1607
1608 /* Registration using memory windows */
1609 case RPCRDMA_MEMWINDOWS_ASYNC:
1610 case RPCRDMA_MEMWINDOWS:
8d4ba034 1611 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
c56c65fb
TT
1612 break;
1613
1614 /* Default registration each time */
1615 default:
8d4ba034 1616 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1617 break;
1618 }
1619 if (rc)
1620 return -1;
1621
1622 return nsegs;
1623}
1624
1625int
1626rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1627 struct rpcrdma_xprt *r_xprt, void *r)
1628{
1629 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1630 int nsegs = seg->mr_nsegs, rc;
1631
1632 switch (ia->ri_memreg_strategy) {
1633
1634#if RPCRDMA_PERSISTENT_REGISTRATION
1635 case RPCRDMA_ALLPHYSICAL:
1636 BUG_ON(nsegs != 1);
1637 rpcrdma_unmap_one(ia, seg);
1638 rc = 0;
1639 break;
1640#endif
1641
1642 case RPCRDMA_MTHCAFMR:
8d4ba034 1643 rc = rpcrdma_deregister_fmr_external(seg, ia);
c56c65fb
TT
1644 break;
1645
1646 case RPCRDMA_MEMWINDOWS_ASYNC:
1647 case RPCRDMA_MEMWINDOWS:
8d4ba034 1648 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
c56c65fb
TT
1649 break;
1650
1651 default:
8d4ba034 1652 rc = rpcrdma_deregister_default_external(seg, ia);
c56c65fb
TT
1653 break;
1654 }
1655 if (r) {
1656 struct rpcrdma_rep *rep = r;
1657 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1658 rep->rr_func = NULL;
1659 func(rep); /* dereg done, callback now */
1660 }
1661 return nsegs;
1662}
1663
1664/*
1665 * Prepost any receive buffer, then post send.
1666 *
1667 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1668 */
1669int
1670rpcrdma_ep_post(struct rpcrdma_ia *ia,
1671 struct rpcrdma_ep *ep,
1672 struct rpcrdma_req *req)
1673{
1674 struct ib_send_wr send_wr, *send_wr_fail;
1675 struct rpcrdma_rep *rep = req->rl_reply;
1676 int rc;
1677
1678 if (rep) {
1679 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1680 if (rc)
1681 goto out;
1682 req->rl_reply = NULL;
1683 }
1684
1685 send_wr.next = NULL;
1686 send_wr.wr_id = 0ULL; /* no send cookie */
1687 send_wr.sg_list = req->rl_send_iov;
1688 send_wr.num_sge = req->rl_niovs;
1689 send_wr.opcode = IB_WR_SEND;
c56c65fb
TT
1690 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1691 ib_dma_sync_single_for_device(ia->ri_id->device,
1692 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1693 DMA_TO_DEVICE);
1694 ib_dma_sync_single_for_device(ia->ri_id->device,
1695 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1696 DMA_TO_DEVICE);
1697 ib_dma_sync_single_for_device(ia->ri_id->device,
1698 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1699 DMA_TO_DEVICE);
1700
1701 if (DECR_CQCOUNT(ep) > 0)
1702 send_wr.send_flags = 0;
1703 else { /* Provider must take a send completion every now and then */
1704 INIT_CQCOUNT(ep);
1705 send_wr.send_flags = IB_SEND_SIGNALED;
1706 }
1707
1708 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1709 if (rc)
1710 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1711 rc);
1712out:
1713 return rc;
1714}
1715
1716/*
1717 * (Re)post a receive buffer.
1718 */
1719int
1720rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1721 struct rpcrdma_ep *ep,
1722 struct rpcrdma_rep *rep)
1723{
1724 struct ib_recv_wr recv_wr, *recv_wr_fail;
1725 int rc;
1726
1727 recv_wr.next = NULL;
1728 recv_wr.wr_id = (u64) (unsigned long) rep;
1729 recv_wr.sg_list = &rep->rr_iov;
1730 recv_wr.num_sge = 1;
1731
1732 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1733 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1734
1735 DECR_CQCOUNT(ep);
1736 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1737
1738 if (rc)
1739 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1740 rc);
1741 return rc;
1742}
This page took 0.196132 seconds and 5 git commands to generate.