Merge branch 'linux-3.17' of git://anongit.freedesktop.org/git/nouveau/linux-2.6
[deliverable/linux.git] / net / sunrpc / xprtrdma / verbs.c
1 /*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50 #include <linux/interrupt.h>
51 #include <linux/slab.h>
52 #include <asm/bitops.h>
53
54 #include "xprt_rdma.h"
55
56 /*
57 * Globals/Macros
58 */
59
60 #ifdef RPC_DEBUG
61 # define RPCDBG_FACILITY RPCDBG_TRANS
62 #endif
63
64 /*
65 * internal functions
66 */
67
68 /*
69 * handle replies in tasklet context, using a single, global list
70 * rdma tasklet function -- just turn around and call the func
71 * for all replies on the list
72 */
73
74 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
75 static LIST_HEAD(rpcrdma_tasklets_g);
76
77 static void
78 rpcrdma_run_tasklet(unsigned long data)
79 {
80 struct rpcrdma_rep *rep;
81 void (*func)(struct rpcrdma_rep *);
82 unsigned long flags;
83
84 data = data;
85 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
86 while (!list_empty(&rpcrdma_tasklets_g)) {
87 rep = list_entry(rpcrdma_tasklets_g.next,
88 struct rpcrdma_rep, rr_list);
89 list_del(&rep->rr_list);
90 func = rep->rr_func;
91 rep->rr_func = NULL;
92 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
93
94 if (func)
95 func(rep);
96 else
97 rpcrdma_recv_buffer_put(rep);
98
99 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
100 }
101 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
102 }
103
104 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
105
106 static inline void
107 rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
108 {
109 unsigned long flags;
110
111 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
112 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
113 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
114 tasklet_schedule(&rpcrdma_tasklet_g);
115 }
116
117 static void
118 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
119 {
120 struct rpcrdma_ep *ep = context;
121
122 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
123 __func__, event->event, event->device->name, context);
124 if (ep->rep_connected == 1) {
125 ep->rep_connected = -EIO;
126 ep->rep_func(ep);
127 wake_up_all(&ep->rep_connect_wait);
128 }
129 }
130
131 static void
132 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
133 {
134 struct rpcrdma_ep *ep = context;
135
136 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
137 __func__, event->event, event->device->name, context);
138 if (ep->rep_connected == 1) {
139 ep->rep_connected = -EIO;
140 ep->rep_func(ep);
141 wake_up_all(&ep->rep_connect_wait);
142 }
143 }
144
145 static void
146 rpcrdma_sendcq_process_wc(struct ib_wc *wc)
147 {
148 struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
149
150 dprintk("RPC: %s: frmr %p status %X opcode %d\n",
151 __func__, frmr, wc->status, wc->opcode);
152
153 if (wc->wr_id == 0ULL)
154 return;
155 if (wc->status != IB_WC_SUCCESS)
156 return;
157
158 if (wc->opcode == IB_WC_FAST_REG_MR)
159 frmr->r.frmr.state = FRMR_IS_VALID;
160 else if (wc->opcode == IB_WC_LOCAL_INV)
161 frmr->r.frmr.state = FRMR_IS_INVALID;
162 }
163
164 static int
165 rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
166 {
167 struct ib_wc *wcs;
168 int budget, count, rc;
169
170 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
171 do {
172 wcs = ep->rep_send_wcs;
173
174 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
175 if (rc <= 0)
176 return rc;
177
178 count = rc;
179 while (count-- > 0)
180 rpcrdma_sendcq_process_wc(wcs++);
181 } while (rc == RPCRDMA_POLLSIZE && --budget);
182 return 0;
183 }
184
185 /*
186 * Handle send, fast_reg_mr, and local_inv completions.
187 *
188 * Send events are typically suppressed and thus do not result
189 * in an upcall. Occasionally one is signaled, however. This
190 * prevents the provider's completion queue from wrapping and
191 * losing a completion.
192 */
193 static void
194 rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
195 {
196 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
197 int rc;
198
199 rc = rpcrdma_sendcq_poll(cq, ep);
200 if (rc) {
201 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
202 __func__, rc);
203 return;
204 }
205
206 rc = ib_req_notify_cq(cq,
207 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
208 if (rc == 0)
209 return;
210 if (rc < 0) {
211 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
212 __func__, rc);
213 return;
214 }
215
216 rpcrdma_sendcq_poll(cq, ep);
217 }
218
219 static void
220 rpcrdma_recvcq_process_wc(struct ib_wc *wc)
221 {
222 struct rpcrdma_rep *rep =
223 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
224
225 dprintk("RPC: %s: rep %p status %X opcode %X length %u\n",
226 __func__, rep, wc->status, wc->opcode, wc->byte_len);
227
228 if (wc->status != IB_WC_SUCCESS) {
229 rep->rr_len = ~0U;
230 goto out_schedule;
231 }
232 if (wc->opcode != IB_WC_RECV)
233 return;
234
235 rep->rr_len = wc->byte_len;
236 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
237 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
238
239 if (rep->rr_len >= 16) {
240 struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
241 unsigned int credits = ntohl(p->rm_credit);
242
243 if (credits == 0)
244 credits = 1; /* don't deadlock */
245 else if (credits > rep->rr_buffer->rb_max_requests)
246 credits = rep->rr_buffer->rb_max_requests;
247 atomic_set(&rep->rr_buffer->rb_credits, credits);
248 }
249
250 out_schedule:
251 rpcrdma_schedule_tasklet(rep);
252 }
253
254 static int
255 rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
256 {
257 struct ib_wc *wcs;
258 int budget, count, rc;
259
260 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
261 do {
262 wcs = ep->rep_recv_wcs;
263
264 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
265 if (rc <= 0)
266 return rc;
267
268 count = rc;
269 while (count-- > 0)
270 rpcrdma_recvcq_process_wc(wcs++);
271 } while (rc == RPCRDMA_POLLSIZE && --budget);
272 return 0;
273 }
274
275 /*
276 * Handle receive completions.
277 *
278 * It is reentrant but processes single events in order to maintain
279 * ordering of receives to keep server credits.
280 *
281 * It is the responsibility of the scheduled tasklet to return
282 * recv buffers to the pool. NOTE: this affects synchronization of
283 * connection shutdown. That is, the structures required for
284 * the completion of the reply handler must remain intact until
285 * all memory has been reclaimed.
286 */
287 static void
288 rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
289 {
290 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
291 int rc;
292
293 rc = rpcrdma_recvcq_poll(cq, ep);
294 if (rc) {
295 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
296 __func__, rc);
297 return;
298 }
299
300 rc = ib_req_notify_cq(cq,
301 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
302 if (rc == 0)
303 return;
304 if (rc < 0) {
305 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
306 __func__, rc);
307 return;
308 }
309
310 rpcrdma_recvcq_poll(cq, ep);
311 }
312
313 #ifdef RPC_DEBUG
314 static const char * const conn[] = {
315 "address resolved",
316 "address error",
317 "route resolved",
318 "route error",
319 "connect request",
320 "connect response",
321 "connect error",
322 "unreachable",
323 "rejected",
324 "established",
325 "disconnected",
326 "device removal"
327 };
328 #endif
329
330 static int
331 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
332 {
333 struct rpcrdma_xprt *xprt = id->context;
334 struct rpcrdma_ia *ia = &xprt->rx_ia;
335 struct rpcrdma_ep *ep = &xprt->rx_ep;
336 #ifdef RPC_DEBUG
337 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
338 #endif
339 struct ib_qp_attr attr;
340 struct ib_qp_init_attr iattr;
341 int connstate = 0;
342
343 switch (event->event) {
344 case RDMA_CM_EVENT_ADDR_RESOLVED:
345 case RDMA_CM_EVENT_ROUTE_RESOLVED:
346 ia->ri_async_rc = 0;
347 complete(&ia->ri_done);
348 break;
349 case RDMA_CM_EVENT_ADDR_ERROR:
350 ia->ri_async_rc = -EHOSTUNREACH;
351 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
352 __func__, ep);
353 complete(&ia->ri_done);
354 break;
355 case RDMA_CM_EVENT_ROUTE_ERROR:
356 ia->ri_async_rc = -ENETUNREACH;
357 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
358 __func__, ep);
359 complete(&ia->ri_done);
360 break;
361 case RDMA_CM_EVENT_ESTABLISHED:
362 connstate = 1;
363 ib_query_qp(ia->ri_id->qp, &attr,
364 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
365 &iattr);
366 dprintk("RPC: %s: %d responder resources"
367 " (%d initiator)\n",
368 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
369 goto connected;
370 case RDMA_CM_EVENT_CONNECT_ERROR:
371 connstate = -ENOTCONN;
372 goto connected;
373 case RDMA_CM_EVENT_UNREACHABLE:
374 connstate = -ENETDOWN;
375 goto connected;
376 case RDMA_CM_EVENT_REJECTED:
377 connstate = -ECONNREFUSED;
378 goto connected;
379 case RDMA_CM_EVENT_DISCONNECTED:
380 connstate = -ECONNABORTED;
381 goto connected;
382 case RDMA_CM_EVENT_DEVICE_REMOVAL:
383 connstate = -ENODEV;
384 connected:
385 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
386 __func__,
387 (event->event <= 11) ? conn[event->event] :
388 "unknown connection error",
389 &addr->sin_addr.s_addr,
390 ntohs(addr->sin_port),
391 ep, event->event);
392 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
393 dprintk("RPC: %s: %sconnected\n",
394 __func__, connstate > 0 ? "" : "dis");
395 ep->rep_connected = connstate;
396 ep->rep_func(ep);
397 wake_up_all(&ep->rep_connect_wait);
398 break;
399 default:
400 dprintk("RPC: %s: unexpected CM event %d\n",
401 __func__, event->event);
402 break;
403 }
404
405 #ifdef RPC_DEBUG
406 if (connstate == 1) {
407 int ird = attr.max_dest_rd_atomic;
408 int tird = ep->rep_remote_cma.responder_resources;
409 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
410 "on %s, memreg %d slots %d ird %d%s\n",
411 &addr->sin_addr.s_addr,
412 ntohs(addr->sin_port),
413 ia->ri_id->device->name,
414 ia->ri_memreg_strategy,
415 xprt->rx_buf.rb_max_requests,
416 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
417 } else if (connstate < 0) {
418 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
419 &addr->sin_addr.s_addr,
420 ntohs(addr->sin_port),
421 connstate);
422 }
423 #endif
424
425 return 0;
426 }
427
428 static struct rdma_cm_id *
429 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
430 struct rpcrdma_ia *ia, struct sockaddr *addr)
431 {
432 struct rdma_cm_id *id;
433 int rc;
434
435 init_completion(&ia->ri_done);
436
437 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
438 if (IS_ERR(id)) {
439 rc = PTR_ERR(id);
440 dprintk("RPC: %s: rdma_create_id() failed %i\n",
441 __func__, rc);
442 return id;
443 }
444
445 ia->ri_async_rc = -ETIMEDOUT;
446 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
447 if (rc) {
448 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
449 __func__, rc);
450 goto out;
451 }
452 wait_for_completion_interruptible_timeout(&ia->ri_done,
453 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
454 rc = ia->ri_async_rc;
455 if (rc)
456 goto out;
457
458 ia->ri_async_rc = -ETIMEDOUT;
459 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
460 if (rc) {
461 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
462 __func__, rc);
463 goto out;
464 }
465 wait_for_completion_interruptible_timeout(&ia->ri_done,
466 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
467 rc = ia->ri_async_rc;
468 if (rc)
469 goto out;
470
471 return id;
472
473 out:
474 rdma_destroy_id(id);
475 return ERR_PTR(rc);
476 }
477
478 /*
479 * Drain any cq, prior to teardown.
480 */
481 static void
482 rpcrdma_clean_cq(struct ib_cq *cq)
483 {
484 struct ib_wc wc;
485 int count = 0;
486
487 while (1 == ib_poll_cq(cq, 1, &wc))
488 ++count;
489
490 if (count)
491 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
492 __func__, count, wc.opcode);
493 }
494
495 /*
496 * Exported functions.
497 */
498
499 /*
500 * Open and initialize an Interface Adapter.
501 * o initializes fields of struct rpcrdma_ia, including
502 * interface and provider attributes and protection zone.
503 */
504 int
505 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
506 {
507 int rc, mem_priv;
508 struct ib_device_attr devattr;
509 struct rpcrdma_ia *ia = &xprt->rx_ia;
510
511 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
512 if (IS_ERR(ia->ri_id)) {
513 rc = PTR_ERR(ia->ri_id);
514 goto out1;
515 }
516
517 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
518 if (IS_ERR(ia->ri_pd)) {
519 rc = PTR_ERR(ia->ri_pd);
520 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
521 __func__, rc);
522 goto out2;
523 }
524
525 /*
526 * Query the device to determine if the requested memory
527 * registration strategy is supported. If it isn't, set the
528 * strategy to a globally supported model.
529 */
530 rc = ib_query_device(ia->ri_id->device, &devattr);
531 if (rc) {
532 dprintk("RPC: %s: ib_query_device failed %d\n",
533 __func__, rc);
534 goto out2;
535 }
536
537 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
538 ia->ri_have_dma_lkey = 1;
539 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
540 }
541
542 if (memreg == RPCRDMA_FRMR) {
543 /* Requires both frmr reg and local dma lkey */
544 if ((devattr.device_cap_flags &
545 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
546 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
547 dprintk("RPC: %s: FRMR registration "
548 "not supported by HCA\n", __func__);
549 memreg = RPCRDMA_MTHCAFMR;
550 } else {
551 /* Mind the ia limit on FRMR page list depth */
552 ia->ri_max_frmr_depth = min_t(unsigned int,
553 RPCRDMA_MAX_DATA_SEGS,
554 devattr.max_fast_reg_page_list_len);
555 }
556 }
557 if (memreg == RPCRDMA_MTHCAFMR) {
558 if (!ia->ri_id->device->alloc_fmr) {
559 dprintk("RPC: %s: MTHCAFMR registration "
560 "not supported by HCA\n", __func__);
561 #if RPCRDMA_PERSISTENT_REGISTRATION
562 memreg = RPCRDMA_ALLPHYSICAL;
563 #else
564 rc = -ENOMEM;
565 goto out2;
566 #endif
567 }
568 }
569
570 /*
571 * Optionally obtain an underlying physical identity mapping in
572 * order to do a memory window-based bind. This base registration
573 * is protected from remote access - that is enabled only by binding
574 * for the specific bytes targeted during each RPC operation, and
575 * revoked after the corresponding completion similar to a storage
576 * adapter.
577 */
578 switch (memreg) {
579 case RPCRDMA_FRMR:
580 break;
581 #if RPCRDMA_PERSISTENT_REGISTRATION
582 case RPCRDMA_ALLPHYSICAL:
583 mem_priv = IB_ACCESS_LOCAL_WRITE |
584 IB_ACCESS_REMOTE_WRITE |
585 IB_ACCESS_REMOTE_READ;
586 goto register_setup;
587 #endif
588 case RPCRDMA_MTHCAFMR:
589 if (ia->ri_have_dma_lkey)
590 break;
591 mem_priv = IB_ACCESS_LOCAL_WRITE;
592 #if RPCRDMA_PERSISTENT_REGISTRATION
593 register_setup:
594 #endif
595 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
596 if (IS_ERR(ia->ri_bind_mem)) {
597 printk(KERN_ALERT "%s: ib_get_dma_mr for "
598 "phys register failed with %lX\n",
599 __func__, PTR_ERR(ia->ri_bind_mem));
600 rc = -ENOMEM;
601 goto out2;
602 }
603 break;
604 default:
605 printk(KERN_ERR "RPC: Unsupported memory "
606 "registration mode: %d\n", memreg);
607 rc = -ENOMEM;
608 goto out2;
609 }
610 dprintk("RPC: %s: memory registration strategy is %d\n",
611 __func__, memreg);
612
613 /* Else will do memory reg/dereg for each chunk */
614 ia->ri_memreg_strategy = memreg;
615
616 return 0;
617 out2:
618 rdma_destroy_id(ia->ri_id);
619 ia->ri_id = NULL;
620 out1:
621 return rc;
622 }
623
624 /*
625 * Clean up/close an IA.
626 * o if event handles and PD have been initialized, free them.
627 * o close the IA
628 */
629 void
630 rpcrdma_ia_close(struct rpcrdma_ia *ia)
631 {
632 int rc;
633
634 dprintk("RPC: %s: entering\n", __func__);
635 if (ia->ri_bind_mem != NULL) {
636 rc = ib_dereg_mr(ia->ri_bind_mem);
637 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
638 __func__, rc);
639 }
640 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
641 if (ia->ri_id->qp)
642 rdma_destroy_qp(ia->ri_id);
643 rdma_destroy_id(ia->ri_id);
644 ia->ri_id = NULL;
645 }
646 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
647 rc = ib_dealloc_pd(ia->ri_pd);
648 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
649 __func__, rc);
650 }
651 }
652
653 /*
654 * Create unconnected endpoint.
655 */
656 int
657 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
658 struct rpcrdma_create_data_internal *cdata)
659 {
660 struct ib_device_attr devattr;
661 struct ib_cq *sendcq, *recvcq;
662 int rc, err;
663
664 rc = ib_query_device(ia->ri_id->device, &devattr);
665 if (rc) {
666 dprintk("RPC: %s: ib_query_device failed %d\n",
667 __func__, rc);
668 return rc;
669 }
670
671 /* check provider's send/recv wr limits */
672 if (cdata->max_requests > devattr.max_qp_wr)
673 cdata->max_requests = devattr.max_qp_wr;
674
675 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
676 ep->rep_attr.qp_context = ep;
677 /* send_cq and recv_cq initialized below */
678 ep->rep_attr.srq = NULL;
679 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
680 switch (ia->ri_memreg_strategy) {
681 case RPCRDMA_FRMR: {
682 int depth = 7;
683
684 /* Add room for frmr register and invalidate WRs.
685 * 1. FRMR reg WR for head
686 * 2. FRMR invalidate WR for head
687 * 3. N FRMR reg WRs for pagelist
688 * 4. N FRMR invalidate WRs for pagelist
689 * 5. FRMR reg WR for tail
690 * 6. FRMR invalidate WR for tail
691 * 7. The RDMA_SEND WR
692 */
693
694 /* Calculate N if the device max FRMR depth is smaller than
695 * RPCRDMA_MAX_DATA_SEGS.
696 */
697 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
698 int delta = RPCRDMA_MAX_DATA_SEGS -
699 ia->ri_max_frmr_depth;
700
701 do {
702 depth += 2; /* FRMR reg + invalidate */
703 delta -= ia->ri_max_frmr_depth;
704 } while (delta > 0);
705
706 }
707 ep->rep_attr.cap.max_send_wr *= depth;
708 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
709 cdata->max_requests = devattr.max_qp_wr / depth;
710 if (!cdata->max_requests)
711 return -EINVAL;
712 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
713 depth;
714 }
715 break;
716 }
717 default:
718 break;
719 }
720 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
721 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
722 ep->rep_attr.cap.max_recv_sge = 1;
723 ep->rep_attr.cap.max_inline_data = 0;
724 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
725 ep->rep_attr.qp_type = IB_QPT_RC;
726 ep->rep_attr.port_num = ~0;
727
728 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
729 "iovs: send %d recv %d\n",
730 __func__,
731 ep->rep_attr.cap.max_send_wr,
732 ep->rep_attr.cap.max_recv_wr,
733 ep->rep_attr.cap.max_send_sge,
734 ep->rep_attr.cap.max_recv_sge);
735
736 /* set trigger for requesting send completion */
737 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
738 if (ep->rep_cqinit <= 2)
739 ep->rep_cqinit = 0;
740 INIT_CQCOUNT(ep);
741 ep->rep_ia = ia;
742 init_waitqueue_head(&ep->rep_connect_wait);
743 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
744
745 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
746 rpcrdma_cq_async_error_upcall, ep,
747 ep->rep_attr.cap.max_send_wr + 1, 0);
748 if (IS_ERR(sendcq)) {
749 rc = PTR_ERR(sendcq);
750 dprintk("RPC: %s: failed to create send CQ: %i\n",
751 __func__, rc);
752 goto out1;
753 }
754
755 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
756 if (rc) {
757 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
758 __func__, rc);
759 goto out2;
760 }
761
762 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
763 rpcrdma_cq_async_error_upcall, ep,
764 ep->rep_attr.cap.max_recv_wr + 1, 0);
765 if (IS_ERR(recvcq)) {
766 rc = PTR_ERR(recvcq);
767 dprintk("RPC: %s: failed to create recv CQ: %i\n",
768 __func__, rc);
769 goto out2;
770 }
771
772 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
773 if (rc) {
774 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
775 __func__, rc);
776 ib_destroy_cq(recvcq);
777 goto out2;
778 }
779
780 ep->rep_attr.send_cq = sendcq;
781 ep->rep_attr.recv_cq = recvcq;
782
783 /* Initialize cma parameters */
784
785 /* RPC/RDMA does not use private data */
786 ep->rep_remote_cma.private_data = NULL;
787 ep->rep_remote_cma.private_data_len = 0;
788
789 /* Client offers RDMA Read but does not initiate */
790 ep->rep_remote_cma.initiator_depth = 0;
791 if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
792 ep->rep_remote_cma.responder_resources = 32;
793 else
794 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
795
796 ep->rep_remote_cma.retry_count = 7;
797 ep->rep_remote_cma.flow_control = 0;
798 ep->rep_remote_cma.rnr_retry_count = 0;
799
800 return 0;
801
802 out2:
803 err = ib_destroy_cq(sendcq);
804 if (err)
805 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
806 __func__, err);
807 out1:
808 return rc;
809 }
810
811 /*
812 * rpcrdma_ep_destroy
813 *
814 * Disconnect and destroy endpoint. After this, the only
815 * valid operations on the ep are to free it (if dynamically
816 * allocated) or re-create it.
817 */
818 void
819 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
820 {
821 int rc;
822
823 dprintk("RPC: %s: entering, connected is %d\n",
824 __func__, ep->rep_connected);
825
826 cancel_delayed_work_sync(&ep->rep_connect_worker);
827
828 if (ia->ri_id->qp) {
829 rc = rpcrdma_ep_disconnect(ep, ia);
830 if (rc)
831 dprintk("RPC: %s: rpcrdma_ep_disconnect"
832 " returned %i\n", __func__, rc);
833 rdma_destroy_qp(ia->ri_id);
834 ia->ri_id->qp = NULL;
835 }
836
837 /* padding - could be done in rpcrdma_buffer_destroy... */
838 if (ep->rep_pad_mr) {
839 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
840 ep->rep_pad_mr = NULL;
841 }
842
843 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
844 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
845 if (rc)
846 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
847 __func__, rc);
848
849 rpcrdma_clean_cq(ep->rep_attr.send_cq);
850 rc = ib_destroy_cq(ep->rep_attr.send_cq);
851 if (rc)
852 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
853 __func__, rc);
854 }
855
856 /*
857 * Connect unconnected endpoint.
858 */
859 int
860 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
861 {
862 struct rdma_cm_id *id;
863 int rc = 0;
864 int retry_count = 0;
865
866 if (ep->rep_connected != 0) {
867 struct rpcrdma_xprt *xprt;
868 retry:
869 dprintk("RPC: %s: reconnecting...\n", __func__);
870 rc = rpcrdma_ep_disconnect(ep, ia);
871 if (rc && rc != -ENOTCONN)
872 dprintk("RPC: %s: rpcrdma_ep_disconnect"
873 " status %i\n", __func__, rc);
874
875 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
876 rpcrdma_clean_cq(ep->rep_attr.send_cq);
877
878 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
879 id = rpcrdma_create_id(xprt, ia,
880 (struct sockaddr *)&xprt->rx_data.addr);
881 if (IS_ERR(id)) {
882 rc = -EHOSTUNREACH;
883 goto out;
884 }
885 /* TEMP TEMP TEMP - fail if new device:
886 * Deregister/remarshal *all* requests!
887 * Close and recreate adapter, pd, etc!
888 * Re-determine all attributes still sane!
889 * More stuff I haven't thought of!
890 * Rrrgh!
891 */
892 if (ia->ri_id->device != id->device) {
893 printk("RPC: %s: can't reconnect on "
894 "different device!\n", __func__);
895 rdma_destroy_id(id);
896 rc = -ENETUNREACH;
897 goto out;
898 }
899 /* END TEMP */
900 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
901 if (rc) {
902 dprintk("RPC: %s: rdma_create_qp failed %i\n",
903 __func__, rc);
904 rdma_destroy_id(id);
905 rc = -ENETUNREACH;
906 goto out;
907 }
908 rdma_destroy_qp(ia->ri_id);
909 rdma_destroy_id(ia->ri_id);
910 ia->ri_id = id;
911 } else {
912 dprintk("RPC: %s: connecting...\n", __func__);
913 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
914 if (rc) {
915 dprintk("RPC: %s: rdma_create_qp failed %i\n",
916 __func__, rc);
917 /* do not update ep->rep_connected */
918 return -ENETUNREACH;
919 }
920 }
921
922 ep->rep_connected = 0;
923
924 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
925 if (rc) {
926 dprintk("RPC: %s: rdma_connect() failed with %i\n",
927 __func__, rc);
928 goto out;
929 }
930
931 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
932
933 /*
934 * Check state. A non-peer reject indicates no listener
935 * (ECONNREFUSED), which may be a transient state. All
936 * others indicate a transport condition which has already
937 * undergone a best-effort.
938 */
939 if (ep->rep_connected == -ECONNREFUSED &&
940 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
941 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
942 goto retry;
943 }
944 if (ep->rep_connected <= 0) {
945 /* Sometimes, the only way to reliably connect to remote
946 * CMs is to use same nonzero values for ORD and IRD. */
947 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
948 (ep->rep_remote_cma.responder_resources == 0 ||
949 ep->rep_remote_cma.initiator_depth !=
950 ep->rep_remote_cma.responder_resources)) {
951 if (ep->rep_remote_cma.responder_resources == 0)
952 ep->rep_remote_cma.responder_resources = 1;
953 ep->rep_remote_cma.initiator_depth =
954 ep->rep_remote_cma.responder_resources;
955 goto retry;
956 }
957 rc = ep->rep_connected;
958 } else {
959 dprintk("RPC: %s: connected\n", __func__);
960 }
961
962 out:
963 if (rc)
964 ep->rep_connected = rc;
965 return rc;
966 }
967
968 /*
969 * rpcrdma_ep_disconnect
970 *
971 * This is separate from destroy to facilitate the ability
972 * to reconnect without recreating the endpoint.
973 *
974 * This call is not reentrant, and must not be made in parallel
975 * on the same endpoint.
976 */
977 int
978 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
979 {
980 int rc;
981
982 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
983 rpcrdma_clean_cq(ep->rep_attr.send_cq);
984 rc = rdma_disconnect(ia->ri_id);
985 if (!rc) {
986 /* returns without wait if not connected */
987 wait_event_interruptible(ep->rep_connect_wait,
988 ep->rep_connected != 1);
989 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
990 (ep->rep_connected == 1) ? "still " : "dis");
991 } else {
992 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
993 ep->rep_connected = rc;
994 }
995 return rc;
996 }
997
998 /*
999 * Initialize buffer memory
1000 */
1001 int
1002 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1003 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
1004 {
1005 char *p;
1006 size_t len, rlen, wlen;
1007 int i, rc;
1008 struct rpcrdma_mw *r;
1009
1010 buf->rb_max_requests = cdata->max_requests;
1011 spin_lock_init(&buf->rb_lock);
1012 atomic_set(&buf->rb_credits, 1);
1013
1014 /* Need to allocate:
1015 * 1. arrays for send and recv pointers
1016 * 2. arrays of struct rpcrdma_req to fill in pointers
1017 * 3. array of struct rpcrdma_rep for replies
1018 * 4. padding, if any
1019 * 5. mw's, fmr's or frmr's, if any
1020 * Send/recv buffers in req/rep need to be registered
1021 */
1022
1023 len = buf->rb_max_requests *
1024 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1025 len += cdata->padding;
1026 switch (ia->ri_memreg_strategy) {
1027 case RPCRDMA_FRMR:
1028 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
1029 sizeof(struct rpcrdma_mw);
1030 break;
1031 case RPCRDMA_MTHCAFMR:
1032 /* TBD we are perhaps overallocating here */
1033 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1034 sizeof(struct rpcrdma_mw);
1035 break;
1036 default:
1037 break;
1038 }
1039
1040 /* allocate 1, 4 and 5 in one shot */
1041 p = kzalloc(len, GFP_KERNEL);
1042 if (p == NULL) {
1043 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1044 __func__, len);
1045 rc = -ENOMEM;
1046 goto out;
1047 }
1048 buf->rb_pool = p; /* for freeing it later */
1049
1050 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1051 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1052 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1053 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1054
1055 /*
1056 * Register the zeroed pad buffer, if any.
1057 */
1058 if (cdata->padding) {
1059 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1060 &ep->rep_pad_mr, &ep->rep_pad);
1061 if (rc)
1062 goto out;
1063 }
1064 p += cdata->padding;
1065
1066 INIT_LIST_HEAD(&buf->rb_mws);
1067 r = (struct rpcrdma_mw *)p;
1068 switch (ia->ri_memreg_strategy) {
1069 case RPCRDMA_FRMR:
1070 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1071 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1072 ia->ri_max_frmr_depth);
1073 if (IS_ERR(r->r.frmr.fr_mr)) {
1074 rc = PTR_ERR(r->r.frmr.fr_mr);
1075 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1076 " failed %i\n", __func__, rc);
1077 goto out;
1078 }
1079 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1080 ia->ri_id->device,
1081 ia->ri_max_frmr_depth);
1082 if (IS_ERR(r->r.frmr.fr_pgl)) {
1083 rc = PTR_ERR(r->r.frmr.fr_pgl);
1084 dprintk("RPC: %s: "
1085 "ib_alloc_fast_reg_page_list "
1086 "failed %i\n", __func__, rc);
1087
1088 ib_dereg_mr(r->r.frmr.fr_mr);
1089 goto out;
1090 }
1091 list_add(&r->mw_list, &buf->rb_mws);
1092 ++r;
1093 }
1094 break;
1095 case RPCRDMA_MTHCAFMR:
1096 /* TBD we are perhaps overallocating here */
1097 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1098 static struct ib_fmr_attr fa =
1099 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1100 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1101 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1102 &fa);
1103 if (IS_ERR(r->r.fmr)) {
1104 rc = PTR_ERR(r->r.fmr);
1105 dprintk("RPC: %s: ib_alloc_fmr"
1106 " failed %i\n", __func__, rc);
1107 goto out;
1108 }
1109 list_add(&r->mw_list, &buf->rb_mws);
1110 ++r;
1111 }
1112 break;
1113 default:
1114 break;
1115 }
1116
1117 /*
1118 * Allocate/init the request/reply buffers. Doing this
1119 * using kmalloc for now -- one for each buf.
1120 */
1121 wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
1122 rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
1123 dprintk("RPC: %s: wlen = %zu, rlen = %zu\n",
1124 __func__, wlen, rlen);
1125
1126 for (i = 0; i < buf->rb_max_requests; i++) {
1127 struct rpcrdma_req *req;
1128 struct rpcrdma_rep *rep;
1129
1130 req = kmalloc(wlen, GFP_KERNEL);
1131 if (req == NULL) {
1132 dprintk("RPC: %s: request buffer %d alloc"
1133 " failed\n", __func__, i);
1134 rc = -ENOMEM;
1135 goto out;
1136 }
1137 memset(req, 0, sizeof(struct rpcrdma_req));
1138 buf->rb_send_bufs[i] = req;
1139 buf->rb_send_bufs[i]->rl_buffer = buf;
1140
1141 rc = rpcrdma_register_internal(ia, req->rl_base,
1142 wlen - offsetof(struct rpcrdma_req, rl_base),
1143 &buf->rb_send_bufs[i]->rl_handle,
1144 &buf->rb_send_bufs[i]->rl_iov);
1145 if (rc)
1146 goto out;
1147
1148 buf->rb_send_bufs[i]->rl_size = wlen -
1149 sizeof(struct rpcrdma_req);
1150
1151 rep = kmalloc(rlen, GFP_KERNEL);
1152 if (rep == NULL) {
1153 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1154 __func__, i);
1155 rc = -ENOMEM;
1156 goto out;
1157 }
1158 memset(rep, 0, sizeof(struct rpcrdma_rep));
1159 buf->rb_recv_bufs[i] = rep;
1160 buf->rb_recv_bufs[i]->rr_buffer = buf;
1161
1162 rc = rpcrdma_register_internal(ia, rep->rr_base,
1163 rlen - offsetof(struct rpcrdma_rep, rr_base),
1164 &buf->rb_recv_bufs[i]->rr_handle,
1165 &buf->rb_recv_bufs[i]->rr_iov);
1166 if (rc)
1167 goto out;
1168
1169 }
1170 dprintk("RPC: %s: max_requests %d\n",
1171 __func__, buf->rb_max_requests);
1172 /* done */
1173 return 0;
1174 out:
1175 rpcrdma_buffer_destroy(buf);
1176 return rc;
1177 }
1178
1179 /*
1180 * Unregister and destroy buffer memory. Need to deal with
1181 * partial initialization, so it's callable from failed create.
1182 * Must be called before destroying endpoint, as registrations
1183 * reference it.
1184 */
1185 void
1186 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1187 {
1188 int rc, i;
1189 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1190 struct rpcrdma_mw *r;
1191
1192 /* clean up in reverse order from create
1193 * 1. recv mr memory (mr free, then kfree)
1194 * 2. send mr memory (mr free, then kfree)
1195 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1196 * 4. arrays
1197 */
1198 dprintk("RPC: %s: entering\n", __func__);
1199
1200 for (i = 0; i < buf->rb_max_requests; i++) {
1201 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1202 rpcrdma_deregister_internal(ia,
1203 buf->rb_recv_bufs[i]->rr_handle,
1204 &buf->rb_recv_bufs[i]->rr_iov);
1205 kfree(buf->rb_recv_bufs[i]);
1206 }
1207 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1208 rpcrdma_deregister_internal(ia,
1209 buf->rb_send_bufs[i]->rl_handle,
1210 &buf->rb_send_bufs[i]->rl_iov);
1211 kfree(buf->rb_send_bufs[i]);
1212 }
1213 }
1214
1215 while (!list_empty(&buf->rb_mws)) {
1216 r = list_entry(buf->rb_mws.next,
1217 struct rpcrdma_mw, mw_list);
1218 list_del(&r->mw_list);
1219 switch (ia->ri_memreg_strategy) {
1220 case RPCRDMA_FRMR:
1221 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1222 if (rc)
1223 dprintk("RPC: %s:"
1224 " ib_dereg_mr"
1225 " failed %i\n",
1226 __func__, rc);
1227 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1228 break;
1229 case RPCRDMA_MTHCAFMR:
1230 rc = ib_dealloc_fmr(r->r.fmr);
1231 if (rc)
1232 dprintk("RPC: %s:"
1233 " ib_dealloc_fmr"
1234 " failed %i\n",
1235 __func__, rc);
1236 break;
1237 default:
1238 break;
1239 }
1240 }
1241
1242 kfree(buf->rb_pool);
1243 }
1244
1245 /*
1246 * Get a set of request/reply buffers.
1247 *
1248 * Reply buffer (if needed) is attached to send buffer upon return.
1249 * Rule:
1250 * rb_send_index and rb_recv_index MUST always be pointing to the
1251 * *next* available buffer (non-NULL). They are incremented after
1252 * removing buffers, and decremented *before* returning them.
1253 */
1254 struct rpcrdma_req *
1255 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1256 {
1257 struct rpcrdma_req *req;
1258 unsigned long flags;
1259 int i;
1260 struct rpcrdma_mw *r;
1261
1262 spin_lock_irqsave(&buffers->rb_lock, flags);
1263 if (buffers->rb_send_index == buffers->rb_max_requests) {
1264 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1265 dprintk("RPC: %s: out of request buffers\n", __func__);
1266 return ((struct rpcrdma_req *)NULL);
1267 }
1268
1269 req = buffers->rb_send_bufs[buffers->rb_send_index];
1270 if (buffers->rb_send_index < buffers->rb_recv_index) {
1271 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1272 __func__,
1273 buffers->rb_recv_index - buffers->rb_send_index);
1274 req->rl_reply = NULL;
1275 } else {
1276 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1277 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1278 }
1279 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1280 if (!list_empty(&buffers->rb_mws)) {
1281 i = RPCRDMA_MAX_SEGS - 1;
1282 do {
1283 r = list_entry(buffers->rb_mws.next,
1284 struct rpcrdma_mw, mw_list);
1285 list_del(&r->mw_list);
1286 req->rl_segments[i].mr_chunk.rl_mw = r;
1287 } while (--i >= 0);
1288 }
1289 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1290 return req;
1291 }
1292
1293 /*
1294 * Put request/reply buffers back into pool.
1295 * Pre-decrement counter/array index.
1296 */
1297 void
1298 rpcrdma_buffer_put(struct rpcrdma_req *req)
1299 {
1300 struct rpcrdma_buffer *buffers = req->rl_buffer;
1301 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1302 int i;
1303 unsigned long flags;
1304
1305 spin_lock_irqsave(&buffers->rb_lock, flags);
1306 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1307 req->rl_niovs = 0;
1308 if (req->rl_reply) {
1309 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1310 req->rl_reply->rr_func = NULL;
1311 req->rl_reply = NULL;
1312 }
1313 switch (ia->ri_memreg_strategy) {
1314 case RPCRDMA_FRMR:
1315 case RPCRDMA_MTHCAFMR:
1316 /*
1317 * Cycle mw's back in reverse order, and "spin" them.
1318 * This delays and scrambles reuse as much as possible.
1319 */
1320 i = 1;
1321 do {
1322 struct rpcrdma_mw **mw;
1323 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1324 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1325 *mw = NULL;
1326 } while (++i < RPCRDMA_MAX_SEGS);
1327 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1328 &buffers->rb_mws);
1329 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1330 break;
1331 default:
1332 break;
1333 }
1334 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1335 }
1336
1337 /*
1338 * Recover reply buffers from pool.
1339 * This happens when recovering from error conditions.
1340 * Post-increment counter/array index.
1341 */
1342 void
1343 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1344 {
1345 struct rpcrdma_buffer *buffers = req->rl_buffer;
1346 unsigned long flags;
1347
1348 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1349 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1350 spin_lock_irqsave(&buffers->rb_lock, flags);
1351 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1352 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1353 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1354 }
1355 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1356 }
1357
1358 /*
1359 * Put reply buffers back into pool when not attached to
1360 * request. This happens in error conditions.
1361 */
1362 void
1363 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1364 {
1365 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1366 unsigned long flags;
1367
1368 rep->rr_func = NULL;
1369 spin_lock_irqsave(&buffers->rb_lock, flags);
1370 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1371 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1372 }
1373
1374 /*
1375 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1376 */
1377
1378 int
1379 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1380 struct ib_mr **mrp, struct ib_sge *iov)
1381 {
1382 struct ib_phys_buf ipb;
1383 struct ib_mr *mr;
1384 int rc;
1385
1386 /*
1387 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1388 */
1389 iov->addr = ib_dma_map_single(ia->ri_id->device,
1390 va, len, DMA_BIDIRECTIONAL);
1391 iov->length = len;
1392
1393 if (ia->ri_have_dma_lkey) {
1394 *mrp = NULL;
1395 iov->lkey = ia->ri_dma_lkey;
1396 return 0;
1397 } else if (ia->ri_bind_mem != NULL) {
1398 *mrp = NULL;
1399 iov->lkey = ia->ri_bind_mem->lkey;
1400 return 0;
1401 }
1402
1403 ipb.addr = iov->addr;
1404 ipb.size = iov->length;
1405 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1406 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1407
1408 dprintk("RPC: %s: phys convert: 0x%llx "
1409 "registered 0x%llx length %d\n",
1410 __func__, (unsigned long long)ipb.addr,
1411 (unsigned long long)iov->addr, len);
1412
1413 if (IS_ERR(mr)) {
1414 *mrp = NULL;
1415 rc = PTR_ERR(mr);
1416 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1417 } else {
1418 *mrp = mr;
1419 iov->lkey = mr->lkey;
1420 rc = 0;
1421 }
1422
1423 return rc;
1424 }
1425
1426 int
1427 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1428 struct ib_mr *mr, struct ib_sge *iov)
1429 {
1430 int rc;
1431
1432 ib_dma_unmap_single(ia->ri_id->device,
1433 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1434
1435 if (NULL == mr)
1436 return 0;
1437
1438 rc = ib_dereg_mr(mr);
1439 if (rc)
1440 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1441 return rc;
1442 }
1443
1444 /*
1445 * Wrappers for chunk registration, shared by read/write chunk code.
1446 */
1447
1448 static void
1449 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1450 {
1451 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1452 seg->mr_dmalen = seg->mr_len;
1453 if (seg->mr_page)
1454 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1455 seg->mr_page, offset_in_page(seg->mr_offset),
1456 seg->mr_dmalen, seg->mr_dir);
1457 else
1458 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1459 seg->mr_offset,
1460 seg->mr_dmalen, seg->mr_dir);
1461 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1462 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1463 __func__,
1464 (unsigned long long)seg->mr_dma,
1465 seg->mr_offset, seg->mr_dmalen);
1466 }
1467 }
1468
1469 static void
1470 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1471 {
1472 if (seg->mr_page)
1473 ib_dma_unmap_page(ia->ri_id->device,
1474 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1475 else
1476 ib_dma_unmap_single(ia->ri_id->device,
1477 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1478 }
1479
1480 static int
1481 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1482 int *nsegs, int writing, struct rpcrdma_ia *ia,
1483 struct rpcrdma_xprt *r_xprt)
1484 {
1485 struct rpcrdma_mr_seg *seg1 = seg;
1486 struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1487
1488 u8 key;
1489 int len, pageoff;
1490 int i, rc;
1491 int seg_len;
1492 u64 pa;
1493 int page_no;
1494
1495 pageoff = offset_in_page(seg1->mr_offset);
1496 seg1->mr_offset -= pageoff; /* start of page */
1497 seg1->mr_len += pageoff;
1498 len = -pageoff;
1499 if (*nsegs > ia->ri_max_frmr_depth)
1500 *nsegs = ia->ri_max_frmr_depth;
1501 for (page_no = i = 0; i < *nsegs;) {
1502 rpcrdma_map_one(ia, seg, writing);
1503 pa = seg->mr_dma;
1504 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1505 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->
1506 page_list[page_no++] = pa;
1507 pa += PAGE_SIZE;
1508 }
1509 len += seg->mr_len;
1510 ++seg;
1511 ++i;
1512 /* Check for holes */
1513 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1514 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1515 break;
1516 }
1517 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1518 __func__, seg1->mr_chunk.rl_mw, i);
1519
1520 if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1521 dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
1522 __func__,
1523 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1524 /* Invalidate before using. */
1525 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1526 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1527 invalidate_wr.next = &frmr_wr;
1528 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1529 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1530 invalidate_wr.ex.invalidate_rkey =
1531 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1532 DECR_CQCOUNT(&r_xprt->rx_ep);
1533 post_wr = &invalidate_wr;
1534 } else
1535 post_wr = &frmr_wr;
1536
1537 /* Prepare FRMR WR */
1538 memset(&frmr_wr, 0, sizeof frmr_wr);
1539 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1540 frmr_wr.opcode = IB_WR_FAST_REG_MR;
1541 frmr_wr.send_flags = IB_SEND_SIGNALED;
1542 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1543 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1544 frmr_wr.wr.fast_reg.page_list_len = page_no;
1545 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1546 frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1547 if (frmr_wr.wr.fast_reg.length < len) {
1548 while (seg1->mr_nsegs--)
1549 rpcrdma_unmap_one(ia, seg++);
1550 return -EIO;
1551 }
1552
1553 /* Bump the key */
1554 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1555 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1556
1557 frmr_wr.wr.fast_reg.access_flags = (writing ?
1558 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1559 IB_ACCESS_REMOTE_READ);
1560 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1561 DECR_CQCOUNT(&r_xprt->rx_ep);
1562
1563 rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
1564
1565 if (rc) {
1566 dprintk("RPC: %s: failed ib_post_send for register,"
1567 " status %i\n", __func__, rc);
1568 while (i--)
1569 rpcrdma_unmap_one(ia, --seg);
1570 } else {
1571 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1572 seg1->mr_base = seg1->mr_dma + pageoff;
1573 seg1->mr_nsegs = i;
1574 seg1->mr_len = len;
1575 }
1576 *nsegs = i;
1577 return rc;
1578 }
1579
1580 static int
1581 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1582 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1583 {
1584 struct rpcrdma_mr_seg *seg1 = seg;
1585 struct ib_send_wr invalidate_wr, *bad_wr;
1586 int rc;
1587
1588 while (seg1->mr_nsegs--)
1589 rpcrdma_unmap_one(ia, seg++);
1590
1591 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1592 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1593 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1594 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1595 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1596 DECR_CQCOUNT(&r_xprt->rx_ep);
1597
1598 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1599 if (rc)
1600 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1601 " status %i\n", __func__, rc);
1602 return rc;
1603 }
1604
1605 static int
1606 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1607 int *nsegs, int writing, struct rpcrdma_ia *ia)
1608 {
1609 struct rpcrdma_mr_seg *seg1 = seg;
1610 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1611 int len, pageoff, i, rc;
1612
1613 pageoff = offset_in_page(seg1->mr_offset);
1614 seg1->mr_offset -= pageoff; /* start of page */
1615 seg1->mr_len += pageoff;
1616 len = -pageoff;
1617 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1618 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1619 for (i = 0; i < *nsegs;) {
1620 rpcrdma_map_one(ia, seg, writing);
1621 physaddrs[i] = seg->mr_dma;
1622 len += seg->mr_len;
1623 ++seg;
1624 ++i;
1625 /* Check for holes */
1626 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1627 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1628 break;
1629 }
1630 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1631 physaddrs, i, seg1->mr_dma);
1632 if (rc) {
1633 dprintk("RPC: %s: failed ib_map_phys_fmr "
1634 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1635 len, (unsigned long long)seg1->mr_dma,
1636 pageoff, i, rc);
1637 while (i--)
1638 rpcrdma_unmap_one(ia, --seg);
1639 } else {
1640 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1641 seg1->mr_base = seg1->mr_dma + pageoff;
1642 seg1->mr_nsegs = i;
1643 seg1->mr_len = len;
1644 }
1645 *nsegs = i;
1646 return rc;
1647 }
1648
1649 static int
1650 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1651 struct rpcrdma_ia *ia)
1652 {
1653 struct rpcrdma_mr_seg *seg1 = seg;
1654 LIST_HEAD(l);
1655 int rc;
1656
1657 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1658 rc = ib_unmap_fmr(&l);
1659 while (seg1->mr_nsegs--)
1660 rpcrdma_unmap_one(ia, seg++);
1661 if (rc)
1662 dprintk("RPC: %s: failed ib_unmap_fmr,"
1663 " status %i\n", __func__, rc);
1664 return rc;
1665 }
1666
1667 int
1668 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1669 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1670 {
1671 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1672 int rc = 0;
1673
1674 switch (ia->ri_memreg_strategy) {
1675
1676 #if RPCRDMA_PERSISTENT_REGISTRATION
1677 case RPCRDMA_ALLPHYSICAL:
1678 rpcrdma_map_one(ia, seg, writing);
1679 seg->mr_rkey = ia->ri_bind_mem->rkey;
1680 seg->mr_base = seg->mr_dma;
1681 seg->mr_nsegs = 1;
1682 nsegs = 1;
1683 break;
1684 #endif
1685
1686 /* Registration using frmr registration */
1687 case RPCRDMA_FRMR:
1688 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1689 break;
1690
1691 /* Registration using fmr memory registration */
1692 case RPCRDMA_MTHCAFMR:
1693 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1694 break;
1695
1696 default:
1697 return -1;
1698 }
1699 if (rc)
1700 return -1;
1701
1702 return nsegs;
1703 }
1704
1705 int
1706 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1707 struct rpcrdma_xprt *r_xprt)
1708 {
1709 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1710 int nsegs = seg->mr_nsegs, rc;
1711
1712 switch (ia->ri_memreg_strategy) {
1713
1714 #if RPCRDMA_PERSISTENT_REGISTRATION
1715 case RPCRDMA_ALLPHYSICAL:
1716 rpcrdma_unmap_one(ia, seg);
1717 break;
1718 #endif
1719
1720 case RPCRDMA_FRMR:
1721 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1722 break;
1723
1724 case RPCRDMA_MTHCAFMR:
1725 rc = rpcrdma_deregister_fmr_external(seg, ia);
1726 break;
1727
1728 default:
1729 break;
1730 }
1731 return nsegs;
1732 }
1733
1734 /*
1735 * Prepost any receive buffer, then post send.
1736 *
1737 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1738 */
1739 int
1740 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1741 struct rpcrdma_ep *ep,
1742 struct rpcrdma_req *req)
1743 {
1744 struct ib_send_wr send_wr, *send_wr_fail;
1745 struct rpcrdma_rep *rep = req->rl_reply;
1746 int rc;
1747
1748 if (rep) {
1749 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1750 if (rc)
1751 goto out;
1752 req->rl_reply = NULL;
1753 }
1754
1755 send_wr.next = NULL;
1756 send_wr.wr_id = 0ULL; /* no send cookie */
1757 send_wr.sg_list = req->rl_send_iov;
1758 send_wr.num_sge = req->rl_niovs;
1759 send_wr.opcode = IB_WR_SEND;
1760 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1761 ib_dma_sync_single_for_device(ia->ri_id->device,
1762 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1763 DMA_TO_DEVICE);
1764 ib_dma_sync_single_for_device(ia->ri_id->device,
1765 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1766 DMA_TO_DEVICE);
1767 ib_dma_sync_single_for_device(ia->ri_id->device,
1768 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1769 DMA_TO_DEVICE);
1770
1771 if (DECR_CQCOUNT(ep) > 0)
1772 send_wr.send_flags = 0;
1773 else { /* Provider must take a send completion every now and then */
1774 INIT_CQCOUNT(ep);
1775 send_wr.send_flags = IB_SEND_SIGNALED;
1776 }
1777
1778 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1779 if (rc)
1780 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1781 rc);
1782 out:
1783 return rc;
1784 }
1785
1786 /*
1787 * (Re)post a receive buffer.
1788 */
1789 int
1790 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1791 struct rpcrdma_ep *ep,
1792 struct rpcrdma_rep *rep)
1793 {
1794 struct ib_recv_wr recv_wr, *recv_wr_fail;
1795 int rc;
1796
1797 recv_wr.next = NULL;
1798 recv_wr.wr_id = (u64) (unsigned long) rep;
1799 recv_wr.sg_list = &rep->rr_iov;
1800 recv_wr.num_sge = 1;
1801
1802 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1803 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1804
1805 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1806
1807 if (rc)
1808 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1809 rc);
1810 return rc;
1811 }
This page took 0.325665 seconds and 5 git commands to generate.