2 * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include <rdma/ib_umem.h>
34 #include <rdma/ib_umem_odp.h>
38 #define MAX_PREFETCH_LEN (4*1024*1024U)
40 /* Timeout in ms to wait for an active mmu notifier to complete when handling
42 #define MMU_NOTIFIER_TIMEOUT 1000
44 struct workqueue_struct
*mlx5_ib_page_fault_wq
;
46 void mlx5_ib_invalidate_range(struct ib_umem
*umem
, unsigned long start
,
49 struct mlx5_ib_mr
*mr
;
50 const u64 umr_block_mask
= (MLX5_UMR_MTT_ALIGNMENT
/ sizeof(u64
)) - 1;
51 u64 idx
= 0, blk_start_idx
= 0;
55 if (!umem
|| !umem
->odp_data
) {
56 pr_err("invalidation called on NULL umem or non-ODP umem\n");
60 mr
= umem
->odp_data
->private;
62 if (!mr
|| !mr
->ibmr
.pd
)
65 start
= max_t(u64
, ib_umem_start(umem
), start
);
66 end
= min_t(u64
, ib_umem_end(umem
), end
);
69 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
70 * while we are doing the invalidation, no page fault will attempt to
71 * overwrite the same MTTs. Concurent invalidations might race us,
72 * but they will write 0s as well, so no difference in the end result.
75 for (addr
= start
; addr
< end
; addr
+= (u64
)umem
->page_size
) {
76 idx
= (addr
- ib_umem_start(umem
)) / PAGE_SIZE
;
78 * Strive to write the MTTs in chunks, but avoid overwriting
79 * non-existing MTTs. The huristic here can be improved to
80 * estimate the cost of another UMR vs. the cost of bigger
83 if (umem
->odp_data
->dma_list
[idx
] &
84 (ODP_READ_ALLOWED_BIT
| ODP_WRITE_ALLOWED_BIT
)) {
90 u64 umr_offset
= idx
& umr_block_mask
;
92 if (in_block
&& umr_offset
== 0) {
93 mlx5_ib_update_mtt(mr
, blk_start_idx
,
94 idx
- blk_start_idx
, 1);
100 mlx5_ib_update_mtt(mr
, blk_start_idx
, idx
- blk_start_idx
+ 1,
104 * We are now sure that the device will not access the
105 * memory. We can safely unmap it, and mark it as dirty if
109 ib_umem_odp_unmap_dma_pages(umem
, start
, end
);
112 #define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \
113 if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \
114 ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \
117 int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev
*dev
)
120 struct mlx5_odp_caps hw_caps
;
121 struct ib_odp_caps
*caps
= &dev
->odp_caps
;
123 memset(caps
, 0, sizeof(*caps
));
125 if (!(dev
->mdev
->caps
.gen
.flags
& MLX5_DEV_CAP_FLAG_ON_DMND_PG
))
128 err
= mlx5_query_odp_caps(dev
->mdev
, &hw_caps
);
132 caps
->general_caps
= IB_ODP_SUPPORT
;
133 COPY_ODP_BIT_MLX_TO_IB(hw_caps
, caps
, per_transport_caps
.ud_odp_caps
,
135 COPY_ODP_BIT_MLX_TO_IB(hw_caps
, caps
, per_transport_caps
.rc_odp_caps
,
137 COPY_ODP_BIT_MLX_TO_IB(hw_caps
, caps
, per_transport_caps
.rc_odp_caps
,
139 COPY_ODP_BIT_MLX_TO_IB(hw_caps
, caps
, per_transport_caps
.rc_odp_caps
,
141 COPY_ODP_BIT_MLX_TO_IB(hw_caps
, caps
, per_transport_caps
.rc_odp_caps
,
148 static struct mlx5_ib_mr
*mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev
*dev
,
151 u32 base_key
= mlx5_base_mkey(key
);
152 struct mlx5_core_mr
*mmr
= __mlx5_mr_lookup(dev
->mdev
, base_key
);
153 struct mlx5_ib_mr
*mr
= container_of(mmr
, struct mlx5_ib_mr
, mmr
);
155 if (!mmr
|| mmr
->key
!= key
|| !mr
->live
)
158 return container_of(mmr
, struct mlx5_ib_mr
, mmr
);
161 static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp
*qp
,
162 struct mlx5_ib_pfault
*pfault
,
164 struct mlx5_ib_dev
*dev
= to_mdev(qp
->ibqp
.pd
->device
);
165 int ret
= mlx5_core_page_fault_resume(dev
->mdev
, qp
->mqp
.qpn
,
166 pfault
->mpfault
.flags
,
169 pr_err("Failed to resolve the page fault on QP 0x%x\n",
174 * Handle a single data segment in a page-fault WQE.
176 * Returns number of pages retrieved on success. The caller will continue to
177 * the next data segment.
178 * Can return the following error codes:
179 * -EAGAIN to designate a temporary error. The caller will abort handling the
180 * page fault and resolve it.
181 * -EFAULT when there's an error mapping the requested pages. The caller will
182 * abort the page fault handling and possibly move the QP to an error state.
183 * On other errors the QP should also be closed with an error.
185 static int pagefault_single_data_segment(struct mlx5_ib_qp
*qp
,
186 struct mlx5_ib_pfault
*pfault
,
187 u32 key
, u64 io_virt
, size_t bcnt
,
190 struct mlx5_ib_dev
*mib_dev
= to_mdev(qp
->ibqp
.pd
->device
);
192 unsigned int current_seq
;
194 int npages
= 0, ret
= 0;
195 struct mlx5_ib_mr
*mr
;
196 u64 access_mask
= ODP_READ_ALLOWED_BIT
;
198 srcu_key
= srcu_read_lock(&mib_dev
->mr_srcu
);
199 mr
= mlx5_ib_odp_find_mr_lkey(mib_dev
, key
);
201 * If we didn't find the MR, it means the MR was closed while we were
202 * handling the ODP event. In this case we return -EFAULT so that the
205 if (!mr
|| !mr
->ibmr
.pd
) {
206 pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n",
211 if (!mr
->umem
->odp_data
) {
212 pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
216 (bcnt
- pfault
->mpfault
.bytes_committed
);
219 if (mr
->ibmr
.pd
!= qp
->ibqp
.pd
) {
220 pr_err("Page-fault with different PDs for QP and MR.\n");
225 current_seq
= ACCESS_ONCE(mr
->umem
->odp_data
->notifiers_seq
);
227 * Ensure the sequence number is valid for some time before we call
233 * Avoid branches - this code will perform correctly
234 * in all iterations (in iteration 2 and above,
235 * bytes_committed == 0).
237 io_virt
+= pfault
->mpfault
.bytes_committed
;
238 bcnt
-= pfault
->mpfault
.bytes_committed
;
240 start_idx
= (io_virt
- (mr
->mmr
.iova
& PAGE_MASK
)) >> PAGE_SHIFT
;
242 if (mr
->umem
->writable
)
243 access_mask
|= ODP_WRITE_ALLOWED_BIT
;
244 npages
= ib_umem_odp_map_dma_pages(mr
->umem
, io_virt
, bcnt
,
245 access_mask
, current_seq
);
252 mutex_lock(&mr
->umem
->odp_data
->umem_mutex
);
253 if (!ib_umem_mmu_notifier_retry(mr
->umem
, current_seq
)) {
255 * No need to check whether the MTTs really belong to
256 * this MR, since ib_umem_odp_map_dma_pages already
259 ret
= mlx5_ib_update_mtt(mr
, start_idx
, npages
, 0);
263 mutex_unlock(&mr
->umem
->odp_data
->umem_mutex
);
266 pr_err("Failed to update mkey page tables\n");
271 u32 new_mappings
= npages
* PAGE_SIZE
-
272 (io_virt
- round_down(io_virt
, PAGE_SIZE
));
273 *bytes_mapped
+= min_t(u32
, new_mappings
, bcnt
);
278 if (ret
== -EAGAIN
) {
279 if (!mr
->umem
->odp_data
->dying
) {
280 struct ib_umem_odp
*odp_data
= mr
->umem
->odp_data
;
281 unsigned long timeout
=
282 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT
);
284 if (!wait_for_completion_timeout(
285 &odp_data
->notifier_completion
,
287 pr_warn("timeout waiting for mmu notifier completion\n");
290 /* The MR is being killed, kill the QP as well. */
294 srcu_read_unlock(&mib_dev
->mr_srcu
, srcu_key
);
295 pfault
->mpfault
.bytes_committed
= 0;
296 return ret
? ret
: npages
;
300 * Parse a series of data segments for page fault handling.
302 * @qp the QP on which the fault occurred.
303 * @pfault contains page fault information.
304 * @wqe points at the first data segment in the WQE.
305 * @wqe_end points after the end of the WQE.
306 * @bytes_mapped receives the number of bytes that the function was able to
307 * map. This allows the caller to decide intelligently whether
308 * enough memory was mapped to resolve the page fault
309 * successfully (e.g. enough for the next MTU, or the entire
311 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus
312 * the committed bytes).
314 * Returns the number of pages loaded if positive, zero for an empty WQE, or a
315 * negative error code.
317 static int pagefault_data_segments(struct mlx5_ib_qp
*qp
,
318 struct mlx5_ib_pfault
*pfault
, void *wqe
,
319 void *wqe_end
, u32
*bytes_mapped
,
320 u32
*total_wqe_bytes
, int receive_queue
)
322 int ret
= 0, npages
= 0;
329 /* Skip SRQ next-WQE segment. */
330 if (receive_queue
&& qp
->ibqp
.srq
)
331 wqe
+= sizeof(struct mlx5_wqe_srq_next_seg
);
336 *total_wqe_bytes
= 0;
338 while (wqe
< wqe_end
) {
339 struct mlx5_wqe_data_seg
*dseg
= wqe
;
341 io_virt
= be64_to_cpu(dseg
->addr
);
342 key
= be32_to_cpu(dseg
->lkey
);
343 byte_count
= be32_to_cpu(dseg
->byte_count
);
344 inline_segment
= !!(byte_count
& MLX5_INLINE_SEG
);
345 bcnt
= byte_count
& ~MLX5_INLINE_SEG
;
347 if (inline_segment
) {
348 bcnt
= bcnt
& MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK
;
349 wqe
+= ALIGN(sizeof(struct mlx5_wqe_inline_seg
) + bcnt
,
352 wqe
+= sizeof(*dseg
);
355 /* receive WQE end of sg list. */
356 if (receive_queue
&& bcnt
== 0 && key
== MLX5_INVALID_LKEY
&&
360 if (!inline_segment
&& total_wqe_bytes
) {
361 *total_wqe_bytes
+= bcnt
- min_t(size_t, bcnt
,
362 pfault
->mpfault
.bytes_committed
);
365 /* A zero length data segment designates a length of 2GB. */
369 if (inline_segment
|| bcnt
<= pfault
->mpfault
.bytes_committed
) {
370 pfault
->mpfault
.bytes_committed
-=
372 pfault
->mpfault
.bytes_committed
);
376 ret
= pagefault_single_data_segment(qp
, pfault
, key
, io_virt
,
383 return ret
< 0 ? ret
: npages
;
387 * Parse initiator WQE. Advances the wqe pointer to point at the
388 * scatter-gather list, and set wqe_end to the end of the WQE.
390 static int mlx5_ib_mr_initiator_pfault_handler(
391 struct mlx5_ib_qp
*qp
, struct mlx5_ib_pfault
*pfault
,
392 void **wqe
, void **wqe_end
, int wqe_length
)
394 struct mlx5_ib_dev
*dev
= to_mdev(qp
->ibqp
.pd
->device
);
395 struct mlx5_wqe_ctrl_seg
*ctrl
= *wqe
;
396 u16 wqe_index
= pfault
->mpfault
.wqe
.wqe_index
;
399 u32 ctrl_wqe_index
, ctrl_qpn
;
402 ds
= be32_to_cpu(ctrl
->qpn_ds
) & MLX5_WQE_CTRL_DS_MASK
;
403 if (ds
* MLX5_WQE_DS_UNITS
> wqe_length
) {
404 mlx5_ib_err(dev
, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
410 mlx5_ib_err(dev
, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
411 wqe_index
, qp
->mqp
.qpn
);
416 ctrl_wqe_index
= (be32_to_cpu(ctrl
->opmod_idx_opcode
) &
417 MLX5_WQE_CTRL_WQE_INDEX_MASK
) >>
418 MLX5_WQE_CTRL_WQE_INDEX_SHIFT
;
419 if (wqe_index
!= ctrl_wqe_index
) {
420 mlx5_ib_err(dev
, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
421 wqe_index
, qp
->mqp
.qpn
,
426 ctrl_qpn
= (be32_to_cpu(ctrl
->qpn_ds
) & MLX5_WQE_CTRL_QPN_MASK
) >>
427 MLX5_WQE_CTRL_QPN_SHIFT
;
428 if (qp
->mqp
.qpn
!= ctrl_qpn
) {
429 mlx5_ib_err(dev
, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
430 wqe_index
, qp
->mqp
.qpn
,
436 *wqe_end
= *wqe
+ ds
* MLX5_WQE_DS_UNITS
;
437 *wqe
+= sizeof(*ctrl
);
439 opcode
= be32_to_cpu(ctrl
->opmod_idx_opcode
) &
440 MLX5_WQE_CTRL_OPCODE_MASK
;
441 switch (qp
->ibqp
.qp_type
) {
444 case MLX5_OPCODE_SEND
:
445 case MLX5_OPCODE_SEND_IMM
:
446 case MLX5_OPCODE_SEND_INVAL
:
447 if (!(dev
->odp_caps
.per_transport_caps
.rc_odp_caps
&
448 IB_ODP_SUPPORT_SEND
))
449 goto invalid_transport_or_opcode
;
451 case MLX5_OPCODE_RDMA_WRITE
:
452 case MLX5_OPCODE_RDMA_WRITE_IMM
:
453 if (!(dev
->odp_caps
.per_transport_caps
.rc_odp_caps
&
454 IB_ODP_SUPPORT_WRITE
))
455 goto invalid_transport_or_opcode
;
456 *wqe
+= sizeof(struct mlx5_wqe_raddr_seg
);
458 case MLX5_OPCODE_RDMA_READ
:
459 if (!(dev
->odp_caps
.per_transport_caps
.rc_odp_caps
&
460 IB_ODP_SUPPORT_READ
))
461 goto invalid_transport_or_opcode
;
462 *wqe
+= sizeof(struct mlx5_wqe_raddr_seg
);
465 goto invalid_transport_or_opcode
;
470 case MLX5_OPCODE_SEND
:
471 case MLX5_OPCODE_SEND_IMM
:
472 if (!(dev
->odp_caps
.per_transport_caps
.ud_odp_caps
&
473 IB_ODP_SUPPORT_SEND
))
474 goto invalid_transport_or_opcode
;
475 *wqe
+= sizeof(struct mlx5_wqe_datagram_seg
);
478 goto invalid_transport_or_opcode
;
482 invalid_transport_or_opcode
:
483 mlx5_ib_err(dev
, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n",
484 qp
->ibqp
.qp_type
, opcode
);
492 * Parse responder WQE. Advances the wqe pointer to point at the
493 * scatter-gather list, and set wqe_end to the end of the WQE.
495 static int mlx5_ib_mr_responder_pfault_handler(
496 struct mlx5_ib_qp
*qp
, struct mlx5_ib_pfault
*pfault
,
497 void **wqe
, void **wqe_end
, int wqe_length
)
499 struct mlx5_ib_dev
*dev
= to_mdev(qp
->ibqp
.pd
->device
);
500 struct mlx5_ib_wq
*wq
= &qp
->rq
;
501 int wqe_size
= 1 << wq
->wqe_shift
;
504 mlx5_ib_err(dev
, "ODP fault on SRQ is not supported\n");
509 mlx5_ib_err(dev
, "ODP fault with WQE signatures is not supported\n");
513 if (wqe_size
> wqe_length
) {
514 mlx5_ib_err(dev
, "Couldn't read all of the receive WQE's content\n");
518 switch (qp
->ibqp
.qp_type
) {
520 if (!(dev
->odp_caps
.per_transport_caps
.rc_odp_caps
&
521 IB_ODP_SUPPORT_RECV
))
522 goto invalid_transport_or_opcode
;
525 invalid_transport_or_opcode
:
526 mlx5_ib_err(dev
, "ODP fault on QP of an unsupported transport. transport: 0x%x\n",
531 *wqe_end
= *wqe
+ wqe_size
;
536 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp
*qp
,
537 struct mlx5_ib_pfault
*pfault
)
539 struct mlx5_ib_dev
*dev
= to_mdev(qp
->ibqp
.pd
->device
);
542 u32 bytes_mapped
, total_wqe_bytes
;
544 int resume_with_error
= 0;
545 u16 wqe_index
= pfault
->mpfault
.wqe
.wqe_index
;
546 int requestor
= pfault
->mpfault
.flags
& MLX5_PFAULT_REQUESTOR
;
548 buffer
= (char *)__get_free_page(GFP_KERNEL
);
550 mlx5_ib_err(dev
, "Error allocating memory for IO page fault handling.\n");
551 resume_with_error
= 1;
552 goto resolve_page_fault
;
555 ret
= mlx5_ib_read_user_wqe(qp
, requestor
, wqe_index
, buffer
,
558 mlx5_ib_err(dev
, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n",
559 -ret
, wqe_index
, qp
->mqp
.qpn
);
560 resume_with_error
= 1;
561 goto resolve_page_fault
;
566 ret
= mlx5_ib_mr_initiator_pfault_handler(qp
, pfault
, &wqe
,
569 ret
= mlx5_ib_mr_responder_pfault_handler(qp
, pfault
, &wqe
,
572 resume_with_error
= 1;
573 goto resolve_page_fault
;
576 if (wqe
>= wqe_end
) {
577 mlx5_ib_err(dev
, "ODP fault on invalid WQE.\n");
578 resume_with_error
= 1;
579 goto resolve_page_fault
;
582 ret
= pagefault_data_segments(qp
, pfault
, wqe
, wqe_end
, &bytes_mapped
,
583 &total_wqe_bytes
, !requestor
);
584 if (ret
== -EAGAIN
) {
585 goto resolve_page_fault
;
586 } else if (ret
< 0 || total_wqe_bytes
> bytes_mapped
) {
587 mlx5_ib_err(dev
, "Error getting user pages for page fault. Error: 0x%x\n",
589 resume_with_error
= 1;
590 goto resolve_page_fault
;
594 mlx5_ib_page_fault_resume(qp
, pfault
, resume_with_error
);
595 mlx5_ib_dbg(dev
, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n",
596 qp
->mqp
.qpn
, resume_with_error
, pfault
->mpfault
.flags
);
598 free_page((unsigned long)buffer
);
601 static int pages_in_range(u64 address
, u32 length
)
603 return (ALIGN(address
+ length
, PAGE_SIZE
) -
604 (address
& PAGE_MASK
)) >> PAGE_SHIFT
;
607 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp
*qp
,
608 struct mlx5_ib_pfault
*pfault
)
610 struct mlx5_pagefault
*mpfault
= &pfault
->mpfault
;
613 u32 prefetch_len
= mpfault
->bytes_committed
;
614 int prefetch_activated
= 0;
615 u32 rkey
= mpfault
->rdma
.r_key
;
618 /* The RDMA responder handler handles the page fault in two parts.
619 * First it brings the necessary pages for the current packet
620 * (and uses the pfault context), and then (after resuming the QP)
621 * prefetches more pages. The second operation cannot use the pfault
622 * context and therefore uses the dummy_pfault context allocated on
624 struct mlx5_ib_pfault dummy_pfault
= {};
626 dummy_pfault
.mpfault
.bytes_committed
= 0;
628 mpfault
->rdma
.rdma_va
+= mpfault
->bytes_committed
;
629 mpfault
->rdma
.rdma_op_len
-= min(mpfault
->bytes_committed
,
630 mpfault
->rdma
.rdma_op_len
);
631 mpfault
->bytes_committed
= 0;
633 address
= mpfault
->rdma
.rdma_va
;
634 length
= mpfault
->rdma
.rdma_op_len
;
636 /* For some operations, the hardware cannot tell the exact message
637 * length, and in those cases it reports zero. Use prefetch
640 prefetch_activated
= 1;
641 length
= mpfault
->rdma
.packet_size
;
642 prefetch_len
= min(MAX_PREFETCH_LEN
, prefetch_len
);
645 ret
= pagefault_single_data_segment(qp
, pfault
, rkey
, address
, length
,
647 if (ret
== -EAGAIN
) {
648 /* We're racing with an invalidation, don't prefetch */
649 prefetch_activated
= 0;
650 } else if (ret
< 0 || pages_in_range(address
, length
) > ret
) {
651 mlx5_ib_page_fault_resume(qp
, pfault
, 1);
655 mlx5_ib_page_fault_resume(qp
, pfault
, 0);
657 /* At this point, there might be a new pagefault already arriving in
658 * the eq, switch to the dummy pagefault for the rest of the
659 * processing. We're still OK with the objects being alive as the
660 * work-queue is being fenced. */
662 if (prefetch_activated
) {
663 ret
= pagefault_single_data_segment(qp
, &dummy_pfault
, rkey
,
668 pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n",
669 ret
, prefetch_activated
,
670 qp
->ibqp
.qp_num
, address
, prefetch_len
);
675 void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp
*qp
,
676 struct mlx5_ib_pfault
*pfault
)
678 u8 event_subtype
= pfault
->mpfault
.event_subtype
;
680 switch (event_subtype
) {
681 case MLX5_PFAULT_SUBTYPE_WQE
:
682 mlx5_ib_mr_wqe_pfault_handler(qp
, pfault
);
684 case MLX5_PFAULT_SUBTYPE_RDMA
:
685 mlx5_ib_mr_rdma_pfault_handler(qp
, pfault
);
688 pr_warn("Invalid page fault event subtype: 0x%x\n",
690 mlx5_ib_page_fault_resume(qp
, pfault
, 1);
695 static void mlx5_ib_qp_pfault_action(struct work_struct
*work
)
697 struct mlx5_ib_pfault
*pfault
= container_of(work
,
698 struct mlx5_ib_pfault
,
700 enum mlx5_ib_pagefault_context context
=
701 mlx5_ib_get_pagefault_context(&pfault
->mpfault
);
702 struct mlx5_ib_qp
*qp
= container_of(pfault
, struct mlx5_ib_qp
,
703 pagefaults
[context
]);
704 mlx5_ib_mr_pfault_handler(qp
, pfault
);
707 void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp
*qp
)
711 spin_lock_irqsave(&qp
->disable_page_faults_lock
, flags
);
712 qp
->disable_page_faults
= 1;
713 spin_unlock_irqrestore(&qp
->disable_page_faults_lock
, flags
);
716 * Note that at this point, we are guarenteed that no more
717 * work queue elements will be posted to the work queue with
718 * the QP we are closing.
720 flush_workqueue(mlx5_ib_page_fault_wq
);
723 void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp
*qp
)
727 spin_lock_irqsave(&qp
->disable_page_faults_lock
, flags
);
728 qp
->disable_page_faults
= 0;
729 spin_unlock_irqrestore(&qp
->disable_page_faults_lock
, flags
);
732 static void mlx5_ib_pfault_handler(struct mlx5_core_qp
*qp
,
733 struct mlx5_pagefault
*pfault
)
736 * Note that we will only get one fault event per QP per context
737 * (responder/initiator, read/write), until we resolve the page fault
738 * with the mlx5_ib_page_fault_resume command. Since this function is
739 * called from within the work element, there is no risk of missing
742 struct mlx5_ib_qp
*mibqp
= to_mibqp(qp
);
743 enum mlx5_ib_pagefault_context context
=
744 mlx5_ib_get_pagefault_context(pfault
);
745 struct mlx5_ib_pfault
*qp_pfault
= &mibqp
->pagefaults
[context
];
747 qp_pfault
->mpfault
= *pfault
;
749 /* No need to stop interrupts here since we are in an interrupt */
750 spin_lock(&mibqp
->disable_page_faults_lock
);
751 if (!mibqp
->disable_page_faults
)
752 queue_work(mlx5_ib_page_fault_wq
, &qp_pfault
->work
);
753 spin_unlock(&mibqp
->disable_page_faults_lock
);
756 void mlx5_ib_odp_create_qp(struct mlx5_ib_qp
*qp
)
760 qp
->disable_page_faults
= 1;
761 spin_lock_init(&qp
->disable_page_faults_lock
);
763 qp
->mqp
.pfault_handler
= mlx5_ib_pfault_handler
;
765 for (i
= 0; i
< MLX5_IB_PAGEFAULT_CONTEXTS
; ++i
)
766 INIT_WORK(&qp
->pagefaults
[i
].work
, mlx5_ib_qp_pfault_action
);
769 int mlx5_ib_odp_init_one(struct mlx5_ib_dev
*ibdev
)
773 ret
= init_srcu_struct(&ibdev
->mr_srcu
);
780 void mlx5_ib_odp_remove_one(struct mlx5_ib_dev
*ibdev
)
782 cleanup_srcu_struct(&ibdev
->mr_srcu
);
785 int __init
mlx5_ib_odp_init(void)
787 mlx5_ib_page_fault_wq
=
788 create_singlethread_workqueue("mlx5_ib_page_faults");
789 if (!mlx5_ib_page_fault_wq
)
795 void mlx5_ib_odp_cleanup(void)
797 destroy_workqueue(mlx5_ib_page_fault_wq
);