2 * VFIO: IOMMU DMA mapping support for TCE on POWER
4 * Copyright (C) 2013 IBM Corp. All rights reserved.
5 * Author: Alexey Kardashevskiy <aik@ozlabs.ru>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
11 * Derived from original vfio_iommu_type1.c:
12 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
13 * Author: Alex Williamson <alex.williamson@redhat.com>
16 #include <linux/module.h>
17 #include <linux/pci.h>
18 #include <linux/slab.h>
19 #include <linux/uaccess.h>
20 #include <linux/err.h>
21 #include <linux/vfio.h>
22 #include <asm/iommu.h>
25 #define DRIVER_VERSION "0.1"
26 #define DRIVER_AUTHOR "aik@ozlabs.ru"
27 #define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
29 static void tce_iommu_detach_group(void *iommu_data
,
30 struct iommu_group
*iommu_group
);
32 static long try_increment_locked_vm(long npages
)
34 long ret
= 0, locked
, lock_limit
;
36 if (!current
|| !current
->mm
)
37 return -ESRCH
; /* process exited */
42 down_write(¤t
->mm
->mmap_sem
);
43 locked
= current
->mm
->locked_vm
+ npages
;
44 lock_limit
= rlimit(RLIMIT_MEMLOCK
) >> PAGE_SHIFT
;
45 if (locked
> lock_limit
&& !capable(CAP_IPC_LOCK
))
48 current
->mm
->locked_vm
+= npages
;
50 pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current
->pid
,
52 current
->mm
->locked_vm
<< PAGE_SHIFT
,
53 rlimit(RLIMIT_MEMLOCK
),
54 ret
? " - exceeded" : "");
56 up_write(¤t
->mm
->mmap_sem
);
61 static void decrement_locked_vm(long npages
)
63 if (!current
|| !current
->mm
|| !npages
)
64 return; /* process exited */
66 down_write(¤t
->mm
->mmap_sem
);
67 if (WARN_ON_ONCE(npages
> current
->mm
->locked_vm
))
68 npages
= current
->mm
->locked_vm
;
69 current
->mm
->locked_vm
-= npages
;
70 pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current
->pid
,
72 current
->mm
->locked_vm
<< PAGE_SHIFT
,
73 rlimit(RLIMIT_MEMLOCK
));
74 up_write(¤t
->mm
->mmap_sem
);
78 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
80 * This code handles mapping and unmapping of user data buffers
81 * into DMA'ble space using the IOMMU
85 * The container descriptor supports only a single group per container.
86 * Required by the API as the container is not supplied with the IOMMU group
87 * at the moment of initialization.
89 struct tce_container
{
91 struct iommu_group
*grp
;
93 unsigned long locked_pages
;
96 static bool tce_page_is_contained(struct page
*page
, unsigned page_shift
)
99 * Check that the TCE table granularity is not bigger than the size of
100 * a page we just found. Otherwise the hardware can get access to
101 * a bigger memory chunk that it should.
103 return (PAGE_SHIFT
+ compound_order(compound_head(page
))) >= page_shift
;
106 static long tce_iommu_find_table(struct tce_container
*container
,
107 phys_addr_t ioba
, struct iommu_table
**ptbl
)
110 struct iommu_table_group
*table_group
;
112 table_group
= iommu_group_get_iommudata(container
->grp
);
116 for (i
= 0; i
< IOMMU_TABLE_GROUP_MAX_TABLES
; ++i
) {
117 struct iommu_table
*tbl
= table_group
->tables
[i
];
120 unsigned long entry
= ioba
>> tbl
->it_page_shift
;
121 unsigned long start
= tbl
->it_offset
;
122 unsigned long end
= start
+ tbl
->it_size
;
124 if ((start
<= entry
) && (entry
< end
)) {
134 static int tce_iommu_enable(struct tce_container
*container
)
137 unsigned long locked
;
138 struct iommu_table_group
*table_group
;
144 return -ESRCH
; /* process exited */
146 if (container
->enabled
)
150 * When userspace pages are mapped into the IOMMU, they are effectively
151 * locked memory, so, theoretically, we need to update the accounting
152 * of locked pages on each map and unmap. For powerpc, the map unmap
153 * paths can be very hot, though, and the accounting would kill
154 * performance, especially since it would be difficult to impossible
155 * to handle the accounting in real mode only.
157 * To address that, rather than precisely accounting every page, we
158 * instead account for a worst case on locked memory when the iommu is
159 * enabled and disabled. The worst case upper bound on locked memory
160 * is the size of the whole iommu window, which is usually relatively
161 * small (compared to total memory sizes) on POWER hardware.
163 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
164 * that would effectively kill the guest at random points, much better
165 * enforcing the limit based on the max that the guest can map.
167 * Unfortunately at the moment it counts whole tables, no matter how
168 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
169 * each with 2GB DMA window, 8GB will be counted here. The reason for
170 * this is that we cannot tell here the amount of RAM used by the guest
171 * as this information is only available from KVM and VFIO is
174 * So we do not allow enabling a container without a group attached
175 * as there is no way to know how much we should increment
176 * the locked_vm counter.
178 table_group
= iommu_group_get_iommudata(container
->grp
);
182 if (!table_group
->tce32_size
)
185 locked
= table_group
->tce32_size
>> PAGE_SHIFT
;
186 ret
= try_increment_locked_vm(locked
);
190 container
->locked_pages
= locked
;
192 container
->enabled
= true;
197 static void tce_iommu_disable(struct tce_container
*container
)
199 if (!container
->enabled
)
202 container
->enabled
= false;
207 decrement_locked_vm(container
->locked_pages
);
210 static void *tce_iommu_open(unsigned long arg
)
212 struct tce_container
*container
;
214 if (arg
!= VFIO_SPAPR_TCE_IOMMU
) {
215 pr_err("tce_vfio: Wrong IOMMU type\n");
216 return ERR_PTR(-EINVAL
);
219 container
= kzalloc(sizeof(*container
), GFP_KERNEL
);
221 return ERR_PTR(-ENOMEM
);
223 mutex_init(&container
->lock
);
228 static void tce_iommu_release(void *iommu_data
)
230 struct tce_container
*container
= iommu_data
;
232 WARN_ON(container
->grp
);
235 tce_iommu_detach_group(iommu_data
, container
->grp
);
237 tce_iommu_disable(container
);
238 mutex_destroy(&container
->lock
);
243 static void tce_iommu_unuse_page(struct tce_container
*container
,
248 page
= pfn_to_page(hpa
>> PAGE_SHIFT
);
252 static int tce_iommu_clear(struct tce_container
*container
,
253 struct iommu_table
*tbl
,
254 unsigned long entry
, unsigned long pages
)
256 unsigned long oldhpa
;
258 enum dma_data_direction direction
;
260 for ( ; pages
; --pages
, ++entry
) {
261 direction
= DMA_NONE
;
263 ret
= iommu_tce_xchg(tbl
, entry
, &oldhpa
, &direction
);
267 if (direction
== DMA_NONE
)
270 tce_iommu_unuse_page(container
, oldhpa
);
276 static int tce_iommu_use_page(unsigned long tce
, unsigned long *hpa
)
278 struct page
*page
= NULL
;
279 enum dma_data_direction direction
= iommu_tce_direction(tce
);
281 if (get_user_pages_fast(tce
& PAGE_MASK
, 1,
282 direction
!= DMA_TO_DEVICE
, &page
) != 1)
285 *hpa
= __pa((unsigned long) page_address(page
));
290 static long tce_iommu_build(struct tce_container
*container
,
291 struct iommu_table
*tbl
,
292 unsigned long entry
, unsigned long tce
, unsigned long pages
,
293 enum dma_data_direction direction
)
298 enum dma_data_direction dirtmp
;
300 for (i
= 0; i
< pages
; ++i
) {
301 unsigned long offset
= tce
& IOMMU_PAGE_MASK(tbl
) & ~PAGE_MASK
;
303 ret
= tce_iommu_use_page(tce
, &hpa
);
307 page
= pfn_to_page(hpa
>> PAGE_SHIFT
);
308 if (!tce_page_is_contained(page
, tbl
->it_page_shift
)) {
315 ret
= iommu_tce_xchg(tbl
, entry
+ i
, &hpa
, &dirtmp
);
317 tce_iommu_unuse_page(container
, hpa
);
318 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
319 __func__
, entry
<< tbl
->it_page_shift
,
324 if (dirtmp
!= DMA_NONE
)
325 tce_iommu_unuse_page(container
, hpa
);
327 tce
+= IOMMU_PAGE_SIZE(tbl
);
331 tce_iommu_clear(container
, tbl
, entry
, i
);
336 static long tce_iommu_create_table(struct tce_container
*container
,
337 struct iommu_table_group
*table_group
,
342 struct iommu_table
**ptbl
)
344 long ret
, table_size
;
346 table_size
= table_group
->ops
->get_table_size(page_shift
, window_size
,
351 ret
= try_increment_locked_vm(table_size
>> PAGE_SHIFT
);
355 ret
= table_group
->ops
->create_table(table_group
, num
,
356 page_shift
, window_size
, levels
, ptbl
);
358 WARN_ON(!ret
&& !(*ptbl
)->it_ops
->free
);
359 WARN_ON(!ret
&& ((*ptbl
)->it_allocated_size
!= table_size
));
362 decrement_locked_vm(table_size
>> PAGE_SHIFT
);
367 static void tce_iommu_free_table(struct iommu_table
*tbl
)
369 unsigned long pages
= tbl
->it_allocated_size
>> PAGE_SHIFT
;
371 tbl
->it_ops
->free(tbl
);
372 decrement_locked_vm(pages
);
375 static long tce_iommu_ioctl(void *iommu_data
,
376 unsigned int cmd
, unsigned long arg
)
378 struct tce_container
*container
= iommu_data
;
383 case VFIO_CHECK_EXTENSION
:
385 case VFIO_SPAPR_TCE_IOMMU
:
389 ret
= vfio_spapr_iommu_eeh_ioctl(NULL
, cmd
, arg
);
393 return (ret
< 0) ? 0 : ret
;
395 case VFIO_IOMMU_SPAPR_TCE_GET_INFO
: {
396 struct vfio_iommu_spapr_tce_info info
;
397 struct iommu_table_group
*table_group
;
399 if (WARN_ON(!container
->grp
))
402 table_group
= iommu_group_get_iommudata(container
->grp
);
407 minsz
= offsetofend(struct vfio_iommu_spapr_tce_info
,
410 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
413 if (info
.argsz
< minsz
)
416 info
.dma32_window_start
= table_group
->tce32_start
;
417 info
.dma32_window_size
= table_group
->tce32_size
;
420 if (copy_to_user((void __user
*)arg
, &info
, minsz
))
425 case VFIO_IOMMU_MAP_DMA
: {
426 struct vfio_iommu_type1_dma_map param
;
427 struct iommu_table
*tbl
= NULL
;
429 enum dma_data_direction direction
;
431 if (!container
->enabled
)
434 minsz
= offsetofend(struct vfio_iommu_type1_dma_map
, size
);
436 if (copy_from_user(¶m
, (void __user
*)arg
, minsz
))
439 if (param
.argsz
< minsz
)
442 if (param
.flags
& ~(VFIO_DMA_MAP_FLAG_READ
|
443 VFIO_DMA_MAP_FLAG_WRITE
))
446 num
= tce_iommu_find_table(container
, param
.iova
, &tbl
);
450 if ((param
.size
& ~IOMMU_PAGE_MASK(tbl
)) ||
451 (param
.vaddr
& ~IOMMU_PAGE_MASK(tbl
)))
454 /* iova is checked by the IOMMU API */
455 if (param
.flags
& VFIO_DMA_MAP_FLAG_READ
) {
456 if (param
.flags
& VFIO_DMA_MAP_FLAG_WRITE
)
457 direction
= DMA_BIDIRECTIONAL
;
459 direction
= DMA_TO_DEVICE
;
461 if (param
.flags
& VFIO_DMA_MAP_FLAG_WRITE
)
462 direction
= DMA_FROM_DEVICE
;
467 ret
= iommu_tce_put_param_check(tbl
, param
.iova
, param
.vaddr
);
471 ret
= tce_iommu_build(container
, tbl
,
472 param
.iova
>> tbl
->it_page_shift
,
474 param
.size
>> tbl
->it_page_shift
,
477 iommu_flush_tce(tbl
);
481 case VFIO_IOMMU_UNMAP_DMA
: {
482 struct vfio_iommu_type1_dma_unmap param
;
483 struct iommu_table
*tbl
= NULL
;
486 if (!container
->enabled
)
489 minsz
= offsetofend(struct vfio_iommu_type1_dma_unmap
,
492 if (copy_from_user(¶m
, (void __user
*)arg
, minsz
))
495 if (param
.argsz
< minsz
)
498 /* No flag is supported now */
502 num
= tce_iommu_find_table(container
, param
.iova
, &tbl
);
506 if (param
.size
& ~IOMMU_PAGE_MASK(tbl
))
509 ret
= iommu_tce_clear_param_check(tbl
, param
.iova
, 0,
510 param
.size
>> tbl
->it_page_shift
);
514 ret
= tce_iommu_clear(container
, tbl
,
515 param
.iova
>> tbl
->it_page_shift
,
516 param
.size
>> tbl
->it_page_shift
);
517 iommu_flush_tce(tbl
);
521 case VFIO_IOMMU_ENABLE
:
522 mutex_lock(&container
->lock
);
523 ret
= tce_iommu_enable(container
);
524 mutex_unlock(&container
->lock
);
528 case VFIO_IOMMU_DISABLE
:
529 mutex_lock(&container
->lock
);
530 tce_iommu_disable(container
);
531 mutex_unlock(&container
->lock
);
537 return vfio_spapr_iommu_eeh_ioctl(container
->grp
,
544 static void tce_iommu_release_ownership(struct tce_container
*container
,
545 struct iommu_table_group
*table_group
)
549 for (i
= 0; i
< IOMMU_TABLE_GROUP_MAX_TABLES
; ++i
) {
550 struct iommu_table
*tbl
= table_group
->tables
[i
];
555 tce_iommu_clear(container
, tbl
, tbl
->it_offset
, tbl
->it_size
);
557 iommu_release_ownership(tbl
);
561 static int tce_iommu_take_ownership(struct tce_container
*container
,
562 struct iommu_table_group
*table_group
)
566 for (i
= 0; i
< IOMMU_TABLE_GROUP_MAX_TABLES
; ++i
) {
567 struct iommu_table
*tbl
= table_group
->tables
[i
];
569 if (!tbl
|| !tbl
->it_map
)
572 rc
= iommu_take_ownership(tbl
);
574 for (j
= 0; j
< i
; ++j
)
575 iommu_release_ownership(
576 table_group
->tables
[j
]);
585 static void tce_iommu_release_ownership_ddw(struct tce_container
*container
,
586 struct iommu_table_group
*table_group
)
590 if (!table_group
->ops
->unset_window
) {
595 for (i
= 0; i
< IOMMU_TABLE_GROUP_MAX_TABLES
; ++i
) {
596 /* Store table pointer as unset_window resets it */
597 struct iommu_table
*tbl
= table_group
->tables
[i
];
602 table_group
->ops
->unset_window(table_group
, i
);
603 tce_iommu_clear(container
, tbl
,
604 tbl
->it_offset
, tbl
->it_size
);
605 tce_iommu_free_table(tbl
);
608 table_group
->ops
->release_ownership(table_group
);
611 static long tce_iommu_take_ownership_ddw(struct tce_container
*container
,
612 struct iommu_table_group
*table_group
)
615 struct iommu_table
*tbl
= NULL
;
617 if (!table_group
->ops
->create_table
|| !table_group
->ops
->set_window
||
618 !table_group
->ops
->release_ownership
) {
623 table_group
->ops
->take_ownership(table_group
);
625 ret
= tce_iommu_create_table(container
,
627 0, /* window number */
629 table_group
->tce32_size
,
630 1, /* default levels */
633 ret
= table_group
->ops
->set_window(table_group
, 0, tbl
);
635 tce_iommu_free_table(tbl
);
637 table_group
->tables
[0] = tbl
;
641 table_group
->ops
->release_ownership(table_group
);
646 static int tce_iommu_attach_group(void *iommu_data
,
647 struct iommu_group
*iommu_group
)
650 struct tce_container
*container
= iommu_data
;
651 struct iommu_table_group
*table_group
;
653 mutex_lock(&container
->lock
);
655 /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
656 iommu_group_id(iommu_group), iommu_group); */
657 if (container
->grp
) {
658 pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
659 iommu_group_id(container
->grp
),
660 iommu_group_id(iommu_group
));
665 if (container
->enabled
) {
666 pr_err("tce_vfio: attaching group #%u to enabled container\n",
667 iommu_group_id(iommu_group
));
672 table_group
= iommu_group_get_iommudata(iommu_group
);
678 if (!table_group
->ops
|| !table_group
->ops
->take_ownership
||
679 !table_group
->ops
->release_ownership
)
680 ret
= tce_iommu_take_ownership(container
, table_group
);
682 ret
= tce_iommu_take_ownership_ddw(container
, table_group
);
685 container
->grp
= iommu_group
;
688 mutex_unlock(&container
->lock
);
693 static void tce_iommu_detach_group(void *iommu_data
,
694 struct iommu_group
*iommu_group
)
696 struct tce_container
*container
= iommu_data
;
697 struct iommu_table_group
*table_group
;
699 mutex_lock(&container
->lock
);
700 if (iommu_group
!= container
->grp
) {
701 pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
702 iommu_group_id(iommu_group
),
703 iommu_group_id(container
->grp
));
707 if (container
->enabled
) {
708 pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
709 iommu_group_id(container
->grp
));
710 tce_iommu_disable(container
);
713 /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
714 iommu_group_id(iommu_group), iommu_group); */
715 container
->grp
= NULL
;
717 table_group
= iommu_group_get_iommudata(iommu_group
);
718 BUG_ON(!table_group
);
720 if (!table_group
->ops
|| !table_group
->ops
->release_ownership
)
721 tce_iommu_release_ownership(container
, table_group
);
723 tce_iommu_release_ownership_ddw(container
, table_group
);
726 mutex_unlock(&container
->lock
);
729 const struct vfio_iommu_driver_ops tce_iommu_driver_ops
= {
730 .name
= "iommu-vfio-powerpc",
731 .owner
= THIS_MODULE
,
732 .open
= tce_iommu_open
,
733 .release
= tce_iommu_release
,
734 .ioctl
= tce_iommu_ioctl
,
735 .attach_group
= tce_iommu_attach_group
,
736 .detach_group
= tce_iommu_detach_group
,
739 static int __init
tce_iommu_init(void)
741 return vfio_register_iommu_driver(&tce_iommu_driver_ops
);
744 static void __exit
tce_iommu_cleanup(void)
746 vfio_unregister_iommu_driver(&tce_iommu_driver_ops
);
749 module_init(tce_iommu_init
);
750 module_exit(tce_iommu_cleanup
);
752 MODULE_VERSION(DRIVER_VERSION
);
753 MODULE_LICENSE("GPL v2");
754 MODULE_AUTHOR(DRIVER_AUTHOR
);
755 MODULE_DESCRIPTION(DRIVER_DESC
);