2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <jroedel@suse.de>
4 * Leo Duran <leo.duran@amd.com>
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 #include <linux/ratelimit.h>
21 #include <linux/pci.h>
22 #include <linux/pci-ats.h>
23 #include <linux/bitmap.h>
24 #include <linux/slab.h>
25 #include <linux/debugfs.h>
26 #include <linux/scatterlist.h>
27 #include <linux/dma-mapping.h>
28 #include <linux/iommu-helper.h>
29 #include <linux/iommu.h>
30 #include <linux/delay.h>
31 #include <linux/amd-iommu.h>
32 #include <linux/notifier.h>
33 #include <linux/export.h>
34 #include <linux/irq.h>
35 #include <linux/msi.h>
36 #include <linux/dma-contiguous.h>
37 #include <asm/irq_remapping.h>
38 #include <asm/io_apic.h>
40 #include <asm/hw_irq.h>
41 #include <asm/msidef.h>
42 #include <asm/proto.h>
43 #include <asm/iommu.h>
47 #include "amd_iommu_proto.h"
48 #include "amd_iommu_types.h"
49 #include "irq_remapping.h"
51 #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
53 #define LOOP_TIMEOUT 100000
56 * This bitmap is used to advertise the page sizes our hardware support
57 * to the IOMMU core, which will then use this information to split
58 * physically contiguous memory regions it is mapping into page sizes
61 * 512GB Pages are not supported due to a hardware bug
63 #define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38))
65 static DEFINE_RWLOCK(amd_iommu_devtable_lock
);
67 /* List of all available dev_data structures */
68 static LIST_HEAD(dev_data_list
);
69 static DEFINE_SPINLOCK(dev_data_list_lock
);
71 LIST_HEAD(ioapic_map
);
75 * Domain for untranslated devices - only allocated
76 * if iommu=pt passed on kernel cmd line.
78 static struct protection_domain
*pt_domain
;
80 static const struct iommu_ops amd_iommu_ops
;
82 static ATOMIC_NOTIFIER_HEAD(ppr_notifier
);
83 int amd_iommu_max_glx_val
= -1;
85 static struct dma_map_ops amd_iommu_dma_ops
;
88 * This struct contains device specific data for the IOMMU
90 struct iommu_dev_data
{
91 struct list_head list
; /* For domain->dev_list */
92 struct list_head dev_data_list
; /* For global dev_data_list */
93 struct list_head alias_list
; /* Link alias-groups together */
94 struct iommu_dev_data
*alias_data
;/* The alias dev_data */
95 struct protection_domain
*domain
; /* Domain the device is bound to */
96 u16 devid
; /* PCI Device ID */
97 bool iommu_v2
; /* Device can make use of IOMMUv2 */
98 bool passthrough
; /* Default for device is pt_domain */
102 } ats
; /* ATS state */
103 bool pri_tlp
; /* PASID TLB required for
105 u32 errata
; /* Bitmap for errata to apply */
109 * general struct to manage commands send to an IOMMU
115 struct kmem_cache
*amd_iommu_irq_cache
;
117 static void update_domain(struct protection_domain
*domain
);
118 static int alloc_passthrough_domain(void);
120 /****************************************************************************
124 ****************************************************************************/
126 static struct protection_domain
*to_pdomain(struct iommu_domain
*dom
)
128 return container_of(dom
, struct protection_domain
, domain
);
131 static struct iommu_dev_data
*alloc_dev_data(u16 devid
)
133 struct iommu_dev_data
*dev_data
;
136 dev_data
= kzalloc(sizeof(*dev_data
), GFP_KERNEL
);
140 INIT_LIST_HEAD(&dev_data
->alias_list
);
142 dev_data
->devid
= devid
;
144 spin_lock_irqsave(&dev_data_list_lock
, flags
);
145 list_add_tail(&dev_data
->dev_data_list
, &dev_data_list
);
146 spin_unlock_irqrestore(&dev_data_list_lock
, flags
);
151 static void free_dev_data(struct iommu_dev_data
*dev_data
)
155 spin_lock_irqsave(&dev_data_list_lock
, flags
);
156 list_del(&dev_data
->dev_data_list
);
157 spin_unlock_irqrestore(&dev_data_list_lock
, flags
);
162 static struct iommu_dev_data
*search_dev_data(u16 devid
)
164 struct iommu_dev_data
*dev_data
;
167 spin_lock_irqsave(&dev_data_list_lock
, flags
);
168 list_for_each_entry(dev_data
, &dev_data_list
, dev_data_list
) {
169 if (dev_data
->devid
== devid
)
176 spin_unlock_irqrestore(&dev_data_list_lock
, flags
);
181 static struct iommu_dev_data
*find_dev_data(u16 devid
)
183 struct iommu_dev_data
*dev_data
;
185 dev_data
= search_dev_data(devid
);
187 if (dev_data
== NULL
)
188 dev_data
= alloc_dev_data(devid
);
193 static inline u16
get_device_id(struct device
*dev
)
195 struct pci_dev
*pdev
= to_pci_dev(dev
);
197 return PCI_DEVID(pdev
->bus
->number
, pdev
->devfn
);
200 static struct iommu_dev_data
*get_dev_data(struct device
*dev
)
202 return dev
->archdata
.iommu
;
205 static bool pci_iommuv2_capable(struct pci_dev
*pdev
)
207 static const int caps
[] = {
210 PCI_EXT_CAP_ID_PASID
,
214 for (i
= 0; i
< 3; ++i
) {
215 pos
= pci_find_ext_capability(pdev
, caps
[i
]);
223 static bool pdev_pri_erratum(struct pci_dev
*pdev
, u32 erratum
)
225 struct iommu_dev_data
*dev_data
;
227 dev_data
= get_dev_data(&pdev
->dev
);
229 return dev_data
->errata
& (1 << erratum
) ? true : false;
233 * This function actually applies the mapping to the page table of the
236 static void alloc_unity_mapping(struct dma_ops_domain
*dma_dom
,
237 struct unity_map_entry
*e
)
241 for (addr
= e
->address_start
; addr
< e
->address_end
;
243 if (addr
< dma_dom
->aperture_size
)
244 __set_bit(addr
>> PAGE_SHIFT
,
245 dma_dom
->aperture
[0]->bitmap
);
250 * Inits the unity mappings required for a specific device
252 static void init_unity_mappings_for_device(struct device
*dev
,
253 struct dma_ops_domain
*dma_dom
)
255 struct unity_map_entry
*e
;
258 devid
= get_device_id(dev
);
260 list_for_each_entry(e
, &amd_iommu_unity_map
, list
) {
261 if (!(devid
>= e
->devid_start
&& devid
<= e
->devid_end
))
263 alloc_unity_mapping(dma_dom
, e
);
268 * This function checks if the driver got a valid device from the caller to
269 * avoid dereferencing invalid pointers.
271 static bool check_device(struct device
*dev
)
275 if (!dev
|| !dev
->dma_mask
)
279 if (!dev_is_pci(dev
))
282 devid
= get_device_id(dev
);
284 /* Out of our scope? */
285 if (devid
> amd_iommu_last_bdf
)
288 if (amd_iommu_rlookup_table
[devid
] == NULL
)
294 static void init_iommu_group(struct device
*dev
)
296 struct dma_ops_domain
*dma_domain
;
297 struct iommu_domain
*domain
;
298 struct iommu_group
*group
;
300 group
= iommu_group_get_for_dev(dev
);
304 domain
= iommu_group_default_domain(group
);
308 dma_domain
= to_pdomain(domain
)->priv
;
310 init_unity_mappings_for_device(dev
, dma_domain
);
312 iommu_group_put(group
);
315 static int __last_alias(struct pci_dev
*pdev
, u16 alias
, void *data
)
317 *(u16
*)data
= alias
;
321 static u16
get_alias(struct device
*dev
)
323 struct pci_dev
*pdev
= to_pci_dev(dev
);
324 u16 devid
, ivrs_alias
, pci_alias
;
326 devid
= get_device_id(dev
);
327 ivrs_alias
= amd_iommu_alias_table
[devid
];
328 pci_for_each_dma_alias(pdev
, __last_alias
, &pci_alias
);
330 if (ivrs_alias
== pci_alias
)
336 * The IVRS is fairly reliable in telling us about aliases, but it
337 * can't know about every screwy device. If we don't have an IVRS
338 * reported alias, use the PCI reported alias. In that case we may
339 * still need to initialize the rlookup and dev_table entries if the
340 * alias is to a non-existent device.
342 if (ivrs_alias
== devid
) {
343 if (!amd_iommu_rlookup_table
[pci_alias
]) {
344 amd_iommu_rlookup_table
[pci_alias
] =
345 amd_iommu_rlookup_table
[devid
];
346 memcpy(amd_iommu_dev_table
[pci_alias
].data
,
347 amd_iommu_dev_table
[devid
].data
,
348 sizeof(amd_iommu_dev_table
[pci_alias
].data
));
354 pr_info("AMD-Vi: Using IVRS reported alias %02x:%02x.%d "
355 "for device %s[%04x:%04x], kernel reported alias "
356 "%02x:%02x.%d\n", PCI_BUS_NUM(ivrs_alias
), PCI_SLOT(ivrs_alias
),
357 PCI_FUNC(ivrs_alias
), dev_name(dev
), pdev
->vendor
, pdev
->device
,
358 PCI_BUS_NUM(pci_alias
), PCI_SLOT(pci_alias
),
359 PCI_FUNC(pci_alias
));
362 * If we don't have a PCI DMA alias and the IVRS alias is on the same
363 * bus, then the IVRS table may know about a quirk that we don't.
365 if (pci_alias
== devid
&&
366 PCI_BUS_NUM(ivrs_alias
) == pdev
->bus
->number
) {
367 pdev
->dev_flags
|= PCI_DEV_FLAGS_DMA_ALIAS_DEVFN
;
368 pdev
->dma_alias_devfn
= ivrs_alias
& 0xff;
369 pr_info("AMD-Vi: Added PCI DMA alias %02x.%d for %s\n",
370 PCI_SLOT(ivrs_alias
), PCI_FUNC(ivrs_alias
),
377 static int iommu_init_device(struct device
*dev
)
379 struct pci_dev
*pdev
= to_pci_dev(dev
);
380 struct iommu_dev_data
*dev_data
;
383 if (dev
->archdata
.iommu
)
386 dev_data
= find_dev_data(get_device_id(dev
));
390 alias
= get_alias(dev
);
392 if (alias
!= dev_data
->devid
) {
393 struct iommu_dev_data
*alias_data
;
395 alias_data
= find_dev_data(alias
);
396 if (alias_data
== NULL
) {
397 pr_err("AMD-Vi: Warning: Unhandled device %s\n",
399 free_dev_data(dev_data
);
402 dev_data
->alias_data
= alias_data
;
404 /* Add device to the alias_list */
405 list_add(&dev_data
->alias_list
, &alias_data
->alias_list
);
408 if (pci_iommuv2_capable(pdev
)) {
409 struct amd_iommu
*iommu
;
411 iommu
= amd_iommu_rlookup_table
[dev_data
->devid
];
412 dev_data
->iommu_v2
= iommu
->is_iommu_v2
;
415 dev
->archdata
.iommu
= dev_data
;
417 iommu_device_link(amd_iommu_rlookup_table
[dev_data
->devid
]->iommu_dev
,
423 static void iommu_ignore_device(struct device
*dev
)
427 devid
= get_device_id(dev
);
428 alias
= amd_iommu_alias_table
[devid
];
430 memset(&amd_iommu_dev_table
[devid
], 0, sizeof(struct dev_table_entry
));
431 memset(&amd_iommu_dev_table
[alias
], 0, sizeof(struct dev_table_entry
));
433 amd_iommu_rlookup_table
[devid
] = NULL
;
434 amd_iommu_rlookup_table
[alias
] = NULL
;
437 static void iommu_uninit_device(struct device
*dev
)
439 struct iommu_dev_data
*dev_data
= search_dev_data(get_device_id(dev
));
444 iommu_device_unlink(amd_iommu_rlookup_table
[dev_data
->devid
]->iommu_dev
,
447 iommu_group_remove_device(dev
);
449 /* Unlink from alias, it may change if another device is re-plugged */
450 dev_data
->alias_data
= NULL
;
453 dev
->archdata
.dma_ops
= NULL
;
456 * We keep dev_data around for unplugged devices and reuse it when the
457 * device is re-plugged - not doing so would introduce a ton of races.
461 #ifdef CONFIG_AMD_IOMMU_STATS
464 * Initialization code for statistics collection
467 DECLARE_STATS_COUNTER(compl_wait
);
468 DECLARE_STATS_COUNTER(cnt_map_single
);
469 DECLARE_STATS_COUNTER(cnt_unmap_single
);
470 DECLARE_STATS_COUNTER(cnt_map_sg
);
471 DECLARE_STATS_COUNTER(cnt_unmap_sg
);
472 DECLARE_STATS_COUNTER(cnt_alloc_coherent
);
473 DECLARE_STATS_COUNTER(cnt_free_coherent
);
474 DECLARE_STATS_COUNTER(cross_page
);
475 DECLARE_STATS_COUNTER(domain_flush_single
);
476 DECLARE_STATS_COUNTER(domain_flush_all
);
477 DECLARE_STATS_COUNTER(alloced_io_mem
);
478 DECLARE_STATS_COUNTER(total_map_requests
);
479 DECLARE_STATS_COUNTER(complete_ppr
);
480 DECLARE_STATS_COUNTER(invalidate_iotlb
);
481 DECLARE_STATS_COUNTER(invalidate_iotlb_all
);
482 DECLARE_STATS_COUNTER(pri_requests
);
484 static struct dentry
*stats_dir
;
485 static struct dentry
*de_fflush
;
487 static void amd_iommu_stats_add(struct __iommu_counter
*cnt
)
489 if (stats_dir
== NULL
)
492 cnt
->dent
= debugfs_create_u64(cnt
->name
, 0444, stats_dir
,
496 static void amd_iommu_stats_init(void)
498 stats_dir
= debugfs_create_dir("amd-iommu", NULL
);
499 if (stats_dir
== NULL
)
502 de_fflush
= debugfs_create_bool("fullflush", 0444, stats_dir
,
503 &amd_iommu_unmap_flush
);
505 amd_iommu_stats_add(&compl_wait
);
506 amd_iommu_stats_add(&cnt_map_single
);
507 amd_iommu_stats_add(&cnt_unmap_single
);
508 amd_iommu_stats_add(&cnt_map_sg
);
509 amd_iommu_stats_add(&cnt_unmap_sg
);
510 amd_iommu_stats_add(&cnt_alloc_coherent
);
511 amd_iommu_stats_add(&cnt_free_coherent
);
512 amd_iommu_stats_add(&cross_page
);
513 amd_iommu_stats_add(&domain_flush_single
);
514 amd_iommu_stats_add(&domain_flush_all
);
515 amd_iommu_stats_add(&alloced_io_mem
);
516 amd_iommu_stats_add(&total_map_requests
);
517 amd_iommu_stats_add(&complete_ppr
);
518 amd_iommu_stats_add(&invalidate_iotlb
);
519 amd_iommu_stats_add(&invalidate_iotlb_all
);
520 amd_iommu_stats_add(&pri_requests
);
525 /****************************************************************************
527 * Interrupt handling functions
529 ****************************************************************************/
531 static void dump_dte_entry(u16 devid
)
535 for (i
= 0; i
< 4; ++i
)
536 pr_err("AMD-Vi: DTE[%d]: %016llx\n", i
,
537 amd_iommu_dev_table
[devid
].data
[i
]);
540 static void dump_command(unsigned long phys_addr
)
542 struct iommu_cmd
*cmd
= phys_to_virt(phys_addr
);
545 for (i
= 0; i
< 4; ++i
)
546 pr_err("AMD-Vi: CMD[%d]: %08x\n", i
, cmd
->data
[i
]);
549 static void iommu_print_event(struct amd_iommu
*iommu
, void *__evt
)
551 int type
, devid
, domid
, flags
;
552 volatile u32
*event
= __evt
;
557 type
= (event
[1] >> EVENT_TYPE_SHIFT
) & EVENT_TYPE_MASK
;
558 devid
= (event
[0] >> EVENT_DEVID_SHIFT
) & EVENT_DEVID_MASK
;
559 domid
= (event
[1] >> EVENT_DOMID_SHIFT
) & EVENT_DOMID_MASK
;
560 flags
= (event
[1] >> EVENT_FLAGS_SHIFT
) & EVENT_FLAGS_MASK
;
561 address
= (u64
)(((u64
)event
[3]) << 32) | event
[2];
564 /* Did we hit the erratum? */
565 if (++count
== LOOP_TIMEOUT
) {
566 pr_err("AMD-Vi: No event written to event log\n");
573 printk(KERN_ERR
"AMD-Vi: Event logged [");
576 case EVENT_TYPE_ILL_DEV
:
577 printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
578 "address=0x%016llx flags=0x%04x]\n",
579 PCI_BUS_NUM(devid
), PCI_SLOT(devid
), PCI_FUNC(devid
),
581 dump_dte_entry(devid
);
583 case EVENT_TYPE_IO_FAULT
:
584 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
585 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
586 PCI_BUS_NUM(devid
), PCI_SLOT(devid
), PCI_FUNC(devid
),
587 domid
, address
, flags
);
589 case EVENT_TYPE_DEV_TAB_ERR
:
590 printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
591 "address=0x%016llx flags=0x%04x]\n",
592 PCI_BUS_NUM(devid
), PCI_SLOT(devid
), PCI_FUNC(devid
),
595 case EVENT_TYPE_PAGE_TAB_ERR
:
596 printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
597 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
598 PCI_BUS_NUM(devid
), PCI_SLOT(devid
), PCI_FUNC(devid
),
599 domid
, address
, flags
);
601 case EVENT_TYPE_ILL_CMD
:
602 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address
);
603 dump_command(address
);
605 case EVENT_TYPE_CMD_HARD_ERR
:
606 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
607 "flags=0x%04x]\n", address
, flags
);
609 case EVENT_TYPE_IOTLB_INV_TO
:
610 printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
611 "address=0x%016llx]\n",
612 PCI_BUS_NUM(devid
), PCI_SLOT(devid
), PCI_FUNC(devid
),
615 case EVENT_TYPE_INV_DEV_REQ
:
616 printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
617 "address=0x%016llx flags=0x%04x]\n",
618 PCI_BUS_NUM(devid
), PCI_SLOT(devid
), PCI_FUNC(devid
),
622 printk(KERN_ERR
"UNKNOWN type=0x%02x]\n", type
);
625 memset(__evt
, 0, 4 * sizeof(u32
));
628 static void iommu_poll_events(struct amd_iommu
*iommu
)
632 head
= readl(iommu
->mmio_base
+ MMIO_EVT_HEAD_OFFSET
);
633 tail
= readl(iommu
->mmio_base
+ MMIO_EVT_TAIL_OFFSET
);
635 while (head
!= tail
) {
636 iommu_print_event(iommu
, iommu
->evt_buf
+ head
);
637 head
= (head
+ EVENT_ENTRY_SIZE
) % iommu
->evt_buf_size
;
640 writel(head
, iommu
->mmio_base
+ MMIO_EVT_HEAD_OFFSET
);
643 static void iommu_handle_ppr_entry(struct amd_iommu
*iommu
, u64
*raw
)
645 struct amd_iommu_fault fault
;
647 INC_STATS_COUNTER(pri_requests
);
649 if (PPR_REQ_TYPE(raw
[0]) != PPR_REQ_FAULT
) {
650 pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n");
654 fault
.address
= raw
[1];
655 fault
.pasid
= PPR_PASID(raw
[0]);
656 fault
.device_id
= PPR_DEVID(raw
[0]);
657 fault
.tag
= PPR_TAG(raw
[0]);
658 fault
.flags
= PPR_FLAGS(raw
[0]);
660 atomic_notifier_call_chain(&ppr_notifier
, 0, &fault
);
663 static void iommu_poll_ppr_log(struct amd_iommu
*iommu
)
667 if (iommu
->ppr_log
== NULL
)
670 head
= readl(iommu
->mmio_base
+ MMIO_PPR_HEAD_OFFSET
);
671 tail
= readl(iommu
->mmio_base
+ MMIO_PPR_TAIL_OFFSET
);
673 while (head
!= tail
) {
678 raw
= (u64
*)(iommu
->ppr_log
+ head
);
681 * Hardware bug: Interrupt may arrive before the entry is
682 * written to memory. If this happens we need to wait for the
685 for (i
= 0; i
< LOOP_TIMEOUT
; ++i
) {
686 if (PPR_REQ_TYPE(raw
[0]) != 0)
691 /* Avoid memcpy function-call overhead */
696 * To detect the hardware bug we need to clear the entry
699 raw
[0] = raw
[1] = 0UL;
701 /* Update head pointer of hardware ring-buffer */
702 head
= (head
+ PPR_ENTRY_SIZE
) % PPR_LOG_SIZE
;
703 writel(head
, iommu
->mmio_base
+ MMIO_PPR_HEAD_OFFSET
);
705 /* Handle PPR entry */
706 iommu_handle_ppr_entry(iommu
, entry
);
708 /* Refresh ring-buffer information */
709 head
= readl(iommu
->mmio_base
+ MMIO_PPR_HEAD_OFFSET
);
710 tail
= readl(iommu
->mmio_base
+ MMIO_PPR_TAIL_OFFSET
);
714 irqreturn_t
amd_iommu_int_thread(int irq
, void *data
)
716 struct amd_iommu
*iommu
= (struct amd_iommu
*) data
;
717 u32 status
= readl(iommu
->mmio_base
+ MMIO_STATUS_OFFSET
);
719 while (status
& (MMIO_STATUS_EVT_INT_MASK
| MMIO_STATUS_PPR_INT_MASK
)) {
720 /* Enable EVT and PPR interrupts again */
721 writel((MMIO_STATUS_EVT_INT_MASK
| MMIO_STATUS_PPR_INT_MASK
),
722 iommu
->mmio_base
+ MMIO_STATUS_OFFSET
);
724 if (status
& MMIO_STATUS_EVT_INT_MASK
) {
725 pr_devel("AMD-Vi: Processing IOMMU Event Log\n");
726 iommu_poll_events(iommu
);
729 if (status
& MMIO_STATUS_PPR_INT_MASK
) {
730 pr_devel("AMD-Vi: Processing IOMMU PPR Log\n");
731 iommu_poll_ppr_log(iommu
);
735 * Hardware bug: ERBT1312
736 * When re-enabling interrupt (by writing 1
737 * to clear the bit), the hardware might also try to set
738 * the interrupt bit in the event status register.
739 * In this scenario, the bit will be set, and disable
740 * subsequent interrupts.
742 * Workaround: The IOMMU driver should read back the
743 * status register and check if the interrupt bits are cleared.
744 * If not, driver will need to go through the interrupt handler
745 * again and re-clear the bits
747 status
= readl(iommu
->mmio_base
+ MMIO_STATUS_OFFSET
);
752 irqreturn_t
amd_iommu_int_handler(int irq
, void *data
)
754 return IRQ_WAKE_THREAD
;
757 /****************************************************************************
759 * IOMMU command queuing functions
761 ****************************************************************************/
763 static int wait_on_sem(volatile u64
*sem
)
767 while (*sem
== 0 && i
< LOOP_TIMEOUT
) {
772 if (i
== LOOP_TIMEOUT
) {
773 pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
780 static void copy_cmd_to_buffer(struct amd_iommu
*iommu
,
781 struct iommu_cmd
*cmd
,
786 target
= iommu
->cmd_buf
+ tail
;
787 tail
= (tail
+ sizeof(*cmd
)) % iommu
->cmd_buf_size
;
789 /* Copy command to buffer */
790 memcpy(target
, cmd
, sizeof(*cmd
));
792 /* Tell the IOMMU about it */
793 writel(tail
, iommu
->mmio_base
+ MMIO_CMD_TAIL_OFFSET
);
796 static void build_completion_wait(struct iommu_cmd
*cmd
, u64 address
)
798 WARN_ON(address
& 0x7ULL
);
800 memset(cmd
, 0, sizeof(*cmd
));
801 cmd
->data
[0] = lower_32_bits(__pa(address
)) | CMD_COMPL_WAIT_STORE_MASK
;
802 cmd
->data
[1] = upper_32_bits(__pa(address
));
804 CMD_SET_TYPE(cmd
, CMD_COMPL_WAIT
);
807 static void build_inv_dte(struct iommu_cmd
*cmd
, u16 devid
)
809 memset(cmd
, 0, sizeof(*cmd
));
810 cmd
->data
[0] = devid
;
811 CMD_SET_TYPE(cmd
, CMD_INV_DEV_ENTRY
);
814 static void build_inv_iommu_pages(struct iommu_cmd
*cmd
, u64 address
,
815 size_t size
, u16 domid
, int pde
)
820 pages
= iommu_num_pages(address
, size
, PAGE_SIZE
);
825 * If we have to flush more than one page, flush all
826 * TLB entries for this domain
828 address
= CMD_INV_IOMMU_ALL_PAGES_ADDRESS
;
832 address
&= PAGE_MASK
;
834 memset(cmd
, 0, sizeof(*cmd
));
835 cmd
->data
[1] |= domid
;
836 cmd
->data
[2] = lower_32_bits(address
);
837 cmd
->data
[3] = upper_32_bits(address
);
838 CMD_SET_TYPE(cmd
, CMD_INV_IOMMU_PAGES
);
839 if (s
) /* size bit - we flush more than one 4kb page */
840 cmd
->data
[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK
;
841 if (pde
) /* PDE bit - we want to flush everything, not only the PTEs */
842 cmd
->data
[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK
;
845 static void build_inv_iotlb_pages(struct iommu_cmd
*cmd
, u16 devid
, int qdep
,
846 u64 address
, size_t size
)
851 pages
= iommu_num_pages(address
, size
, PAGE_SIZE
);
856 * If we have to flush more than one page, flush all
857 * TLB entries for this domain
859 address
= CMD_INV_IOMMU_ALL_PAGES_ADDRESS
;
863 address
&= PAGE_MASK
;
865 memset(cmd
, 0, sizeof(*cmd
));
866 cmd
->data
[0] = devid
;
867 cmd
->data
[0] |= (qdep
& 0xff) << 24;
868 cmd
->data
[1] = devid
;
869 cmd
->data
[2] = lower_32_bits(address
);
870 cmd
->data
[3] = upper_32_bits(address
);
871 CMD_SET_TYPE(cmd
, CMD_INV_IOTLB_PAGES
);
873 cmd
->data
[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK
;
876 static void build_inv_iommu_pasid(struct iommu_cmd
*cmd
, u16 domid
, int pasid
,
877 u64 address
, bool size
)
879 memset(cmd
, 0, sizeof(*cmd
));
881 address
&= ~(0xfffULL
);
883 cmd
->data
[0] = pasid
;
884 cmd
->data
[1] = domid
;
885 cmd
->data
[2] = lower_32_bits(address
);
886 cmd
->data
[3] = upper_32_bits(address
);
887 cmd
->data
[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK
;
888 cmd
->data
[2] |= CMD_INV_IOMMU_PAGES_GN_MASK
;
890 cmd
->data
[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK
;
891 CMD_SET_TYPE(cmd
, CMD_INV_IOMMU_PAGES
);
894 static void build_inv_iotlb_pasid(struct iommu_cmd
*cmd
, u16 devid
, int pasid
,
895 int qdep
, u64 address
, bool size
)
897 memset(cmd
, 0, sizeof(*cmd
));
899 address
&= ~(0xfffULL
);
901 cmd
->data
[0] = devid
;
902 cmd
->data
[0] |= ((pasid
>> 8) & 0xff) << 16;
903 cmd
->data
[0] |= (qdep
& 0xff) << 24;
904 cmd
->data
[1] = devid
;
905 cmd
->data
[1] |= (pasid
& 0xff) << 16;
906 cmd
->data
[2] = lower_32_bits(address
);
907 cmd
->data
[2] |= CMD_INV_IOMMU_PAGES_GN_MASK
;
908 cmd
->data
[3] = upper_32_bits(address
);
910 cmd
->data
[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK
;
911 CMD_SET_TYPE(cmd
, CMD_INV_IOTLB_PAGES
);
914 static void build_complete_ppr(struct iommu_cmd
*cmd
, u16 devid
, int pasid
,
915 int status
, int tag
, bool gn
)
917 memset(cmd
, 0, sizeof(*cmd
));
919 cmd
->data
[0] = devid
;
921 cmd
->data
[1] = pasid
;
922 cmd
->data
[2] = CMD_INV_IOMMU_PAGES_GN_MASK
;
924 cmd
->data
[3] = tag
& 0x1ff;
925 cmd
->data
[3] |= (status
& PPR_STATUS_MASK
) << PPR_STATUS_SHIFT
;
927 CMD_SET_TYPE(cmd
, CMD_COMPLETE_PPR
);
930 static void build_inv_all(struct iommu_cmd
*cmd
)
932 memset(cmd
, 0, sizeof(*cmd
));
933 CMD_SET_TYPE(cmd
, CMD_INV_ALL
);
936 static void build_inv_irt(struct iommu_cmd
*cmd
, u16 devid
)
938 memset(cmd
, 0, sizeof(*cmd
));
939 cmd
->data
[0] = devid
;
940 CMD_SET_TYPE(cmd
, CMD_INV_IRT
);
944 * Writes the command to the IOMMUs command buffer and informs the
945 * hardware about the new command.
947 static int iommu_queue_command_sync(struct amd_iommu
*iommu
,
948 struct iommu_cmd
*cmd
,
951 u32 left
, tail
, head
, next_tail
;
954 WARN_ON(iommu
->cmd_buf_size
& CMD_BUFFER_UNINITIALIZED
);
957 spin_lock_irqsave(&iommu
->lock
, flags
);
959 head
= readl(iommu
->mmio_base
+ MMIO_CMD_HEAD_OFFSET
);
960 tail
= readl(iommu
->mmio_base
+ MMIO_CMD_TAIL_OFFSET
);
961 next_tail
= (tail
+ sizeof(*cmd
)) % iommu
->cmd_buf_size
;
962 left
= (head
- next_tail
) % iommu
->cmd_buf_size
;
965 struct iommu_cmd sync_cmd
;
966 volatile u64 sem
= 0;
969 build_completion_wait(&sync_cmd
, (u64
)&sem
);
970 copy_cmd_to_buffer(iommu
, &sync_cmd
, tail
);
972 spin_unlock_irqrestore(&iommu
->lock
, flags
);
974 if ((ret
= wait_on_sem(&sem
)) != 0)
980 copy_cmd_to_buffer(iommu
, cmd
, tail
);
982 /* We need to sync now to make sure all commands are processed */
983 iommu
->need_sync
= sync
;
985 spin_unlock_irqrestore(&iommu
->lock
, flags
);
990 static int iommu_queue_command(struct amd_iommu
*iommu
, struct iommu_cmd
*cmd
)
992 return iommu_queue_command_sync(iommu
, cmd
, true);
996 * This function queues a completion wait command into the command
999 static int iommu_completion_wait(struct amd_iommu
*iommu
)
1001 struct iommu_cmd cmd
;
1002 volatile u64 sem
= 0;
1005 if (!iommu
->need_sync
)
1008 build_completion_wait(&cmd
, (u64
)&sem
);
1010 ret
= iommu_queue_command_sync(iommu
, &cmd
, false);
1014 return wait_on_sem(&sem
);
1017 static int iommu_flush_dte(struct amd_iommu
*iommu
, u16 devid
)
1019 struct iommu_cmd cmd
;
1021 build_inv_dte(&cmd
, devid
);
1023 return iommu_queue_command(iommu
, &cmd
);
1026 static void iommu_flush_dte_all(struct amd_iommu
*iommu
)
1030 for (devid
= 0; devid
<= 0xffff; ++devid
)
1031 iommu_flush_dte(iommu
, devid
);
1033 iommu_completion_wait(iommu
);
1037 * This function uses heavy locking and may disable irqs for some time. But
1038 * this is no issue because it is only called during resume.
1040 static void iommu_flush_tlb_all(struct amd_iommu
*iommu
)
1044 for (dom_id
= 0; dom_id
<= 0xffff; ++dom_id
) {
1045 struct iommu_cmd cmd
;
1046 build_inv_iommu_pages(&cmd
, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS
,
1048 iommu_queue_command(iommu
, &cmd
);
1051 iommu_completion_wait(iommu
);
1054 static void iommu_flush_all(struct amd_iommu
*iommu
)
1056 struct iommu_cmd cmd
;
1058 build_inv_all(&cmd
);
1060 iommu_queue_command(iommu
, &cmd
);
1061 iommu_completion_wait(iommu
);
1064 static void iommu_flush_irt(struct amd_iommu
*iommu
, u16 devid
)
1066 struct iommu_cmd cmd
;
1068 build_inv_irt(&cmd
, devid
);
1070 iommu_queue_command(iommu
, &cmd
);
1073 static void iommu_flush_irt_all(struct amd_iommu
*iommu
)
1077 for (devid
= 0; devid
<= MAX_DEV_TABLE_ENTRIES
; devid
++)
1078 iommu_flush_irt(iommu
, devid
);
1080 iommu_completion_wait(iommu
);
1083 void iommu_flush_all_caches(struct amd_iommu
*iommu
)
1085 if (iommu_feature(iommu
, FEATURE_IA
)) {
1086 iommu_flush_all(iommu
);
1088 iommu_flush_dte_all(iommu
);
1089 iommu_flush_irt_all(iommu
);
1090 iommu_flush_tlb_all(iommu
);
1095 * Command send function for flushing on-device TLB
1097 static int device_flush_iotlb(struct iommu_dev_data
*dev_data
,
1098 u64 address
, size_t size
)
1100 struct amd_iommu
*iommu
;
1101 struct iommu_cmd cmd
;
1104 qdep
= dev_data
->ats
.qdep
;
1105 iommu
= amd_iommu_rlookup_table
[dev_data
->devid
];
1107 build_inv_iotlb_pages(&cmd
, dev_data
->devid
, qdep
, address
, size
);
1109 return iommu_queue_command(iommu
, &cmd
);
1113 * Command send function for invalidating a device table entry
1115 static int device_flush_dte(struct iommu_dev_data
*dev_data
)
1117 struct amd_iommu
*iommu
;
1120 iommu
= amd_iommu_rlookup_table
[dev_data
->devid
];
1122 ret
= iommu_flush_dte(iommu
, dev_data
->devid
);
1126 if (dev_data
->ats
.enabled
)
1127 ret
= device_flush_iotlb(dev_data
, 0, ~0UL);
1133 * TLB invalidation function which is called from the mapping functions.
1134 * It invalidates a single PTE if the range to flush is within a single
1135 * page. Otherwise it flushes the whole TLB of the IOMMU.
1137 static void __domain_flush_pages(struct protection_domain
*domain
,
1138 u64 address
, size_t size
, int pde
)
1140 struct iommu_dev_data
*dev_data
;
1141 struct iommu_cmd cmd
;
1144 build_inv_iommu_pages(&cmd
, address
, size
, domain
->id
, pde
);
1146 for (i
= 0; i
< amd_iommus_present
; ++i
) {
1147 if (!domain
->dev_iommu
[i
])
1151 * Devices of this domain are behind this IOMMU
1152 * We need a TLB flush
1154 ret
|= iommu_queue_command(amd_iommus
[i
], &cmd
);
1157 list_for_each_entry(dev_data
, &domain
->dev_list
, list
) {
1159 if (!dev_data
->ats
.enabled
)
1162 ret
|= device_flush_iotlb(dev_data
, address
, size
);
1168 static void domain_flush_pages(struct protection_domain
*domain
,
1169 u64 address
, size_t size
)
1171 __domain_flush_pages(domain
, address
, size
, 0);
1174 /* Flush the whole IO/TLB for a given protection domain */
1175 static void domain_flush_tlb(struct protection_domain
*domain
)
1177 __domain_flush_pages(domain
, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS
, 0);
1180 /* Flush the whole IO/TLB for a given protection domain - including PDE */
1181 static void domain_flush_tlb_pde(struct protection_domain
*domain
)
1183 __domain_flush_pages(domain
, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS
, 1);
1186 static void domain_flush_complete(struct protection_domain
*domain
)
1190 for (i
= 0; i
< amd_iommus_present
; ++i
) {
1191 if (!domain
->dev_iommu
[i
])
1195 * Devices of this domain are behind this IOMMU
1196 * We need to wait for completion of all commands.
1198 iommu_completion_wait(amd_iommus
[i
]);
1204 * This function flushes the DTEs for all devices in domain
1206 static void domain_flush_devices(struct protection_domain
*domain
)
1208 struct iommu_dev_data
*dev_data
;
1210 list_for_each_entry(dev_data
, &domain
->dev_list
, list
)
1211 device_flush_dte(dev_data
);
1214 /****************************************************************************
1216 * The functions below are used the create the page table mappings for
1217 * unity mapped regions.
1219 ****************************************************************************/
1222 * This function is used to add another level to an IO page table. Adding
1223 * another level increases the size of the address space by 9 bits to a size up
1226 static bool increase_address_space(struct protection_domain
*domain
,
1231 if (domain
->mode
== PAGE_MODE_6_LEVEL
)
1232 /* address space already 64 bit large */
1235 pte
= (void *)get_zeroed_page(gfp
);
1239 *pte
= PM_LEVEL_PDE(domain
->mode
,
1240 virt_to_phys(domain
->pt_root
));
1241 domain
->pt_root
= pte
;
1243 domain
->updated
= true;
1248 static u64
*alloc_pte(struct protection_domain
*domain
,
1249 unsigned long address
,
1250 unsigned long page_size
,
1257 BUG_ON(!is_power_of_2(page_size
));
1259 while (address
> PM_LEVEL_SIZE(domain
->mode
))
1260 increase_address_space(domain
, gfp
);
1262 level
= domain
->mode
- 1;
1263 pte
= &domain
->pt_root
[PM_LEVEL_INDEX(level
, address
)];
1264 address
= PAGE_SIZE_ALIGN(address
, page_size
);
1265 end_lvl
= PAGE_SIZE_LEVEL(page_size
);
1267 while (level
> end_lvl
) {
1268 if (!IOMMU_PTE_PRESENT(*pte
)) {
1269 page
= (u64
*)get_zeroed_page(gfp
);
1272 *pte
= PM_LEVEL_PDE(level
, virt_to_phys(page
));
1275 /* No level skipping support yet */
1276 if (PM_PTE_LEVEL(*pte
) != level
)
1281 pte
= IOMMU_PTE_PAGE(*pte
);
1283 if (pte_page
&& level
== end_lvl
)
1286 pte
= &pte
[PM_LEVEL_INDEX(level
, address
)];
1293 * This function checks if there is a PTE for a given dma address. If
1294 * there is one, it returns the pointer to it.
1296 static u64
*fetch_pte(struct protection_domain
*domain
,
1297 unsigned long address
,
1298 unsigned long *page_size
)
1303 if (address
> PM_LEVEL_SIZE(domain
->mode
))
1306 level
= domain
->mode
- 1;
1307 pte
= &domain
->pt_root
[PM_LEVEL_INDEX(level
, address
)];
1308 *page_size
= PTE_LEVEL_PAGE_SIZE(level
);
1313 if (!IOMMU_PTE_PRESENT(*pte
))
1317 if (PM_PTE_LEVEL(*pte
) == 7 ||
1318 PM_PTE_LEVEL(*pte
) == 0)
1321 /* No level skipping support yet */
1322 if (PM_PTE_LEVEL(*pte
) != level
)
1327 /* Walk to the next level */
1328 pte
= IOMMU_PTE_PAGE(*pte
);
1329 pte
= &pte
[PM_LEVEL_INDEX(level
, address
)];
1330 *page_size
= PTE_LEVEL_PAGE_SIZE(level
);
1333 if (PM_PTE_LEVEL(*pte
) == 0x07) {
1334 unsigned long pte_mask
;
1337 * If we have a series of large PTEs, make
1338 * sure to return a pointer to the first one.
1340 *page_size
= pte_mask
= PTE_PAGE_SIZE(*pte
);
1341 pte_mask
= ~((PAGE_SIZE_PTE_COUNT(pte_mask
) << 3) - 1);
1342 pte
= (u64
*)(((unsigned long)pte
) & pte_mask
);
1349 * Generic mapping functions. It maps a physical address into a DMA
1350 * address space. It allocates the page table pages if necessary.
1351 * In the future it can be extended to a generic mapping function
1352 * supporting all features of AMD IOMMU page tables like level skipping
1353 * and full 64 bit address spaces.
1355 static int iommu_map_page(struct protection_domain
*dom
,
1356 unsigned long bus_addr
,
1357 unsigned long phys_addr
,
1359 unsigned long page_size
)
1364 BUG_ON(!IS_ALIGNED(bus_addr
, page_size
));
1365 BUG_ON(!IS_ALIGNED(phys_addr
, page_size
));
1367 if (!(prot
& IOMMU_PROT_MASK
))
1370 count
= PAGE_SIZE_PTE_COUNT(page_size
);
1371 pte
= alloc_pte(dom
, bus_addr
, page_size
, NULL
, GFP_KERNEL
);
1376 for (i
= 0; i
< count
; ++i
)
1377 if (IOMMU_PTE_PRESENT(pte
[i
]))
1381 __pte
= PAGE_SIZE_PTE(phys_addr
, page_size
);
1382 __pte
|= PM_LEVEL_ENC(7) | IOMMU_PTE_P
| IOMMU_PTE_FC
;
1384 __pte
= phys_addr
| IOMMU_PTE_P
| IOMMU_PTE_FC
;
1386 if (prot
& IOMMU_PROT_IR
)
1387 __pte
|= IOMMU_PTE_IR
;
1388 if (prot
& IOMMU_PROT_IW
)
1389 __pte
|= IOMMU_PTE_IW
;
1391 for (i
= 0; i
< count
; ++i
)
1399 static unsigned long iommu_unmap_page(struct protection_domain
*dom
,
1400 unsigned long bus_addr
,
1401 unsigned long page_size
)
1403 unsigned long long unmapped
;
1404 unsigned long unmap_size
;
1407 BUG_ON(!is_power_of_2(page_size
));
1411 while (unmapped
< page_size
) {
1413 pte
= fetch_pte(dom
, bus_addr
, &unmap_size
);
1418 count
= PAGE_SIZE_PTE_COUNT(unmap_size
);
1419 for (i
= 0; i
< count
; i
++)
1423 bus_addr
= (bus_addr
& ~(unmap_size
- 1)) + unmap_size
;
1424 unmapped
+= unmap_size
;
1427 BUG_ON(unmapped
&& !is_power_of_2(unmapped
));
1432 /****************************************************************************
1434 * The next functions belong to the address allocator for the dma_ops
1435 * interface functions. They work like the allocators in the other IOMMU
1436 * drivers. Its basically a bitmap which marks the allocated pages in
1437 * the aperture. Maybe it could be enhanced in the future to a more
1438 * efficient allocator.
1440 ****************************************************************************/
1443 * The address allocator core functions.
1445 * called with domain->lock held
1449 * Used to reserve address ranges in the aperture (e.g. for exclusion
1452 static void dma_ops_reserve_addresses(struct dma_ops_domain
*dom
,
1453 unsigned long start_page
,
1456 unsigned int i
, last_page
= dom
->aperture_size
>> PAGE_SHIFT
;
1458 if (start_page
+ pages
> last_page
)
1459 pages
= last_page
- start_page
;
1461 for (i
= start_page
; i
< start_page
+ pages
; ++i
) {
1462 int index
= i
/ APERTURE_RANGE_PAGES
;
1463 int page
= i
% APERTURE_RANGE_PAGES
;
1464 __set_bit(page
, dom
->aperture
[index
]->bitmap
);
1469 * This function is used to add a new aperture range to an existing
1470 * aperture in case of dma_ops domain allocation or address allocation
1473 static int alloc_new_range(struct dma_ops_domain
*dma_dom
,
1474 bool populate
, gfp_t gfp
)
1476 int index
= dma_dom
->aperture_size
>> APERTURE_RANGE_SHIFT
;
1477 struct amd_iommu
*iommu
;
1478 unsigned long i
, old_size
, pte_pgsize
;
1480 #ifdef CONFIG_IOMMU_STRESS
1484 if (index
>= APERTURE_MAX_RANGES
)
1487 dma_dom
->aperture
[index
] = kzalloc(sizeof(struct aperture_range
), gfp
);
1488 if (!dma_dom
->aperture
[index
])
1491 dma_dom
->aperture
[index
]->bitmap
= (void *)get_zeroed_page(gfp
);
1492 if (!dma_dom
->aperture
[index
]->bitmap
)
1495 dma_dom
->aperture
[index
]->offset
= dma_dom
->aperture_size
;
1498 unsigned long address
= dma_dom
->aperture_size
;
1499 int i
, num_ptes
= APERTURE_RANGE_PAGES
/ 512;
1500 u64
*pte
, *pte_page
;
1502 for (i
= 0; i
< num_ptes
; ++i
) {
1503 pte
= alloc_pte(&dma_dom
->domain
, address
, PAGE_SIZE
,
1508 dma_dom
->aperture
[index
]->pte_pages
[i
] = pte_page
;
1510 address
+= APERTURE_RANGE_SIZE
/ 64;
1514 old_size
= dma_dom
->aperture_size
;
1515 dma_dom
->aperture_size
+= APERTURE_RANGE_SIZE
;
1517 /* Reserve address range used for MSI messages */
1518 if (old_size
< MSI_ADDR_BASE_LO
&&
1519 dma_dom
->aperture_size
> MSI_ADDR_BASE_LO
) {
1520 unsigned long spage
;
1523 pages
= iommu_num_pages(MSI_ADDR_BASE_LO
, 0x10000, PAGE_SIZE
);
1524 spage
= MSI_ADDR_BASE_LO
>> PAGE_SHIFT
;
1526 dma_ops_reserve_addresses(dma_dom
, spage
, pages
);
1529 /* Initialize the exclusion range if necessary */
1530 for_each_iommu(iommu
) {
1531 if (iommu
->exclusion_start
&&
1532 iommu
->exclusion_start
>= dma_dom
->aperture
[index
]->offset
1533 && iommu
->exclusion_start
< dma_dom
->aperture_size
) {
1534 unsigned long startpage
;
1535 int pages
= iommu_num_pages(iommu
->exclusion_start
,
1536 iommu
->exclusion_length
,
1538 startpage
= iommu
->exclusion_start
>> PAGE_SHIFT
;
1539 dma_ops_reserve_addresses(dma_dom
, startpage
, pages
);
1544 * Check for areas already mapped as present in the new aperture
1545 * range and mark those pages as reserved in the allocator. Such
1546 * mappings may already exist as a result of requested unity
1547 * mappings for devices.
1549 for (i
= dma_dom
->aperture
[index
]->offset
;
1550 i
< dma_dom
->aperture_size
;
1552 u64
*pte
= fetch_pte(&dma_dom
->domain
, i
, &pte_pgsize
);
1553 if (!pte
|| !IOMMU_PTE_PRESENT(*pte
))
1556 dma_ops_reserve_addresses(dma_dom
, i
>> PAGE_SHIFT
,
1560 update_domain(&dma_dom
->domain
);
1565 update_domain(&dma_dom
->domain
);
1567 free_page((unsigned long)dma_dom
->aperture
[index
]->bitmap
);
1569 kfree(dma_dom
->aperture
[index
]);
1570 dma_dom
->aperture
[index
] = NULL
;
1575 static unsigned long dma_ops_area_alloc(struct device
*dev
,
1576 struct dma_ops_domain
*dom
,
1578 unsigned long align_mask
,
1580 unsigned long start
)
1582 unsigned long next_bit
= dom
->next_address
% APERTURE_RANGE_SIZE
;
1583 int max_index
= dom
->aperture_size
>> APERTURE_RANGE_SHIFT
;
1584 int i
= start
>> APERTURE_RANGE_SHIFT
;
1585 unsigned long boundary_size
;
1586 unsigned long address
= -1;
1587 unsigned long limit
;
1589 next_bit
>>= PAGE_SHIFT
;
1591 boundary_size
= ALIGN(dma_get_seg_boundary(dev
) + 1,
1592 PAGE_SIZE
) >> PAGE_SHIFT
;
1594 for (;i
< max_index
; ++i
) {
1595 unsigned long offset
= dom
->aperture
[i
]->offset
>> PAGE_SHIFT
;
1597 if (dom
->aperture
[i
]->offset
>= dma_mask
)
1600 limit
= iommu_device_max_index(APERTURE_RANGE_PAGES
, offset
,
1601 dma_mask
>> PAGE_SHIFT
);
1603 address
= iommu_area_alloc(dom
->aperture
[i
]->bitmap
,
1604 limit
, next_bit
, pages
, 0,
1605 boundary_size
, align_mask
);
1606 if (address
!= -1) {
1607 address
= dom
->aperture
[i
]->offset
+
1608 (address
<< PAGE_SHIFT
);
1609 dom
->next_address
= address
+ (pages
<< PAGE_SHIFT
);
1619 static unsigned long dma_ops_alloc_addresses(struct device
*dev
,
1620 struct dma_ops_domain
*dom
,
1622 unsigned long align_mask
,
1625 unsigned long address
;
1627 #ifdef CONFIG_IOMMU_STRESS
1628 dom
->next_address
= 0;
1629 dom
->need_flush
= true;
1632 address
= dma_ops_area_alloc(dev
, dom
, pages
, align_mask
,
1633 dma_mask
, dom
->next_address
);
1635 if (address
== -1) {
1636 dom
->next_address
= 0;
1637 address
= dma_ops_area_alloc(dev
, dom
, pages
, align_mask
,
1639 dom
->need_flush
= true;
1642 if (unlikely(address
== -1))
1643 address
= DMA_ERROR_CODE
;
1645 WARN_ON((address
+ (PAGE_SIZE
*pages
)) > dom
->aperture_size
);
1651 * The address free function.
1653 * called with domain->lock held
1655 static void dma_ops_free_addresses(struct dma_ops_domain
*dom
,
1656 unsigned long address
,
1659 unsigned i
= address
>> APERTURE_RANGE_SHIFT
;
1660 struct aperture_range
*range
= dom
->aperture
[i
];
1662 BUG_ON(i
>= APERTURE_MAX_RANGES
|| range
== NULL
);
1664 #ifdef CONFIG_IOMMU_STRESS
1669 if (address
>= dom
->next_address
)
1670 dom
->need_flush
= true;
1672 address
= (address
% APERTURE_RANGE_SIZE
) >> PAGE_SHIFT
;
1674 bitmap_clear(range
->bitmap
, address
, pages
);
1678 /****************************************************************************
1680 * The next functions belong to the domain allocation. A domain is
1681 * allocated for every IOMMU as the default domain. If device isolation
1682 * is enabled, every device get its own domain. The most important thing
1683 * about domains is the page table mapping the DMA address space they
1686 ****************************************************************************/
1689 * This function adds a protection domain to the global protection domain list
1691 static void add_domain_to_list(struct protection_domain
*domain
)
1693 unsigned long flags
;
1695 spin_lock_irqsave(&amd_iommu_pd_lock
, flags
);
1696 list_add(&domain
->list
, &amd_iommu_pd_list
);
1697 spin_unlock_irqrestore(&amd_iommu_pd_lock
, flags
);
1701 * This function removes a protection domain to the global
1702 * protection domain list
1704 static void del_domain_from_list(struct protection_domain
*domain
)
1706 unsigned long flags
;
1708 spin_lock_irqsave(&amd_iommu_pd_lock
, flags
);
1709 list_del(&domain
->list
);
1710 spin_unlock_irqrestore(&amd_iommu_pd_lock
, flags
);
1713 static u16
domain_id_alloc(void)
1715 unsigned long flags
;
1718 write_lock_irqsave(&amd_iommu_devtable_lock
, flags
);
1719 id
= find_first_zero_bit(amd_iommu_pd_alloc_bitmap
, MAX_DOMAIN_ID
);
1721 if (id
> 0 && id
< MAX_DOMAIN_ID
)
1722 __set_bit(id
, amd_iommu_pd_alloc_bitmap
);
1725 write_unlock_irqrestore(&amd_iommu_devtable_lock
, flags
);
1730 static void domain_id_free(int id
)
1732 unsigned long flags
;
1734 write_lock_irqsave(&amd_iommu_devtable_lock
, flags
);
1735 if (id
> 0 && id
< MAX_DOMAIN_ID
)
1736 __clear_bit(id
, amd_iommu_pd_alloc_bitmap
);
1737 write_unlock_irqrestore(&amd_iommu_devtable_lock
, flags
);
1740 #define DEFINE_FREE_PT_FN(LVL, FN) \
1741 static void free_pt_##LVL (unsigned long __pt) \
1749 for (i = 0; i < 512; ++i) { \
1750 if (!IOMMU_PTE_PRESENT(pt[i])) \
1753 p = (unsigned long)IOMMU_PTE_PAGE(pt[i]); \
1756 free_page((unsigned long)pt); \
1759 DEFINE_FREE_PT_FN(l2
, free_page
)
1760 DEFINE_FREE_PT_FN(l3
, free_pt_l2
)
1761 DEFINE_FREE_PT_FN(l4
, free_pt_l3
)
1762 DEFINE_FREE_PT_FN(l5
, free_pt_l4
)
1763 DEFINE_FREE_PT_FN(l6
, free_pt_l5
)
1765 static void free_pagetable(struct protection_domain
*domain
)
1767 unsigned long root
= (unsigned long)domain
->pt_root
;
1769 switch (domain
->mode
) {
1770 case PAGE_MODE_NONE
:
1772 case PAGE_MODE_1_LEVEL
:
1775 case PAGE_MODE_2_LEVEL
:
1778 case PAGE_MODE_3_LEVEL
:
1781 case PAGE_MODE_4_LEVEL
:
1784 case PAGE_MODE_5_LEVEL
:
1787 case PAGE_MODE_6_LEVEL
:
1795 static void free_gcr3_tbl_level1(u64
*tbl
)
1800 for (i
= 0; i
< 512; ++i
) {
1801 if (!(tbl
[i
] & GCR3_VALID
))
1804 ptr
= __va(tbl
[i
] & PAGE_MASK
);
1806 free_page((unsigned long)ptr
);
1810 static void free_gcr3_tbl_level2(u64
*tbl
)
1815 for (i
= 0; i
< 512; ++i
) {
1816 if (!(tbl
[i
] & GCR3_VALID
))
1819 ptr
= __va(tbl
[i
] & PAGE_MASK
);
1821 free_gcr3_tbl_level1(ptr
);
1825 static void free_gcr3_table(struct protection_domain
*domain
)
1827 if (domain
->glx
== 2)
1828 free_gcr3_tbl_level2(domain
->gcr3_tbl
);
1829 else if (domain
->glx
== 1)
1830 free_gcr3_tbl_level1(domain
->gcr3_tbl
);
1831 else if (domain
->glx
!= 0)
1834 free_page((unsigned long)domain
->gcr3_tbl
);
1838 * Free a domain, only used if something went wrong in the
1839 * allocation path and we need to free an already allocated page table
1841 static void dma_ops_domain_free(struct dma_ops_domain
*dom
)
1848 del_domain_from_list(&dom
->domain
);
1850 free_pagetable(&dom
->domain
);
1852 for (i
= 0; i
< APERTURE_MAX_RANGES
; ++i
) {
1853 if (!dom
->aperture
[i
])
1855 free_page((unsigned long)dom
->aperture
[i
]->bitmap
);
1856 kfree(dom
->aperture
[i
]);
1863 * Allocates a new protection domain usable for the dma_ops functions.
1864 * It also initializes the page table and the address allocator data
1865 * structures required for the dma_ops interface
1867 static struct dma_ops_domain
*dma_ops_domain_alloc(void)
1869 struct dma_ops_domain
*dma_dom
;
1871 dma_dom
= kzalloc(sizeof(struct dma_ops_domain
), GFP_KERNEL
);
1875 spin_lock_init(&dma_dom
->domain
.lock
);
1877 dma_dom
->domain
.id
= domain_id_alloc();
1878 if (dma_dom
->domain
.id
== 0)
1880 INIT_LIST_HEAD(&dma_dom
->domain
.dev_list
);
1881 dma_dom
->domain
.mode
= PAGE_MODE_2_LEVEL
;
1882 dma_dom
->domain
.pt_root
= (void *)get_zeroed_page(GFP_KERNEL
);
1883 dma_dom
->domain
.flags
= PD_DMA_OPS_MASK
;
1884 dma_dom
->domain
.priv
= dma_dom
;
1885 if (!dma_dom
->domain
.pt_root
)
1888 dma_dom
->need_flush
= false;
1890 add_domain_to_list(&dma_dom
->domain
);
1892 if (alloc_new_range(dma_dom
, true, GFP_KERNEL
))
1896 * mark the first page as allocated so we never return 0 as
1897 * a valid dma-address. So we can use 0 as error value
1899 dma_dom
->aperture
[0]->bitmap
[0] = 1;
1900 dma_dom
->next_address
= 0;
1906 dma_ops_domain_free(dma_dom
);
1912 * little helper function to check whether a given protection domain is a
1915 static bool dma_ops_domain(struct protection_domain
*domain
)
1917 return domain
->flags
& PD_DMA_OPS_MASK
;
1920 static void set_dte_entry(u16 devid
, struct protection_domain
*domain
, bool ats
)
1925 if (domain
->mode
!= PAGE_MODE_NONE
)
1926 pte_root
= virt_to_phys(domain
->pt_root
);
1928 pte_root
|= (domain
->mode
& DEV_ENTRY_MODE_MASK
)
1929 << DEV_ENTRY_MODE_SHIFT
;
1930 pte_root
|= IOMMU_PTE_IR
| IOMMU_PTE_IW
| IOMMU_PTE_P
| IOMMU_PTE_TV
;
1932 flags
= amd_iommu_dev_table
[devid
].data
[1];
1935 flags
|= DTE_FLAG_IOTLB
;
1937 if (domain
->flags
& PD_IOMMUV2_MASK
) {
1938 u64 gcr3
= __pa(domain
->gcr3_tbl
);
1939 u64 glx
= domain
->glx
;
1942 pte_root
|= DTE_FLAG_GV
;
1943 pte_root
|= (glx
& DTE_GLX_MASK
) << DTE_GLX_SHIFT
;
1945 /* First mask out possible old values for GCR3 table */
1946 tmp
= DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B
;
1949 tmp
= DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C
;
1952 /* Encode GCR3 table into DTE */
1953 tmp
= DTE_GCR3_VAL_A(gcr3
) << DTE_GCR3_SHIFT_A
;
1956 tmp
= DTE_GCR3_VAL_B(gcr3
) << DTE_GCR3_SHIFT_B
;
1959 tmp
= DTE_GCR3_VAL_C(gcr3
) << DTE_GCR3_SHIFT_C
;
1963 flags
&= ~(0xffffUL
);
1964 flags
|= domain
->id
;
1966 amd_iommu_dev_table
[devid
].data
[1] = flags
;
1967 amd_iommu_dev_table
[devid
].data
[0] = pte_root
;
1970 static void clear_dte_entry(u16 devid
)
1972 /* remove entry from the device table seen by the hardware */
1973 amd_iommu_dev_table
[devid
].data
[0] = IOMMU_PTE_P
| IOMMU_PTE_TV
;
1974 amd_iommu_dev_table
[devid
].data
[1] = 0;
1976 amd_iommu_apply_erratum_63(devid
);
1979 static void do_attach(struct iommu_dev_data
*dev_data
,
1980 struct protection_domain
*domain
)
1982 struct amd_iommu
*iommu
;
1985 iommu
= amd_iommu_rlookup_table
[dev_data
->devid
];
1986 ats
= dev_data
->ats
.enabled
;
1988 /* Update data structures */
1989 dev_data
->domain
= domain
;
1990 list_add(&dev_data
->list
, &domain
->dev_list
);
1991 set_dte_entry(dev_data
->devid
, domain
, ats
);
1993 /* Do reference counting */
1994 domain
->dev_iommu
[iommu
->index
] += 1;
1995 domain
->dev_cnt
+= 1;
1997 /* Flush the DTE entry */
1998 device_flush_dte(dev_data
);
2001 static void do_detach(struct iommu_dev_data
*dev_data
)
2003 struct amd_iommu
*iommu
;
2005 iommu
= amd_iommu_rlookup_table
[dev_data
->devid
];
2007 /* decrease reference counters */
2008 dev_data
->domain
->dev_iommu
[iommu
->index
] -= 1;
2009 dev_data
->domain
->dev_cnt
-= 1;
2011 /* Update data structures */
2012 dev_data
->domain
= NULL
;
2013 list_del(&dev_data
->list
);
2014 clear_dte_entry(dev_data
->devid
);
2016 /* Flush the DTE entry */
2017 device_flush_dte(dev_data
);
2021 * If a device is not yet associated with a domain, this function does
2022 * assigns it visible for the hardware
2024 static int __attach_device(struct iommu_dev_data
*dev_data
,
2025 struct protection_domain
*domain
)
2027 struct iommu_dev_data
*head
, *entry
;
2031 spin_lock(&domain
->lock
);
2035 if (head
->alias_data
!= NULL
)
2036 head
= head
->alias_data
;
2038 /* Now we have the root of the alias group, if any */
2041 if (head
->domain
!= NULL
)
2044 /* Attach alias group root */
2045 do_attach(head
, domain
);
2047 /* Attach other devices in the alias group */
2048 list_for_each_entry(entry
, &head
->alias_list
, alias_list
)
2049 do_attach(entry
, domain
);
2056 spin_unlock(&domain
->lock
);
2062 static void pdev_iommuv2_disable(struct pci_dev
*pdev
)
2064 pci_disable_ats(pdev
);
2065 pci_disable_pri(pdev
);
2066 pci_disable_pasid(pdev
);
2069 /* FIXME: Change generic reset-function to do the same */
2070 static int pri_reset_while_enabled(struct pci_dev
*pdev
)
2075 pos
= pci_find_ext_capability(pdev
, PCI_EXT_CAP_ID_PRI
);
2079 pci_read_config_word(pdev
, pos
+ PCI_PRI_CTRL
, &control
);
2080 control
|= PCI_PRI_CTRL_RESET
;
2081 pci_write_config_word(pdev
, pos
+ PCI_PRI_CTRL
, control
);
2086 static int pdev_iommuv2_enable(struct pci_dev
*pdev
)
2091 /* FIXME: Hardcode number of outstanding requests for now */
2093 if (pdev_pri_erratum(pdev
, AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE
))
2095 reset_enable
= pdev_pri_erratum(pdev
, AMD_PRI_DEV_ERRATUM_ENABLE_RESET
);
2097 /* Only allow access to user-accessible pages */
2098 ret
= pci_enable_pasid(pdev
, 0);
2102 /* First reset the PRI state of the device */
2103 ret
= pci_reset_pri(pdev
);
2108 ret
= pci_enable_pri(pdev
, reqs
);
2113 ret
= pri_reset_while_enabled(pdev
);
2118 ret
= pci_enable_ats(pdev
, PAGE_SHIFT
);
2125 pci_disable_pri(pdev
);
2126 pci_disable_pasid(pdev
);
2131 /* FIXME: Move this to PCI code */
2132 #define PCI_PRI_TLP_OFF (1 << 15)
2134 static bool pci_pri_tlp_required(struct pci_dev
*pdev
)
2139 pos
= pci_find_ext_capability(pdev
, PCI_EXT_CAP_ID_PRI
);
2143 pci_read_config_word(pdev
, pos
+ PCI_PRI_STATUS
, &status
);
2145 return (status
& PCI_PRI_TLP_OFF
) ? true : false;
2149 * If a device is not yet associated with a domain, this function
2150 * assigns it visible for the hardware
2152 static int attach_device(struct device
*dev
,
2153 struct protection_domain
*domain
)
2155 struct pci_dev
*pdev
= to_pci_dev(dev
);
2156 struct iommu_dev_data
*dev_data
;
2157 unsigned long flags
;
2160 dev_data
= get_dev_data(dev
);
2162 if (domain
->flags
& PD_IOMMUV2_MASK
) {
2163 if (!dev_data
->iommu_v2
|| !dev_data
->passthrough
)
2166 if (pdev_iommuv2_enable(pdev
) != 0)
2169 dev_data
->ats
.enabled
= true;
2170 dev_data
->ats
.qdep
= pci_ats_queue_depth(pdev
);
2171 dev_data
->pri_tlp
= pci_pri_tlp_required(pdev
);
2172 } else if (amd_iommu_iotlb_sup
&&
2173 pci_enable_ats(pdev
, PAGE_SHIFT
) == 0) {
2174 dev_data
->ats
.enabled
= true;
2175 dev_data
->ats
.qdep
= pci_ats_queue_depth(pdev
);
2178 write_lock_irqsave(&amd_iommu_devtable_lock
, flags
);
2179 ret
= __attach_device(dev_data
, domain
);
2180 write_unlock_irqrestore(&amd_iommu_devtable_lock
, flags
);
2183 * We might boot into a crash-kernel here. The crashed kernel
2184 * left the caches in the IOMMU dirty. So we have to flush
2185 * here to evict all dirty stuff.
2187 domain_flush_tlb_pde(domain
);
2193 * Removes a device from a protection domain (unlocked)
2195 static void __detach_device(struct iommu_dev_data
*dev_data
)
2197 struct iommu_dev_data
*head
, *entry
;
2198 struct protection_domain
*domain
;
2199 unsigned long flags
;
2201 BUG_ON(!dev_data
->domain
);
2203 domain
= dev_data
->domain
;
2205 spin_lock_irqsave(&domain
->lock
, flags
);
2208 if (head
->alias_data
!= NULL
)
2209 head
= head
->alias_data
;
2211 list_for_each_entry(entry
, &head
->alias_list
, alias_list
)
2216 spin_unlock_irqrestore(&domain
->lock
, flags
);
2219 * If we run in passthrough mode the device must be assigned to the
2220 * passthrough domain if it is detached from any other domain.
2221 * Make sure we can deassign from the pt_domain itself.
2223 if (dev_data
->passthrough
&&
2224 (dev_data
->domain
== NULL
&& domain
!= pt_domain
))
2225 __attach_device(dev_data
, pt_domain
);
2229 * Removes a device from a protection domain (with devtable_lock held)
2231 static void detach_device(struct device
*dev
)
2233 struct protection_domain
*domain
;
2234 struct iommu_dev_data
*dev_data
;
2235 unsigned long flags
;
2237 dev_data
= get_dev_data(dev
);
2238 domain
= dev_data
->domain
;
2240 /* lock device table */
2241 write_lock_irqsave(&amd_iommu_devtable_lock
, flags
);
2242 __detach_device(dev_data
);
2243 write_unlock_irqrestore(&amd_iommu_devtable_lock
, flags
);
2245 if (domain
->flags
& PD_IOMMUV2_MASK
)
2246 pdev_iommuv2_disable(to_pci_dev(dev
));
2247 else if (dev_data
->ats
.enabled
)
2248 pci_disable_ats(to_pci_dev(dev
));
2250 dev_data
->ats
.enabled
= false;
2253 static int amd_iommu_add_device(struct device
*dev
)
2255 struct iommu_dev_data
*dev_data
;
2256 struct iommu_domain
*domain
;
2257 struct amd_iommu
*iommu
;
2261 if (!check_device(dev
) || get_dev_data(dev
))
2264 devid
= get_device_id(dev
);
2265 iommu
= amd_iommu_rlookup_table
[devid
];
2267 ret
= iommu_init_device(dev
);
2268 if (ret
== -ENOTSUPP
) {
2269 iommu_ignore_device(dev
);
2270 dev
->archdata
.dma_ops
= &nommu_dma_ops
;
2273 init_iommu_group(dev
);
2275 dev_data
= get_dev_data(dev
);
2276 if (dev_data
&& dev_data
->iommu_v2
)
2277 iommu_request_dm_for_dev(dev
);
2279 /* Domains are initialized for this device - have a look what we ended up with */
2280 domain
= iommu_get_domain_for_dev(dev
);
2281 if (domain
->type
== IOMMU_DOMAIN_IDENTITY
) {
2282 dev_data
->passthrough
= true;
2283 dev
->archdata
.dma_ops
= &nommu_dma_ops
;
2285 dev
->archdata
.dma_ops
= &amd_iommu_dma_ops
;
2289 iommu_completion_wait(iommu
);
2294 static void amd_iommu_remove_device(struct device
*dev
)
2296 struct amd_iommu
*iommu
;
2299 if (!check_device(dev
))
2302 devid
= get_device_id(dev
);
2303 iommu
= amd_iommu_rlookup_table
[devid
];
2305 iommu_uninit_device(dev
);
2306 iommu_completion_wait(iommu
);
2309 /*****************************************************************************
2311 * The next functions belong to the dma_ops mapping/unmapping code.
2313 *****************************************************************************/
2316 * In the dma_ops path we only have the struct device. This function
2317 * finds the corresponding IOMMU, the protection domain and the
2318 * requestor id for a given device.
2319 * If the device is not yet associated with a domain this is also done
2322 static struct protection_domain
*get_domain(struct device
*dev
)
2324 struct protection_domain
*domain
;
2325 struct iommu_domain
*io_domain
;
2327 if (!check_device(dev
))
2328 return ERR_PTR(-EINVAL
);
2330 io_domain
= iommu_get_domain_for_dev(dev
);
2334 domain
= to_pdomain(io_domain
);
2335 if (!dma_ops_domain(domain
))
2336 return ERR_PTR(-EBUSY
);
2341 static void update_device_table(struct protection_domain
*domain
)
2343 struct iommu_dev_data
*dev_data
;
2345 list_for_each_entry(dev_data
, &domain
->dev_list
, list
)
2346 set_dte_entry(dev_data
->devid
, domain
, dev_data
->ats
.enabled
);
2349 static void update_domain(struct protection_domain
*domain
)
2351 if (!domain
->updated
)
2354 update_device_table(domain
);
2356 domain_flush_devices(domain
);
2357 domain_flush_tlb_pde(domain
);
2359 domain
->updated
= false;
2363 * This function fetches the PTE for a given address in the aperture
2365 static u64
* dma_ops_get_pte(struct dma_ops_domain
*dom
,
2366 unsigned long address
)
2368 struct aperture_range
*aperture
;
2369 u64
*pte
, *pte_page
;
2371 aperture
= dom
->aperture
[APERTURE_RANGE_INDEX(address
)];
2375 pte
= aperture
->pte_pages
[APERTURE_PAGE_INDEX(address
)];
2377 pte
= alloc_pte(&dom
->domain
, address
, PAGE_SIZE
, &pte_page
,
2379 aperture
->pte_pages
[APERTURE_PAGE_INDEX(address
)] = pte_page
;
2381 pte
+= PM_LEVEL_INDEX(0, address
);
2383 update_domain(&dom
->domain
);
2389 * This is the generic map function. It maps one 4kb page at paddr to
2390 * the given address in the DMA address space for the domain.
2392 static dma_addr_t
dma_ops_domain_map(struct dma_ops_domain
*dom
,
2393 unsigned long address
,
2399 WARN_ON(address
> dom
->aperture_size
);
2403 pte
= dma_ops_get_pte(dom
, address
);
2405 return DMA_ERROR_CODE
;
2407 __pte
= paddr
| IOMMU_PTE_P
| IOMMU_PTE_FC
;
2409 if (direction
== DMA_TO_DEVICE
)
2410 __pte
|= IOMMU_PTE_IR
;
2411 else if (direction
== DMA_FROM_DEVICE
)
2412 __pte
|= IOMMU_PTE_IW
;
2413 else if (direction
== DMA_BIDIRECTIONAL
)
2414 __pte
|= IOMMU_PTE_IR
| IOMMU_PTE_IW
;
2420 return (dma_addr_t
)address
;
2424 * The generic unmapping function for on page in the DMA address space.
2426 static void dma_ops_domain_unmap(struct dma_ops_domain
*dom
,
2427 unsigned long address
)
2429 struct aperture_range
*aperture
;
2432 if (address
>= dom
->aperture_size
)
2435 aperture
= dom
->aperture
[APERTURE_RANGE_INDEX(address
)];
2439 pte
= aperture
->pte_pages
[APERTURE_PAGE_INDEX(address
)];
2443 pte
+= PM_LEVEL_INDEX(0, address
);
2451 * This function contains common code for mapping of a physically
2452 * contiguous memory region into DMA address space. It is used by all
2453 * mapping functions provided with this IOMMU driver.
2454 * Must be called with the domain lock held.
2456 static dma_addr_t
__map_single(struct device
*dev
,
2457 struct dma_ops_domain
*dma_dom
,
2464 dma_addr_t offset
= paddr
& ~PAGE_MASK
;
2465 dma_addr_t address
, start
, ret
;
2467 unsigned long align_mask
= 0;
2470 pages
= iommu_num_pages(paddr
, size
, PAGE_SIZE
);
2473 INC_STATS_COUNTER(total_map_requests
);
2476 INC_STATS_COUNTER(cross_page
);
2479 align_mask
= (1UL << get_order(size
)) - 1;
2482 address
= dma_ops_alloc_addresses(dev
, dma_dom
, pages
, align_mask
,
2484 if (unlikely(address
== DMA_ERROR_CODE
)) {
2486 * setting next_address here will let the address
2487 * allocator only scan the new allocated range in the
2488 * first run. This is a small optimization.
2490 dma_dom
->next_address
= dma_dom
->aperture_size
;
2492 if (alloc_new_range(dma_dom
, false, GFP_ATOMIC
))
2496 * aperture was successfully enlarged by 128 MB, try
2503 for (i
= 0; i
< pages
; ++i
) {
2504 ret
= dma_ops_domain_map(dma_dom
, start
, paddr
, dir
);
2505 if (ret
== DMA_ERROR_CODE
)
2513 ADD_STATS_COUNTER(alloced_io_mem
, size
);
2515 if (unlikely(dma_dom
->need_flush
&& !amd_iommu_unmap_flush
)) {
2516 domain_flush_tlb(&dma_dom
->domain
);
2517 dma_dom
->need_flush
= false;
2518 } else if (unlikely(amd_iommu_np_cache
))
2519 domain_flush_pages(&dma_dom
->domain
, address
, size
);
2526 for (--i
; i
>= 0; --i
) {
2528 dma_ops_domain_unmap(dma_dom
, start
);
2531 dma_ops_free_addresses(dma_dom
, address
, pages
);
2533 return DMA_ERROR_CODE
;
2537 * Does the reverse of the __map_single function. Must be called with
2538 * the domain lock held too
2540 static void __unmap_single(struct dma_ops_domain
*dma_dom
,
2541 dma_addr_t dma_addr
,
2545 dma_addr_t flush_addr
;
2546 dma_addr_t i
, start
;
2549 if ((dma_addr
== DMA_ERROR_CODE
) ||
2550 (dma_addr
+ size
> dma_dom
->aperture_size
))
2553 flush_addr
= dma_addr
;
2554 pages
= iommu_num_pages(dma_addr
, size
, PAGE_SIZE
);
2555 dma_addr
&= PAGE_MASK
;
2558 for (i
= 0; i
< pages
; ++i
) {
2559 dma_ops_domain_unmap(dma_dom
, start
);
2563 SUB_STATS_COUNTER(alloced_io_mem
, size
);
2565 dma_ops_free_addresses(dma_dom
, dma_addr
, pages
);
2567 if (amd_iommu_unmap_flush
|| dma_dom
->need_flush
) {
2568 domain_flush_pages(&dma_dom
->domain
, flush_addr
, size
);
2569 dma_dom
->need_flush
= false;
2574 * The exported map_single function for dma_ops.
2576 static dma_addr_t
map_page(struct device
*dev
, struct page
*page
,
2577 unsigned long offset
, size_t size
,
2578 enum dma_data_direction dir
,
2579 struct dma_attrs
*attrs
)
2581 unsigned long flags
;
2582 struct protection_domain
*domain
;
2585 phys_addr_t paddr
= page_to_phys(page
) + offset
;
2587 INC_STATS_COUNTER(cnt_map_single
);
2589 domain
= get_domain(dev
);
2590 if (PTR_ERR(domain
) == -EINVAL
)
2591 return (dma_addr_t
)paddr
;
2592 else if (IS_ERR(domain
))
2593 return DMA_ERROR_CODE
;
2595 dma_mask
= *dev
->dma_mask
;
2597 spin_lock_irqsave(&domain
->lock
, flags
);
2599 addr
= __map_single(dev
, domain
->priv
, paddr
, size
, dir
, false,
2601 if (addr
== DMA_ERROR_CODE
)
2604 domain_flush_complete(domain
);
2607 spin_unlock_irqrestore(&domain
->lock
, flags
);
2613 * The exported unmap_single function for dma_ops.
2615 static void unmap_page(struct device
*dev
, dma_addr_t dma_addr
, size_t size
,
2616 enum dma_data_direction dir
, struct dma_attrs
*attrs
)
2618 unsigned long flags
;
2619 struct protection_domain
*domain
;
2621 INC_STATS_COUNTER(cnt_unmap_single
);
2623 domain
= get_domain(dev
);
2627 spin_lock_irqsave(&domain
->lock
, flags
);
2629 __unmap_single(domain
->priv
, dma_addr
, size
, dir
);
2631 domain_flush_complete(domain
);
2633 spin_unlock_irqrestore(&domain
->lock
, flags
);
2637 * The exported map_sg function for dma_ops (handles scatter-gather
2640 static int map_sg(struct device
*dev
, struct scatterlist
*sglist
,
2641 int nelems
, enum dma_data_direction dir
,
2642 struct dma_attrs
*attrs
)
2644 unsigned long flags
;
2645 struct protection_domain
*domain
;
2647 struct scatterlist
*s
;
2649 int mapped_elems
= 0;
2652 INC_STATS_COUNTER(cnt_map_sg
);
2654 domain
= get_domain(dev
);
2658 dma_mask
= *dev
->dma_mask
;
2660 spin_lock_irqsave(&domain
->lock
, flags
);
2662 for_each_sg(sglist
, s
, nelems
, i
) {
2665 s
->dma_address
= __map_single(dev
, domain
->priv
,
2666 paddr
, s
->length
, dir
, false,
2669 if (s
->dma_address
) {
2670 s
->dma_length
= s
->length
;
2676 domain_flush_complete(domain
);
2679 spin_unlock_irqrestore(&domain
->lock
, flags
);
2681 return mapped_elems
;
2683 for_each_sg(sglist
, s
, mapped_elems
, i
) {
2685 __unmap_single(domain
->priv
, s
->dma_address
,
2686 s
->dma_length
, dir
);
2687 s
->dma_address
= s
->dma_length
= 0;
2696 * The exported map_sg function for dma_ops (handles scatter-gather
2699 static void unmap_sg(struct device
*dev
, struct scatterlist
*sglist
,
2700 int nelems
, enum dma_data_direction dir
,
2701 struct dma_attrs
*attrs
)
2703 unsigned long flags
;
2704 struct protection_domain
*domain
;
2705 struct scatterlist
*s
;
2708 INC_STATS_COUNTER(cnt_unmap_sg
);
2710 domain
= get_domain(dev
);
2714 spin_lock_irqsave(&domain
->lock
, flags
);
2716 for_each_sg(sglist
, s
, nelems
, i
) {
2717 __unmap_single(domain
->priv
, s
->dma_address
,
2718 s
->dma_length
, dir
);
2719 s
->dma_address
= s
->dma_length
= 0;
2722 domain_flush_complete(domain
);
2724 spin_unlock_irqrestore(&domain
->lock
, flags
);
2728 * The exported alloc_coherent function for dma_ops.
2730 static void *alloc_coherent(struct device
*dev
, size_t size
,
2731 dma_addr_t
*dma_addr
, gfp_t flag
,
2732 struct dma_attrs
*attrs
)
2734 u64 dma_mask
= dev
->coherent_dma_mask
;
2735 struct protection_domain
*domain
;
2736 unsigned long flags
;
2739 INC_STATS_COUNTER(cnt_alloc_coherent
);
2741 domain
= get_domain(dev
);
2742 if (PTR_ERR(domain
) == -EINVAL
) {
2743 page
= alloc_pages(flag
, get_order(size
));
2744 *dma_addr
= page_to_phys(page
);
2745 return page_address(page
);
2746 } else if (IS_ERR(domain
))
2749 size
= PAGE_ALIGN(size
);
2750 dma_mask
= dev
->coherent_dma_mask
;
2751 flag
&= ~(__GFP_DMA
| __GFP_HIGHMEM
| __GFP_DMA32
);
2753 page
= alloc_pages(flag
| __GFP_NOWARN
, get_order(size
));
2755 if (!(flag
& __GFP_WAIT
))
2758 page
= dma_alloc_from_contiguous(dev
, size
>> PAGE_SHIFT
,
2765 dma_mask
= *dev
->dma_mask
;
2767 spin_lock_irqsave(&domain
->lock
, flags
);
2769 *dma_addr
= __map_single(dev
, domain
->priv
, page_to_phys(page
),
2770 size
, DMA_BIDIRECTIONAL
, true, dma_mask
);
2772 if (*dma_addr
== DMA_ERROR_CODE
) {
2773 spin_unlock_irqrestore(&domain
->lock
, flags
);
2777 domain_flush_complete(domain
);
2779 spin_unlock_irqrestore(&domain
->lock
, flags
);
2781 return page_address(page
);
2785 if (!dma_release_from_contiguous(dev
, page
, size
>> PAGE_SHIFT
))
2786 __free_pages(page
, get_order(size
));
2792 * The exported free_coherent function for dma_ops.
2794 static void free_coherent(struct device
*dev
, size_t size
,
2795 void *virt_addr
, dma_addr_t dma_addr
,
2796 struct dma_attrs
*attrs
)
2798 struct protection_domain
*domain
;
2799 unsigned long flags
;
2802 INC_STATS_COUNTER(cnt_free_coherent
);
2804 page
= virt_to_page(virt_addr
);
2805 size
= PAGE_ALIGN(size
);
2807 domain
= get_domain(dev
);
2811 spin_lock_irqsave(&domain
->lock
, flags
);
2813 __unmap_single(domain
->priv
, dma_addr
, size
, DMA_BIDIRECTIONAL
);
2815 domain_flush_complete(domain
);
2817 spin_unlock_irqrestore(&domain
->lock
, flags
);
2820 if (!dma_release_from_contiguous(dev
, page
, size
>> PAGE_SHIFT
))
2821 __free_pages(page
, get_order(size
));
2825 * This function is called by the DMA layer to find out if we can handle a
2826 * particular device. It is part of the dma_ops.
2828 static int amd_iommu_dma_supported(struct device
*dev
, u64 mask
)
2830 return check_device(dev
);
2833 static struct dma_map_ops amd_iommu_dma_ops
= {
2834 .alloc
= alloc_coherent
,
2835 .free
= free_coherent
,
2836 .map_page
= map_page
,
2837 .unmap_page
= unmap_page
,
2839 .unmap_sg
= unmap_sg
,
2840 .dma_supported
= amd_iommu_dma_supported
,
2843 int __init
amd_iommu_init_api(void)
2845 return bus_set_iommu(&pci_bus_type
, &amd_iommu_ops
);
2848 int __init
amd_iommu_init_dma_ops(void)
2853 amd_iommu_stats_init();
2855 if (amd_iommu_unmap_flush
)
2856 pr_info("AMD-Vi: IO/TLB flush on unmap enabled\n");
2858 pr_info("AMD-Vi: Lazy IO/TLB flushing enabled\n");
2863 /*****************************************************************************
2865 * The following functions belong to the exported interface of AMD IOMMU
2867 * This interface allows access to lower level functions of the IOMMU
2868 * like protection domain handling and assignement of devices to domains
2869 * which is not possible with the dma_ops interface.
2871 *****************************************************************************/
2873 static void cleanup_domain(struct protection_domain
*domain
)
2875 struct iommu_dev_data
*entry
;
2876 unsigned long flags
;
2878 write_lock_irqsave(&amd_iommu_devtable_lock
, flags
);
2880 while (!list_empty(&domain
->dev_list
)) {
2881 entry
= list_first_entry(&domain
->dev_list
,
2882 struct iommu_dev_data
, list
);
2883 __detach_device(entry
);
2886 write_unlock_irqrestore(&amd_iommu_devtable_lock
, flags
);
2889 static void protection_domain_free(struct protection_domain
*domain
)
2894 del_domain_from_list(domain
);
2897 domain_id_free(domain
->id
);
2902 static struct protection_domain
*protection_domain_alloc(void)
2904 struct protection_domain
*domain
;
2906 domain
= kzalloc(sizeof(*domain
), GFP_KERNEL
);
2910 spin_lock_init(&domain
->lock
);
2911 mutex_init(&domain
->api_lock
);
2912 domain
->id
= domain_id_alloc();
2915 INIT_LIST_HEAD(&domain
->dev_list
);
2917 add_domain_to_list(domain
);
2927 static int alloc_passthrough_domain(void)
2929 if (pt_domain
!= NULL
)
2932 /* allocate passthrough domain */
2933 pt_domain
= protection_domain_alloc();
2937 pt_domain
->mode
= PAGE_MODE_NONE
;
2942 static struct iommu_domain
*amd_iommu_domain_alloc(unsigned type
)
2944 struct protection_domain
*pdomain
;
2945 struct dma_ops_domain
*dma_domain
;
2948 case IOMMU_DOMAIN_UNMANAGED
:
2949 pdomain
= protection_domain_alloc();
2953 pdomain
->mode
= PAGE_MODE_3_LEVEL
;
2954 pdomain
->pt_root
= (void *)get_zeroed_page(GFP_KERNEL
);
2955 if (!pdomain
->pt_root
) {
2956 protection_domain_free(pdomain
);
2960 pdomain
->domain
.geometry
.aperture_start
= 0;
2961 pdomain
->domain
.geometry
.aperture_end
= ~0ULL;
2962 pdomain
->domain
.geometry
.force_aperture
= true;
2965 case IOMMU_DOMAIN_DMA
:
2966 dma_domain
= dma_ops_domain_alloc();
2968 pr_err("AMD-Vi: Failed to allocate\n");
2971 pdomain
= &dma_domain
->domain
;
2973 case IOMMU_DOMAIN_IDENTITY
:
2974 pdomain
= protection_domain_alloc();
2978 pdomain
->mode
= PAGE_MODE_NONE
;
2984 return &pdomain
->domain
;
2987 static void amd_iommu_domain_free(struct iommu_domain
*dom
)
2989 struct protection_domain
*domain
;
2994 domain
= to_pdomain(dom
);
2996 if (domain
->dev_cnt
> 0)
2997 cleanup_domain(domain
);
2999 BUG_ON(domain
->dev_cnt
!= 0);
3001 if (domain
->mode
!= PAGE_MODE_NONE
)
3002 free_pagetable(domain
);
3004 if (domain
->flags
& PD_IOMMUV2_MASK
)
3005 free_gcr3_table(domain
);
3007 protection_domain_free(domain
);
3010 static void amd_iommu_detach_device(struct iommu_domain
*dom
,
3013 struct iommu_dev_data
*dev_data
= dev
->archdata
.iommu
;
3014 struct amd_iommu
*iommu
;
3017 if (!check_device(dev
))
3020 devid
= get_device_id(dev
);
3022 if (dev_data
->domain
!= NULL
)
3025 iommu
= amd_iommu_rlookup_table
[devid
];
3029 iommu_completion_wait(iommu
);
3032 static int amd_iommu_attach_device(struct iommu_domain
*dom
,
3035 struct protection_domain
*domain
= to_pdomain(dom
);
3036 struct iommu_dev_data
*dev_data
;
3037 struct amd_iommu
*iommu
;
3040 if (!check_device(dev
))
3043 dev_data
= dev
->archdata
.iommu
;
3045 iommu
= amd_iommu_rlookup_table
[dev_data
->devid
];
3049 if (dev_data
->domain
)
3052 ret
= attach_device(dev
, domain
);
3054 iommu_completion_wait(iommu
);
3059 static int amd_iommu_map(struct iommu_domain
*dom
, unsigned long iova
,
3060 phys_addr_t paddr
, size_t page_size
, int iommu_prot
)
3062 struct protection_domain
*domain
= to_pdomain(dom
);
3066 if (domain
->mode
== PAGE_MODE_NONE
)
3069 if (iommu_prot
& IOMMU_READ
)
3070 prot
|= IOMMU_PROT_IR
;
3071 if (iommu_prot
& IOMMU_WRITE
)
3072 prot
|= IOMMU_PROT_IW
;
3074 mutex_lock(&domain
->api_lock
);
3075 ret
= iommu_map_page(domain
, iova
, paddr
, prot
, page_size
);
3076 mutex_unlock(&domain
->api_lock
);
3081 static size_t amd_iommu_unmap(struct iommu_domain
*dom
, unsigned long iova
,
3084 struct protection_domain
*domain
= to_pdomain(dom
);
3087 if (domain
->mode
== PAGE_MODE_NONE
)
3090 mutex_lock(&domain
->api_lock
);
3091 unmap_size
= iommu_unmap_page(domain
, iova
, page_size
);
3092 mutex_unlock(&domain
->api_lock
);
3094 domain_flush_tlb_pde(domain
);
3099 static phys_addr_t
amd_iommu_iova_to_phys(struct iommu_domain
*dom
,
3102 struct protection_domain
*domain
= to_pdomain(dom
);
3103 unsigned long offset_mask
, pte_pgsize
;
3106 if (domain
->mode
== PAGE_MODE_NONE
)
3109 pte
= fetch_pte(domain
, iova
, &pte_pgsize
);
3111 if (!pte
|| !IOMMU_PTE_PRESENT(*pte
))
3114 offset_mask
= pte_pgsize
- 1;
3115 __pte
= *pte
& PM_ADDR_MASK
;
3117 return (__pte
& ~offset_mask
) | (iova
& offset_mask
);
3120 static bool amd_iommu_capable(enum iommu_cap cap
)
3123 case IOMMU_CAP_CACHE_COHERENCY
:
3125 case IOMMU_CAP_INTR_REMAP
:
3126 return (irq_remapping_enabled
== 1);
3127 case IOMMU_CAP_NOEXEC
:
3134 static void amd_iommu_get_dm_regions(struct device
*dev
,
3135 struct list_head
*head
)
3137 struct unity_map_entry
*entry
;
3140 devid
= get_device_id(dev
);
3142 list_for_each_entry(entry
, &amd_iommu_unity_map
, list
) {
3143 struct iommu_dm_region
*region
;
3145 if (devid
< entry
->devid_start
|| devid
> entry
->devid_end
)
3148 region
= kzalloc(sizeof(*region
), GFP_KERNEL
);
3150 pr_err("Out of memory allocating dm-regions for %s\n",
3155 region
->start
= entry
->address_start
;
3156 region
->length
= entry
->address_end
- entry
->address_start
;
3157 if (entry
->prot
& IOMMU_PROT_IR
)
3158 region
->prot
|= IOMMU_READ
;
3159 if (entry
->prot
& IOMMU_PROT_IW
)
3160 region
->prot
|= IOMMU_WRITE
;
3162 list_add_tail(®ion
->list
, head
);
3166 static void amd_iommu_put_dm_regions(struct device
*dev
,
3167 struct list_head
*head
)
3169 struct iommu_dm_region
*entry
, *next
;
3171 list_for_each_entry_safe(entry
, next
, head
, list
)
3175 static const struct iommu_ops amd_iommu_ops
= {
3176 .capable
= amd_iommu_capable
,
3177 .domain_alloc
= amd_iommu_domain_alloc
,
3178 .domain_free
= amd_iommu_domain_free
,
3179 .attach_dev
= amd_iommu_attach_device
,
3180 .detach_dev
= amd_iommu_detach_device
,
3181 .map
= amd_iommu_map
,
3182 .unmap
= amd_iommu_unmap
,
3183 .map_sg
= default_iommu_map_sg
,
3184 .iova_to_phys
= amd_iommu_iova_to_phys
,
3185 .add_device
= amd_iommu_add_device
,
3186 .remove_device
= amd_iommu_remove_device
,
3187 .get_dm_regions
= amd_iommu_get_dm_regions
,
3188 .put_dm_regions
= amd_iommu_put_dm_regions
,
3189 .pgsize_bitmap
= AMD_IOMMU_PGSIZES
,
3192 /*****************************************************************************
3194 * The next functions do a basic initialization of IOMMU for pass through
3197 * In passthrough mode the IOMMU is initialized and enabled but not used for
3198 * DMA-API translation.
3200 *****************************************************************************/
3202 int __init
amd_iommu_init_passthrough(void)
3204 struct iommu_dev_data
*dev_data
;
3205 struct pci_dev
*dev
= NULL
;
3208 ret
= alloc_passthrough_domain();
3212 for_each_pci_dev(dev
) {
3213 if (!check_device(&dev
->dev
))
3216 dev_data
= get_dev_data(&dev
->dev
);
3217 dev_data
->passthrough
= true;
3219 attach_device(&dev
->dev
, pt_domain
);
3222 amd_iommu_stats_init();
3224 pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
3229 /* IOMMUv2 specific functions */
3230 int amd_iommu_register_ppr_notifier(struct notifier_block
*nb
)
3232 return atomic_notifier_chain_register(&ppr_notifier
, nb
);
3234 EXPORT_SYMBOL(amd_iommu_register_ppr_notifier
);
3236 int amd_iommu_unregister_ppr_notifier(struct notifier_block
*nb
)
3238 return atomic_notifier_chain_unregister(&ppr_notifier
, nb
);
3240 EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier
);
3242 void amd_iommu_domain_direct_map(struct iommu_domain
*dom
)
3244 struct protection_domain
*domain
= to_pdomain(dom
);
3245 unsigned long flags
;
3247 spin_lock_irqsave(&domain
->lock
, flags
);
3249 /* Update data structure */
3250 domain
->mode
= PAGE_MODE_NONE
;
3251 domain
->updated
= true;
3253 /* Make changes visible to IOMMUs */
3254 update_domain(domain
);
3256 /* Page-table is not visible to IOMMU anymore, so free it */
3257 free_pagetable(domain
);
3259 spin_unlock_irqrestore(&domain
->lock
, flags
);
3261 EXPORT_SYMBOL(amd_iommu_domain_direct_map
);
3263 int amd_iommu_domain_enable_v2(struct iommu_domain
*dom
, int pasids
)
3265 struct protection_domain
*domain
= to_pdomain(dom
);
3266 unsigned long flags
;
3269 if (pasids
<= 0 || pasids
> (PASID_MASK
+ 1))
3272 /* Number of GCR3 table levels required */
3273 for (levels
= 0; (pasids
- 1) & ~0x1ff; pasids
>>= 9)
3276 if (levels
> amd_iommu_max_glx_val
)
3279 spin_lock_irqsave(&domain
->lock
, flags
);
3282 * Save us all sanity checks whether devices already in the
3283 * domain support IOMMUv2. Just force that the domain has no
3284 * devices attached when it is switched into IOMMUv2 mode.
3287 if (domain
->dev_cnt
> 0 || domain
->flags
& PD_IOMMUV2_MASK
)
3291 domain
->gcr3_tbl
= (void *)get_zeroed_page(GFP_ATOMIC
);
3292 if (domain
->gcr3_tbl
== NULL
)
3295 domain
->glx
= levels
;
3296 domain
->flags
|= PD_IOMMUV2_MASK
;
3297 domain
->updated
= true;
3299 update_domain(domain
);
3304 spin_unlock_irqrestore(&domain
->lock
, flags
);
3308 EXPORT_SYMBOL(amd_iommu_domain_enable_v2
);
3310 static int __flush_pasid(struct protection_domain
*domain
, int pasid
,
3311 u64 address
, bool size
)
3313 struct iommu_dev_data
*dev_data
;
3314 struct iommu_cmd cmd
;
3317 if (!(domain
->flags
& PD_IOMMUV2_MASK
))
3320 build_inv_iommu_pasid(&cmd
, domain
->id
, pasid
, address
, size
);
3323 * IOMMU TLB needs to be flushed before Device TLB to
3324 * prevent device TLB refill from IOMMU TLB
3326 for (i
= 0; i
< amd_iommus_present
; ++i
) {
3327 if (domain
->dev_iommu
[i
] == 0)
3330 ret
= iommu_queue_command(amd_iommus
[i
], &cmd
);
3335 /* Wait until IOMMU TLB flushes are complete */
3336 domain_flush_complete(domain
);
3338 /* Now flush device TLBs */
3339 list_for_each_entry(dev_data
, &domain
->dev_list
, list
) {
3340 struct amd_iommu
*iommu
;
3343 BUG_ON(!dev_data
->ats
.enabled
);
3345 qdep
= dev_data
->ats
.qdep
;
3346 iommu
= amd_iommu_rlookup_table
[dev_data
->devid
];
3348 build_inv_iotlb_pasid(&cmd
, dev_data
->devid
, pasid
,
3349 qdep
, address
, size
);
3351 ret
= iommu_queue_command(iommu
, &cmd
);
3356 /* Wait until all device TLBs are flushed */
3357 domain_flush_complete(domain
);
3366 static int __amd_iommu_flush_page(struct protection_domain
*domain
, int pasid
,
3369 INC_STATS_COUNTER(invalidate_iotlb
);
3371 return __flush_pasid(domain
, pasid
, address
, false);
3374 int amd_iommu_flush_page(struct iommu_domain
*dom
, int pasid
,
3377 struct protection_domain
*domain
= to_pdomain(dom
);
3378 unsigned long flags
;
3381 spin_lock_irqsave(&domain
->lock
, flags
);
3382 ret
= __amd_iommu_flush_page(domain
, pasid
, address
);
3383 spin_unlock_irqrestore(&domain
->lock
, flags
);
3387 EXPORT_SYMBOL(amd_iommu_flush_page
);
3389 static int __amd_iommu_flush_tlb(struct protection_domain
*domain
, int pasid
)
3391 INC_STATS_COUNTER(invalidate_iotlb_all
);
3393 return __flush_pasid(domain
, pasid
, CMD_INV_IOMMU_ALL_PAGES_ADDRESS
,
3397 int amd_iommu_flush_tlb(struct iommu_domain
*dom
, int pasid
)
3399 struct protection_domain
*domain
= to_pdomain(dom
);
3400 unsigned long flags
;
3403 spin_lock_irqsave(&domain
->lock
, flags
);
3404 ret
= __amd_iommu_flush_tlb(domain
, pasid
);
3405 spin_unlock_irqrestore(&domain
->lock
, flags
);
3409 EXPORT_SYMBOL(amd_iommu_flush_tlb
);
3411 static u64
*__get_gcr3_pte(u64
*root
, int level
, int pasid
, bool alloc
)
3418 index
= (pasid
>> (9 * level
)) & 0x1ff;
3424 if (!(*pte
& GCR3_VALID
)) {
3428 root
= (void *)get_zeroed_page(GFP_ATOMIC
);
3432 *pte
= __pa(root
) | GCR3_VALID
;
3435 root
= __va(*pte
& PAGE_MASK
);
3443 static int __set_gcr3(struct protection_domain
*domain
, int pasid
,
3448 if (domain
->mode
!= PAGE_MODE_NONE
)
3451 pte
= __get_gcr3_pte(domain
->gcr3_tbl
, domain
->glx
, pasid
, true);
3455 *pte
= (cr3
& PAGE_MASK
) | GCR3_VALID
;
3457 return __amd_iommu_flush_tlb(domain
, pasid
);
3460 static int __clear_gcr3(struct protection_domain
*domain
, int pasid
)
3464 if (domain
->mode
!= PAGE_MODE_NONE
)
3467 pte
= __get_gcr3_pte(domain
->gcr3_tbl
, domain
->glx
, pasid
, false);
3473 return __amd_iommu_flush_tlb(domain
, pasid
);
3476 int amd_iommu_domain_set_gcr3(struct iommu_domain
*dom
, int pasid
,
3479 struct protection_domain
*domain
= to_pdomain(dom
);
3480 unsigned long flags
;
3483 spin_lock_irqsave(&domain
->lock
, flags
);
3484 ret
= __set_gcr3(domain
, pasid
, cr3
);
3485 spin_unlock_irqrestore(&domain
->lock
, flags
);
3489 EXPORT_SYMBOL(amd_iommu_domain_set_gcr3
);
3491 int amd_iommu_domain_clear_gcr3(struct iommu_domain
*dom
, int pasid
)
3493 struct protection_domain
*domain
= to_pdomain(dom
);
3494 unsigned long flags
;
3497 spin_lock_irqsave(&domain
->lock
, flags
);
3498 ret
= __clear_gcr3(domain
, pasid
);
3499 spin_unlock_irqrestore(&domain
->lock
, flags
);
3503 EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3
);
3505 int amd_iommu_complete_ppr(struct pci_dev
*pdev
, int pasid
,
3506 int status
, int tag
)
3508 struct iommu_dev_data
*dev_data
;
3509 struct amd_iommu
*iommu
;
3510 struct iommu_cmd cmd
;
3512 INC_STATS_COUNTER(complete_ppr
);
3514 dev_data
= get_dev_data(&pdev
->dev
);
3515 iommu
= amd_iommu_rlookup_table
[dev_data
->devid
];
3517 build_complete_ppr(&cmd
, dev_data
->devid
, pasid
, status
,
3518 tag
, dev_data
->pri_tlp
);
3520 return iommu_queue_command(iommu
, &cmd
);
3522 EXPORT_SYMBOL(amd_iommu_complete_ppr
);
3524 struct iommu_domain
*amd_iommu_get_v2_domain(struct pci_dev
*pdev
)
3526 struct protection_domain
*pdomain
;
3528 pdomain
= get_domain(&pdev
->dev
);
3529 if (IS_ERR(pdomain
))
3532 /* Only return IOMMUv2 domains */
3533 if (!(pdomain
->flags
& PD_IOMMUV2_MASK
))
3536 return &pdomain
->domain
;
3538 EXPORT_SYMBOL(amd_iommu_get_v2_domain
);
3540 void amd_iommu_enable_device_erratum(struct pci_dev
*pdev
, u32 erratum
)
3542 struct iommu_dev_data
*dev_data
;
3544 if (!amd_iommu_v2_supported())
3547 dev_data
= get_dev_data(&pdev
->dev
);
3548 dev_data
->errata
|= (1 << erratum
);
3550 EXPORT_SYMBOL(amd_iommu_enable_device_erratum
);
3552 int amd_iommu_device_info(struct pci_dev
*pdev
,
3553 struct amd_iommu_device_info
*info
)
3558 if (pdev
== NULL
|| info
== NULL
)
3561 if (!amd_iommu_v2_supported())
3564 memset(info
, 0, sizeof(*info
));
3566 pos
= pci_find_ext_capability(pdev
, PCI_EXT_CAP_ID_ATS
);
3568 info
->flags
|= AMD_IOMMU_DEVICE_FLAG_ATS_SUP
;
3570 pos
= pci_find_ext_capability(pdev
, PCI_EXT_CAP_ID_PRI
);
3572 info
->flags
|= AMD_IOMMU_DEVICE_FLAG_PRI_SUP
;
3574 pos
= pci_find_ext_capability(pdev
, PCI_EXT_CAP_ID_PASID
);
3578 max_pasids
= 1 << (9 * (amd_iommu_max_glx_val
+ 1));
3579 max_pasids
= min(max_pasids
, (1 << 20));
3581 info
->flags
|= AMD_IOMMU_DEVICE_FLAG_PASID_SUP
;
3582 info
->max_pasids
= min(pci_max_pasids(pdev
), max_pasids
);
3584 features
= pci_pasid_features(pdev
);
3585 if (features
& PCI_PASID_CAP_EXEC
)
3586 info
->flags
|= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP
;
3587 if (features
& PCI_PASID_CAP_PRIV
)
3588 info
->flags
|= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP
;
3593 EXPORT_SYMBOL(amd_iommu_device_info
);
3595 #ifdef CONFIG_IRQ_REMAP
3597 /*****************************************************************************
3599 * Interrupt Remapping Implementation
3601 *****************************************************************************/
3618 #define DTE_IRQ_PHYS_ADDR_MASK (((1ULL << 45)-1) << 6)
3619 #define DTE_IRQ_REMAP_INTCTL (2ULL << 60)
3620 #define DTE_IRQ_TABLE_LEN (8ULL << 1)
3621 #define DTE_IRQ_REMAP_ENABLE 1ULL
3623 static void set_dte_irq_entry(u16 devid
, struct irq_remap_table
*table
)
3627 dte
= amd_iommu_dev_table
[devid
].data
[2];
3628 dte
&= ~DTE_IRQ_PHYS_ADDR_MASK
;
3629 dte
|= virt_to_phys(table
->table
);
3630 dte
|= DTE_IRQ_REMAP_INTCTL
;
3631 dte
|= DTE_IRQ_TABLE_LEN
;
3632 dte
|= DTE_IRQ_REMAP_ENABLE
;
3634 amd_iommu_dev_table
[devid
].data
[2] = dte
;
3637 #define IRTE_ALLOCATED (~1U)
3639 static struct irq_remap_table
*get_irq_table(u16 devid
, bool ioapic
)
3641 struct irq_remap_table
*table
= NULL
;
3642 struct amd_iommu
*iommu
;
3643 unsigned long flags
;
3646 write_lock_irqsave(&amd_iommu_devtable_lock
, flags
);
3648 iommu
= amd_iommu_rlookup_table
[devid
];
3652 table
= irq_lookup_table
[devid
];
3656 alias
= amd_iommu_alias_table
[devid
];
3657 table
= irq_lookup_table
[alias
];
3659 irq_lookup_table
[devid
] = table
;
3660 set_dte_irq_entry(devid
, table
);
3661 iommu_flush_dte(iommu
, devid
);
3665 /* Nothing there yet, allocate new irq remapping table */
3666 table
= kzalloc(sizeof(*table
), GFP_ATOMIC
);
3670 /* Initialize table spin-lock */
3671 spin_lock_init(&table
->lock
);
3674 /* Keep the first 32 indexes free for IOAPIC interrupts */
3675 table
->min_index
= 32;
3677 table
->table
= kmem_cache_alloc(amd_iommu_irq_cache
, GFP_ATOMIC
);
3678 if (!table
->table
) {
3684 memset(table
->table
, 0, MAX_IRQS_PER_TABLE
* sizeof(u32
));
3689 for (i
= 0; i
< 32; ++i
)
3690 table
->table
[i
] = IRTE_ALLOCATED
;
3693 irq_lookup_table
[devid
] = table
;
3694 set_dte_irq_entry(devid
, table
);
3695 iommu_flush_dte(iommu
, devid
);
3696 if (devid
!= alias
) {
3697 irq_lookup_table
[alias
] = table
;
3698 set_dte_irq_entry(alias
, table
);
3699 iommu_flush_dte(iommu
, alias
);
3703 iommu_completion_wait(iommu
);
3706 write_unlock_irqrestore(&amd_iommu_devtable_lock
, flags
);
3711 static int alloc_irq_index(struct irq_cfg
*cfg
, u16 devid
, int count
)
3713 struct irq_remap_table
*table
;
3714 unsigned long flags
;
3717 table
= get_irq_table(devid
, false);
3721 spin_lock_irqsave(&table
->lock
, flags
);
3723 /* Scan table for free entries */
3724 for (c
= 0, index
= table
->min_index
;
3725 index
< MAX_IRQS_PER_TABLE
;
3727 if (table
->table
[index
] == 0)
3733 struct irq_2_irte
*irte_info
;
3736 table
->table
[index
- c
+ 1] = IRTE_ALLOCATED
;
3741 irte_info
= &cfg
->irq_2_irte
;
3742 irte_info
->devid
= devid
;
3743 irte_info
->index
= index
;
3752 spin_unlock_irqrestore(&table
->lock
, flags
);
3757 static int get_irte(u16 devid
, int index
, union irte
*irte
)
3759 struct irq_remap_table
*table
;
3760 unsigned long flags
;
3762 table
= get_irq_table(devid
, false);
3766 spin_lock_irqsave(&table
->lock
, flags
);
3767 irte
->val
= table
->table
[index
];
3768 spin_unlock_irqrestore(&table
->lock
, flags
);
3773 static int modify_irte(u16 devid
, int index
, union irte irte
)
3775 struct irq_remap_table
*table
;
3776 struct amd_iommu
*iommu
;
3777 unsigned long flags
;
3779 iommu
= amd_iommu_rlookup_table
[devid
];
3783 table
= get_irq_table(devid
, false);
3787 spin_lock_irqsave(&table
->lock
, flags
);
3788 table
->table
[index
] = irte
.val
;
3789 spin_unlock_irqrestore(&table
->lock
, flags
);
3791 iommu_flush_irt(iommu
, devid
);
3792 iommu_completion_wait(iommu
);
3797 static void free_irte(u16 devid
, int index
)
3799 struct irq_remap_table
*table
;
3800 struct amd_iommu
*iommu
;
3801 unsigned long flags
;
3803 iommu
= amd_iommu_rlookup_table
[devid
];
3807 table
= get_irq_table(devid
, false);
3811 spin_lock_irqsave(&table
->lock
, flags
);
3812 table
->table
[index
] = 0;
3813 spin_unlock_irqrestore(&table
->lock
, flags
);
3815 iommu_flush_irt(iommu
, devid
);
3816 iommu_completion_wait(iommu
);
3819 static int setup_ioapic_entry(int irq
, struct IO_APIC_route_entry
*entry
,
3820 unsigned int destination
, int vector
,
3821 struct io_apic_irq_attr
*attr
)
3823 struct irq_remap_table
*table
;
3824 struct irq_2_irte
*irte_info
;
3825 struct irq_cfg
*cfg
;
3836 irte_info
= &cfg
->irq_2_irte
;
3837 ioapic_id
= mpc_ioapic_id(attr
->ioapic
);
3838 devid
= get_ioapic_devid(ioapic_id
);
3843 table
= get_irq_table(devid
, true);
3847 index
= attr
->ioapic_pin
;
3849 /* Setup IRQ remapping info */
3851 irte_info
->devid
= devid
;
3852 irte_info
->index
= index
;
3854 /* Setup IRTE for IOMMU */
3856 irte
.fields
.vector
= vector
;
3857 irte
.fields
.int_type
= apic
->irq_delivery_mode
;
3858 irte
.fields
.destination
= destination
;
3859 irte
.fields
.dm
= apic
->irq_dest_mode
;
3860 irte
.fields
.valid
= 1;
3862 ret
= modify_irte(devid
, index
, irte
);
3866 /* Setup IOAPIC entry */
3867 memset(entry
, 0, sizeof(*entry
));
3869 entry
->vector
= index
;
3871 entry
->trigger
= attr
->trigger
;
3872 entry
->polarity
= attr
->polarity
;
3875 * Mask level triggered irqs.
3883 static int set_affinity(struct irq_data
*data
, const struct cpumask
*mask
,
3886 struct irq_2_irte
*irte_info
;
3887 unsigned int dest
, irq
;
3888 struct irq_cfg
*cfg
;
3892 if (!config_enabled(CONFIG_SMP
))
3895 cfg
= irqd_cfg(data
);
3897 irte_info
= &cfg
->irq_2_irte
;
3899 if (!cpumask_intersects(mask
, cpu_online_mask
))
3902 if (get_irte(irte_info
->devid
, irte_info
->index
, &irte
))
3905 if (assign_irq_vector(irq
, cfg
, mask
))
3908 err
= apic
->cpu_mask_to_apicid_and(cfg
->domain
, mask
, &dest
);
3910 if (assign_irq_vector(irq
, cfg
, data
->affinity
))
3911 pr_err("AMD-Vi: Failed to recover vector for irq %d\n", irq
);
3915 irte
.fields
.vector
= cfg
->vector
;
3916 irte
.fields
.destination
= dest
;
3918 modify_irte(irte_info
->devid
, irte_info
->index
, irte
);
3920 if (cfg
->move_in_progress
)
3921 send_cleanup_vector(cfg
);
3923 cpumask_copy(data
->affinity
, mask
);
3928 static int free_irq(int irq
)
3930 struct irq_2_irte
*irte_info
;
3931 struct irq_cfg
*cfg
;
3937 irte_info
= &cfg
->irq_2_irte
;
3939 free_irte(irte_info
->devid
, irte_info
->index
);
3944 static void compose_msi_msg(struct pci_dev
*pdev
,
3945 unsigned int irq
, unsigned int dest
,
3946 struct msi_msg
*msg
, u8 hpet_id
)
3948 struct irq_2_irte
*irte_info
;
3949 struct irq_cfg
*cfg
;
3956 irte_info
= &cfg
->irq_2_irte
;
3959 irte
.fields
.vector
= cfg
->vector
;
3960 irte
.fields
.int_type
= apic
->irq_delivery_mode
;
3961 irte
.fields
.destination
= dest
;
3962 irte
.fields
.dm
= apic
->irq_dest_mode
;
3963 irte
.fields
.valid
= 1;
3965 modify_irte(irte_info
->devid
, irte_info
->index
, irte
);
3967 msg
->address_hi
= MSI_ADDR_BASE_HI
;
3968 msg
->address_lo
= MSI_ADDR_BASE_LO
;
3969 msg
->data
= irte_info
->index
;
3972 static int msi_alloc_irq(struct pci_dev
*pdev
, int irq
, int nvec
)
3974 struct irq_cfg
*cfg
;
3985 devid
= get_device_id(&pdev
->dev
);
3986 index
= alloc_irq_index(cfg
, devid
, nvec
);
3988 return index
< 0 ? MAX_IRQS_PER_TABLE
: index
;
3991 static int msi_setup_irq(struct pci_dev
*pdev
, unsigned int irq
,
3992 int index
, int offset
)
3994 struct irq_2_irte
*irte_info
;
3995 struct irq_cfg
*cfg
;
4005 if (index
>= MAX_IRQS_PER_TABLE
)
4008 devid
= get_device_id(&pdev
->dev
);
4009 irte_info
= &cfg
->irq_2_irte
;
4012 irte_info
->devid
= devid
;
4013 irte_info
->index
= index
+ offset
;
4018 static int alloc_hpet_msi(unsigned int irq
, unsigned int id
)
4020 struct irq_2_irte
*irte_info
;
4021 struct irq_cfg
*cfg
;
4028 irte_info
= &cfg
->irq_2_irte
;
4029 devid
= get_hpet_devid(id
);
4033 index
= alloc_irq_index(cfg
, devid
, 1);
4038 irte_info
->devid
= devid
;
4039 irte_info
->index
= index
;
4044 struct irq_remap_ops amd_iommu_irq_ops
= {
4045 .prepare
= amd_iommu_prepare
,
4046 .enable
= amd_iommu_enable
,
4047 .disable
= amd_iommu_disable
,
4048 .reenable
= amd_iommu_reenable
,
4049 .enable_faulting
= amd_iommu_enable_faulting
,
4050 .setup_ioapic_entry
= setup_ioapic_entry
,
4051 .set_affinity
= set_affinity
,
4052 .free_irq
= free_irq
,
4053 .compose_msi_msg
= compose_msi_msg
,
4054 .msi_alloc_irq
= msi_alloc_irq
,
4055 .msi_setup_irq
= msi_setup_irq
,
4056 .alloc_hpet_msi
= alloc_hpet_msi
,