2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/memory.h>
37 #include <linux/timer.h>
38 #include <linux/iova.h>
39 #include <linux/iommu.h>
40 #include <linux/intel-iommu.h>
41 #include <linux/syscore_ops.h>
42 #include <linux/tboot.h>
43 #include <linux/dmi.h>
44 #include <linux/pci-ats.h>
45 #include <linux/memblock.h>
46 #include <asm/irq_remapping.h>
47 #include <asm/cacheflush.h>
48 #include <asm/iommu.h>
50 #include "irq_remapping.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
58 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60 #define IOAPIC_RANGE_START (0xfee00000)
61 #define IOAPIC_RANGE_END (0xfeefffff)
62 #define IOVA_START_ADDR (0x1000)
64 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
66 #define MAX_AGAW_WIDTH 64
67 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
70 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
73 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
74 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
75 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
76 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
79 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
80 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
82 /* page table handling */
83 #define LEVEL_STRIDE (9)
84 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
87 * This bitmap is used to advertise the page sizes our hardware support
88 * to the IOMMU core, which will then use this information to split
89 * physically contiguous memory regions it is mapping into page sizes
92 * Traditionally the IOMMU core just handed us the mappings directly,
93 * after making sure the size is an order of a 4KiB page and that the
94 * mapping has natural alignment.
96 * To retain this behavior, we currently advertise that we support
97 * all page sizes that are an order of 4KiB.
99 * If at some point we'd like to utilize the IOMMU core's new behavior,
100 * we could change this to advertise the real page sizes we support.
102 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
104 static inline int agaw_to_level(int agaw
)
109 static inline int agaw_to_width(int agaw
)
111 return min_t(int, 30 + agaw
* LEVEL_STRIDE
, MAX_AGAW_WIDTH
);
114 static inline int width_to_agaw(int width
)
116 return DIV_ROUND_UP(width
- 30, LEVEL_STRIDE
);
119 static inline unsigned int level_to_offset_bits(int level
)
121 return (level
- 1) * LEVEL_STRIDE
;
124 static inline int pfn_level_offset(unsigned long pfn
, int level
)
126 return (pfn
>> level_to_offset_bits(level
)) & LEVEL_MASK
;
129 static inline unsigned long level_mask(int level
)
131 return -1UL << level_to_offset_bits(level
);
134 static inline unsigned long level_size(int level
)
136 return 1UL << level_to_offset_bits(level
);
139 static inline unsigned long align_to_level(unsigned long pfn
, int level
)
141 return (pfn
+ level_size(level
) - 1) & level_mask(level
);
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl
)
146 return 1 << min_t(int, (lvl
- 1) * LEVEL_STRIDE
, MAX_AGAW_PFN_WIDTH
);
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150 are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn
)
153 return dma_pfn
>> (PAGE_SHIFT
- VTD_PAGE_SHIFT
);
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn
)
158 return mm_pfn
<< (PAGE_SHIFT
- VTD_PAGE_SHIFT
);
160 static inline unsigned long page_to_dma_pfn(struct page
*pg
)
162 return mm_to_dma_pfn(page_to_pfn(pg
));
164 static inline unsigned long virt_to_dma_pfn(void *p
)
166 return page_to_dma_pfn(virt_to_page(p
));
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu
**g_iommus
;
172 static void __init
check_tylersburg_isoch(void);
173 static int rwbf_quirk
;
176 * set to 1 to panic kernel if can't successfully enable VT-d
177 * (used when kernel is launched w/ TXT)
179 static int force_on
= 0;
184 * 12-63: Context Ptr (12 - (haw-1))
191 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
192 static inline bool root_present(struct root_entry
*root
)
194 return (root
->val
& 1);
196 static inline void set_root_present(struct root_entry
*root
)
200 static inline void set_root_value(struct root_entry
*root
, unsigned long value
)
202 root
->val
|= value
& VTD_PAGE_MASK
;
205 static inline struct context_entry
*
206 get_context_addr_from_root(struct root_entry
*root
)
208 return (struct context_entry
*)
209 (root_present(root
)?phys_to_virt(
210 root
->val
& VTD_PAGE_MASK
) :
217 * 1: fault processing disable
218 * 2-3: translation type
219 * 12-63: address space root
225 struct context_entry
{
230 static inline bool context_present(struct context_entry
*context
)
232 return (context
->lo
& 1);
234 static inline void context_set_present(struct context_entry
*context
)
239 static inline void context_set_fault_enable(struct context_entry
*context
)
241 context
->lo
&= (((u64
)-1) << 2) | 1;
244 static inline void context_set_translation_type(struct context_entry
*context
,
247 context
->lo
&= (((u64
)-1) << 4) | 3;
248 context
->lo
|= (value
& 3) << 2;
251 static inline void context_set_address_root(struct context_entry
*context
,
254 context
->lo
|= value
& VTD_PAGE_MASK
;
257 static inline void context_set_address_width(struct context_entry
*context
,
260 context
->hi
|= value
& 7;
263 static inline void context_set_domain_id(struct context_entry
*context
,
266 context
->hi
|= (value
& ((1 << 16) - 1)) << 8;
269 static inline void context_clear_entry(struct context_entry
*context
)
282 * 12-63: Host physcial address
288 static inline void dma_clear_pte(struct dma_pte
*pte
)
293 static inline u64
dma_pte_addr(struct dma_pte
*pte
)
296 return pte
->val
& VTD_PAGE_MASK
;
298 /* Must have a full atomic 64-bit read */
299 return __cmpxchg64(&pte
->val
, 0ULL, 0ULL) & VTD_PAGE_MASK
;
303 static inline bool dma_pte_present(struct dma_pte
*pte
)
305 return (pte
->val
& 3) != 0;
308 static inline bool dma_pte_superpage(struct dma_pte
*pte
)
310 return (pte
->val
& (1 << 7));
313 static inline int first_pte_in_page(struct dma_pte
*pte
)
315 return !((unsigned long)pte
& ~VTD_PAGE_MASK
);
319 * This domain is a statically identity mapping domain.
320 * 1. This domain creats a static 1:1 mapping to all usable memory.
321 * 2. It maps to each iommu if successful.
322 * 3. Each iommu mapps to this domain if successful.
324 static struct dmar_domain
*si_domain
;
325 static int hw_pass_through
= 1;
327 /* devices under the same p2p bridge are owned in one domain */
328 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
330 /* domain represents a virtual machine, more than one devices
331 * across iommus may be owned in one domain, e.g. kvm guest.
333 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
335 /* si_domain contains mulitple devices */
336 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
338 /* define the limit of IOMMUs supported in each domain */
340 # define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
342 # define IOMMU_UNITS_SUPPORTED 64
346 int id
; /* domain id */
347 int nid
; /* node id */
348 DECLARE_BITMAP(iommu_bmp
, IOMMU_UNITS_SUPPORTED
);
349 /* bitmap of iommus this domain uses*/
351 struct list_head devices
; /* all devices' list */
352 struct iova_domain iovad
; /* iova's that belong to this domain */
354 struct dma_pte
*pgd
; /* virtual address */
355 int gaw
; /* max guest address width */
357 /* adjusted guest address width, 0 is level 2 30-bit */
360 int flags
; /* flags to find out type of domain */
362 int iommu_coherency
;/* indicate coherency of iommu access */
363 int iommu_snooping
; /* indicate snooping control feature*/
364 int iommu_count
; /* reference count of iommu */
365 int iommu_superpage
;/* Level of superpages supported:
366 0 == 4KiB (no superpages), 1 == 2MiB,
367 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
368 spinlock_t iommu_lock
; /* protect iommu set in domain */
369 u64 max_addr
; /* maximum mapped address */
372 /* PCI domain-device relationship */
373 struct device_domain_info
{
374 struct list_head link
; /* link to domain siblings */
375 struct list_head global
; /* link to global list */
376 int segment
; /* PCI domain */
377 u8 bus
; /* PCI bus number */
378 u8 devfn
; /* PCI devfn number */
379 struct pci_dev
*dev
; /* it's NULL for PCIe-to-PCI bridge */
380 struct intel_iommu
*iommu
; /* IOMMU used by this device */
381 struct dmar_domain
*domain
; /* pointer to domain */
384 struct dmar_rmrr_unit
{
385 struct list_head list
; /* list of rmrr units */
386 struct acpi_dmar_header
*hdr
; /* ACPI header */
387 u64 base_address
; /* reserved base address*/
388 u64 end_address
; /* reserved end address */
389 struct pci_dev __rcu
**devices
; /* target devices */
390 int devices_cnt
; /* target device count */
393 struct dmar_atsr_unit
{
394 struct list_head list
; /* list of ATSR units */
395 struct acpi_dmar_header
*hdr
; /* ACPI header */
396 struct pci_dev __rcu
**devices
; /* target devices */
397 int devices_cnt
; /* target device count */
398 u8 include_all
:1; /* include all ports */
401 static LIST_HEAD(dmar_atsr_units
);
402 static LIST_HEAD(dmar_rmrr_units
);
404 #define for_each_rmrr_units(rmrr) \
405 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
407 static void flush_unmaps_timeout(unsigned long data
);
409 static DEFINE_TIMER(unmap_timer
, flush_unmaps_timeout
, 0, 0);
411 #define HIGH_WATER_MARK 250
412 struct deferred_flush_tables
{
414 struct iova
*iova
[HIGH_WATER_MARK
];
415 struct dmar_domain
*domain
[HIGH_WATER_MARK
];
418 static struct deferred_flush_tables
*deferred_flush
;
420 /* bitmap for indexing intel_iommus */
421 static int g_num_of_iommus
;
423 static DEFINE_SPINLOCK(async_umap_flush_lock
);
424 static LIST_HEAD(unmaps_to_do
);
427 static long list_size
;
429 static void domain_exit(struct dmar_domain
*domain
);
430 static void domain_remove_dev_info(struct dmar_domain
*domain
);
431 static void domain_remove_one_dev_info(struct dmar_domain
*domain
,
432 struct pci_dev
*pdev
);
433 static void iommu_detach_dependent_devices(struct intel_iommu
*iommu
,
434 struct pci_dev
*pdev
);
436 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
437 int dmar_disabled
= 0;
439 int dmar_disabled
= 1;
440 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
442 int intel_iommu_enabled
= 0;
443 EXPORT_SYMBOL_GPL(intel_iommu_enabled
);
445 static int dmar_map_gfx
= 1;
446 static int dmar_forcedac
;
447 static int intel_iommu_strict
;
448 static int intel_iommu_superpage
= 1;
450 int intel_iommu_gfx_mapped
;
451 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped
);
453 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
454 static DEFINE_SPINLOCK(device_domain_lock
);
455 static LIST_HEAD(device_domain_list
);
457 static struct iommu_ops intel_iommu_ops
;
459 static int __init
intel_iommu_setup(char *str
)
464 if (!strncmp(str
, "on", 2)) {
466 printk(KERN_INFO
"Intel-IOMMU: enabled\n");
467 } else if (!strncmp(str
, "off", 3)) {
469 printk(KERN_INFO
"Intel-IOMMU: disabled\n");
470 } else if (!strncmp(str
, "igfx_off", 8)) {
473 "Intel-IOMMU: disable GFX device mapping\n");
474 } else if (!strncmp(str
, "forcedac", 8)) {
476 "Intel-IOMMU: Forcing DAC for PCI devices\n");
478 } else if (!strncmp(str
, "strict", 6)) {
480 "Intel-IOMMU: disable batched IOTLB flush\n");
481 intel_iommu_strict
= 1;
482 } else if (!strncmp(str
, "sp_off", 6)) {
484 "Intel-IOMMU: disable supported super page\n");
485 intel_iommu_superpage
= 0;
488 str
+= strcspn(str
, ",");
494 __setup("intel_iommu=", intel_iommu_setup
);
496 static struct kmem_cache
*iommu_domain_cache
;
497 static struct kmem_cache
*iommu_devinfo_cache
;
498 static struct kmem_cache
*iommu_iova_cache
;
500 static inline void *alloc_pgtable_page(int node
)
505 page
= alloc_pages_node(node
, GFP_ATOMIC
| __GFP_ZERO
, 0);
507 vaddr
= page_address(page
);
511 static inline void free_pgtable_page(void *vaddr
)
513 free_page((unsigned long)vaddr
);
516 static inline void *alloc_domain_mem(void)
518 return kmem_cache_alloc(iommu_domain_cache
, GFP_ATOMIC
);
521 static void free_domain_mem(void *vaddr
)
523 kmem_cache_free(iommu_domain_cache
, vaddr
);
526 static inline void * alloc_devinfo_mem(void)
528 return kmem_cache_alloc(iommu_devinfo_cache
, GFP_ATOMIC
);
531 static inline void free_devinfo_mem(void *vaddr
)
533 kmem_cache_free(iommu_devinfo_cache
, vaddr
);
536 struct iova
*alloc_iova_mem(void)
538 return kmem_cache_alloc(iommu_iova_cache
, GFP_ATOMIC
);
541 void free_iova_mem(struct iova
*iova
)
543 kmem_cache_free(iommu_iova_cache
, iova
);
547 static int __iommu_calculate_agaw(struct intel_iommu
*iommu
, int max_gaw
)
552 sagaw
= cap_sagaw(iommu
->cap
);
553 for (agaw
= width_to_agaw(max_gaw
);
555 if (test_bit(agaw
, &sagaw
))
563 * Calculate max SAGAW for each iommu.
565 int iommu_calculate_max_sagaw(struct intel_iommu
*iommu
)
567 return __iommu_calculate_agaw(iommu
, MAX_AGAW_WIDTH
);
571 * calculate agaw for each iommu.
572 * "SAGAW" may be different across iommus, use a default agaw, and
573 * get a supported less agaw for iommus that don't support the default agaw.
575 int iommu_calculate_agaw(struct intel_iommu
*iommu
)
577 return __iommu_calculate_agaw(iommu
, DEFAULT_DOMAIN_ADDRESS_WIDTH
);
580 /* This functionin only returns single iommu in a domain */
581 static struct intel_iommu
*domain_get_iommu(struct dmar_domain
*domain
)
585 /* si_domain and vm domain should not get here. */
586 BUG_ON(domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
);
587 BUG_ON(domain
->flags
& DOMAIN_FLAG_STATIC_IDENTITY
);
589 iommu_id
= find_first_bit(domain
->iommu_bmp
, g_num_of_iommus
);
590 if (iommu_id
< 0 || iommu_id
>= g_num_of_iommus
)
593 return g_iommus
[iommu_id
];
596 static void domain_update_iommu_coherency(struct dmar_domain
*domain
)
600 i
= find_first_bit(domain
->iommu_bmp
, g_num_of_iommus
);
602 domain
->iommu_coherency
= i
< g_num_of_iommus
? 1 : 0;
604 for_each_set_bit(i
, domain
->iommu_bmp
, g_num_of_iommus
) {
605 if (!ecap_coherent(g_iommus
[i
]->ecap
)) {
606 domain
->iommu_coherency
= 0;
612 static void domain_update_iommu_snooping(struct dmar_domain
*domain
)
616 domain
->iommu_snooping
= 1;
618 for_each_set_bit(i
, domain
->iommu_bmp
, g_num_of_iommus
) {
619 if (!ecap_sc_support(g_iommus
[i
]->ecap
)) {
620 domain
->iommu_snooping
= 0;
626 static void domain_update_iommu_superpage(struct dmar_domain
*domain
)
628 struct dmar_drhd_unit
*drhd
;
629 struct intel_iommu
*iommu
= NULL
;
632 if (!intel_iommu_superpage
) {
633 domain
->iommu_superpage
= 0;
637 /* set iommu_superpage to the smallest common denominator */
639 for_each_active_iommu(iommu
, drhd
) {
640 mask
&= cap_super_page_val(iommu
->cap
);
647 domain
->iommu_superpage
= fls(mask
);
650 /* Some capabilities may be different across iommus */
651 static void domain_update_iommu_cap(struct dmar_domain
*domain
)
653 domain_update_iommu_coherency(domain
);
654 domain_update_iommu_snooping(domain
);
655 domain_update_iommu_superpage(domain
);
658 static struct intel_iommu
*device_to_iommu(int segment
, u8 bus
, u8 devfn
)
660 struct dmar_drhd_unit
*drhd
= NULL
;
661 struct intel_iommu
*iommu
;
666 for_each_active_iommu(iommu
, drhd
) {
667 if (segment
!= drhd
->segment
)
670 for_each_active_dev_scope(drhd
->devices
,
671 drhd
->devices_cnt
, i
, dev
) {
672 if (dev
->bus
->number
== bus
&& dev
->devfn
== devfn
)
674 if (dev
->subordinate
&&
675 dev
->subordinate
->number
<= bus
&&
676 dev
->subordinate
->busn_res
.end
>= bus
)
680 if (drhd
->include_all
)
690 static void domain_flush_cache(struct dmar_domain
*domain
,
691 void *addr
, int size
)
693 if (!domain
->iommu_coherency
)
694 clflush_cache_range(addr
, size
);
697 /* Gets context entry for a given bus and devfn */
698 static struct context_entry
* device_to_context_entry(struct intel_iommu
*iommu
,
701 struct root_entry
*root
;
702 struct context_entry
*context
;
703 unsigned long phy_addr
;
706 spin_lock_irqsave(&iommu
->lock
, flags
);
707 root
= &iommu
->root_entry
[bus
];
708 context
= get_context_addr_from_root(root
);
710 context
= (struct context_entry
*)
711 alloc_pgtable_page(iommu
->node
);
713 spin_unlock_irqrestore(&iommu
->lock
, flags
);
716 __iommu_flush_cache(iommu
, (void *)context
, CONTEXT_SIZE
);
717 phy_addr
= virt_to_phys((void *)context
);
718 set_root_value(root
, phy_addr
);
719 set_root_present(root
);
720 __iommu_flush_cache(iommu
, root
, sizeof(*root
));
722 spin_unlock_irqrestore(&iommu
->lock
, flags
);
723 return &context
[devfn
];
726 static int device_context_mapped(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
728 struct root_entry
*root
;
729 struct context_entry
*context
;
733 spin_lock_irqsave(&iommu
->lock
, flags
);
734 root
= &iommu
->root_entry
[bus
];
735 context
= get_context_addr_from_root(root
);
740 ret
= context_present(&context
[devfn
]);
742 spin_unlock_irqrestore(&iommu
->lock
, flags
);
746 static void clear_context_table(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
748 struct root_entry
*root
;
749 struct context_entry
*context
;
752 spin_lock_irqsave(&iommu
->lock
, flags
);
753 root
= &iommu
->root_entry
[bus
];
754 context
= get_context_addr_from_root(root
);
756 context_clear_entry(&context
[devfn
]);
757 __iommu_flush_cache(iommu
, &context
[devfn
], \
760 spin_unlock_irqrestore(&iommu
->lock
, flags
);
763 static void free_context_table(struct intel_iommu
*iommu
)
765 struct root_entry
*root
;
768 struct context_entry
*context
;
770 spin_lock_irqsave(&iommu
->lock
, flags
);
771 if (!iommu
->root_entry
) {
774 for (i
= 0; i
< ROOT_ENTRY_NR
; i
++) {
775 root
= &iommu
->root_entry
[i
];
776 context
= get_context_addr_from_root(root
);
778 free_pgtable_page(context
);
780 free_pgtable_page(iommu
->root_entry
);
781 iommu
->root_entry
= NULL
;
783 spin_unlock_irqrestore(&iommu
->lock
, flags
);
786 static struct dma_pte
*pfn_to_dma_pte(struct dmar_domain
*domain
,
787 unsigned long pfn
, int *target_level
)
789 int addr_width
= agaw_to_width(domain
->agaw
) - VTD_PAGE_SHIFT
;
790 struct dma_pte
*parent
, *pte
= NULL
;
791 int level
= agaw_to_level(domain
->agaw
);
794 BUG_ON(!domain
->pgd
);
796 if (addr_width
< BITS_PER_LONG
&& pfn
>> addr_width
)
797 /* Address beyond IOMMU's addressing capabilities. */
800 parent
= domain
->pgd
;
805 offset
= pfn_level_offset(pfn
, level
);
806 pte
= &parent
[offset
];
807 if (!*target_level
&& (dma_pte_superpage(pte
) || !dma_pte_present(pte
)))
809 if (level
== *target_level
)
812 if (!dma_pte_present(pte
)) {
815 tmp_page
= alloc_pgtable_page(domain
->nid
);
820 domain_flush_cache(domain
, tmp_page
, VTD_PAGE_SIZE
);
821 pteval
= ((uint64_t)virt_to_dma_pfn(tmp_page
) << VTD_PAGE_SHIFT
) | DMA_PTE_READ
| DMA_PTE_WRITE
;
822 if (cmpxchg64(&pte
->val
, 0ULL, pteval
)) {
823 /* Someone else set it while we were thinking; use theirs. */
824 free_pgtable_page(tmp_page
);
827 domain_flush_cache(domain
, pte
, sizeof(*pte
));
833 parent
= phys_to_virt(dma_pte_addr(pte
));
838 *target_level
= level
;
844 /* return address's pte at specific level */
845 static struct dma_pte
*dma_pfn_level_pte(struct dmar_domain
*domain
,
847 int level
, int *large_page
)
849 struct dma_pte
*parent
, *pte
= NULL
;
850 int total
= agaw_to_level(domain
->agaw
);
853 parent
= domain
->pgd
;
854 while (level
<= total
) {
855 offset
= pfn_level_offset(pfn
, total
);
856 pte
= &parent
[offset
];
860 if (!dma_pte_present(pte
)) {
865 if (pte
->val
& DMA_PTE_LARGE_PAGE
) {
870 parent
= phys_to_virt(dma_pte_addr(pte
));
876 /* clear last level pte, a tlb flush should be followed */
877 static void dma_pte_clear_range(struct dmar_domain
*domain
,
878 unsigned long start_pfn
,
879 unsigned long last_pfn
)
881 int addr_width
= agaw_to_width(domain
->agaw
) - VTD_PAGE_SHIFT
;
882 unsigned int large_page
= 1;
883 struct dma_pte
*first_pte
, *pte
;
885 BUG_ON(addr_width
< BITS_PER_LONG
&& start_pfn
>> addr_width
);
886 BUG_ON(addr_width
< BITS_PER_LONG
&& last_pfn
>> addr_width
);
887 BUG_ON(start_pfn
> last_pfn
);
889 /* we don't need lock here; nobody else touches the iova range */
892 first_pte
= pte
= dma_pfn_level_pte(domain
, start_pfn
, 1, &large_page
);
894 start_pfn
= align_to_level(start_pfn
+ 1, large_page
+ 1);
899 start_pfn
+= lvl_to_nr_pages(large_page
);
901 } while (start_pfn
<= last_pfn
&& !first_pte_in_page(pte
));
903 domain_flush_cache(domain
, first_pte
,
904 (void *)pte
- (void *)first_pte
);
906 } while (start_pfn
&& start_pfn
<= last_pfn
);
909 static void dma_pte_free_level(struct dmar_domain
*domain
, int level
,
910 struct dma_pte
*pte
, unsigned long pfn
,
911 unsigned long start_pfn
, unsigned long last_pfn
)
913 pfn
= max(start_pfn
, pfn
);
914 pte
= &pte
[pfn_level_offset(pfn
, level
)];
917 unsigned long level_pfn
;
918 struct dma_pte
*level_pte
;
920 if (!dma_pte_present(pte
) || dma_pte_superpage(pte
))
923 level_pfn
= pfn
& level_mask(level
- 1);
924 level_pte
= phys_to_virt(dma_pte_addr(pte
));
927 dma_pte_free_level(domain
, level
- 1, level_pte
,
928 level_pfn
, start_pfn
, last_pfn
);
930 /* If range covers entire pagetable, free it */
931 if (!(start_pfn
> level_pfn
||
932 last_pfn
< level_pfn
+ level_size(level
) - 1)) {
934 domain_flush_cache(domain
, pte
, sizeof(*pte
));
935 free_pgtable_page(level_pte
);
938 pfn
+= level_size(level
);
939 } while (!first_pte_in_page(++pte
) && pfn
<= last_pfn
);
942 /* free page table pages. last level pte should already be cleared */
943 static void dma_pte_free_pagetable(struct dmar_domain
*domain
,
944 unsigned long start_pfn
,
945 unsigned long last_pfn
)
947 int addr_width
= agaw_to_width(domain
->agaw
) - VTD_PAGE_SHIFT
;
949 BUG_ON(addr_width
< BITS_PER_LONG
&& start_pfn
>> addr_width
);
950 BUG_ON(addr_width
< BITS_PER_LONG
&& last_pfn
>> addr_width
);
951 BUG_ON(start_pfn
> last_pfn
);
953 /* We don't need lock here; nobody else touches the iova range */
954 dma_pte_free_level(domain
, agaw_to_level(domain
->agaw
),
955 domain
->pgd
, 0, start_pfn
, last_pfn
);
958 if (start_pfn
== 0 && last_pfn
== DOMAIN_MAX_PFN(domain
->gaw
)) {
959 free_pgtable_page(domain
->pgd
);
965 static int iommu_alloc_root_entry(struct intel_iommu
*iommu
)
967 struct root_entry
*root
;
970 root
= (struct root_entry
*)alloc_pgtable_page(iommu
->node
);
974 __iommu_flush_cache(iommu
, root
, ROOT_SIZE
);
976 spin_lock_irqsave(&iommu
->lock
, flags
);
977 iommu
->root_entry
= root
;
978 spin_unlock_irqrestore(&iommu
->lock
, flags
);
983 static void iommu_set_root_entry(struct intel_iommu
*iommu
)
989 addr
= iommu
->root_entry
;
991 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
992 dmar_writeq(iommu
->reg
+ DMAR_RTADDR_REG
, virt_to_phys(addr
));
994 writel(iommu
->gcmd
| DMA_GCMD_SRTP
, iommu
->reg
+ DMAR_GCMD_REG
);
996 /* Make sure hardware complete it */
997 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
998 readl
, (sts
& DMA_GSTS_RTPS
), sts
);
1000 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1003 static void iommu_flush_write_buffer(struct intel_iommu
*iommu
)
1008 if (!rwbf_quirk
&& !cap_rwbf(iommu
->cap
))
1011 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1012 writel(iommu
->gcmd
| DMA_GCMD_WBF
, iommu
->reg
+ DMAR_GCMD_REG
);
1014 /* Make sure hardware complete it */
1015 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1016 readl
, (!(val
& DMA_GSTS_WBFS
)), val
);
1018 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1021 /* return value determine if we need a write buffer flush */
1022 static void __iommu_flush_context(struct intel_iommu
*iommu
,
1023 u16 did
, u16 source_id
, u8 function_mask
,
1030 case DMA_CCMD_GLOBAL_INVL
:
1031 val
= DMA_CCMD_GLOBAL_INVL
;
1033 case DMA_CCMD_DOMAIN_INVL
:
1034 val
= DMA_CCMD_DOMAIN_INVL
|DMA_CCMD_DID(did
);
1036 case DMA_CCMD_DEVICE_INVL
:
1037 val
= DMA_CCMD_DEVICE_INVL
|DMA_CCMD_DID(did
)
1038 | DMA_CCMD_SID(source_id
) | DMA_CCMD_FM(function_mask
);
1043 val
|= DMA_CCMD_ICC
;
1045 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1046 dmar_writeq(iommu
->reg
+ DMAR_CCMD_REG
, val
);
1048 /* Make sure hardware complete it */
1049 IOMMU_WAIT_OP(iommu
, DMAR_CCMD_REG
,
1050 dmar_readq
, (!(val
& DMA_CCMD_ICC
)), val
);
1052 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1055 /* return value determine if we need a write buffer flush */
1056 static void __iommu_flush_iotlb(struct intel_iommu
*iommu
, u16 did
,
1057 u64 addr
, unsigned int size_order
, u64 type
)
1059 int tlb_offset
= ecap_iotlb_offset(iommu
->ecap
);
1060 u64 val
= 0, val_iva
= 0;
1064 case DMA_TLB_GLOBAL_FLUSH
:
1065 /* global flush doesn't need set IVA_REG */
1066 val
= DMA_TLB_GLOBAL_FLUSH
|DMA_TLB_IVT
;
1068 case DMA_TLB_DSI_FLUSH
:
1069 val
= DMA_TLB_DSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
1071 case DMA_TLB_PSI_FLUSH
:
1072 val
= DMA_TLB_PSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
1073 /* Note: always flush non-leaf currently */
1074 val_iva
= size_order
| addr
;
1079 /* Note: set drain read/write */
1082 * This is probably to be super secure.. Looks like we can
1083 * ignore it without any impact.
1085 if (cap_read_drain(iommu
->cap
))
1086 val
|= DMA_TLB_READ_DRAIN
;
1088 if (cap_write_drain(iommu
->cap
))
1089 val
|= DMA_TLB_WRITE_DRAIN
;
1091 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1092 /* Note: Only uses first TLB reg currently */
1094 dmar_writeq(iommu
->reg
+ tlb_offset
, val_iva
);
1095 dmar_writeq(iommu
->reg
+ tlb_offset
+ 8, val
);
1097 /* Make sure hardware complete it */
1098 IOMMU_WAIT_OP(iommu
, tlb_offset
+ 8,
1099 dmar_readq
, (!(val
& DMA_TLB_IVT
)), val
);
1101 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1103 /* check IOTLB invalidation granularity */
1104 if (DMA_TLB_IAIG(val
) == 0)
1105 printk(KERN_ERR
"IOMMU: flush IOTLB failed\n");
1106 if (DMA_TLB_IAIG(val
) != DMA_TLB_IIRG(type
))
1107 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1108 (unsigned long long)DMA_TLB_IIRG(type
),
1109 (unsigned long long)DMA_TLB_IAIG(val
));
1112 static struct device_domain_info
*iommu_support_dev_iotlb(
1113 struct dmar_domain
*domain
, int segment
, u8 bus
, u8 devfn
)
1116 unsigned long flags
;
1117 struct device_domain_info
*info
;
1118 struct intel_iommu
*iommu
= device_to_iommu(segment
, bus
, devfn
);
1120 if (!ecap_dev_iotlb_support(iommu
->ecap
))
1126 spin_lock_irqsave(&device_domain_lock
, flags
);
1127 list_for_each_entry(info
, &domain
->devices
, link
)
1128 if (info
->bus
== bus
&& info
->devfn
== devfn
) {
1132 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1134 if (!found
|| !info
->dev
)
1137 if (!pci_find_ext_capability(info
->dev
, PCI_EXT_CAP_ID_ATS
))
1140 if (!dmar_find_matched_atsr_unit(info
->dev
))
1143 info
->iommu
= iommu
;
1148 static void iommu_enable_dev_iotlb(struct device_domain_info
*info
)
1153 pci_enable_ats(info
->dev
, VTD_PAGE_SHIFT
);
1156 static void iommu_disable_dev_iotlb(struct device_domain_info
*info
)
1158 if (!info
->dev
|| !pci_ats_enabled(info
->dev
))
1161 pci_disable_ats(info
->dev
);
1164 static void iommu_flush_dev_iotlb(struct dmar_domain
*domain
,
1165 u64 addr
, unsigned mask
)
1168 unsigned long flags
;
1169 struct device_domain_info
*info
;
1171 spin_lock_irqsave(&device_domain_lock
, flags
);
1172 list_for_each_entry(info
, &domain
->devices
, link
) {
1173 if (!info
->dev
|| !pci_ats_enabled(info
->dev
))
1176 sid
= info
->bus
<< 8 | info
->devfn
;
1177 qdep
= pci_ats_queue_depth(info
->dev
);
1178 qi_flush_dev_iotlb(info
->iommu
, sid
, qdep
, addr
, mask
);
1180 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1183 static void iommu_flush_iotlb_psi(struct intel_iommu
*iommu
, u16 did
,
1184 unsigned long pfn
, unsigned int pages
, int map
)
1186 unsigned int mask
= ilog2(__roundup_pow_of_two(pages
));
1187 uint64_t addr
= (uint64_t)pfn
<< VTD_PAGE_SHIFT
;
1192 * Fallback to domain selective flush if no PSI support or the size is
1194 * PSI requires page size to be 2 ^ x, and the base address is naturally
1195 * aligned to the size
1197 if (!cap_pgsel_inv(iommu
->cap
) || mask
> cap_max_amask_val(iommu
->cap
))
1198 iommu
->flush
.flush_iotlb(iommu
, did
, 0, 0,
1201 iommu
->flush
.flush_iotlb(iommu
, did
, addr
, mask
,
1205 * In caching mode, changes of pages from non-present to present require
1206 * flush. However, device IOTLB doesn't need to be flushed in this case.
1208 if (!cap_caching_mode(iommu
->cap
) || !map
)
1209 iommu_flush_dev_iotlb(iommu
->domains
[did
], addr
, mask
);
1212 static void iommu_disable_protect_mem_regions(struct intel_iommu
*iommu
)
1215 unsigned long flags
;
1217 raw_spin_lock_irqsave(&iommu
->register_lock
, flags
);
1218 pmen
= readl(iommu
->reg
+ DMAR_PMEN_REG
);
1219 pmen
&= ~DMA_PMEN_EPM
;
1220 writel(pmen
, iommu
->reg
+ DMAR_PMEN_REG
);
1222 /* wait for the protected region status bit to clear */
1223 IOMMU_WAIT_OP(iommu
, DMAR_PMEN_REG
,
1224 readl
, !(pmen
& DMA_PMEN_PRS
), pmen
);
1226 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
1229 static int iommu_enable_translation(struct intel_iommu
*iommu
)
1232 unsigned long flags
;
1234 raw_spin_lock_irqsave(&iommu
->register_lock
, flags
);
1235 iommu
->gcmd
|= DMA_GCMD_TE
;
1236 writel(iommu
->gcmd
, iommu
->reg
+ DMAR_GCMD_REG
);
1238 /* Make sure hardware complete it */
1239 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1240 readl
, (sts
& DMA_GSTS_TES
), sts
);
1242 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
1246 static int iommu_disable_translation(struct intel_iommu
*iommu
)
1251 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1252 iommu
->gcmd
&= ~DMA_GCMD_TE
;
1253 writel(iommu
->gcmd
, iommu
->reg
+ DMAR_GCMD_REG
);
1255 /* Make sure hardware complete it */
1256 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1257 readl
, (!(sts
& DMA_GSTS_TES
)), sts
);
1259 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1264 static int iommu_init_domains(struct intel_iommu
*iommu
)
1266 unsigned long ndomains
;
1267 unsigned long nlongs
;
1269 ndomains
= cap_ndoms(iommu
->cap
);
1270 pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1271 iommu
->seq_id
, ndomains
);
1272 nlongs
= BITS_TO_LONGS(ndomains
);
1274 spin_lock_init(&iommu
->lock
);
1276 /* TBD: there might be 64K domains,
1277 * consider other allocation for future chip
1279 iommu
->domain_ids
= kcalloc(nlongs
, sizeof(unsigned long), GFP_KERNEL
);
1280 if (!iommu
->domain_ids
) {
1281 pr_err("IOMMU%d: allocating domain id array failed\n",
1285 iommu
->domains
= kcalloc(ndomains
, sizeof(struct dmar_domain
*),
1287 if (!iommu
->domains
) {
1288 pr_err("IOMMU%d: allocating domain array failed\n",
1290 kfree(iommu
->domain_ids
);
1291 iommu
->domain_ids
= NULL
;
1296 * if Caching mode is set, then invalid translations are tagged
1297 * with domainid 0. Hence we need to pre-allocate it.
1299 if (cap_caching_mode(iommu
->cap
))
1300 set_bit(0, iommu
->domain_ids
);
1304 static void free_dmar_iommu(struct intel_iommu
*iommu
)
1306 struct dmar_domain
*domain
;
1308 unsigned long flags
;
1310 if ((iommu
->domains
) && (iommu
->domain_ids
)) {
1311 for_each_set_bit(i
, iommu
->domain_ids
, cap_ndoms(iommu
->cap
)) {
1313 * Domain id 0 is reserved for invalid translation
1314 * if hardware supports caching mode.
1316 if (cap_caching_mode(iommu
->cap
) && i
== 0)
1319 domain
= iommu
->domains
[i
];
1320 clear_bit(i
, iommu
->domain_ids
);
1322 spin_lock_irqsave(&domain
->iommu_lock
, flags
);
1323 count
= --domain
->iommu_count
;
1324 spin_unlock_irqrestore(&domain
->iommu_lock
, flags
);
1326 domain_exit(domain
);
1330 if (iommu
->gcmd
& DMA_GCMD_TE
)
1331 iommu_disable_translation(iommu
);
1333 kfree(iommu
->domains
);
1334 kfree(iommu
->domain_ids
);
1335 iommu
->domains
= NULL
;
1336 iommu
->domain_ids
= NULL
;
1338 g_iommus
[iommu
->seq_id
] = NULL
;
1340 /* free context mapping */
1341 free_context_table(iommu
);
1344 static struct dmar_domain
*alloc_domain(bool vm
)
1346 /* domain id for virtual machine, it won't be set in context */
1347 static atomic_t vm_domid
= ATOMIC_INIT(0);
1348 struct dmar_domain
*domain
;
1350 domain
= alloc_domain_mem();
1355 domain
->iommu_count
= 0;
1356 memset(domain
->iommu_bmp
, 0, sizeof(domain
->iommu_bmp
));
1358 spin_lock_init(&domain
->iommu_lock
);
1359 INIT_LIST_HEAD(&domain
->devices
);
1361 domain
->id
= atomic_inc_return(&vm_domid
);
1362 domain
->flags
= DOMAIN_FLAG_VIRTUAL_MACHINE
;
1368 static int iommu_attach_domain(struct dmar_domain
*domain
,
1369 struct intel_iommu
*iommu
)
1372 unsigned long ndomains
;
1373 unsigned long flags
;
1375 ndomains
= cap_ndoms(iommu
->cap
);
1377 spin_lock_irqsave(&iommu
->lock
, flags
);
1379 num
= find_first_zero_bit(iommu
->domain_ids
, ndomains
);
1380 if (num
>= ndomains
) {
1381 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1382 printk(KERN_ERR
"IOMMU: no free domain ids\n");
1387 domain
->iommu_count
++;
1388 set_bit(num
, iommu
->domain_ids
);
1389 set_bit(iommu
->seq_id
, domain
->iommu_bmp
);
1390 iommu
->domains
[num
] = domain
;
1391 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1396 static void iommu_detach_domain(struct dmar_domain
*domain
,
1397 struct intel_iommu
*iommu
)
1399 unsigned long flags
;
1402 spin_lock_irqsave(&iommu
->lock
, flags
);
1403 ndomains
= cap_ndoms(iommu
->cap
);
1404 for_each_set_bit(num
, iommu
->domain_ids
, ndomains
) {
1405 if (iommu
->domains
[num
] == domain
) {
1406 clear_bit(num
, iommu
->domain_ids
);
1407 iommu
->domains
[num
] = NULL
;
1411 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1414 static struct iova_domain reserved_iova_list
;
1415 static struct lock_class_key reserved_rbtree_key
;
1417 static int dmar_init_reserved_ranges(void)
1419 struct pci_dev
*pdev
= NULL
;
1423 init_iova_domain(&reserved_iova_list
, DMA_32BIT_PFN
);
1425 lockdep_set_class(&reserved_iova_list
.iova_rbtree_lock
,
1426 &reserved_rbtree_key
);
1428 /* IOAPIC ranges shouldn't be accessed by DMA */
1429 iova
= reserve_iova(&reserved_iova_list
, IOVA_PFN(IOAPIC_RANGE_START
),
1430 IOVA_PFN(IOAPIC_RANGE_END
));
1432 printk(KERN_ERR
"Reserve IOAPIC range failed\n");
1436 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1437 for_each_pci_dev(pdev
) {
1440 for (i
= 0; i
< PCI_NUM_RESOURCES
; i
++) {
1441 r
= &pdev
->resource
[i
];
1442 if (!r
->flags
|| !(r
->flags
& IORESOURCE_MEM
))
1444 iova
= reserve_iova(&reserved_iova_list
,
1448 printk(KERN_ERR
"Reserve iova failed\n");
1456 static void domain_reserve_special_ranges(struct dmar_domain
*domain
)
1458 copy_reserved_iova(&reserved_iova_list
, &domain
->iovad
);
1461 static inline int guestwidth_to_adjustwidth(int gaw
)
1464 int r
= (gaw
- 12) % 9;
1475 static int domain_init(struct dmar_domain
*domain
, int guest_width
)
1477 struct intel_iommu
*iommu
;
1478 int adjust_width
, agaw
;
1479 unsigned long sagaw
;
1481 init_iova_domain(&domain
->iovad
, DMA_32BIT_PFN
);
1482 domain_reserve_special_ranges(domain
);
1484 /* calculate AGAW */
1485 iommu
= domain_get_iommu(domain
);
1486 if (guest_width
> cap_mgaw(iommu
->cap
))
1487 guest_width
= cap_mgaw(iommu
->cap
);
1488 domain
->gaw
= guest_width
;
1489 adjust_width
= guestwidth_to_adjustwidth(guest_width
);
1490 agaw
= width_to_agaw(adjust_width
);
1491 sagaw
= cap_sagaw(iommu
->cap
);
1492 if (!test_bit(agaw
, &sagaw
)) {
1493 /* hardware doesn't support it, choose a bigger one */
1494 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw
);
1495 agaw
= find_next_bit(&sagaw
, 5, agaw
);
1499 domain
->agaw
= agaw
;
1501 if (ecap_coherent(iommu
->ecap
))
1502 domain
->iommu_coherency
= 1;
1504 domain
->iommu_coherency
= 0;
1506 if (ecap_sc_support(iommu
->ecap
))
1507 domain
->iommu_snooping
= 1;
1509 domain
->iommu_snooping
= 0;
1511 domain
->iommu_superpage
= fls(cap_super_page_val(iommu
->cap
));
1512 domain
->nid
= iommu
->node
;
1514 /* always allocate the top pgd */
1515 domain
->pgd
= (struct dma_pte
*)alloc_pgtable_page(domain
->nid
);
1518 __iommu_flush_cache(iommu
, domain
->pgd
, PAGE_SIZE
);
1522 static void domain_exit(struct dmar_domain
*domain
)
1524 struct dmar_drhd_unit
*drhd
;
1525 struct intel_iommu
*iommu
;
1527 /* Domain 0 is reserved, so dont process it */
1531 /* Flush any lazy unmaps that may reference this domain */
1532 if (!intel_iommu_strict
)
1533 flush_unmaps_timeout(0);
1535 /* remove associated devices */
1536 domain_remove_dev_info(domain
);
1539 put_iova_domain(&domain
->iovad
);
1542 dma_pte_clear_range(domain
, 0, DOMAIN_MAX_PFN(domain
->gaw
));
1544 /* free page tables */
1545 dma_pte_free_pagetable(domain
, 0, DOMAIN_MAX_PFN(domain
->gaw
));
1547 /* clear attached or cached domains */
1549 for_each_active_iommu(iommu
, drhd
)
1550 if (domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
||
1551 test_bit(iommu
->seq_id
, domain
->iommu_bmp
))
1552 iommu_detach_domain(domain
, iommu
);
1555 free_domain_mem(domain
);
1558 static int domain_context_mapping_one(struct dmar_domain
*domain
, int segment
,
1559 u8 bus
, u8 devfn
, int translation
)
1561 struct context_entry
*context
;
1562 unsigned long flags
;
1563 struct intel_iommu
*iommu
;
1564 struct dma_pte
*pgd
;
1566 unsigned long ndomains
;
1569 struct device_domain_info
*info
= NULL
;
1571 pr_debug("Set context mapping for %02x:%02x.%d\n",
1572 bus
, PCI_SLOT(devfn
), PCI_FUNC(devfn
));
1574 BUG_ON(!domain
->pgd
);
1575 BUG_ON(translation
!= CONTEXT_TT_PASS_THROUGH
&&
1576 translation
!= CONTEXT_TT_MULTI_LEVEL
);
1578 iommu
= device_to_iommu(segment
, bus
, devfn
);
1582 context
= device_to_context_entry(iommu
, bus
, devfn
);
1585 spin_lock_irqsave(&iommu
->lock
, flags
);
1586 if (context_present(context
)) {
1587 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1594 if (domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
||
1595 domain
->flags
& DOMAIN_FLAG_STATIC_IDENTITY
) {
1598 /* find an available domain id for this device in iommu */
1599 ndomains
= cap_ndoms(iommu
->cap
);
1600 for_each_set_bit(num
, iommu
->domain_ids
, ndomains
) {
1601 if (iommu
->domains
[num
] == domain
) {
1609 num
= find_first_zero_bit(iommu
->domain_ids
, ndomains
);
1610 if (num
>= ndomains
) {
1611 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1612 printk(KERN_ERR
"IOMMU: no free domain ids\n");
1616 set_bit(num
, iommu
->domain_ids
);
1617 iommu
->domains
[num
] = domain
;
1621 /* Skip top levels of page tables for
1622 * iommu which has less agaw than default.
1623 * Unnecessary for PT mode.
1625 if (translation
!= CONTEXT_TT_PASS_THROUGH
) {
1626 for (agaw
= domain
->agaw
; agaw
!= iommu
->agaw
; agaw
--) {
1627 pgd
= phys_to_virt(dma_pte_addr(pgd
));
1628 if (!dma_pte_present(pgd
)) {
1629 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1636 context_set_domain_id(context
, id
);
1638 if (translation
!= CONTEXT_TT_PASS_THROUGH
) {
1639 info
= iommu_support_dev_iotlb(domain
, segment
, bus
, devfn
);
1640 translation
= info
? CONTEXT_TT_DEV_IOTLB
:
1641 CONTEXT_TT_MULTI_LEVEL
;
1644 * In pass through mode, AW must be programmed to indicate the largest
1645 * AGAW value supported by hardware. And ASR is ignored by hardware.
1647 if (unlikely(translation
== CONTEXT_TT_PASS_THROUGH
))
1648 context_set_address_width(context
, iommu
->msagaw
);
1650 context_set_address_root(context
, virt_to_phys(pgd
));
1651 context_set_address_width(context
, iommu
->agaw
);
1654 context_set_translation_type(context
, translation
);
1655 context_set_fault_enable(context
);
1656 context_set_present(context
);
1657 domain_flush_cache(domain
, context
, sizeof(*context
));
1660 * It's a non-present to present mapping. If hardware doesn't cache
1661 * non-present entry we only need to flush the write-buffer. If the
1662 * _does_ cache non-present entries, then it does so in the special
1663 * domain #0, which we have to flush:
1665 if (cap_caching_mode(iommu
->cap
)) {
1666 iommu
->flush
.flush_context(iommu
, 0,
1667 (((u16
)bus
) << 8) | devfn
,
1668 DMA_CCMD_MASK_NOBIT
,
1669 DMA_CCMD_DEVICE_INVL
);
1670 iommu
->flush
.flush_iotlb(iommu
, domain
->id
, 0, 0, DMA_TLB_DSI_FLUSH
);
1672 iommu_flush_write_buffer(iommu
);
1674 iommu_enable_dev_iotlb(info
);
1675 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1677 spin_lock_irqsave(&domain
->iommu_lock
, flags
);
1678 if (!test_and_set_bit(iommu
->seq_id
, domain
->iommu_bmp
)) {
1679 domain
->iommu_count
++;
1680 if (domain
->iommu_count
== 1)
1681 domain
->nid
= iommu
->node
;
1682 domain_update_iommu_cap(domain
);
1684 spin_unlock_irqrestore(&domain
->iommu_lock
, flags
);
1689 domain_context_mapping(struct dmar_domain
*domain
, struct pci_dev
*pdev
,
1693 struct pci_dev
*tmp
, *parent
;
1695 ret
= domain_context_mapping_one(domain
, pci_domain_nr(pdev
->bus
),
1696 pdev
->bus
->number
, pdev
->devfn
,
1701 /* dependent device mapping */
1702 tmp
= pci_find_upstream_pcie_bridge(pdev
);
1705 /* Secondary interface's bus number and devfn 0 */
1706 parent
= pdev
->bus
->self
;
1707 while (parent
!= tmp
) {
1708 ret
= domain_context_mapping_one(domain
,
1709 pci_domain_nr(parent
->bus
),
1710 parent
->bus
->number
,
1711 parent
->devfn
, translation
);
1714 parent
= parent
->bus
->self
;
1716 if (pci_is_pcie(tmp
)) /* this is a PCIe-to-PCI bridge */
1717 return domain_context_mapping_one(domain
,
1718 pci_domain_nr(tmp
->subordinate
),
1719 tmp
->subordinate
->number
, 0,
1721 else /* this is a legacy PCI bridge */
1722 return domain_context_mapping_one(domain
,
1723 pci_domain_nr(tmp
->bus
),
1729 static int domain_context_mapped(struct pci_dev
*pdev
)
1732 struct pci_dev
*tmp
, *parent
;
1733 struct intel_iommu
*iommu
;
1735 iommu
= device_to_iommu(pci_domain_nr(pdev
->bus
), pdev
->bus
->number
,
1740 ret
= device_context_mapped(iommu
, pdev
->bus
->number
, pdev
->devfn
);
1743 /* dependent device mapping */
1744 tmp
= pci_find_upstream_pcie_bridge(pdev
);
1747 /* Secondary interface's bus number and devfn 0 */
1748 parent
= pdev
->bus
->self
;
1749 while (parent
!= tmp
) {
1750 ret
= device_context_mapped(iommu
, parent
->bus
->number
,
1754 parent
= parent
->bus
->self
;
1756 if (pci_is_pcie(tmp
))
1757 return device_context_mapped(iommu
, tmp
->subordinate
->number
,
1760 return device_context_mapped(iommu
, tmp
->bus
->number
,
1764 /* Returns a number of VTD pages, but aligned to MM page size */
1765 static inline unsigned long aligned_nrpages(unsigned long host_addr
,
1768 host_addr
&= ~PAGE_MASK
;
1769 return PAGE_ALIGN(host_addr
+ size
) >> VTD_PAGE_SHIFT
;
1772 /* Return largest possible superpage level for a given mapping */
1773 static inline int hardware_largepage_caps(struct dmar_domain
*domain
,
1774 unsigned long iov_pfn
,
1775 unsigned long phy_pfn
,
1776 unsigned long pages
)
1778 int support
, level
= 1;
1779 unsigned long pfnmerge
;
1781 support
= domain
->iommu_superpage
;
1783 /* To use a large page, the virtual *and* physical addresses
1784 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1785 of them will mean we have to use smaller pages. So just
1786 merge them and check both at once. */
1787 pfnmerge
= iov_pfn
| phy_pfn
;
1789 while (support
&& !(pfnmerge
& ~VTD_STRIDE_MASK
)) {
1790 pages
>>= VTD_STRIDE_SHIFT
;
1793 pfnmerge
>>= VTD_STRIDE_SHIFT
;
1800 static int __domain_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
1801 struct scatterlist
*sg
, unsigned long phys_pfn
,
1802 unsigned long nr_pages
, int prot
)
1804 struct dma_pte
*first_pte
= NULL
, *pte
= NULL
;
1805 phys_addr_t
uninitialized_var(pteval
);
1806 int addr_width
= agaw_to_width(domain
->agaw
) - VTD_PAGE_SHIFT
;
1807 unsigned long sg_res
;
1808 unsigned int largepage_lvl
= 0;
1809 unsigned long lvl_pages
= 0;
1811 BUG_ON(addr_width
< BITS_PER_LONG
&& (iov_pfn
+ nr_pages
- 1) >> addr_width
);
1813 if ((prot
& (DMA_PTE_READ
|DMA_PTE_WRITE
)) == 0)
1816 prot
&= DMA_PTE_READ
| DMA_PTE_WRITE
| DMA_PTE_SNP
;
1821 sg_res
= nr_pages
+ 1;
1822 pteval
= ((phys_addr_t
)phys_pfn
<< VTD_PAGE_SHIFT
) | prot
;
1825 while (nr_pages
> 0) {
1829 sg_res
= aligned_nrpages(sg
->offset
, sg
->length
);
1830 sg
->dma_address
= ((dma_addr_t
)iov_pfn
<< VTD_PAGE_SHIFT
) + sg
->offset
;
1831 sg
->dma_length
= sg
->length
;
1832 pteval
= page_to_phys(sg_page(sg
)) | prot
;
1833 phys_pfn
= pteval
>> VTD_PAGE_SHIFT
;
1837 largepage_lvl
= hardware_largepage_caps(domain
, iov_pfn
, phys_pfn
, sg_res
);
1839 first_pte
= pte
= pfn_to_dma_pte(domain
, iov_pfn
, &largepage_lvl
);
1842 /* It is large page*/
1843 if (largepage_lvl
> 1) {
1844 pteval
|= DMA_PTE_LARGE_PAGE
;
1845 /* Ensure that old small page tables are removed to make room
1846 for superpage, if they exist. */
1847 dma_pte_clear_range(domain
, iov_pfn
,
1848 iov_pfn
+ lvl_to_nr_pages(largepage_lvl
) - 1);
1849 dma_pte_free_pagetable(domain
, iov_pfn
,
1850 iov_pfn
+ lvl_to_nr_pages(largepage_lvl
) - 1);
1852 pteval
&= ~(uint64_t)DMA_PTE_LARGE_PAGE
;
1856 /* We don't need lock here, nobody else
1857 * touches the iova range
1859 tmp
= cmpxchg64_local(&pte
->val
, 0ULL, pteval
);
1861 static int dumps
= 5;
1862 printk(KERN_CRIT
"ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1863 iov_pfn
, tmp
, (unsigned long long)pteval
);
1866 debug_dma_dump_mappings(NULL
);
1871 lvl_pages
= lvl_to_nr_pages(largepage_lvl
);
1873 BUG_ON(nr_pages
< lvl_pages
);
1874 BUG_ON(sg_res
< lvl_pages
);
1876 nr_pages
-= lvl_pages
;
1877 iov_pfn
+= lvl_pages
;
1878 phys_pfn
+= lvl_pages
;
1879 pteval
+= lvl_pages
* VTD_PAGE_SIZE
;
1880 sg_res
-= lvl_pages
;
1882 /* If the next PTE would be the first in a new page, then we
1883 need to flush the cache on the entries we've just written.
1884 And then we'll need to recalculate 'pte', so clear it and
1885 let it get set again in the if (!pte) block above.
1887 If we're done (!nr_pages) we need to flush the cache too.
1889 Also if we've been setting superpages, we may need to
1890 recalculate 'pte' and switch back to smaller pages for the
1891 end of the mapping, if the trailing size is not enough to
1892 use another superpage (i.e. sg_res < lvl_pages). */
1894 if (!nr_pages
|| first_pte_in_page(pte
) ||
1895 (largepage_lvl
> 1 && sg_res
< lvl_pages
)) {
1896 domain_flush_cache(domain
, first_pte
,
1897 (void *)pte
- (void *)first_pte
);
1901 if (!sg_res
&& nr_pages
)
1907 static inline int domain_sg_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
1908 struct scatterlist
*sg
, unsigned long nr_pages
,
1911 return __domain_mapping(domain
, iov_pfn
, sg
, 0, nr_pages
, prot
);
1914 static inline int domain_pfn_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
1915 unsigned long phys_pfn
, unsigned long nr_pages
,
1918 return __domain_mapping(domain
, iov_pfn
, NULL
, phys_pfn
, nr_pages
, prot
);
1921 static void iommu_detach_dev(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
1926 clear_context_table(iommu
, bus
, devfn
);
1927 iommu
->flush
.flush_context(iommu
, 0, 0, 0,
1928 DMA_CCMD_GLOBAL_INVL
);
1929 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH
);
1932 static inline void unlink_domain_info(struct device_domain_info
*info
)
1934 assert_spin_locked(&device_domain_lock
);
1935 list_del(&info
->link
);
1936 list_del(&info
->global
);
1938 info
->dev
->dev
.archdata
.iommu
= NULL
;
1941 static void domain_remove_dev_info(struct dmar_domain
*domain
)
1943 struct device_domain_info
*info
;
1944 unsigned long flags
, flags2
;
1945 struct intel_iommu
*iommu
;
1947 spin_lock_irqsave(&device_domain_lock
, flags
);
1948 while (!list_empty(&domain
->devices
)) {
1949 info
= list_entry(domain
->devices
.next
,
1950 struct device_domain_info
, link
);
1951 unlink_domain_info(info
);
1952 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1954 iommu_disable_dev_iotlb(info
);
1955 iommu
= device_to_iommu(info
->segment
, info
->bus
, info
->devfn
);
1956 iommu_detach_dev(iommu
, info
->bus
, info
->devfn
);
1958 if (domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
) {
1959 iommu_detach_dependent_devices(iommu
, info
->dev
);
1960 /* clear this iommu in iommu_bmp, update iommu count
1963 spin_lock_irqsave(&domain
->iommu_lock
, flags2
);
1964 if (test_and_clear_bit(iommu
->seq_id
,
1965 domain
->iommu_bmp
)) {
1966 domain
->iommu_count
--;
1967 domain_update_iommu_cap(domain
);
1969 spin_unlock_irqrestore(&domain
->iommu_lock
, flags2
);
1972 free_devinfo_mem(info
);
1973 spin_lock_irqsave(&device_domain_lock
, flags
);
1975 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1980 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1982 static struct dmar_domain
*
1983 find_domain(struct pci_dev
*pdev
)
1985 struct device_domain_info
*info
;
1987 /* No lock here, assumes no domain exit in normal case */
1988 info
= pdev
->dev
.archdata
.iommu
;
1990 return info
->domain
;
1994 static inline struct dmar_domain
*
1995 dmar_search_domain_by_dev_info(int segment
, int bus
, int devfn
)
1997 struct device_domain_info
*info
;
1999 list_for_each_entry(info
, &device_domain_list
, global
)
2000 if (info
->segment
== segment
&& info
->bus
== bus
&&
2001 info
->devfn
== devfn
)
2002 return info
->domain
;
2007 static int dmar_insert_dev_info(int segment
, int bus
, int devfn
,
2008 struct pci_dev
*dev
, struct dmar_domain
**domp
)
2010 struct dmar_domain
*found
, *domain
= *domp
;
2011 struct device_domain_info
*info
;
2012 unsigned long flags
;
2014 info
= alloc_devinfo_mem();
2018 info
->segment
= segment
;
2020 info
->devfn
= devfn
;
2022 info
->domain
= domain
;
2024 domain
->flags
|= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES
;
2026 spin_lock_irqsave(&device_domain_lock
, flags
);
2028 found
= find_domain(dev
);
2030 found
= dmar_search_domain_by_dev_info(segment
, bus
, devfn
);
2032 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2033 free_devinfo_mem(info
);
2034 if (found
!= domain
) {
2035 domain_exit(domain
);
2039 list_add(&info
->link
, &domain
->devices
);
2040 list_add(&info
->global
, &device_domain_list
);
2042 dev
->dev
.archdata
.iommu
= info
;
2043 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2049 /* domain is initialized */
2050 static struct dmar_domain
*get_domain_for_dev(struct pci_dev
*pdev
, int gaw
)
2052 struct dmar_domain
*domain
, *free
= NULL
;
2053 struct intel_iommu
*iommu
;
2054 struct dmar_drhd_unit
*drhd
;
2055 struct pci_dev
*dev_tmp
;
2056 unsigned long flags
;
2057 int bus
= 0, devfn
= 0;
2060 domain
= find_domain(pdev
);
2064 segment
= pci_domain_nr(pdev
->bus
);
2066 dev_tmp
= pci_find_upstream_pcie_bridge(pdev
);
2068 if (pci_is_pcie(dev_tmp
)) {
2069 bus
= dev_tmp
->subordinate
->number
;
2072 bus
= dev_tmp
->bus
->number
;
2073 devfn
= dev_tmp
->devfn
;
2075 spin_lock_irqsave(&device_domain_lock
, flags
);
2076 domain
= dmar_search_domain_by_dev_info(segment
, bus
, devfn
);
2077 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2078 /* pcie-pci bridge already has a domain, uses it */
2083 drhd
= dmar_find_matched_drhd_unit(pdev
);
2085 printk(KERN_ERR
"IOMMU: can't find DMAR for device %s\n",
2089 iommu
= drhd
->iommu
;
2091 /* Allocate and intialize new domain for the device */
2092 domain
= alloc_domain(false);
2095 if (iommu_attach_domain(domain
, iommu
)) {
2096 free_domain_mem(domain
);
2100 if (domain_init(domain
, gaw
))
2103 /* register pcie-to-pci device */
2105 if (dmar_insert_dev_info(segment
, bus
, devfn
, NULL
, &domain
))
2112 if (dmar_insert_dev_info(segment
, pdev
->bus
->number
, pdev
->devfn
,
2113 pdev
, &domain
) == 0)
2118 /* recheck it here, maybe others set it */
2119 return find_domain(pdev
);
2122 static int iommu_identity_mapping
;
2123 #define IDENTMAP_ALL 1
2124 #define IDENTMAP_GFX 2
2125 #define IDENTMAP_AZALIA 4
2127 static int iommu_domain_identity_map(struct dmar_domain
*domain
,
2128 unsigned long long start
,
2129 unsigned long long end
)
2131 unsigned long first_vpfn
= start
>> VTD_PAGE_SHIFT
;
2132 unsigned long last_vpfn
= end
>> VTD_PAGE_SHIFT
;
2134 if (!reserve_iova(&domain
->iovad
, dma_to_mm_pfn(first_vpfn
),
2135 dma_to_mm_pfn(last_vpfn
))) {
2136 printk(KERN_ERR
"IOMMU: reserve iova failed\n");
2140 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2141 start
, end
, domain
->id
);
2143 * RMRR range might have overlap with physical memory range,
2146 dma_pte_clear_range(domain
, first_vpfn
, last_vpfn
);
2148 return domain_pfn_mapping(domain
, first_vpfn
, first_vpfn
,
2149 last_vpfn
- first_vpfn
+ 1,
2150 DMA_PTE_READ
|DMA_PTE_WRITE
);
2153 static int iommu_prepare_identity_map(struct pci_dev
*pdev
,
2154 unsigned long long start
,
2155 unsigned long long end
)
2157 struct dmar_domain
*domain
;
2160 domain
= get_domain_for_dev(pdev
, DEFAULT_DOMAIN_ADDRESS_WIDTH
);
2164 /* For _hardware_ passthrough, don't bother. But for software
2165 passthrough, we do it anyway -- it may indicate a memory
2166 range which is reserved in E820, so which didn't get set
2167 up to start with in si_domain */
2168 if (domain
== si_domain
&& hw_pass_through
) {
2169 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2170 pci_name(pdev
), start
, end
);
2175 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2176 pci_name(pdev
), start
, end
);
2179 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2180 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2181 dmi_get_system_info(DMI_BIOS_VENDOR
),
2182 dmi_get_system_info(DMI_BIOS_VERSION
),
2183 dmi_get_system_info(DMI_PRODUCT_VERSION
));
2188 if (end
>> agaw_to_width(domain
->agaw
)) {
2189 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2190 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2191 agaw_to_width(domain
->agaw
),
2192 dmi_get_system_info(DMI_BIOS_VENDOR
),
2193 dmi_get_system_info(DMI_BIOS_VERSION
),
2194 dmi_get_system_info(DMI_PRODUCT_VERSION
));
2199 ret
= iommu_domain_identity_map(domain
, start
, end
);
2203 /* context entry init */
2204 ret
= domain_context_mapping(domain
, pdev
, CONTEXT_TT_MULTI_LEVEL
);
2211 domain_exit(domain
);
2215 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit
*rmrr
,
2216 struct pci_dev
*pdev
)
2218 if (pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
)
2220 return iommu_prepare_identity_map(pdev
, rmrr
->base_address
,
2224 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2225 static inline void iommu_prepare_isa(void)
2227 struct pci_dev
*pdev
;
2230 pdev
= pci_get_class(PCI_CLASS_BRIDGE_ISA
<< 8, NULL
);
2234 printk(KERN_INFO
"IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2235 ret
= iommu_prepare_identity_map(pdev
, 0, 16*1024*1024 - 1);
2238 printk(KERN_ERR
"IOMMU: Failed to create 0-16MiB identity map; "
2239 "floppy might not work\n");
2243 static inline void iommu_prepare_isa(void)
2247 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2249 static int md_domain_init(struct dmar_domain
*domain
, int guest_width
);
2251 static int __init
si_domain_init(int hw
)
2253 struct dmar_drhd_unit
*drhd
;
2254 struct intel_iommu
*iommu
;
2257 si_domain
= alloc_domain(false);
2261 si_domain
->flags
= DOMAIN_FLAG_STATIC_IDENTITY
;
2263 for_each_active_iommu(iommu
, drhd
) {
2264 ret
= iommu_attach_domain(si_domain
, iommu
);
2266 domain_exit(si_domain
);
2271 if (md_domain_init(si_domain
, DEFAULT_DOMAIN_ADDRESS_WIDTH
)) {
2272 domain_exit(si_domain
);
2276 pr_debug("IOMMU: identity mapping domain is domain %d\n",
2282 for_each_online_node(nid
) {
2283 unsigned long start_pfn
, end_pfn
;
2286 for_each_mem_pfn_range(i
, nid
, &start_pfn
, &end_pfn
, NULL
) {
2287 ret
= iommu_domain_identity_map(si_domain
,
2288 PFN_PHYS(start_pfn
), PFN_PHYS(end_pfn
));
2297 static int identity_mapping(struct pci_dev
*pdev
)
2299 struct device_domain_info
*info
;
2301 if (likely(!iommu_identity_mapping
))
2304 info
= pdev
->dev
.archdata
.iommu
;
2305 if (info
&& info
!= DUMMY_DEVICE_DOMAIN_INFO
)
2306 return (info
->domain
== si_domain
);
2311 static int domain_add_dev_info(struct dmar_domain
*domain
,
2312 struct pci_dev
*pdev
,
2315 struct device_domain_info
*info
;
2316 unsigned long flags
;
2319 info
= alloc_devinfo_mem();
2323 info
->segment
= pci_domain_nr(pdev
->bus
);
2324 info
->bus
= pdev
->bus
->number
;
2325 info
->devfn
= pdev
->devfn
;
2327 info
->domain
= domain
;
2329 spin_lock_irqsave(&device_domain_lock
, flags
);
2330 list_add(&info
->link
, &domain
->devices
);
2331 list_add(&info
->global
, &device_domain_list
);
2332 pdev
->dev
.archdata
.iommu
= info
;
2333 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2335 ret
= domain_context_mapping(domain
, pdev
, translation
);
2337 spin_lock_irqsave(&device_domain_lock
, flags
);
2338 unlink_domain_info(info
);
2339 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2340 free_devinfo_mem(info
);
2347 static bool device_has_rmrr(struct pci_dev
*dev
)
2349 struct dmar_rmrr_unit
*rmrr
;
2350 struct pci_dev
*tmp
;
2354 for_each_rmrr_units(rmrr
) {
2356 * Return TRUE if this RMRR contains the device that
2359 for_each_active_dev_scope(rmrr
->devices
,
2360 rmrr
->devices_cnt
, i
, tmp
)
2370 static int iommu_should_identity_map(struct pci_dev
*pdev
, int startup
)
2374 * We want to prevent any device associated with an RMRR from
2375 * getting placed into the SI Domain. This is done because
2376 * problems exist when devices are moved in and out of domains
2377 * and their respective RMRR info is lost. We exempt USB devices
2378 * from this process due to their usage of RMRRs that are known
2379 * to not be needed after BIOS hand-off to OS.
2381 if (device_has_rmrr(pdev
) &&
2382 (pdev
->class >> 8) != PCI_CLASS_SERIAL_USB
)
2385 if ((iommu_identity_mapping
& IDENTMAP_AZALIA
) && IS_AZALIA(pdev
))
2388 if ((iommu_identity_mapping
& IDENTMAP_GFX
) && IS_GFX_DEVICE(pdev
))
2391 if (!(iommu_identity_mapping
& IDENTMAP_ALL
))
2395 * We want to start off with all devices in the 1:1 domain, and
2396 * take them out later if we find they can't access all of memory.
2398 * However, we can't do this for PCI devices behind bridges,
2399 * because all PCI devices behind the same bridge will end up
2400 * with the same source-id on their transactions.
2402 * Practically speaking, we can't change things around for these
2403 * devices at run-time, because we can't be sure there'll be no
2404 * DMA transactions in flight for any of their siblings.
2406 * So PCI devices (unless they're on the root bus) as well as
2407 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2408 * the 1:1 domain, just in _case_ one of their siblings turns out
2409 * not to be able to map all of memory.
2411 if (!pci_is_pcie(pdev
)) {
2412 if (!pci_is_root_bus(pdev
->bus
))
2414 if (pdev
->class >> 8 == PCI_CLASS_BRIDGE_PCI
)
2416 } else if (pci_pcie_type(pdev
) == PCI_EXP_TYPE_PCI_BRIDGE
)
2420 * At boot time, we don't yet know if devices will be 64-bit capable.
2421 * Assume that they will -- if they turn out not to be, then we can
2422 * take them out of the 1:1 domain later.
2426 * If the device's dma_mask is less than the system's memory
2427 * size then this is not a candidate for identity mapping.
2429 u64 dma_mask
= pdev
->dma_mask
;
2431 if (pdev
->dev
.coherent_dma_mask
&&
2432 pdev
->dev
.coherent_dma_mask
< dma_mask
)
2433 dma_mask
= pdev
->dev
.coherent_dma_mask
;
2435 return dma_mask
>= dma_get_required_mask(&pdev
->dev
);
2441 static int __init
iommu_prepare_static_identity_mapping(int hw
)
2443 struct pci_dev
*pdev
= NULL
;
2446 ret
= si_domain_init(hw
);
2450 for_each_pci_dev(pdev
) {
2451 if (iommu_should_identity_map(pdev
, 1)) {
2452 ret
= domain_add_dev_info(si_domain
, pdev
,
2453 hw
? CONTEXT_TT_PASS_THROUGH
:
2454 CONTEXT_TT_MULTI_LEVEL
);
2456 /* device not associated with an iommu */
2461 pr_info("IOMMU: %s identity mapping for device %s\n",
2462 hw
? "hardware" : "software", pci_name(pdev
));
2469 static int __init
init_dmars(void)
2471 struct dmar_drhd_unit
*drhd
;
2472 struct dmar_rmrr_unit
*rmrr
;
2473 struct pci_dev
*pdev
;
2474 struct intel_iommu
*iommu
;
2480 * initialize and program root entry to not present
2483 for_each_drhd_unit(drhd
) {
2485 * lock not needed as this is only incremented in the single
2486 * threaded kernel __init code path all other access are read
2489 if (g_num_of_iommus
< IOMMU_UNITS_SUPPORTED
) {
2493 printk_once(KERN_ERR
"intel-iommu: exceeded %d IOMMUs\n",
2494 IOMMU_UNITS_SUPPORTED
);
2497 g_iommus
= kcalloc(g_num_of_iommus
, sizeof(struct intel_iommu
*),
2500 printk(KERN_ERR
"Allocating global iommu array failed\n");
2505 deferred_flush
= kzalloc(g_num_of_iommus
*
2506 sizeof(struct deferred_flush_tables
), GFP_KERNEL
);
2507 if (!deferred_flush
) {
2512 for_each_active_iommu(iommu
, drhd
) {
2513 g_iommus
[iommu
->seq_id
] = iommu
;
2515 ret
= iommu_init_domains(iommu
);
2521 * we could share the same root & context tables
2522 * among all IOMMU's. Need to Split it later.
2524 ret
= iommu_alloc_root_entry(iommu
);
2526 printk(KERN_ERR
"IOMMU: allocate root entry failed\n");
2529 if (!ecap_pass_through(iommu
->ecap
))
2530 hw_pass_through
= 0;
2534 * Start from the sane iommu hardware state.
2536 for_each_active_iommu(iommu
, drhd
) {
2538 * If the queued invalidation is already initialized by us
2539 * (for example, while enabling interrupt-remapping) then
2540 * we got the things already rolling from a sane state.
2546 * Clear any previous faults.
2548 dmar_fault(-1, iommu
);
2550 * Disable queued invalidation if supported and already enabled
2551 * before OS handover.
2553 dmar_disable_qi(iommu
);
2556 for_each_active_iommu(iommu
, drhd
) {
2557 if (dmar_enable_qi(iommu
)) {
2559 * Queued Invalidate not enabled, use Register Based
2562 iommu
->flush
.flush_context
= __iommu_flush_context
;
2563 iommu
->flush
.flush_iotlb
= __iommu_flush_iotlb
;
2564 printk(KERN_INFO
"IOMMU %d 0x%Lx: using Register based "
2567 (unsigned long long)drhd
->reg_base_addr
);
2569 iommu
->flush
.flush_context
= qi_flush_context
;
2570 iommu
->flush
.flush_iotlb
= qi_flush_iotlb
;
2571 printk(KERN_INFO
"IOMMU %d 0x%Lx: using Queued "
2574 (unsigned long long)drhd
->reg_base_addr
);
2578 if (iommu_pass_through
)
2579 iommu_identity_mapping
|= IDENTMAP_ALL
;
2581 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2582 iommu_identity_mapping
|= IDENTMAP_GFX
;
2585 check_tylersburg_isoch();
2588 * If pass through is not set or not enabled, setup context entries for
2589 * identity mappings for rmrr, gfx, and isa and may fall back to static
2590 * identity mapping if iommu_identity_mapping is set.
2592 if (iommu_identity_mapping
) {
2593 ret
= iommu_prepare_static_identity_mapping(hw_pass_through
);
2595 printk(KERN_CRIT
"Failed to setup IOMMU pass-through\n");
2601 * for each dev attached to rmrr
2603 * locate drhd for dev, alloc domain for dev
2604 * allocate free domain
2605 * allocate page table entries for rmrr
2606 * if context not allocated for bus
2607 * allocate and init context
2608 * set present in root table for this bus
2609 * init context with domain, translation etc
2613 printk(KERN_INFO
"IOMMU: Setting RMRR:\n");
2614 for_each_rmrr_units(rmrr
) {
2615 /* some BIOS lists non-exist devices in DMAR table. */
2616 for_each_active_dev_scope(rmrr
->devices
, rmrr
->devices_cnt
,
2618 ret
= iommu_prepare_rmrr_dev(rmrr
, pdev
);
2621 "IOMMU: mapping reserved region failed\n");
2625 iommu_prepare_isa();
2630 * global invalidate context cache
2631 * global invalidate iotlb
2632 * enable translation
2634 for_each_iommu(iommu
, drhd
) {
2635 if (drhd
->ignored
) {
2637 * we always have to disable PMRs or DMA may fail on
2641 iommu_disable_protect_mem_regions(iommu
);
2645 iommu_flush_write_buffer(iommu
);
2647 ret
= dmar_set_interrupt(iommu
);
2651 iommu_set_root_entry(iommu
);
2653 iommu
->flush
.flush_context(iommu
, 0, 0, 0, DMA_CCMD_GLOBAL_INVL
);
2654 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH
);
2656 ret
= iommu_enable_translation(iommu
);
2660 iommu_disable_protect_mem_regions(iommu
);
2666 for_each_active_iommu(iommu
, drhd
)
2667 free_dmar_iommu(iommu
);
2668 kfree(deferred_flush
);
2675 /* This takes a number of _MM_ pages, not VTD pages */
2676 static struct iova
*intel_alloc_iova(struct device
*dev
,
2677 struct dmar_domain
*domain
,
2678 unsigned long nrpages
, uint64_t dma_mask
)
2680 struct pci_dev
*pdev
= to_pci_dev(dev
);
2681 struct iova
*iova
= NULL
;
2683 /* Restrict dma_mask to the width that the iommu can handle */
2684 dma_mask
= min_t(uint64_t, DOMAIN_MAX_ADDR(domain
->gaw
), dma_mask
);
2686 if (!dmar_forcedac
&& dma_mask
> DMA_BIT_MASK(32)) {
2688 * First try to allocate an io virtual address in
2689 * DMA_BIT_MASK(32) and if that fails then try allocating
2692 iova
= alloc_iova(&domain
->iovad
, nrpages
,
2693 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2697 iova
= alloc_iova(&domain
->iovad
, nrpages
, IOVA_PFN(dma_mask
), 1);
2698 if (unlikely(!iova
)) {
2699 printk(KERN_ERR
"Allocating %ld-page iova for %s failed",
2700 nrpages
, pci_name(pdev
));
2707 static struct dmar_domain
*__get_valid_domain_for_dev(struct pci_dev
*pdev
)
2709 struct dmar_domain
*domain
;
2712 domain
= get_domain_for_dev(pdev
,
2713 DEFAULT_DOMAIN_ADDRESS_WIDTH
);
2716 "Allocating domain for %s failed", pci_name(pdev
));
2720 /* make sure context mapping is ok */
2721 if (unlikely(!domain_context_mapped(pdev
))) {
2722 ret
= domain_context_mapping(domain
, pdev
,
2723 CONTEXT_TT_MULTI_LEVEL
);
2726 "Domain context map for %s failed",
2735 static inline struct dmar_domain
*get_valid_domain_for_dev(struct pci_dev
*dev
)
2737 struct device_domain_info
*info
;
2739 /* No lock here, assumes no domain exit in normal case */
2740 info
= dev
->dev
.archdata
.iommu
;
2742 return info
->domain
;
2744 return __get_valid_domain_for_dev(dev
);
2747 static int iommu_dummy(struct pci_dev
*pdev
)
2749 return pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
;
2752 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2753 static int iommu_no_mapping(struct device
*dev
)
2755 struct pci_dev
*pdev
;
2758 if (unlikely(!dev_is_pci(dev
)))
2761 pdev
= to_pci_dev(dev
);
2762 if (iommu_dummy(pdev
))
2765 if (!iommu_identity_mapping
)
2768 found
= identity_mapping(pdev
);
2770 if (iommu_should_identity_map(pdev
, 0))
2774 * 32 bit DMA is removed from si_domain and fall back
2775 * to non-identity mapping.
2777 domain_remove_one_dev_info(si_domain
, pdev
);
2778 printk(KERN_INFO
"32bit %s uses non-identity mapping\n",
2784 * In case of a detached 64 bit DMA device from vm, the device
2785 * is put into si_domain for identity mapping.
2787 if (iommu_should_identity_map(pdev
, 0)) {
2789 ret
= domain_add_dev_info(si_domain
, pdev
,
2791 CONTEXT_TT_PASS_THROUGH
:
2792 CONTEXT_TT_MULTI_LEVEL
);
2794 printk(KERN_INFO
"64bit %s uses identity mapping\n",
2804 static dma_addr_t
__intel_map_single(struct device
*hwdev
, phys_addr_t paddr
,
2805 size_t size
, int dir
, u64 dma_mask
)
2807 struct pci_dev
*pdev
= to_pci_dev(hwdev
);
2808 struct dmar_domain
*domain
;
2809 phys_addr_t start_paddr
;
2813 struct intel_iommu
*iommu
;
2814 unsigned long paddr_pfn
= paddr
>> PAGE_SHIFT
;
2816 BUG_ON(dir
== DMA_NONE
);
2818 if (iommu_no_mapping(hwdev
))
2821 domain
= get_valid_domain_for_dev(pdev
);
2825 iommu
= domain_get_iommu(domain
);
2826 size
= aligned_nrpages(paddr
, size
);
2828 iova
= intel_alloc_iova(hwdev
, domain
, dma_to_mm_pfn(size
), dma_mask
);
2833 * Check if DMAR supports zero-length reads on write only
2836 if (dir
== DMA_TO_DEVICE
|| dir
== DMA_BIDIRECTIONAL
|| \
2837 !cap_zlr(iommu
->cap
))
2838 prot
|= DMA_PTE_READ
;
2839 if (dir
== DMA_FROM_DEVICE
|| dir
== DMA_BIDIRECTIONAL
)
2840 prot
|= DMA_PTE_WRITE
;
2842 * paddr - (paddr + size) might be partial page, we should map the whole
2843 * page. Note: if two part of one page are separately mapped, we
2844 * might have two guest_addr mapping to the same host paddr, but this
2845 * is not a big problem
2847 ret
= domain_pfn_mapping(domain
, mm_to_dma_pfn(iova
->pfn_lo
),
2848 mm_to_dma_pfn(paddr_pfn
), size
, prot
);
2852 /* it's a non-present to present mapping. Only flush if caching mode */
2853 if (cap_caching_mode(iommu
->cap
))
2854 iommu_flush_iotlb_psi(iommu
, domain
->id
, mm_to_dma_pfn(iova
->pfn_lo
), size
, 1);
2856 iommu_flush_write_buffer(iommu
);
2858 start_paddr
= (phys_addr_t
)iova
->pfn_lo
<< PAGE_SHIFT
;
2859 start_paddr
+= paddr
& ~PAGE_MASK
;
2864 __free_iova(&domain
->iovad
, iova
);
2865 printk(KERN_ERR
"Device %s request: %zx@%llx dir %d --- failed\n",
2866 pci_name(pdev
), size
, (unsigned long long)paddr
, dir
);
2870 static dma_addr_t
intel_map_page(struct device
*dev
, struct page
*page
,
2871 unsigned long offset
, size_t size
,
2872 enum dma_data_direction dir
,
2873 struct dma_attrs
*attrs
)
2875 return __intel_map_single(dev
, page_to_phys(page
) + offset
, size
,
2876 dir
, to_pci_dev(dev
)->dma_mask
);
2879 static void flush_unmaps(void)
2885 /* just flush them all */
2886 for (i
= 0; i
< g_num_of_iommus
; i
++) {
2887 struct intel_iommu
*iommu
= g_iommus
[i
];
2891 if (!deferred_flush
[i
].next
)
2894 /* In caching mode, global flushes turn emulation expensive */
2895 if (!cap_caching_mode(iommu
->cap
))
2896 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0,
2897 DMA_TLB_GLOBAL_FLUSH
);
2898 for (j
= 0; j
< deferred_flush
[i
].next
; j
++) {
2900 struct iova
*iova
= deferred_flush
[i
].iova
[j
];
2901 struct dmar_domain
*domain
= deferred_flush
[i
].domain
[j
];
2903 /* On real hardware multiple invalidations are expensive */
2904 if (cap_caching_mode(iommu
->cap
))
2905 iommu_flush_iotlb_psi(iommu
, domain
->id
,
2906 iova
->pfn_lo
, iova
->pfn_hi
- iova
->pfn_lo
+ 1, 0);
2908 mask
= ilog2(mm_to_dma_pfn(iova
->pfn_hi
- iova
->pfn_lo
+ 1));
2909 iommu_flush_dev_iotlb(deferred_flush
[i
].domain
[j
],
2910 (uint64_t)iova
->pfn_lo
<< PAGE_SHIFT
, mask
);
2912 __free_iova(&deferred_flush
[i
].domain
[j
]->iovad
, iova
);
2914 deferred_flush
[i
].next
= 0;
2920 static void flush_unmaps_timeout(unsigned long data
)
2922 unsigned long flags
;
2924 spin_lock_irqsave(&async_umap_flush_lock
, flags
);
2926 spin_unlock_irqrestore(&async_umap_flush_lock
, flags
);
2929 static void add_unmap(struct dmar_domain
*dom
, struct iova
*iova
)
2931 unsigned long flags
;
2933 struct intel_iommu
*iommu
;
2935 spin_lock_irqsave(&async_umap_flush_lock
, flags
);
2936 if (list_size
== HIGH_WATER_MARK
)
2939 iommu
= domain_get_iommu(dom
);
2940 iommu_id
= iommu
->seq_id
;
2942 next
= deferred_flush
[iommu_id
].next
;
2943 deferred_flush
[iommu_id
].domain
[next
] = dom
;
2944 deferred_flush
[iommu_id
].iova
[next
] = iova
;
2945 deferred_flush
[iommu_id
].next
++;
2948 mod_timer(&unmap_timer
, jiffies
+ msecs_to_jiffies(10));
2952 spin_unlock_irqrestore(&async_umap_flush_lock
, flags
);
2955 static void intel_unmap_page(struct device
*dev
, dma_addr_t dev_addr
,
2956 size_t size
, enum dma_data_direction dir
,
2957 struct dma_attrs
*attrs
)
2959 struct pci_dev
*pdev
= to_pci_dev(dev
);
2960 struct dmar_domain
*domain
;
2961 unsigned long start_pfn
, last_pfn
;
2963 struct intel_iommu
*iommu
;
2965 if (iommu_no_mapping(dev
))
2968 domain
= find_domain(pdev
);
2971 iommu
= domain_get_iommu(domain
);
2973 iova
= find_iova(&domain
->iovad
, IOVA_PFN(dev_addr
));
2974 if (WARN_ONCE(!iova
, "Driver unmaps unmatched page at PFN %llx\n",
2975 (unsigned long long)dev_addr
))
2978 start_pfn
= mm_to_dma_pfn(iova
->pfn_lo
);
2979 last_pfn
= mm_to_dma_pfn(iova
->pfn_hi
+ 1) - 1;
2981 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2982 pci_name(pdev
), start_pfn
, last_pfn
);
2984 /* clear the whole page */
2985 dma_pte_clear_range(domain
, start_pfn
, last_pfn
);
2987 /* free page tables */
2988 dma_pte_free_pagetable(domain
, start_pfn
, last_pfn
);
2990 if (intel_iommu_strict
) {
2991 iommu_flush_iotlb_psi(iommu
, domain
->id
, start_pfn
,
2992 last_pfn
- start_pfn
+ 1, 0);
2994 __free_iova(&domain
->iovad
, iova
);
2996 add_unmap(domain
, iova
);
2998 * queue up the release of the unmap to save the 1/6th of the
2999 * cpu used up by the iotlb flush operation...
3004 static void *intel_alloc_coherent(struct device
*hwdev
, size_t size
,
3005 dma_addr_t
*dma_handle
, gfp_t flags
,
3006 struct dma_attrs
*attrs
)
3011 size
= PAGE_ALIGN(size
);
3012 order
= get_order(size
);
3014 if (!iommu_no_mapping(hwdev
))
3015 flags
&= ~(GFP_DMA
| GFP_DMA32
);
3016 else if (hwdev
->coherent_dma_mask
< dma_get_required_mask(hwdev
)) {
3017 if (hwdev
->coherent_dma_mask
< DMA_BIT_MASK(32))
3023 vaddr
= (void *)__get_free_pages(flags
, order
);
3026 memset(vaddr
, 0, size
);
3028 *dma_handle
= __intel_map_single(hwdev
, virt_to_bus(vaddr
), size
,
3030 hwdev
->coherent_dma_mask
);
3033 free_pages((unsigned long)vaddr
, order
);
3037 static void intel_free_coherent(struct device
*hwdev
, size_t size
, void *vaddr
,
3038 dma_addr_t dma_handle
, struct dma_attrs
*attrs
)
3042 size
= PAGE_ALIGN(size
);
3043 order
= get_order(size
);
3045 intel_unmap_page(hwdev
, dma_handle
, size
, DMA_BIDIRECTIONAL
, NULL
);
3046 free_pages((unsigned long)vaddr
, order
);
3049 static void intel_unmap_sg(struct device
*hwdev
, struct scatterlist
*sglist
,
3050 int nelems
, enum dma_data_direction dir
,
3051 struct dma_attrs
*attrs
)
3053 struct pci_dev
*pdev
= to_pci_dev(hwdev
);
3054 struct dmar_domain
*domain
;
3055 unsigned long start_pfn
, last_pfn
;
3057 struct intel_iommu
*iommu
;
3059 if (iommu_no_mapping(hwdev
))
3062 domain
= find_domain(pdev
);
3065 iommu
= domain_get_iommu(domain
);
3067 iova
= find_iova(&domain
->iovad
, IOVA_PFN(sglist
[0].dma_address
));
3068 if (WARN_ONCE(!iova
, "Driver unmaps unmatched sglist at PFN %llx\n",
3069 (unsigned long long)sglist
[0].dma_address
))
3072 start_pfn
= mm_to_dma_pfn(iova
->pfn_lo
);
3073 last_pfn
= mm_to_dma_pfn(iova
->pfn_hi
+ 1) - 1;
3075 /* clear the whole page */
3076 dma_pte_clear_range(domain
, start_pfn
, last_pfn
);
3078 /* free page tables */
3079 dma_pte_free_pagetable(domain
, start_pfn
, last_pfn
);
3081 if (intel_iommu_strict
) {
3082 iommu_flush_iotlb_psi(iommu
, domain
->id
, start_pfn
,
3083 last_pfn
- start_pfn
+ 1, 0);
3085 __free_iova(&domain
->iovad
, iova
);
3087 add_unmap(domain
, iova
);
3089 * queue up the release of the unmap to save the 1/6th of the
3090 * cpu used up by the iotlb flush operation...
3095 static int intel_nontranslate_map_sg(struct device
*hddev
,
3096 struct scatterlist
*sglist
, int nelems
, int dir
)
3099 struct scatterlist
*sg
;
3101 for_each_sg(sglist
, sg
, nelems
, i
) {
3102 BUG_ON(!sg_page(sg
));
3103 sg
->dma_address
= page_to_phys(sg_page(sg
)) + sg
->offset
;
3104 sg
->dma_length
= sg
->length
;
3109 static int intel_map_sg(struct device
*hwdev
, struct scatterlist
*sglist
, int nelems
,
3110 enum dma_data_direction dir
, struct dma_attrs
*attrs
)
3113 struct pci_dev
*pdev
= to_pci_dev(hwdev
);
3114 struct dmar_domain
*domain
;
3117 struct iova
*iova
= NULL
;
3119 struct scatterlist
*sg
;
3120 unsigned long start_vpfn
;
3121 struct intel_iommu
*iommu
;
3123 BUG_ON(dir
== DMA_NONE
);
3124 if (iommu_no_mapping(hwdev
))
3125 return intel_nontranslate_map_sg(hwdev
, sglist
, nelems
, dir
);
3127 domain
= get_valid_domain_for_dev(pdev
);
3131 iommu
= domain_get_iommu(domain
);
3133 for_each_sg(sglist
, sg
, nelems
, i
)
3134 size
+= aligned_nrpages(sg
->offset
, sg
->length
);
3136 iova
= intel_alloc_iova(hwdev
, domain
, dma_to_mm_pfn(size
),
3139 sglist
->dma_length
= 0;
3144 * Check if DMAR supports zero-length reads on write only
3147 if (dir
== DMA_TO_DEVICE
|| dir
== DMA_BIDIRECTIONAL
|| \
3148 !cap_zlr(iommu
->cap
))
3149 prot
|= DMA_PTE_READ
;
3150 if (dir
== DMA_FROM_DEVICE
|| dir
== DMA_BIDIRECTIONAL
)
3151 prot
|= DMA_PTE_WRITE
;
3153 start_vpfn
= mm_to_dma_pfn(iova
->pfn_lo
);
3155 ret
= domain_sg_mapping(domain
, start_vpfn
, sglist
, size
, prot
);
3156 if (unlikely(ret
)) {
3157 /* clear the page */
3158 dma_pte_clear_range(domain
, start_vpfn
,
3159 start_vpfn
+ size
- 1);
3160 /* free page tables */
3161 dma_pte_free_pagetable(domain
, start_vpfn
,
3162 start_vpfn
+ size
- 1);
3164 __free_iova(&domain
->iovad
, iova
);
3168 /* it's a non-present to present mapping. Only flush if caching mode */
3169 if (cap_caching_mode(iommu
->cap
))
3170 iommu_flush_iotlb_psi(iommu
, domain
->id
, start_vpfn
, size
, 1);
3172 iommu_flush_write_buffer(iommu
);
3177 static int intel_mapping_error(struct device
*dev
, dma_addr_t dma_addr
)
3182 struct dma_map_ops intel_dma_ops
= {
3183 .alloc
= intel_alloc_coherent
,
3184 .free
= intel_free_coherent
,
3185 .map_sg
= intel_map_sg
,
3186 .unmap_sg
= intel_unmap_sg
,
3187 .map_page
= intel_map_page
,
3188 .unmap_page
= intel_unmap_page
,
3189 .mapping_error
= intel_mapping_error
,
3192 static inline int iommu_domain_cache_init(void)
3196 iommu_domain_cache
= kmem_cache_create("iommu_domain",
3197 sizeof(struct dmar_domain
),
3202 if (!iommu_domain_cache
) {
3203 printk(KERN_ERR
"Couldn't create iommu_domain cache\n");
3210 static inline int iommu_devinfo_cache_init(void)
3214 iommu_devinfo_cache
= kmem_cache_create("iommu_devinfo",
3215 sizeof(struct device_domain_info
),
3219 if (!iommu_devinfo_cache
) {
3220 printk(KERN_ERR
"Couldn't create devinfo cache\n");
3227 static inline int iommu_iova_cache_init(void)
3231 iommu_iova_cache
= kmem_cache_create("iommu_iova",
3232 sizeof(struct iova
),
3236 if (!iommu_iova_cache
) {
3237 printk(KERN_ERR
"Couldn't create iova cache\n");
3244 static int __init
iommu_init_mempool(void)
3247 ret
= iommu_iova_cache_init();
3251 ret
= iommu_domain_cache_init();
3255 ret
= iommu_devinfo_cache_init();
3259 kmem_cache_destroy(iommu_domain_cache
);
3261 kmem_cache_destroy(iommu_iova_cache
);
3266 static void __init
iommu_exit_mempool(void)
3268 kmem_cache_destroy(iommu_devinfo_cache
);
3269 kmem_cache_destroy(iommu_domain_cache
);
3270 kmem_cache_destroy(iommu_iova_cache
);
3274 static void quirk_ioat_snb_local_iommu(struct pci_dev
*pdev
)
3276 struct dmar_drhd_unit
*drhd
;
3280 /* We know that this device on this chipset has its own IOMMU.
3281 * If we find it under a different IOMMU, then the BIOS is lying
3282 * to us. Hope that the IOMMU for this device is actually
3283 * disabled, and it needs no translation...
3285 rc
= pci_bus_read_config_dword(pdev
->bus
, PCI_DEVFN(0, 0), 0xb0, &vtbar
);
3287 /* "can't" happen */
3288 dev_info(&pdev
->dev
, "failed to run vt-d quirk\n");
3291 vtbar
&= 0xffff0000;
3293 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3294 drhd
= dmar_find_matched_drhd_unit(pdev
);
3295 if (WARN_TAINT_ONCE(!drhd
|| drhd
->reg_base_addr
- vtbar
!= 0xa000,
3296 TAINT_FIRMWARE_WORKAROUND
,
3297 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3298 pdev
->dev
.archdata
.iommu
= DUMMY_DEVICE_DOMAIN_INFO
;
3300 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL
, PCI_DEVICE_ID_INTEL_IOAT_SNB
, quirk_ioat_snb_local_iommu
);
3302 static void __init
init_no_remapping_devices(void)
3304 struct dmar_drhd_unit
*drhd
;
3305 struct pci_dev
*dev
;
3308 for_each_drhd_unit(drhd
) {
3309 if (!drhd
->include_all
) {
3310 for_each_active_dev_scope(drhd
->devices
,
3311 drhd
->devices_cnt
, i
, dev
)
3313 /* ignore DMAR unit if no pci devices exist */
3314 if (i
== drhd
->devices_cnt
)
3319 for_each_active_drhd_unit(drhd
) {
3320 if (drhd
->include_all
)
3323 for_each_active_dev_scope(drhd
->devices
,
3324 drhd
->devices_cnt
, i
, dev
)
3325 if (!IS_GFX_DEVICE(dev
))
3327 if (i
< drhd
->devices_cnt
)
3330 /* This IOMMU has *only* gfx devices. Either bypass it or
3331 set the gfx_mapped flag, as appropriate */
3333 intel_iommu_gfx_mapped
= 1;
3336 for_each_active_dev_scope(drhd
->devices
,
3337 drhd
->devices_cnt
, i
, dev
)
3338 dev
->dev
.archdata
.iommu
= DUMMY_DEVICE_DOMAIN_INFO
;
3343 #ifdef CONFIG_SUSPEND
3344 static int init_iommu_hw(void)
3346 struct dmar_drhd_unit
*drhd
;
3347 struct intel_iommu
*iommu
= NULL
;
3349 for_each_active_iommu(iommu
, drhd
)
3351 dmar_reenable_qi(iommu
);
3353 for_each_iommu(iommu
, drhd
) {
3354 if (drhd
->ignored
) {
3356 * we always have to disable PMRs or DMA may fail on
3360 iommu_disable_protect_mem_regions(iommu
);
3364 iommu_flush_write_buffer(iommu
);
3366 iommu_set_root_entry(iommu
);
3368 iommu
->flush
.flush_context(iommu
, 0, 0, 0,
3369 DMA_CCMD_GLOBAL_INVL
);
3370 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0,
3371 DMA_TLB_GLOBAL_FLUSH
);
3372 if (iommu_enable_translation(iommu
))
3374 iommu_disable_protect_mem_regions(iommu
);
3380 static void iommu_flush_all(void)
3382 struct dmar_drhd_unit
*drhd
;
3383 struct intel_iommu
*iommu
;
3385 for_each_active_iommu(iommu
, drhd
) {
3386 iommu
->flush
.flush_context(iommu
, 0, 0, 0,
3387 DMA_CCMD_GLOBAL_INVL
);
3388 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0,
3389 DMA_TLB_GLOBAL_FLUSH
);
3393 static int iommu_suspend(void)
3395 struct dmar_drhd_unit
*drhd
;
3396 struct intel_iommu
*iommu
= NULL
;
3399 for_each_active_iommu(iommu
, drhd
) {
3400 iommu
->iommu_state
= kzalloc(sizeof(u32
) * MAX_SR_DMAR_REGS
,
3402 if (!iommu
->iommu_state
)
3408 for_each_active_iommu(iommu
, drhd
) {
3409 iommu_disable_translation(iommu
);
3411 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
3413 iommu
->iommu_state
[SR_DMAR_FECTL_REG
] =
3414 readl(iommu
->reg
+ DMAR_FECTL_REG
);
3415 iommu
->iommu_state
[SR_DMAR_FEDATA_REG
] =
3416 readl(iommu
->reg
+ DMAR_FEDATA_REG
);
3417 iommu
->iommu_state
[SR_DMAR_FEADDR_REG
] =
3418 readl(iommu
->reg
+ DMAR_FEADDR_REG
);
3419 iommu
->iommu_state
[SR_DMAR_FEUADDR_REG
] =
3420 readl(iommu
->reg
+ DMAR_FEUADDR_REG
);
3422 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
3427 for_each_active_iommu(iommu
, drhd
)
3428 kfree(iommu
->iommu_state
);
3433 static void iommu_resume(void)
3435 struct dmar_drhd_unit
*drhd
;
3436 struct intel_iommu
*iommu
= NULL
;
3439 if (init_iommu_hw()) {
3441 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3443 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3447 for_each_active_iommu(iommu
, drhd
) {
3449 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
3451 writel(iommu
->iommu_state
[SR_DMAR_FECTL_REG
],
3452 iommu
->reg
+ DMAR_FECTL_REG
);
3453 writel(iommu
->iommu_state
[SR_DMAR_FEDATA_REG
],
3454 iommu
->reg
+ DMAR_FEDATA_REG
);
3455 writel(iommu
->iommu_state
[SR_DMAR_FEADDR_REG
],
3456 iommu
->reg
+ DMAR_FEADDR_REG
);
3457 writel(iommu
->iommu_state
[SR_DMAR_FEUADDR_REG
],
3458 iommu
->reg
+ DMAR_FEUADDR_REG
);
3460 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
3463 for_each_active_iommu(iommu
, drhd
)
3464 kfree(iommu
->iommu_state
);
3467 static struct syscore_ops iommu_syscore_ops
= {
3468 .resume
= iommu_resume
,
3469 .suspend
= iommu_suspend
,
3472 static void __init
init_iommu_pm_ops(void)
3474 register_syscore_ops(&iommu_syscore_ops
);
3478 static inline void init_iommu_pm_ops(void) {}
3479 #endif /* CONFIG_PM */
3482 int __init
dmar_parse_one_rmrr(struct acpi_dmar_header
*header
)
3484 struct acpi_dmar_reserved_memory
*rmrr
;
3485 struct dmar_rmrr_unit
*rmrru
;
3487 rmrru
= kzalloc(sizeof(*rmrru
), GFP_KERNEL
);
3491 rmrru
->hdr
= header
;
3492 rmrr
= (struct acpi_dmar_reserved_memory
*)header
;
3493 rmrru
->base_address
= rmrr
->base_address
;
3494 rmrru
->end_address
= rmrr
->end_address
;
3495 rmrru
->devices
= dmar_alloc_dev_scope((void *)(rmrr
+ 1),
3496 ((void *)rmrr
) + rmrr
->header
.length
,
3497 &rmrru
->devices_cnt
);
3498 if (rmrru
->devices_cnt
&& rmrru
->devices
== NULL
) {
3503 list_add(&rmrru
->list
, &dmar_rmrr_units
);
3508 int __init
dmar_parse_one_atsr(struct acpi_dmar_header
*hdr
)
3510 struct acpi_dmar_atsr
*atsr
;
3511 struct dmar_atsr_unit
*atsru
;
3513 atsr
= container_of(hdr
, struct acpi_dmar_atsr
, header
);
3514 atsru
= kzalloc(sizeof(*atsru
), GFP_KERNEL
);
3519 atsru
->include_all
= atsr
->flags
& 0x1;
3520 if (!atsru
->include_all
) {
3521 atsru
->devices
= dmar_alloc_dev_scope((void *)(atsr
+ 1),
3522 (void *)atsr
+ atsr
->header
.length
,
3523 &atsru
->devices_cnt
);
3524 if (atsru
->devices_cnt
&& atsru
->devices
== NULL
) {
3530 list_add_rcu(&atsru
->list
, &dmar_atsr_units
);
3535 static void intel_iommu_free_atsr(struct dmar_atsr_unit
*atsru
)
3537 dmar_free_dev_scope(&atsru
->devices
, &atsru
->devices_cnt
);
3541 static void intel_iommu_free_dmars(void)
3543 struct dmar_rmrr_unit
*rmrru
, *rmrr_n
;
3544 struct dmar_atsr_unit
*atsru
, *atsr_n
;
3546 list_for_each_entry_safe(rmrru
, rmrr_n
, &dmar_rmrr_units
, list
) {
3547 list_del(&rmrru
->list
);
3548 dmar_free_dev_scope(&rmrru
->devices
, &rmrru
->devices_cnt
);
3552 list_for_each_entry_safe(atsru
, atsr_n
, &dmar_atsr_units
, list
) {
3553 list_del(&atsru
->list
);
3554 intel_iommu_free_atsr(atsru
);
3558 int dmar_find_matched_atsr_unit(struct pci_dev
*dev
)
3561 struct pci_bus
*bus
;
3562 struct pci_dev
*bridge
= NULL
, *tmp
;
3563 struct acpi_dmar_atsr
*atsr
;
3564 struct dmar_atsr_unit
*atsru
;
3566 dev
= pci_physfn(dev
);
3567 for (bus
= dev
->bus
; bus
; bus
= bus
->parent
) {
3569 if (!bridge
|| !pci_is_pcie(bridge
) ||
3570 pci_pcie_type(bridge
) == PCI_EXP_TYPE_PCI_BRIDGE
)
3572 if (pci_pcie_type(bridge
) == PCI_EXP_TYPE_ROOT_PORT
)
3579 list_for_each_entry_rcu(atsru
, &dmar_atsr_units
, list
) {
3580 atsr
= container_of(atsru
->hdr
, struct acpi_dmar_atsr
, header
);
3581 if (atsr
->segment
!= pci_domain_nr(dev
->bus
))
3584 for_each_dev_scope(atsru
->devices
, atsru
->devices_cnt
, i
, tmp
)
3588 if (atsru
->include_all
)
3598 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info
*info
)
3601 struct dmar_rmrr_unit
*rmrru
;
3602 struct dmar_atsr_unit
*atsru
;
3603 struct acpi_dmar_atsr
*atsr
;
3604 struct acpi_dmar_reserved_memory
*rmrr
;
3606 if (!intel_iommu_enabled
&& system_state
!= SYSTEM_BOOTING
)
3609 list_for_each_entry(rmrru
, &dmar_rmrr_units
, list
) {
3610 rmrr
= container_of(rmrru
->hdr
,
3611 struct acpi_dmar_reserved_memory
, header
);
3612 if (info
->event
== BUS_NOTIFY_ADD_DEVICE
) {
3613 ret
= dmar_insert_dev_scope(info
, (void *)(rmrr
+ 1),
3614 ((void *)rmrr
) + rmrr
->header
.length
,
3615 rmrr
->segment
, rmrru
->devices
,
3616 rmrru
->devices_cnt
);
3621 } else if (info
->event
== BUS_NOTIFY_DEL_DEVICE
) {
3622 if (dmar_remove_dev_scope(info
, rmrr
->segment
,
3623 rmrru
->devices
, rmrru
->devices_cnt
))
3628 list_for_each_entry(atsru
, &dmar_atsr_units
, list
) {
3629 if (atsru
->include_all
)
3632 atsr
= container_of(atsru
->hdr
, struct acpi_dmar_atsr
, header
);
3633 if (info
->event
== BUS_NOTIFY_ADD_DEVICE
) {
3634 ret
= dmar_insert_dev_scope(info
, (void *)(atsr
+ 1),
3635 (void *)atsr
+ atsr
->header
.length
,
3636 atsr
->segment
, atsru
->devices
,
3637 atsru
->devices_cnt
);
3642 } else if (info
->event
== BUS_NOTIFY_DEL_DEVICE
) {
3643 if (dmar_remove_dev_scope(info
, atsr
->segment
,
3644 atsru
->devices
, atsru
->devices_cnt
))
3653 * Here we only respond to action of unbound device from driver.
3655 * Added device is not attached to its DMAR domain here yet. That will happen
3656 * when mapping the device to iova.
3658 static int device_notifier(struct notifier_block
*nb
,
3659 unsigned long action
, void *data
)
3661 struct device
*dev
= data
;
3662 struct pci_dev
*pdev
= to_pci_dev(dev
);
3663 struct dmar_domain
*domain
;
3665 if (iommu_dummy(pdev
))
3668 if (action
!= BUS_NOTIFY_UNBOUND_DRIVER
&&
3669 action
!= BUS_NOTIFY_DEL_DEVICE
)
3672 domain
= find_domain(pdev
);
3676 down_read(&dmar_global_lock
);
3677 domain_remove_one_dev_info(domain
, pdev
);
3678 if (!(domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
) &&
3679 !(domain
->flags
& DOMAIN_FLAG_STATIC_IDENTITY
) &&
3680 list_empty(&domain
->devices
))
3681 domain_exit(domain
);
3682 up_read(&dmar_global_lock
);
3687 static struct notifier_block device_nb
= {
3688 .notifier_call
= device_notifier
,
3691 static int intel_iommu_memory_notifier(struct notifier_block
*nb
,
3692 unsigned long val
, void *v
)
3694 struct memory_notify
*mhp
= v
;
3695 unsigned long long start
, end
;
3696 unsigned long start_vpfn
, last_vpfn
;
3699 case MEM_GOING_ONLINE
:
3700 start
= mhp
->start_pfn
<< PAGE_SHIFT
;
3701 end
= ((mhp
->start_pfn
+ mhp
->nr_pages
) << PAGE_SHIFT
) - 1;
3702 if (iommu_domain_identity_map(si_domain
, start
, end
)) {
3703 pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
3710 case MEM_CANCEL_ONLINE
:
3711 start_vpfn
= mm_to_dma_pfn(mhp
->start_pfn
);
3712 last_vpfn
= mm_to_dma_pfn(mhp
->start_pfn
+ mhp
->nr_pages
- 1);
3713 while (start_vpfn
<= last_vpfn
) {
3715 struct dmar_drhd_unit
*drhd
;
3716 struct intel_iommu
*iommu
;
3718 iova
= find_iova(&si_domain
->iovad
, start_vpfn
);
3720 pr_debug("dmar: failed get IOVA for PFN %lx\n",
3725 iova
= split_and_remove_iova(&si_domain
->iovad
, iova
,
3726 start_vpfn
, last_vpfn
);
3728 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
3729 start_vpfn
, last_vpfn
);
3734 for_each_active_iommu(iommu
, drhd
)
3735 iommu_flush_iotlb_psi(iommu
, si_domain
->id
,
3737 iova
->pfn_hi
- iova
->pfn_lo
+ 1, 0);
3739 dma_pte_clear_range(si_domain
, iova
->pfn_lo
,
3741 dma_pte_free_pagetable(si_domain
, iova
->pfn_lo
,
3744 start_vpfn
= iova
->pfn_hi
+ 1;
3745 free_iova_mem(iova
);
3753 static struct notifier_block intel_iommu_memory_nb
= {
3754 .notifier_call
= intel_iommu_memory_notifier
,
3758 int __init
intel_iommu_init(void)
3761 struct dmar_drhd_unit
*drhd
;
3762 struct intel_iommu
*iommu
;
3764 /* VT-d is required for a TXT/tboot launch, so enforce that */
3765 force_on
= tboot_force_iommu();
3767 if (iommu_init_mempool()) {
3769 panic("tboot: Failed to initialize iommu memory\n");
3773 down_write(&dmar_global_lock
);
3774 if (dmar_table_init()) {
3776 panic("tboot: Failed to initialize DMAR table\n");
3781 * Disable translation if already enabled prior to OS handover.
3783 for_each_active_iommu(iommu
, drhd
)
3784 if (iommu
->gcmd
& DMA_GCMD_TE
)
3785 iommu_disable_translation(iommu
);
3787 if (dmar_dev_scope_init() < 0) {
3789 panic("tboot: Failed to initialize DMAR device scope\n");
3793 if (no_iommu
|| dmar_disabled
)
3796 if (list_empty(&dmar_rmrr_units
))
3797 printk(KERN_INFO
"DMAR: No RMRR found\n");
3799 if (list_empty(&dmar_atsr_units
))
3800 printk(KERN_INFO
"DMAR: No ATSR found\n");
3802 if (dmar_init_reserved_ranges()) {
3804 panic("tboot: Failed to reserve iommu ranges\n");
3805 goto out_free_reserved_range
;
3808 init_no_remapping_devices();
3813 panic("tboot: Failed to initialize DMARs\n");
3814 printk(KERN_ERR
"IOMMU: dmar init failed\n");
3815 goto out_free_reserved_range
;
3817 up_write(&dmar_global_lock
);
3819 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3821 init_timer(&unmap_timer
);
3822 #ifdef CONFIG_SWIOTLB
3825 dma_ops
= &intel_dma_ops
;
3827 init_iommu_pm_ops();
3829 bus_set_iommu(&pci_bus_type
, &intel_iommu_ops
);
3830 bus_register_notifier(&pci_bus_type
, &device_nb
);
3831 if (si_domain
&& !hw_pass_through
)
3832 register_memory_notifier(&intel_iommu_memory_nb
);
3834 intel_iommu_enabled
= 1;
3838 out_free_reserved_range
:
3839 put_iova_domain(&reserved_iova_list
);
3841 intel_iommu_free_dmars();
3842 up_write(&dmar_global_lock
);
3843 iommu_exit_mempool();
3847 static void iommu_detach_dependent_devices(struct intel_iommu
*iommu
,
3848 struct pci_dev
*pdev
)
3850 struct pci_dev
*tmp
, *parent
;
3852 if (!iommu
|| !pdev
)
3855 /* dependent device detach */
3856 tmp
= pci_find_upstream_pcie_bridge(pdev
);
3857 /* Secondary interface's bus number and devfn 0 */
3859 parent
= pdev
->bus
->self
;
3860 while (parent
!= tmp
) {
3861 iommu_detach_dev(iommu
, parent
->bus
->number
,
3863 parent
= parent
->bus
->self
;
3865 if (pci_is_pcie(tmp
)) /* this is a PCIe-to-PCI bridge */
3866 iommu_detach_dev(iommu
,
3867 tmp
->subordinate
->number
, 0);
3868 else /* this is a legacy PCI bridge */
3869 iommu_detach_dev(iommu
, tmp
->bus
->number
,
3874 static void domain_remove_one_dev_info(struct dmar_domain
*domain
,
3875 struct pci_dev
*pdev
)
3877 struct device_domain_info
*info
, *tmp
;
3878 struct intel_iommu
*iommu
;
3879 unsigned long flags
;
3882 iommu
= device_to_iommu(pci_domain_nr(pdev
->bus
), pdev
->bus
->number
,
3887 spin_lock_irqsave(&device_domain_lock
, flags
);
3888 list_for_each_entry_safe(info
, tmp
, &domain
->devices
, link
) {
3889 if (info
->segment
== pci_domain_nr(pdev
->bus
) &&
3890 info
->bus
== pdev
->bus
->number
&&
3891 info
->devfn
== pdev
->devfn
) {
3892 unlink_domain_info(info
);
3893 spin_unlock_irqrestore(&device_domain_lock
, flags
);
3895 iommu_disable_dev_iotlb(info
);
3896 iommu_detach_dev(iommu
, info
->bus
, info
->devfn
);
3897 iommu_detach_dependent_devices(iommu
, pdev
);
3898 free_devinfo_mem(info
);
3900 spin_lock_irqsave(&device_domain_lock
, flags
);
3908 /* if there is no other devices under the same iommu
3909 * owned by this domain, clear this iommu in iommu_bmp
3910 * update iommu count and coherency
3912 if (iommu
== device_to_iommu(info
->segment
, info
->bus
,
3917 spin_unlock_irqrestore(&device_domain_lock
, flags
);
3920 unsigned long tmp_flags
;
3921 spin_lock_irqsave(&domain
->iommu_lock
, tmp_flags
);
3922 clear_bit(iommu
->seq_id
, domain
->iommu_bmp
);
3923 domain
->iommu_count
--;
3924 domain_update_iommu_cap(domain
);
3925 spin_unlock_irqrestore(&domain
->iommu_lock
, tmp_flags
);
3927 if (!(domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
) &&
3928 !(domain
->flags
& DOMAIN_FLAG_STATIC_IDENTITY
)) {
3929 spin_lock_irqsave(&iommu
->lock
, tmp_flags
);
3930 clear_bit(domain
->id
, iommu
->domain_ids
);
3931 iommu
->domains
[domain
->id
] = NULL
;
3932 spin_unlock_irqrestore(&iommu
->lock
, tmp_flags
);
3937 static int md_domain_init(struct dmar_domain
*domain
, int guest_width
)
3941 init_iova_domain(&domain
->iovad
, DMA_32BIT_PFN
);
3942 domain_reserve_special_ranges(domain
);
3944 /* calculate AGAW */
3945 domain
->gaw
= guest_width
;
3946 adjust_width
= guestwidth_to_adjustwidth(guest_width
);
3947 domain
->agaw
= width_to_agaw(adjust_width
);
3949 domain
->iommu_coherency
= 0;
3950 domain
->iommu_snooping
= 0;
3951 domain
->iommu_superpage
= 0;
3952 domain
->max_addr
= 0;
3955 /* always allocate the top pgd */
3956 domain
->pgd
= (struct dma_pte
*)alloc_pgtable_page(domain
->nid
);
3959 domain_flush_cache(domain
, domain
->pgd
, PAGE_SIZE
);
3963 static int intel_iommu_domain_init(struct iommu_domain
*domain
)
3965 struct dmar_domain
*dmar_domain
;
3967 dmar_domain
= alloc_domain(true);
3970 "intel_iommu_domain_init: dmar_domain == NULL\n");
3973 if (md_domain_init(dmar_domain
, DEFAULT_DOMAIN_ADDRESS_WIDTH
)) {
3975 "intel_iommu_domain_init() failed\n");
3976 domain_exit(dmar_domain
);
3979 domain_update_iommu_cap(dmar_domain
);
3980 domain
->priv
= dmar_domain
;
3982 domain
->geometry
.aperture_start
= 0;
3983 domain
->geometry
.aperture_end
= __DOMAIN_MAX_ADDR(dmar_domain
->gaw
);
3984 domain
->geometry
.force_aperture
= true;
3989 static void intel_iommu_domain_destroy(struct iommu_domain
*domain
)
3991 struct dmar_domain
*dmar_domain
= domain
->priv
;
3993 domain
->priv
= NULL
;
3994 domain_exit(dmar_domain
);
3997 static int intel_iommu_attach_device(struct iommu_domain
*domain
,
4000 struct dmar_domain
*dmar_domain
= domain
->priv
;
4001 struct pci_dev
*pdev
= to_pci_dev(dev
);
4002 struct intel_iommu
*iommu
;
4005 /* normally pdev is not mapped */
4006 if (unlikely(domain_context_mapped(pdev
))) {
4007 struct dmar_domain
*old_domain
;
4009 old_domain
= find_domain(pdev
);
4011 if (dmar_domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
||
4012 dmar_domain
->flags
& DOMAIN_FLAG_STATIC_IDENTITY
)
4013 domain_remove_one_dev_info(old_domain
, pdev
);
4015 domain_remove_dev_info(old_domain
);
4019 iommu
= device_to_iommu(pci_domain_nr(pdev
->bus
), pdev
->bus
->number
,
4024 /* check if this iommu agaw is sufficient for max mapped address */
4025 addr_width
= agaw_to_width(iommu
->agaw
);
4026 if (addr_width
> cap_mgaw(iommu
->cap
))
4027 addr_width
= cap_mgaw(iommu
->cap
);
4029 if (dmar_domain
->max_addr
> (1LL << addr_width
)) {
4030 printk(KERN_ERR
"%s: iommu width (%d) is not "
4031 "sufficient for the mapped address (%llx)\n",
4032 __func__
, addr_width
, dmar_domain
->max_addr
);
4035 dmar_domain
->gaw
= addr_width
;
4038 * Knock out extra levels of page tables if necessary
4040 while (iommu
->agaw
< dmar_domain
->agaw
) {
4041 struct dma_pte
*pte
;
4043 pte
= dmar_domain
->pgd
;
4044 if (dma_pte_present(pte
)) {
4045 dmar_domain
->pgd
= (struct dma_pte
*)
4046 phys_to_virt(dma_pte_addr(pte
));
4047 free_pgtable_page(pte
);
4049 dmar_domain
->agaw
--;
4052 return domain_add_dev_info(dmar_domain
, pdev
, CONTEXT_TT_MULTI_LEVEL
);
4055 static void intel_iommu_detach_device(struct iommu_domain
*domain
,
4058 struct dmar_domain
*dmar_domain
= domain
->priv
;
4059 struct pci_dev
*pdev
= to_pci_dev(dev
);
4061 domain_remove_one_dev_info(dmar_domain
, pdev
);
4064 static int intel_iommu_map(struct iommu_domain
*domain
,
4065 unsigned long iova
, phys_addr_t hpa
,
4066 size_t size
, int iommu_prot
)
4068 struct dmar_domain
*dmar_domain
= domain
->priv
;
4073 if (iommu_prot
& IOMMU_READ
)
4074 prot
|= DMA_PTE_READ
;
4075 if (iommu_prot
& IOMMU_WRITE
)
4076 prot
|= DMA_PTE_WRITE
;
4077 if ((iommu_prot
& IOMMU_CACHE
) && dmar_domain
->iommu_snooping
)
4078 prot
|= DMA_PTE_SNP
;
4080 max_addr
= iova
+ size
;
4081 if (dmar_domain
->max_addr
< max_addr
) {
4084 /* check if minimum agaw is sufficient for mapped address */
4085 end
= __DOMAIN_MAX_ADDR(dmar_domain
->gaw
) + 1;
4086 if (end
< max_addr
) {
4087 printk(KERN_ERR
"%s: iommu width (%d) is not "
4088 "sufficient for the mapped address (%llx)\n",
4089 __func__
, dmar_domain
->gaw
, max_addr
);
4092 dmar_domain
->max_addr
= max_addr
;
4094 /* Round up size to next multiple of PAGE_SIZE, if it and
4095 the low bits of hpa would take us onto the next page */
4096 size
= aligned_nrpages(hpa
, size
);
4097 ret
= domain_pfn_mapping(dmar_domain
, iova
>> VTD_PAGE_SHIFT
,
4098 hpa
>> VTD_PAGE_SHIFT
, size
, prot
);
4102 static size_t intel_iommu_unmap(struct iommu_domain
*domain
,
4103 unsigned long iova
, size_t size
)
4105 struct dmar_domain
*dmar_domain
= domain
->priv
;
4108 /* Cope with horrid API which requires us to unmap more than the
4109 size argument if it happens to be a large-page mapping. */
4110 if (!pfn_to_dma_pte(dmar_domain
, iova
>> VTD_PAGE_SHIFT
, &level
))
4113 if (size
< VTD_PAGE_SIZE
<< level_to_offset_bits(level
))
4114 size
= VTD_PAGE_SIZE
<< level_to_offset_bits(level
);
4116 dma_pte_clear_range(dmar_domain
, iova
>> VTD_PAGE_SHIFT
,
4117 (iova
+ size
- 1) >> VTD_PAGE_SHIFT
);
4119 if (dmar_domain
->max_addr
== iova
+ size
)
4120 dmar_domain
->max_addr
= iova
;
4125 static phys_addr_t
intel_iommu_iova_to_phys(struct iommu_domain
*domain
,
4128 struct dmar_domain
*dmar_domain
= domain
->priv
;
4129 struct dma_pte
*pte
;
4133 pte
= pfn_to_dma_pte(dmar_domain
, iova
>> VTD_PAGE_SHIFT
, &level
);
4135 phys
= dma_pte_addr(pte
);
4140 static int intel_iommu_domain_has_cap(struct iommu_domain
*domain
,
4143 struct dmar_domain
*dmar_domain
= domain
->priv
;
4145 if (cap
== IOMMU_CAP_CACHE_COHERENCY
)
4146 return dmar_domain
->iommu_snooping
;
4147 if (cap
== IOMMU_CAP_INTR_REMAP
)
4148 return irq_remapping_enabled
;
4153 #define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4155 static int intel_iommu_add_device(struct device
*dev
)
4157 struct pci_dev
*pdev
= to_pci_dev(dev
);
4158 struct pci_dev
*bridge
, *dma_pdev
= NULL
;
4159 struct iommu_group
*group
;
4162 if (!device_to_iommu(pci_domain_nr(pdev
->bus
),
4163 pdev
->bus
->number
, pdev
->devfn
))
4166 bridge
= pci_find_upstream_pcie_bridge(pdev
);
4168 if (pci_is_pcie(bridge
))
4169 dma_pdev
= pci_get_domain_bus_and_slot(
4170 pci_domain_nr(pdev
->bus
),
4171 bridge
->subordinate
->number
, 0);
4173 dma_pdev
= pci_dev_get(bridge
);
4175 dma_pdev
= pci_dev_get(pdev
);
4177 /* Account for quirked devices */
4178 swap_pci_ref(&dma_pdev
, pci_get_dma_source(dma_pdev
));
4181 * If it's a multifunction device that does not support our
4182 * required ACS flags, add to the same group as lowest numbered
4183 * function that also does not suport the required ACS flags.
4185 if (dma_pdev
->multifunction
&&
4186 !pci_acs_enabled(dma_pdev
, REQ_ACS_FLAGS
)) {
4187 u8 i
, slot
= PCI_SLOT(dma_pdev
->devfn
);
4189 for (i
= 0; i
< 8; i
++) {
4190 struct pci_dev
*tmp
;
4192 tmp
= pci_get_slot(dma_pdev
->bus
, PCI_DEVFN(slot
, i
));
4196 if (!pci_acs_enabled(tmp
, REQ_ACS_FLAGS
)) {
4197 swap_pci_ref(&dma_pdev
, tmp
);
4205 * Devices on the root bus go through the iommu. If that's not us,
4206 * find the next upstream device and test ACS up to the root bus.
4207 * Finding the next device may require skipping virtual buses.
4209 while (!pci_is_root_bus(dma_pdev
->bus
)) {
4210 struct pci_bus
*bus
= dma_pdev
->bus
;
4212 while (!bus
->self
) {
4213 if (!pci_is_root_bus(bus
))
4219 if (pci_acs_path_enabled(bus
->self
, NULL
, REQ_ACS_FLAGS
))
4222 swap_pci_ref(&dma_pdev
, pci_dev_get(bus
->self
));
4226 group
= iommu_group_get(&dma_pdev
->dev
);
4227 pci_dev_put(dma_pdev
);
4229 group
= iommu_group_alloc();
4231 return PTR_ERR(group
);
4234 ret
= iommu_group_add_device(group
, dev
);
4236 iommu_group_put(group
);
4240 static void intel_iommu_remove_device(struct device
*dev
)
4242 iommu_group_remove_device(dev
);
4245 static struct iommu_ops intel_iommu_ops
= {
4246 .domain_init
= intel_iommu_domain_init
,
4247 .domain_destroy
= intel_iommu_domain_destroy
,
4248 .attach_dev
= intel_iommu_attach_device
,
4249 .detach_dev
= intel_iommu_detach_device
,
4250 .map
= intel_iommu_map
,
4251 .unmap
= intel_iommu_unmap
,
4252 .iova_to_phys
= intel_iommu_iova_to_phys
,
4253 .domain_has_cap
= intel_iommu_domain_has_cap
,
4254 .add_device
= intel_iommu_add_device
,
4255 .remove_device
= intel_iommu_remove_device
,
4256 .pgsize_bitmap
= INTEL_IOMMU_PGSIZES
,
4259 static void quirk_iommu_g4x_gfx(struct pci_dev
*dev
)
4261 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4262 printk(KERN_INFO
"DMAR: Disabling IOMMU for graphics on this chipset\n");
4266 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2a40, quirk_iommu_g4x_gfx
);
4267 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e00, quirk_iommu_g4x_gfx
);
4268 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e10, quirk_iommu_g4x_gfx
);
4269 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e20, quirk_iommu_g4x_gfx
);
4270 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e30, quirk_iommu_g4x_gfx
);
4271 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e40, quirk_iommu_g4x_gfx
);
4272 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e90, quirk_iommu_g4x_gfx
);
4274 static void quirk_iommu_rwbf(struct pci_dev
*dev
)
4277 * Mobile 4 Series Chipset neglects to set RWBF capability,
4278 * but needs it. Same seems to hold for the desktop versions.
4280 printk(KERN_INFO
"DMAR: Forcing write-buffer flush capability\n");
4284 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2a40, quirk_iommu_rwbf
);
4285 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e00, quirk_iommu_rwbf
);
4286 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e10, quirk_iommu_rwbf
);
4287 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e20, quirk_iommu_rwbf
);
4288 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e30, quirk_iommu_rwbf
);
4289 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e40, quirk_iommu_rwbf
);
4290 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e90, quirk_iommu_rwbf
);
4293 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4294 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4295 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4296 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4297 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4298 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4299 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4300 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4302 static void quirk_calpella_no_shadow_gtt(struct pci_dev
*dev
)
4306 if (pci_read_config_word(dev
, GGC
, &ggc
))
4309 if (!(ggc
& GGC_MEMORY_VT_ENABLED
)) {
4310 printk(KERN_INFO
"DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4312 } else if (dmar_map_gfx
) {
4313 /* we have to ensure the gfx device is idle before we flush */
4314 printk(KERN_INFO
"DMAR: Disabling batched IOTLB flush on Ironlake\n");
4315 intel_iommu_strict
= 1;
4318 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x0040, quirk_calpella_no_shadow_gtt
);
4319 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x0044, quirk_calpella_no_shadow_gtt
);
4320 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x0062, quirk_calpella_no_shadow_gtt
);
4321 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x006a, quirk_calpella_no_shadow_gtt
);
4323 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4324 ISOCH DMAR unit for the Azalia sound device, but not give it any
4325 TLB entries, which causes it to deadlock. Check for that. We do
4326 this in a function called from init_dmars(), instead of in a PCI
4327 quirk, because we don't want to print the obnoxious "BIOS broken"
4328 message if VT-d is actually disabled.
4330 static void __init
check_tylersburg_isoch(void)
4332 struct pci_dev
*pdev
;
4333 uint32_t vtisochctrl
;
4335 /* If there's no Azalia in the system anyway, forget it. */
4336 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, 0x3a3e, NULL
);
4341 /* System Management Registers. Might be hidden, in which case
4342 we can't do the sanity check. But that's OK, because the
4343 known-broken BIOSes _don't_ actually hide it, so far. */
4344 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, 0x342e, NULL
);
4348 if (pci_read_config_dword(pdev
, 0x188, &vtisochctrl
)) {
4355 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4356 if (vtisochctrl
& 1)
4359 /* Drop all bits other than the number of TLB entries */
4360 vtisochctrl
&= 0x1c;
4362 /* If we have the recommended number of TLB entries (16), fine. */
4363 if (vtisochctrl
== 0x10)
4366 /* Zero TLB entries? You get to ride the short bus to school. */
4368 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4369 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4370 dmi_get_system_info(DMI_BIOS_VENDOR
),
4371 dmi_get_system_info(DMI_BIOS_VERSION
),
4372 dmi_get_system_info(DMI_PRODUCT_VERSION
));
4373 iommu_identity_mapping
|= IDENTMAP_AZALIA
;
4377 printk(KERN_WARNING
"DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",