2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/syscore_ops.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
46 #define ROOT_SIZE VTD_PAGE_SIZE
47 #define CONTEXT_SIZE VTD_PAGE_SIZE
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
53 #define IOAPIC_RANGE_START (0xfee00000)
54 #define IOAPIC_RANGE_END (0xfeefffff)
55 #define IOVA_START_ADDR (0x1000)
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
59 #define MAX_AGAW_WIDTH 64
61 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
67 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
70 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
74 /* page table handling */
75 #define LEVEL_STRIDE (9)
76 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
78 static inline int agaw_to_level(int agaw
)
83 static inline int agaw_to_width(int agaw
)
85 return 30 + agaw
* LEVEL_STRIDE
;
88 static inline int width_to_agaw(int width
)
90 return (width
- 30) / LEVEL_STRIDE
;
93 static inline unsigned int level_to_offset_bits(int level
)
95 return (level
- 1) * LEVEL_STRIDE
;
98 static inline int pfn_level_offset(unsigned long pfn
, int level
)
100 return (pfn
>> level_to_offset_bits(level
)) & LEVEL_MASK
;
103 static inline unsigned long level_mask(int level
)
105 return -1UL << level_to_offset_bits(level
);
108 static inline unsigned long level_size(int level
)
110 return 1UL << level_to_offset_bits(level
);
113 static inline unsigned long align_to_level(unsigned long pfn
, int level
)
115 return (pfn
+ level_size(level
) - 1) & level_mask(level
);
118 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
119 are never going to work. */
120 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn
)
122 return dma_pfn
>> (PAGE_SHIFT
- VTD_PAGE_SHIFT
);
125 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn
)
127 return mm_pfn
<< (PAGE_SHIFT
- VTD_PAGE_SHIFT
);
129 static inline unsigned long page_to_dma_pfn(struct page
*pg
)
131 return mm_to_dma_pfn(page_to_pfn(pg
));
133 static inline unsigned long virt_to_dma_pfn(void *p
)
135 return page_to_dma_pfn(virt_to_page(p
));
138 /* global iommu list, set NULL for ignored DMAR units */
139 static struct intel_iommu
**g_iommus
;
141 static void __init
check_tylersburg_isoch(void);
142 static int rwbf_quirk
;
145 * set to 1 to panic kernel if can't successfully enable VT-d
146 * (used when kernel is launched w/ TXT)
148 static int force_on
= 0;
153 * 12-63: Context Ptr (12 - (haw-1))
160 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
161 static inline bool root_present(struct root_entry
*root
)
163 return (root
->val
& 1);
165 static inline void set_root_present(struct root_entry
*root
)
169 static inline void set_root_value(struct root_entry
*root
, unsigned long value
)
171 root
->val
|= value
& VTD_PAGE_MASK
;
174 static inline struct context_entry
*
175 get_context_addr_from_root(struct root_entry
*root
)
177 return (struct context_entry
*)
178 (root_present(root
)?phys_to_virt(
179 root
->val
& VTD_PAGE_MASK
) :
186 * 1: fault processing disable
187 * 2-3: translation type
188 * 12-63: address space root
194 struct context_entry
{
199 static inline bool context_present(struct context_entry
*context
)
201 return (context
->lo
& 1);
203 static inline void context_set_present(struct context_entry
*context
)
208 static inline void context_set_fault_enable(struct context_entry
*context
)
210 context
->lo
&= (((u64
)-1) << 2) | 1;
213 static inline void context_set_translation_type(struct context_entry
*context
,
216 context
->lo
&= (((u64
)-1) << 4) | 3;
217 context
->lo
|= (value
& 3) << 2;
220 static inline void context_set_address_root(struct context_entry
*context
,
223 context
->lo
|= value
& VTD_PAGE_MASK
;
226 static inline void context_set_address_width(struct context_entry
*context
,
229 context
->hi
|= value
& 7;
232 static inline void context_set_domain_id(struct context_entry
*context
,
235 context
->hi
|= (value
& ((1 << 16) - 1)) << 8;
238 static inline void context_clear_entry(struct context_entry
*context
)
251 * 12-63: Host physcial address
257 static inline void dma_clear_pte(struct dma_pte
*pte
)
262 static inline void dma_set_pte_readable(struct dma_pte
*pte
)
264 pte
->val
|= DMA_PTE_READ
;
267 static inline void dma_set_pte_writable(struct dma_pte
*pte
)
269 pte
->val
|= DMA_PTE_WRITE
;
272 static inline void dma_set_pte_snp(struct dma_pte
*pte
)
274 pte
->val
|= DMA_PTE_SNP
;
277 static inline void dma_set_pte_prot(struct dma_pte
*pte
, unsigned long prot
)
279 pte
->val
= (pte
->val
& ~3) | (prot
& 3);
282 static inline u64
dma_pte_addr(struct dma_pte
*pte
)
285 return pte
->val
& VTD_PAGE_MASK
;
287 /* Must have a full atomic 64-bit read */
288 return __cmpxchg64(&pte
->val
, 0ULL, 0ULL) & VTD_PAGE_MASK
;
292 static inline void dma_set_pte_pfn(struct dma_pte
*pte
, unsigned long pfn
)
294 pte
->val
|= (uint64_t)pfn
<< VTD_PAGE_SHIFT
;
297 static inline bool dma_pte_present(struct dma_pte
*pte
)
299 return (pte
->val
& 3) != 0;
302 static inline int first_pte_in_page(struct dma_pte
*pte
)
304 return !((unsigned long)pte
& ~VTD_PAGE_MASK
);
308 * This domain is a statically identity mapping domain.
309 * 1. This domain creats a static 1:1 mapping to all usable memory.
310 * 2. It maps to each iommu if successful.
311 * 3. Each iommu mapps to this domain if successful.
313 static struct dmar_domain
*si_domain
;
314 static int hw_pass_through
= 1;
316 /* devices under the same p2p bridge are owned in one domain */
317 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
319 /* domain represents a virtual machine, more than one devices
320 * across iommus may be owned in one domain, e.g. kvm guest.
322 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
324 /* si_domain contains mulitple devices */
325 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
328 int id
; /* domain id */
329 int nid
; /* node id */
330 unsigned long iommu_bmp
; /* bitmap of iommus this domain uses*/
332 struct list_head devices
; /* all devices' list */
333 struct iova_domain iovad
; /* iova's that belong to this domain */
335 struct dma_pte
*pgd
; /* virtual address */
336 int gaw
; /* max guest address width */
338 /* adjusted guest address width, 0 is level 2 30-bit */
341 int flags
; /* flags to find out type of domain */
343 int iommu_coherency
;/* indicate coherency of iommu access */
344 int iommu_snooping
; /* indicate snooping control feature*/
345 int iommu_count
; /* reference count of iommu */
346 spinlock_t iommu_lock
; /* protect iommu set in domain */
347 u64 max_addr
; /* maximum mapped address */
350 /* PCI domain-device relationship */
351 struct device_domain_info
{
352 struct list_head link
; /* link to domain siblings */
353 struct list_head global
; /* link to global list */
354 int segment
; /* PCI domain */
355 u8 bus
; /* PCI bus number */
356 u8 devfn
; /* PCI devfn number */
357 struct pci_dev
*dev
; /* it's NULL for PCIe-to-PCI bridge */
358 struct intel_iommu
*iommu
; /* IOMMU used by this device */
359 struct dmar_domain
*domain
; /* pointer to domain */
362 static void flush_unmaps_timeout(unsigned long data
);
364 DEFINE_TIMER(unmap_timer
, flush_unmaps_timeout
, 0, 0);
366 #define HIGH_WATER_MARK 250
367 struct deferred_flush_tables
{
369 struct iova
*iova
[HIGH_WATER_MARK
];
370 struct dmar_domain
*domain
[HIGH_WATER_MARK
];
373 static struct deferred_flush_tables
*deferred_flush
;
375 /* bitmap for indexing intel_iommus */
376 static int g_num_of_iommus
;
378 static DEFINE_SPINLOCK(async_umap_flush_lock
);
379 static LIST_HEAD(unmaps_to_do
);
382 static long list_size
;
384 static void domain_remove_dev_info(struct dmar_domain
*domain
);
386 #ifdef CONFIG_DMAR_DEFAULT_ON
387 int dmar_disabled
= 0;
389 int dmar_disabled
= 1;
390 #endif /*CONFIG_DMAR_DEFAULT_ON*/
392 static int dmar_map_gfx
= 1;
393 static int dmar_forcedac
;
394 static int intel_iommu_strict
;
396 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397 static DEFINE_SPINLOCK(device_domain_lock
);
398 static LIST_HEAD(device_domain_list
);
400 static struct iommu_ops intel_iommu_ops
;
402 static int __init
intel_iommu_setup(char *str
)
407 if (!strncmp(str
, "on", 2)) {
409 printk(KERN_INFO
"Intel-IOMMU: enabled\n");
410 } else if (!strncmp(str
, "off", 3)) {
412 printk(KERN_INFO
"Intel-IOMMU: disabled\n");
413 } else if (!strncmp(str
, "igfx_off", 8)) {
416 "Intel-IOMMU: disable GFX device mapping\n");
417 } else if (!strncmp(str
, "forcedac", 8)) {
419 "Intel-IOMMU: Forcing DAC for PCI devices\n");
421 } else if (!strncmp(str
, "strict", 6)) {
423 "Intel-IOMMU: disable batched IOTLB flush\n");
424 intel_iommu_strict
= 1;
427 str
+= strcspn(str
, ",");
433 __setup("intel_iommu=", intel_iommu_setup
);
435 static struct kmem_cache
*iommu_domain_cache
;
436 static struct kmem_cache
*iommu_devinfo_cache
;
437 static struct kmem_cache
*iommu_iova_cache
;
439 static inline void *alloc_pgtable_page(int node
)
444 page
= alloc_pages_node(node
, GFP_ATOMIC
| __GFP_ZERO
, 0);
446 vaddr
= page_address(page
);
450 static inline void free_pgtable_page(void *vaddr
)
452 free_page((unsigned long)vaddr
);
455 static inline void *alloc_domain_mem(void)
457 return kmem_cache_alloc(iommu_domain_cache
, GFP_ATOMIC
);
460 static void free_domain_mem(void *vaddr
)
462 kmem_cache_free(iommu_domain_cache
, vaddr
);
465 static inline void * alloc_devinfo_mem(void)
467 return kmem_cache_alloc(iommu_devinfo_cache
, GFP_ATOMIC
);
470 static inline void free_devinfo_mem(void *vaddr
)
472 kmem_cache_free(iommu_devinfo_cache
, vaddr
);
475 struct iova
*alloc_iova_mem(void)
477 return kmem_cache_alloc(iommu_iova_cache
, GFP_ATOMIC
);
480 void free_iova_mem(struct iova
*iova
)
482 kmem_cache_free(iommu_iova_cache
, iova
);
486 static int __iommu_calculate_agaw(struct intel_iommu
*iommu
, int max_gaw
)
491 sagaw
= cap_sagaw(iommu
->cap
);
492 for (agaw
= width_to_agaw(max_gaw
);
494 if (test_bit(agaw
, &sagaw
))
502 * Calculate max SAGAW for each iommu.
504 int iommu_calculate_max_sagaw(struct intel_iommu
*iommu
)
506 return __iommu_calculate_agaw(iommu
, MAX_AGAW_WIDTH
);
510 * calculate agaw for each iommu.
511 * "SAGAW" may be different across iommus, use a default agaw, and
512 * get a supported less agaw for iommus that don't support the default agaw.
514 int iommu_calculate_agaw(struct intel_iommu
*iommu
)
516 return __iommu_calculate_agaw(iommu
, DEFAULT_DOMAIN_ADDRESS_WIDTH
);
519 /* This functionin only returns single iommu in a domain */
520 static struct intel_iommu
*domain_get_iommu(struct dmar_domain
*domain
)
524 /* si_domain and vm domain should not get here. */
525 BUG_ON(domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
);
526 BUG_ON(domain
->flags
& DOMAIN_FLAG_STATIC_IDENTITY
);
528 iommu_id
= find_first_bit(&domain
->iommu_bmp
, g_num_of_iommus
);
529 if (iommu_id
< 0 || iommu_id
>= g_num_of_iommus
)
532 return g_iommus
[iommu_id
];
535 static void domain_update_iommu_coherency(struct dmar_domain
*domain
)
539 domain
->iommu_coherency
= 1;
541 for_each_set_bit(i
, &domain
->iommu_bmp
, g_num_of_iommus
) {
542 if (!ecap_coherent(g_iommus
[i
]->ecap
)) {
543 domain
->iommu_coherency
= 0;
549 static void domain_update_iommu_snooping(struct dmar_domain
*domain
)
553 domain
->iommu_snooping
= 1;
555 for_each_set_bit(i
, &domain
->iommu_bmp
, g_num_of_iommus
) {
556 if (!ecap_sc_support(g_iommus
[i
]->ecap
)) {
557 domain
->iommu_snooping
= 0;
563 /* Some capabilities may be different across iommus */
564 static void domain_update_iommu_cap(struct dmar_domain
*domain
)
566 domain_update_iommu_coherency(domain
);
567 domain_update_iommu_snooping(domain
);
570 static struct intel_iommu
*device_to_iommu(int segment
, u8 bus
, u8 devfn
)
572 struct dmar_drhd_unit
*drhd
= NULL
;
575 for_each_drhd_unit(drhd
) {
578 if (segment
!= drhd
->segment
)
581 for (i
= 0; i
< drhd
->devices_cnt
; i
++) {
582 if (drhd
->devices
[i
] &&
583 drhd
->devices
[i
]->bus
->number
== bus
&&
584 drhd
->devices
[i
]->devfn
== devfn
)
586 if (drhd
->devices
[i
] &&
587 drhd
->devices
[i
]->subordinate
&&
588 drhd
->devices
[i
]->subordinate
->number
<= bus
&&
589 drhd
->devices
[i
]->subordinate
->subordinate
>= bus
)
593 if (drhd
->include_all
)
600 static void domain_flush_cache(struct dmar_domain
*domain
,
601 void *addr
, int size
)
603 if (!domain
->iommu_coherency
)
604 clflush_cache_range(addr
, size
);
607 /* Gets context entry for a given bus and devfn */
608 static struct context_entry
* device_to_context_entry(struct intel_iommu
*iommu
,
611 struct root_entry
*root
;
612 struct context_entry
*context
;
613 unsigned long phy_addr
;
616 spin_lock_irqsave(&iommu
->lock
, flags
);
617 root
= &iommu
->root_entry
[bus
];
618 context
= get_context_addr_from_root(root
);
620 context
= (struct context_entry
*)
621 alloc_pgtable_page(iommu
->node
);
623 spin_unlock_irqrestore(&iommu
->lock
, flags
);
626 __iommu_flush_cache(iommu
, (void *)context
, CONTEXT_SIZE
);
627 phy_addr
= virt_to_phys((void *)context
);
628 set_root_value(root
, phy_addr
);
629 set_root_present(root
);
630 __iommu_flush_cache(iommu
, root
, sizeof(*root
));
632 spin_unlock_irqrestore(&iommu
->lock
, flags
);
633 return &context
[devfn
];
636 static int device_context_mapped(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
638 struct root_entry
*root
;
639 struct context_entry
*context
;
643 spin_lock_irqsave(&iommu
->lock
, flags
);
644 root
= &iommu
->root_entry
[bus
];
645 context
= get_context_addr_from_root(root
);
650 ret
= context_present(&context
[devfn
]);
652 spin_unlock_irqrestore(&iommu
->lock
, flags
);
656 static void clear_context_table(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
658 struct root_entry
*root
;
659 struct context_entry
*context
;
662 spin_lock_irqsave(&iommu
->lock
, flags
);
663 root
= &iommu
->root_entry
[bus
];
664 context
= get_context_addr_from_root(root
);
666 context_clear_entry(&context
[devfn
]);
667 __iommu_flush_cache(iommu
, &context
[devfn
], \
670 spin_unlock_irqrestore(&iommu
->lock
, flags
);
673 static void free_context_table(struct intel_iommu
*iommu
)
675 struct root_entry
*root
;
678 struct context_entry
*context
;
680 spin_lock_irqsave(&iommu
->lock
, flags
);
681 if (!iommu
->root_entry
) {
684 for (i
= 0; i
< ROOT_ENTRY_NR
; i
++) {
685 root
= &iommu
->root_entry
[i
];
686 context
= get_context_addr_from_root(root
);
688 free_pgtable_page(context
);
690 free_pgtable_page(iommu
->root_entry
);
691 iommu
->root_entry
= NULL
;
693 spin_unlock_irqrestore(&iommu
->lock
, flags
);
696 static struct dma_pte
*pfn_to_dma_pte(struct dmar_domain
*domain
,
699 int addr_width
= agaw_to_width(domain
->agaw
) - VTD_PAGE_SHIFT
;
700 struct dma_pte
*parent
, *pte
= NULL
;
701 int level
= agaw_to_level(domain
->agaw
);
704 BUG_ON(!domain
->pgd
);
705 BUG_ON(addr_width
< BITS_PER_LONG
&& pfn
>> addr_width
);
706 parent
= domain
->pgd
;
711 offset
= pfn_level_offset(pfn
, level
);
712 pte
= &parent
[offset
];
716 if (!dma_pte_present(pte
)) {
719 tmp_page
= alloc_pgtable_page(domain
->nid
);
724 domain_flush_cache(domain
, tmp_page
, VTD_PAGE_SIZE
);
725 pteval
= ((uint64_t)virt_to_dma_pfn(tmp_page
) << VTD_PAGE_SHIFT
) | DMA_PTE_READ
| DMA_PTE_WRITE
;
726 if (cmpxchg64(&pte
->val
, 0ULL, pteval
)) {
727 /* Someone else set it while we were thinking; use theirs. */
728 free_pgtable_page(tmp_page
);
731 domain_flush_cache(domain
, pte
, sizeof(*pte
));
734 parent
= phys_to_virt(dma_pte_addr(pte
));
741 /* return address's pte at specific level */
742 static struct dma_pte
*dma_pfn_level_pte(struct dmar_domain
*domain
,
746 struct dma_pte
*parent
, *pte
= NULL
;
747 int total
= agaw_to_level(domain
->agaw
);
750 parent
= domain
->pgd
;
751 while (level
<= total
) {
752 offset
= pfn_level_offset(pfn
, total
);
753 pte
= &parent
[offset
];
757 if (!dma_pte_present(pte
))
759 parent
= phys_to_virt(dma_pte_addr(pte
));
765 /* clear last level pte, a tlb flush should be followed */
766 static void dma_pte_clear_range(struct dmar_domain
*domain
,
767 unsigned long start_pfn
,
768 unsigned long last_pfn
)
770 int addr_width
= agaw_to_width(domain
->agaw
) - VTD_PAGE_SHIFT
;
771 struct dma_pte
*first_pte
, *pte
;
773 BUG_ON(addr_width
< BITS_PER_LONG
&& start_pfn
>> addr_width
);
774 BUG_ON(addr_width
< BITS_PER_LONG
&& last_pfn
>> addr_width
);
775 BUG_ON(start_pfn
> last_pfn
);
777 /* we don't need lock here; nobody else touches the iova range */
779 first_pte
= pte
= dma_pfn_level_pte(domain
, start_pfn
, 1);
781 start_pfn
= align_to_level(start_pfn
+ 1, 2);
788 } while (start_pfn
<= last_pfn
&& !first_pte_in_page(pte
));
790 domain_flush_cache(domain
, first_pte
,
791 (void *)pte
- (void *)first_pte
);
793 } while (start_pfn
&& start_pfn
<= last_pfn
);
796 /* free page table pages. last level pte should already be cleared */
797 static void dma_pte_free_pagetable(struct dmar_domain
*domain
,
798 unsigned long start_pfn
,
799 unsigned long last_pfn
)
801 int addr_width
= agaw_to_width(domain
->agaw
) - VTD_PAGE_SHIFT
;
802 struct dma_pte
*first_pte
, *pte
;
803 int total
= agaw_to_level(domain
->agaw
);
807 BUG_ON(addr_width
< BITS_PER_LONG
&& start_pfn
>> addr_width
);
808 BUG_ON(addr_width
< BITS_PER_LONG
&& last_pfn
>> addr_width
);
809 BUG_ON(start_pfn
> last_pfn
);
811 /* We don't need lock here; nobody else touches the iova range */
813 while (level
<= total
) {
814 tmp
= align_to_level(start_pfn
, level
);
816 /* If we can't even clear one PTE at this level, we're done */
817 if (tmp
+ level_size(level
) - 1 > last_pfn
)
821 first_pte
= pte
= dma_pfn_level_pte(domain
, tmp
, level
);
823 tmp
= align_to_level(tmp
+ 1, level
+ 1);
827 if (dma_pte_present(pte
)) {
828 free_pgtable_page(phys_to_virt(dma_pte_addr(pte
)));
832 tmp
+= level_size(level
);
833 } while (!first_pte_in_page(pte
) &&
834 tmp
+ level_size(level
) - 1 <= last_pfn
);
836 domain_flush_cache(domain
, first_pte
,
837 (void *)pte
- (void *)first_pte
);
839 } while (tmp
&& tmp
+ level_size(level
) - 1 <= last_pfn
);
843 if (start_pfn
== 0 && last_pfn
== DOMAIN_MAX_PFN(domain
->gaw
)) {
844 free_pgtable_page(domain
->pgd
);
850 static int iommu_alloc_root_entry(struct intel_iommu
*iommu
)
852 struct root_entry
*root
;
855 root
= (struct root_entry
*)alloc_pgtable_page(iommu
->node
);
859 __iommu_flush_cache(iommu
, root
, ROOT_SIZE
);
861 spin_lock_irqsave(&iommu
->lock
, flags
);
862 iommu
->root_entry
= root
;
863 spin_unlock_irqrestore(&iommu
->lock
, flags
);
868 static void iommu_set_root_entry(struct intel_iommu
*iommu
)
874 addr
= iommu
->root_entry
;
876 spin_lock_irqsave(&iommu
->register_lock
, flag
);
877 dmar_writeq(iommu
->reg
+ DMAR_RTADDR_REG
, virt_to_phys(addr
));
879 writel(iommu
->gcmd
| DMA_GCMD_SRTP
, iommu
->reg
+ DMAR_GCMD_REG
);
881 /* Make sure hardware complete it */
882 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
883 readl
, (sts
& DMA_GSTS_RTPS
), sts
);
885 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
888 static void iommu_flush_write_buffer(struct intel_iommu
*iommu
)
893 if (!rwbf_quirk
&& !cap_rwbf(iommu
->cap
))
896 spin_lock_irqsave(&iommu
->register_lock
, flag
);
897 writel(iommu
->gcmd
| DMA_GCMD_WBF
, iommu
->reg
+ DMAR_GCMD_REG
);
899 /* Make sure hardware complete it */
900 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
901 readl
, (!(val
& DMA_GSTS_WBFS
)), val
);
903 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
906 /* return value determine if we need a write buffer flush */
907 static void __iommu_flush_context(struct intel_iommu
*iommu
,
908 u16 did
, u16 source_id
, u8 function_mask
,
915 case DMA_CCMD_GLOBAL_INVL
:
916 val
= DMA_CCMD_GLOBAL_INVL
;
918 case DMA_CCMD_DOMAIN_INVL
:
919 val
= DMA_CCMD_DOMAIN_INVL
|DMA_CCMD_DID(did
);
921 case DMA_CCMD_DEVICE_INVL
:
922 val
= DMA_CCMD_DEVICE_INVL
|DMA_CCMD_DID(did
)
923 | DMA_CCMD_SID(source_id
) | DMA_CCMD_FM(function_mask
);
930 spin_lock_irqsave(&iommu
->register_lock
, flag
);
931 dmar_writeq(iommu
->reg
+ DMAR_CCMD_REG
, val
);
933 /* Make sure hardware complete it */
934 IOMMU_WAIT_OP(iommu
, DMAR_CCMD_REG
,
935 dmar_readq
, (!(val
& DMA_CCMD_ICC
)), val
);
937 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
940 /* return value determine if we need a write buffer flush */
941 static void __iommu_flush_iotlb(struct intel_iommu
*iommu
, u16 did
,
942 u64 addr
, unsigned int size_order
, u64 type
)
944 int tlb_offset
= ecap_iotlb_offset(iommu
->ecap
);
945 u64 val
= 0, val_iva
= 0;
949 case DMA_TLB_GLOBAL_FLUSH
:
950 /* global flush doesn't need set IVA_REG */
951 val
= DMA_TLB_GLOBAL_FLUSH
|DMA_TLB_IVT
;
953 case DMA_TLB_DSI_FLUSH
:
954 val
= DMA_TLB_DSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
956 case DMA_TLB_PSI_FLUSH
:
957 val
= DMA_TLB_PSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
958 /* Note: always flush non-leaf currently */
959 val_iva
= size_order
| addr
;
964 /* Note: set drain read/write */
967 * This is probably to be super secure.. Looks like we can
968 * ignore it without any impact.
970 if (cap_read_drain(iommu
->cap
))
971 val
|= DMA_TLB_READ_DRAIN
;
973 if (cap_write_drain(iommu
->cap
))
974 val
|= DMA_TLB_WRITE_DRAIN
;
976 spin_lock_irqsave(&iommu
->register_lock
, flag
);
977 /* Note: Only uses first TLB reg currently */
979 dmar_writeq(iommu
->reg
+ tlb_offset
, val_iva
);
980 dmar_writeq(iommu
->reg
+ tlb_offset
+ 8, val
);
982 /* Make sure hardware complete it */
983 IOMMU_WAIT_OP(iommu
, tlb_offset
+ 8,
984 dmar_readq
, (!(val
& DMA_TLB_IVT
)), val
);
986 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
988 /* check IOTLB invalidation granularity */
989 if (DMA_TLB_IAIG(val
) == 0)
990 printk(KERN_ERR
"IOMMU: flush IOTLB failed\n");
991 if (DMA_TLB_IAIG(val
) != DMA_TLB_IIRG(type
))
992 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
993 (unsigned long long)DMA_TLB_IIRG(type
),
994 (unsigned long long)DMA_TLB_IAIG(val
));
997 static struct device_domain_info
*iommu_support_dev_iotlb(
998 struct dmar_domain
*domain
, int segment
, u8 bus
, u8 devfn
)
1001 unsigned long flags
;
1002 struct device_domain_info
*info
;
1003 struct intel_iommu
*iommu
= device_to_iommu(segment
, bus
, devfn
);
1005 if (!ecap_dev_iotlb_support(iommu
->ecap
))
1011 spin_lock_irqsave(&device_domain_lock
, flags
);
1012 list_for_each_entry(info
, &domain
->devices
, link
)
1013 if (info
->bus
== bus
&& info
->devfn
== devfn
) {
1017 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1019 if (!found
|| !info
->dev
)
1022 if (!pci_find_ext_capability(info
->dev
, PCI_EXT_CAP_ID_ATS
))
1025 if (!dmar_find_matched_atsr_unit(info
->dev
))
1028 info
->iommu
= iommu
;
1033 static void iommu_enable_dev_iotlb(struct device_domain_info
*info
)
1038 pci_enable_ats(info
->dev
, VTD_PAGE_SHIFT
);
1041 static void iommu_disable_dev_iotlb(struct device_domain_info
*info
)
1043 if (!info
->dev
|| !pci_ats_enabled(info
->dev
))
1046 pci_disable_ats(info
->dev
);
1049 static void iommu_flush_dev_iotlb(struct dmar_domain
*domain
,
1050 u64 addr
, unsigned mask
)
1053 unsigned long flags
;
1054 struct device_domain_info
*info
;
1056 spin_lock_irqsave(&device_domain_lock
, flags
);
1057 list_for_each_entry(info
, &domain
->devices
, link
) {
1058 if (!info
->dev
|| !pci_ats_enabled(info
->dev
))
1061 sid
= info
->bus
<< 8 | info
->devfn
;
1062 qdep
= pci_ats_queue_depth(info
->dev
);
1063 qi_flush_dev_iotlb(info
->iommu
, sid
, qdep
, addr
, mask
);
1065 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1068 static void iommu_flush_iotlb_psi(struct intel_iommu
*iommu
, u16 did
,
1069 unsigned long pfn
, unsigned int pages
, int map
)
1071 unsigned int mask
= ilog2(__roundup_pow_of_two(pages
));
1072 uint64_t addr
= (uint64_t)pfn
<< VTD_PAGE_SHIFT
;
1077 * Fallback to domain selective flush if no PSI support or the size is
1079 * PSI requires page size to be 2 ^ x, and the base address is naturally
1080 * aligned to the size
1082 if (!cap_pgsel_inv(iommu
->cap
) || mask
> cap_max_amask_val(iommu
->cap
))
1083 iommu
->flush
.flush_iotlb(iommu
, did
, 0, 0,
1086 iommu
->flush
.flush_iotlb(iommu
, did
, addr
, mask
,
1090 * In caching mode, changes of pages from non-present to present require
1091 * flush. However, device IOTLB doesn't need to be flushed in this case.
1093 if (!cap_caching_mode(iommu
->cap
) || !map
)
1094 iommu_flush_dev_iotlb(iommu
->domains
[did
], addr
, mask
);
1097 static void iommu_disable_protect_mem_regions(struct intel_iommu
*iommu
)
1100 unsigned long flags
;
1102 spin_lock_irqsave(&iommu
->register_lock
, flags
);
1103 pmen
= readl(iommu
->reg
+ DMAR_PMEN_REG
);
1104 pmen
&= ~DMA_PMEN_EPM
;
1105 writel(pmen
, iommu
->reg
+ DMAR_PMEN_REG
);
1107 /* wait for the protected region status bit to clear */
1108 IOMMU_WAIT_OP(iommu
, DMAR_PMEN_REG
,
1109 readl
, !(pmen
& DMA_PMEN_PRS
), pmen
);
1111 spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
1114 static int iommu_enable_translation(struct intel_iommu
*iommu
)
1117 unsigned long flags
;
1119 spin_lock_irqsave(&iommu
->register_lock
, flags
);
1120 iommu
->gcmd
|= DMA_GCMD_TE
;
1121 writel(iommu
->gcmd
, iommu
->reg
+ DMAR_GCMD_REG
);
1123 /* Make sure hardware complete it */
1124 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1125 readl
, (sts
& DMA_GSTS_TES
), sts
);
1127 spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
1131 static int iommu_disable_translation(struct intel_iommu
*iommu
)
1136 spin_lock_irqsave(&iommu
->register_lock
, flag
);
1137 iommu
->gcmd
&= ~DMA_GCMD_TE
;
1138 writel(iommu
->gcmd
, iommu
->reg
+ DMAR_GCMD_REG
);
1140 /* Make sure hardware complete it */
1141 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1142 readl
, (!(sts
& DMA_GSTS_TES
)), sts
);
1144 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1149 static int iommu_init_domains(struct intel_iommu
*iommu
)
1151 unsigned long ndomains
;
1152 unsigned long nlongs
;
1154 ndomains
= cap_ndoms(iommu
->cap
);
1155 pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu
->seq_id
,
1157 nlongs
= BITS_TO_LONGS(ndomains
);
1159 spin_lock_init(&iommu
->lock
);
1161 /* TBD: there might be 64K domains,
1162 * consider other allocation for future chip
1164 iommu
->domain_ids
= kcalloc(nlongs
, sizeof(unsigned long), GFP_KERNEL
);
1165 if (!iommu
->domain_ids
) {
1166 printk(KERN_ERR
"Allocating domain id array failed\n");
1169 iommu
->domains
= kcalloc(ndomains
, sizeof(struct dmar_domain
*),
1171 if (!iommu
->domains
) {
1172 printk(KERN_ERR
"Allocating domain array failed\n");
1177 * if Caching mode is set, then invalid translations are tagged
1178 * with domainid 0. Hence we need to pre-allocate it.
1180 if (cap_caching_mode(iommu
->cap
))
1181 set_bit(0, iommu
->domain_ids
);
1186 static void domain_exit(struct dmar_domain
*domain
);
1187 static void vm_domain_exit(struct dmar_domain
*domain
);
1189 void free_dmar_iommu(struct intel_iommu
*iommu
)
1191 struct dmar_domain
*domain
;
1193 unsigned long flags
;
1195 if ((iommu
->domains
) && (iommu
->domain_ids
)) {
1196 for_each_set_bit(i
, iommu
->domain_ids
, cap_ndoms(iommu
->cap
)) {
1197 domain
= iommu
->domains
[i
];
1198 clear_bit(i
, iommu
->domain_ids
);
1200 spin_lock_irqsave(&domain
->iommu_lock
, flags
);
1201 if (--domain
->iommu_count
== 0) {
1202 if (domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
)
1203 vm_domain_exit(domain
);
1205 domain_exit(domain
);
1207 spin_unlock_irqrestore(&domain
->iommu_lock
, flags
);
1211 if (iommu
->gcmd
& DMA_GCMD_TE
)
1212 iommu_disable_translation(iommu
);
1215 irq_set_handler_data(iommu
->irq
, NULL
);
1216 /* This will mask the irq */
1217 free_irq(iommu
->irq
, iommu
);
1218 destroy_irq(iommu
->irq
);
1221 kfree(iommu
->domains
);
1222 kfree(iommu
->domain_ids
);
1224 g_iommus
[iommu
->seq_id
] = NULL
;
1226 /* if all iommus are freed, free g_iommus */
1227 for (i
= 0; i
< g_num_of_iommus
; i
++) {
1232 if (i
== g_num_of_iommus
)
1235 /* free context mapping */
1236 free_context_table(iommu
);
1239 static struct dmar_domain
*alloc_domain(void)
1241 struct dmar_domain
*domain
;
1243 domain
= alloc_domain_mem();
1248 memset(&domain
->iommu_bmp
, 0, sizeof(unsigned long));
1254 static int iommu_attach_domain(struct dmar_domain
*domain
,
1255 struct intel_iommu
*iommu
)
1258 unsigned long ndomains
;
1259 unsigned long flags
;
1261 ndomains
= cap_ndoms(iommu
->cap
);
1263 spin_lock_irqsave(&iommu
->lock
, flags
);
1265 num
= find_first_zero_bit(iommu
->domain_ids
, ndomains
);
1266 if (num
>= ndomains
) {
1267 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1268 printk(KERN_ERR
"IOMMU: no free domain ids\n");
1273 set_bit(num
, iommu
->domain_ids
);
1274 set_bit(iommu
->seq_id
, &domain
->iommu_bmp
);
1275 iommu
->domains
[num
] = domain
;
1276 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1281 static void iommu_detach_domain(struct dmar_domain
*domain
,
1282 struct intel_iommu
*iommu
)
1284 unsigned long flags
;
1288 spin_lock_irqsave(&iommu
->lock
, flags
);
1289 ndomains
= cap_ndoms(iommu
->cap
);
1290 for_each_set_bit(num
, iommu
->domain_ids
, ndomains
) {
1291 if (iommu
->domains
[num
] == domain
) {
1298 clear_bit(num
, iommu
->domain_ids
);
1299 clear_bit(iommu
->seq_id
, &domain
->iommu_bmp
);
1300 iommu
->domains
[num
] = NULL
;
1302 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1305 static struct iova_domain reserved_iova_list
;
1306 static struct lock_class_key reserved_rbtree_key
;
1308 static int dmar_init_reserved_ranges(void)
1310 struct pci_dev
*pdev
= NULL
;
1314 init_iova_domain(&reserved_iova_list
, DMA_32BIT_PFN
);
1316 lockdep_set_class(&reserved_iova_list
.iova_rbtree_lock
,
1317 &reserved_rbtree_key
);
1319 /* IOAPIC ranges shouldn't be accessed by DMA */
1320 iova
= reserve_iova(&reserved_iova_list
, IOVA_PFN(IOAPIC_RANGE_START
),
1321 IOVA_PFN(IOAPIC_RANGE_END
));
1323 printk(KERN_ERR
"Reserve IOAPIC range failed\n");
1327 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1328 for_each_pci_dev(pdev
) {
1331 for (i
= 0; i
< PCI_NUM_RESOURCES
; i
++) {
1332 r
= &pdev
->resource
[i
];
1333 if (!r
->flags
|| !(r
->flags
& IORESOURCE_MEM
))
1335 iova
= reserve_iova(&reserved_iova_list
,
1339 printk(KERN_ERR
"Reserve iova failed\n");
1347 static void domain_reserve_special_ranges(struct dmar_domain
*domain
)
1349 copy_reserved_iova(&reserved_iova_list
, &domain
->iovad
);
1352 static inline int guestwidth_to_adjustwidth(int gaw
)
1355 int r
= (gaw
- 12) % 9;
1366 static int domain_init(struct dmar_domain
*domain
, int guest_width
)
1368 struct intel_iommu
*iommu
;
1369 int adjust_width
, agaw
;
1370 unsigned long sagaw
;
1372 init_iova_domain(&domain
->iovad
, DMA_32BIT_PFN
);
1373 spin_lock_init(&domain
->iommu_lock
);
1375 domain_reserve_special_ranges(domain
);
1377 /* calculate AGAW */
1378 iommu
= domain_get_iommu(domain
);
1379 if (guest_width
> cap_mgaw(iommu
->cap
))
1380 guest_width
= cap_mgaw(iommu
->cap
);
1381 domain
->gaw
= guest_width
;
1382 adjust_width
= guestwidth_to_adjustwidth(guest_width
);
1383 agaw
= width_to_agaw(adjust_width
);
1384 sagaw
= cap_sagaw(iommu
->cap
);
1385 if (!test_bit(agaw
, &sagaw
)) {
1386 /* hardware doesn't support it, choose a bigger one */
1387 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw
);
1388 agaw
= find_next_bit(&sagaw
, 5, agaw
);
1392 domain
->agaw
= agaw
;
1393 INIT_LIST_HEAD(&domain
->devices
);
1395 if (ecap_coherent(iommu
->ecap
))
1396 domain
->iommu_coherency
= 1;
1398 domain
->iommu_coherency
= 0;
1400 if (ecap_sc_support(iommu
->ecap
))
1401 domain
->iommu_snooping
= 1;
1403 domain
->iommu_snooping
= 0;
1405 domain
->iommu_count
= 1;
1406 domain
->nid
= iommu
->node
;
1408 /* always allocate the top pgd */
1409 domain
->pgd
= (struct dma_pte
*)alloc_pgtable_page(domain
->nid
);
1412 __iommu_flush_cache(iommu
, domain
->pgd
, PAGE_SIZE
);
1416 static void domain_exit(struct dmar_domain
*domain
)
1418 struct dmar_drhd_unit
*drhd
;
1419 struct intel_iommu
*iommu
;
1421 /* Domain 0 is reserved, so dont process it */
1425 domain_remove_dev_info(domain
);
1427 put_iova_domain(&domain
->iovad
);
1430 dma_pte_clear_range(domain
, 0, DOMAIN_MAX_PFN(domain
->gaw
));
1432 /* free page tables */
1433 dma_pte_free_pagetable(domain
, 0, DOMAIN_MAX_PFN(domain
->gaw
));
1435 for_each_active_iommu(iommu
, drhd
)
1436 if (test_bit(iommu
->seq_id
, &domain
->iommu_bmp
))
1437 iommu_detach_domain(domain
, iommu
);
1439 free_domain_mem(domain
);
1442 static int domain_context_mapping_one(struct dmar_domain
*domain
, int segment
,
1443 u8 bus
, u8 devfn
, int translation
)
1445 struct context_entry
*context
;
1446 unsigned long flags
;
1447 struct intel_iommu
*iommu
;
1448 struct dma_pte
*pgd
;
1450 unsigned long ndomains
;
1453 struct device_domain_info
*info
= NULL
;
1455 pr_debug("Set context mapping for %02x:%02x.%d\n",
1456 bus
, PCI_SLOT(devfn
), PCI_FUNC(devfn
));
1458 BUG_ON(!domain
->pgd
);
1459 BUG_ON(translation
!= CONTEXT_TT_PASS_THROUGH
&&
1460 translation
!= CONTEXT_TT_MULTI_LEVEL
);
1462 iommu
= device_to_iommu(segment
, bus
, devfn
);
1466 context
= device_to_context_entry(iommu
, bus
, devfn
);
1469 spin_lock_irqsave(&iommu
->lock
, flags
);
1470 if (context_present(context
)) {
1471 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1478 if (domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
||
1479 domain
->flags
& DOMAIN_FLAG_STATIC_IDENTITY
) {
1482 /* find an available domain id for this device in iommu */
1483 ndomains
= cap_ndoms(iommu
->cap
);
1484 for_each_set_bit(num
, iommu
->domain_ids
, ndomains
) {
1485 if (iommu
->domains
[num
] == domain
) {
1493 num
= find_first_zero_bit(iommu
->domain_ids
, ndomains
);
1494 if (num
>= ndomains
) {
1495 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1496 printk(KERN_ERR
"IOMMU: no free domain ids\n");
1500 set_bit(num
, iommu
->domain_ids
);
1501 iommu
->domains
[num
] = domain
;
1505 /* Skip top levels of page tables for
1506 * iommu which has less agaw than default.
1507 * Unnecessary for PT mode.
1509 if (translation
!= CONTEXT_TT_PASS_THROUGH
) {
1510 for (agaw
= domain
->agaw
; agaw
!= iommu
->agaw
; agaw
--) {
1511 pgd
= phys_to_virt(dma_pte_addr(pgd
));
1512 if (!dma_pte_present(pgd
)) {
1513 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1520 context_set_domain_id(context
, id
);
1522 if (translation
!= CONTEXT_TT_PASS_THROUGH
) {
1523 info
= iommu_support_dev_iotlb(domain
, segment
, bus
, devfn
);
1524 translation
= info
? CONTEXT_TT_DEV_IOTLB
:
1525 CONTEXT_TT_MULTI_LEVEL
;
1528 * In pass through mode, AW must be programmed to indicate the largest
1529 * AGAW value supported by hardware. And ASR is ignored by hardware.
1531 if (unlikely(translation
== CONTEXT_TT_PASS_THROUGH
))
1532 context_set_address_width(context
, iommu
->msagaw
);
1534 context_set_address_root(context
, virt_to_phys(pgd
));
1535 context_set_address_width(context
, iommu
->agaw
);
1538 context_set_translation_type(context
, translation
);
1539 context_set_fault_enable(context
);
1540 context_set_present(context
);
1541 domain_flush_cache(domain
, context
, sizeof(*context
));
1544 * It's a non-present to present mapping. If hardware doesn't cache
1545 * non-present entry we only need to flush the write-buffer. If the
1546 * _does_ cache non-present entries, then it does so in the special
1547 * domain #0, which we have to flush:
1549 if (cap_caching_mode(iommu
->cap
)) {
1550 iommu
->flush
.flush_context(iommu
, 0,
1551 (((u16
)bus
) << 8) | devfn
,
1552 DMA_CCMD_MASK_NOBIT
,
1553 DMA_CCMD_DEVICE_INVL
);
1554 iommu
->flush
.flush_iotlb(iommu
, domain
->id
, 0, 0, DMA_TLB_DSI_FLUSH
);
1556 iommu_flush_write_buffer(iommu
);
1558 iommu_enable_dev_iotlb(info
);
1559 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1561 spin_lock_irqsave(&domain
->iommu_lock
, flags
);
1562 if (!test_and_set_bit(iommu
->seq_id
, &domain
->iommu_bmp
)) {
1563 domain
->iommu_count
++;
1564 if (domain
->iommu_count
== 1)
1565 domain
->nid
= iommu
->node
;
1566 domain_update_iommu_cap(domain
);
1568 spin_unlock_irqrestore(&domain
->iommu_lock
, flags
);
1573 domain_context_mapping(struct dmar_domain
*domain
, struct pci_dev
*pdev
,
1577 struct pci_dev
*tmp
, *parent
;
1579 ret
= domain_context_mapping_one(domain
, pci_domain_nr(pdev
->bus
),
1580 pdev
->bus
->number
, pdev
->devfn
,
1585 /* dependent device mapping */
1586 tmp
= pci_find_upstream_pcie_bridge(pdev
);
1589 /* Secondary interface's bus number and devfn 0 */
1590 parent
= pdev
->bus
->self
;
1591 while (parent
!= tmp
) {
1592 ret
= domain_context_mapping_one(domain
,
1593 pci_domain_nr(parent
->bus
),
1594 parent
->bus
->number
,
1595 parent
->devfn
, translation
);
1598 parent
= parent
->bus
->self
;
1600 if (pci_is_pcie(tmp
)) /* this is a PCIe-to-PCI bridge */
1601 return domain_context_mapping_one(domain
,
1602 pci_domain_nr(tmp
->subordinate
),
1603 tmp
->subordinate
->number
, 0,
1605 else /* this is a legacy PCI bridge */
1606 return domain_context_mapping_one(domain
,
1607 pci_domain_nr(tmp
->bus
),
1613 static int domain_context_mapped(struct pci_dev
*pdev
)
1616 struct pci_dev
*tmp
, *parent
;
1617 struct intel_iommu
*iommu
;
1619 iommu
= device_to_iommu(pci_domain_nr(pdev
->bus
), pdev
->bus
->number
,
1624 ret
= device_context_mapped(iommu
, pdev
->bus
->number
, pdev
->devfn
);
1627 /* dependent device mapping */
1628 tmp
= pci_find_upstream_pcie_bridge(pdev
);
1631 /* Secondary interface's bus number and devfn 0 */
1632 parent
= pdev
->bus
->self
;
1633 while (parent
!= tmp
) {
1634 ret
= device_context_mapped(iommu
, parent
->bus
->number
,
1638 parent
= parent
->bus
->self
;
1640 if (pci_is_pcie(tmp
))
1641 return device_context_mapped(iommu
, tmp
->subordinate
->number
,
1644 return device_context_mapped(iommu
, tmp
->bus
->number
,
1648 /* Returns a number of VTD pages, but aligned to MM page size */
1649 static inline unsigned long aligned_nrpages(unsigned long host_addr
,
1652 host_addr
&= ~PAGE_MASK
;
1653 return PAGE_ALIGN(host_addr
+ size
) >> VTD_PAGE_SHIFT
;
1656 static int __domain_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
1657 struct scatterlist
*sg
, unsigned long phys_pfn
,
1658 unsigned long nr_pages
, int prot
)
1660 struct dma_pte
*first_pte
= NULL
, *pte
= NULL
;
1661 phys_addr_t
uninitialized_var(pteval
);
1662 int addr_width
= agaw_to_width(domain
->agaw
) - VTD_PAGE_SHIFT
;
1663 unsigned long sg_res
;
1665 BUG_ON(addr_width
< BITS_PER_LONG
&& (iov_pfn
+ nr_pages
- 1) >> addr_width
);
1667 if ((prot
& (DMA_PTE_READ
|DMA_PTE_WRITE
)) == 0)
1670 prot
&= DMA_PTE_READ
| DMA_PTE_WRITE
| DMA_PTE_SNP
;
1675 sg_res
= nr_pages
+ 1;
1676 pteval
= ((phys_addr_t
)phys_pfn
<< VTD_PAGE_SHIFT
) | prot
;
1679 while (nr_pages
--) {
1683 sg_res
= aligned_nrpages(sg
->offset
, sg
->length
);
1684 sg
->dma_address
= ((dma_addr_t
)iov_pfn
<< VTD_PAGE_SHIFT
) + sg
->offset
;
1685 sg
->dma_length
= sg
->length
;
1686 pteval
= page_to_phys(sg_page(sg
)) | prot
;
1689 first_pte
= pte
= pfn_to_dma_pte(domain
, iov_pfn
);
1693 /* We don't need lock here, nobody else
1694 * touches the iova range
1696 tmp
= cmpxchg64_local(&pte
->val
, 0ULL, pteval
);
1698 static int dumps
= 5;
1699 printk(KERN_CRIT
"ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1700 iov_pfn
, tmp
, (unsigned long long)pteval
);
1703 debug_dma_dump_mappings(NULL
);
1708 if (!nr_pages
|| first_pte_in_page(pte
)) {
1709 domain_flush_cache(domain
, first_pte
,
1710 (void *)pte
- (void *)first_pte
);
1714 pteval
+= VTD_PAGE_SIZE
;
1722 static inline int domain_sg_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
1723 struct scatterlist
*sg
, unsigned long nr_pages
,
1726 return __domain_mapping(domain
, iov_pfn
, sg
, 0, nr_pages
, prot
);
1729 static inline int domain_pfn_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
1730 unsigned long phys_pfn
, unsigned long nr_pages
,
1733 return __domain_mapping(domain
, iov_pfn
, NULL
, phys_pfn
, nr_pages
, prot
);
1736 static void iommu_detach_dev(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
1741 clear_context_table(iommu
, bus
, devfn
);
1742 iommu
->flush
.flush_context(iommu
, 0, 0, 0,
1743 DMA_CCMD_GLOBAL_INVL
);
1744 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH
);
1747 static void domain_remove_dev_info(struct dmar_domain
*domain
)
1749 struct device_domain_info
*info
;
1750 unsigned long flags
;
1751 struct intel_iommu
*iommu
;
1753 spin_lock_irqsave(&device_domain_lock
, flags
);
1754 while (!list_empty(&domain
->devices
)) {
1755 info
= list_entry(domain
->devices
.next
,
1756 struct device_domain_info
, link
);
1757 list_del(&info
->link
);
1758 list_del(&info
->global
);
1760 info
->dev
->dev
.archdata
.iommu
= NULL
;
1761 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1763 iommu_disable_dev_iotlb(info
);
1764 iommu
= device_to_iommu(info
->segment
, info
->bus
, info
->devfn
);
1765 iommu_detach_dev(iommu
, info
->bus
, info
->devfn
);
1766 free_devinfo_mem(info
);
1768 spin_lock_irqsave(&device_domain_lock
, flags
);
1770 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1775 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1777 static struct dmar_domain
*
1778 find_domain(struct pci_dev
*pdev
)
1780 struct device_domain_info
*info
;
1782 /* No lock here, assumes no domain exit in normal case */
1783 info
= pdev
->dev
.archdata
.iommu
;
1785 return info
->domain
;
1789 /* domain is initialized */
1790 static struct dmar_domain
*get_domain_for_dev(struct pci_dev
*pdev
, int gaw
)
1792 struct dmar_domain
*domain
, *found
= NULL
;
1793 struct intel_iommu
*iommu
;
1794 struct dmar_drhd_unit
*drhd
;
1795 struct device_domain_info
*info
, *tmp
;
1796 struct pci_dev
*dev_tmp
;
1797 unsigned long flags
;
1798 int bus
= 0, devfn
= 0;
1802 domain
= find_domain(pdev
);
1806 segment
= pci_domain_nr(pdev
->bus
);
1808 dev_tmp
= pci_find_upstream_pcie_bridge(pdev
);
1810 if (pci_is_pcie(dev_tmp
)) {
1811 bus
= dev_tmp
->subordinate
->number
;
1814 bus
= dev_tmp
->bus
->number
;
1815 devfn
= dev_tmp
->devfn
;
1817 spin_lock_irqsave(&device_domain_lock
, flags
);
1818 list_for_each_entry(info
, &device_domain_list
, global
) {
1819 if (info
->segment
== segment
&&
1820 info
->bus
== bus
&& info
->devfn
== devfn
) {
1821 found
= info
->domain
;
1825 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1826 /* pcie-pci bridge already has a domain, uses it */
1833 domain
= alloc_domain();
1837 /* Allocate new domain for the device */
1838 drhd
= dmar_find_matched_drhd_unit(pdev
);
1840 printk(KERN_ERR
"IOMMU: can't find DMAR for device %s\n",
1844 iommu
= drhd
->iommu
;
1846 ret
= iommu_attach_domain(domain
, iommu
);
1848 free_domain_mem(domain
);
1852 if (domain_init(domain
, gaw
)) {
1853 domain_exit(domain
);
1857 /* register pcie-to-pci device */
1859 info
= alloc_devinfo_mem();
1861 domain_exit(domain
);
1864 info
->segment
= segment
;
1866 info
->devfn
= devfn
;
1868 info
->domain
= domain
;
1869 /* This domain is shared by devices under p2p bridge */
1870 domain
->flags
|= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES
;
1872 /* pcie-to-pci bridge already has a domain, uses it */
1874 spin_lock_irqsave(&device_domain_lock
, flags
);
1875 list_for_each_entry(tmp
, &device_domain_list
, global
) {
1876 if (tmp
->segment
== segment
&&
1877 tmp
->bus
== bus
&& tmp
->devfn
== devfn
) {
1878 found
= tmp
->domain
;
1883 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1884 free_devinfo_mem(info
);
1885 domain_exit(domain
);
1888 list_add(&info
->link
, &domain
->devices
);
1889 list_add(&info
->global
, &device_domain_list
);
1890 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1895 info
= alloc_devinfo_mem();
1898 info
->segment
= segment
;
1899 info
->bus
= pdev
->bus
->number
;
1900 info
->devfn
= pdev
->devfn
;
1902 info
->domain
= domain
;
1903 spin_lock_irqsave(&device_domain_lock
, flags
);
1904 /* somebody is fast */
1905 found
= find_domain(pdev
);
1906 if (found
!= NULL
) {
1907 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1908 if (found
!= domain
) {
1909 domain_exit(domain
);
1912 free_devinfo_mem(info
);
1915 list_add(&info
->link
, &domain
->devices
);
1916 list_add(&info
->global
, &device_domain_list
);
1917 pdev
->dev
.archdata
.iommu
= info
;
1918 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1921 /* recheck it here, maybe others set it */
1922 return find_domain(pdev
);
1925 static int iommu_identity_mapping
;
1926 #define IDENTMAP_ALL 1
1927 #define IDENTMAP_GFX 2
1928 #define IDENTMAP_AZALIA 4
1930 static int iommu_domain_identity_map(struct dmar_domain
*domain
,
1931 unsigned long long start
,
1932 unsigned long long end
)
1934 unsigned long first_vpfn
= start
>> VTD_PAGE_SHIFT
;
1935 unsigned long last_vpfn
= end
>> VTD_PAGE_SHIFT
;
1937 if (!reserve_iova(&domain
->iovad
, dma_to_mm_pfn(first_vpfn
),
1938 dma_to_mm_pfn(last_vpfn
))) {
1939 printk(KERN_ERR
"IOMMU: reserve iova failed\n");
1943 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1944 start
, end
, domain
->id
);
1946 * RMRR range might have overlap with physical memory range,
1949 dma_pte_clear_range(domain
, first_vpfn
, last_vpfn
);
1951 return domain_pfn_mapping(domain
, first_vpfn
, first_vpfn
,
1952 last_vpfn
- first_vpfn
+ 1,
1953 DMA_PTE_READ
|DMA_PTE_WRITE
);
1956 static int iommu_prepare_identity_map(struct pci_dev
*pdev
,
1957 unsigned long long start
,
1958 unsigned long long end
)
1960 struct dmar_domain
*domain
;
1963 domain
= get_domain_for_dev(pdev
, DEFAULT_DOMAIN_ADDRESS_WIDTH
);
1967 /* For _hardware_ passthrough, don't bother. But for software
1968 passthrough, we do it anyway -- it may indicate a memory
1969 range which is reserved in E820, so which didn't get set
1970 up to start with in si_domain */
1971 if (domain
== si_domain
&& hw_pass_through
) {
1972 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1973 pci_name(pdev
), start
, end
);
1978 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1979 pci_name(pdev
), start
, end
);
1982 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1983 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1984 dmi_get_system_info(DMI_BIOS_VENDOR
),
1985 dmi_get_system_info(DMI_BIOS_VERSION
),
1986 dmi_get_system_info(DMI_PRODUCT_VERSION
));
1991 if (end
>> agaw_to_width(domain
->agaw
)) {
1992 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1993 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1994 agaw_to_width(domain
->agaw
),
1995 dmi_get_system_info(DMI_BIOS_VENDOR
),
1996 dmi_get_system_info(DMI_BIOS_VERSION
),
1997 dmi_get_system_info(DMI_PRODUCT_VERSION
));
2002 ret
= iommu_domain_identity_map(domain
, start
, end
);
2006 /* context entry init */
2007 ret
= domain_context_mapping(domain
, pdev
, CONTEXT_TT_MULTI_LEVEL
);
2014 domain_exit(domain
);
2018 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit
*rmrr
,
2019 struct pci_dev
*pdev
)
2021 if (pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
)
2023 return iommu_prepare_identity_map(pdev
, rmrr
->base_address
,
2024 rmrr
->end_address
+ 1);
2027 #ifdef CONFIG_DMAR_FLOPPY_WA
2028 static inline void iommu_prepare_isa(void)
2030 struct pci_dev
*pdev
;
2033 pdev
= pci_get_class(PCI_CLASS_BRIDGE_ISA
<< 8, NULL
);
2037 printk(KERN_INFO
"IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2038 ret
= iommu_prepare_identity_map(pdev
, 0, 16*1024*1024);
2041 printk(KERN_ERR
"IOMMU: Failed to create 0-16MiB identity map; "
2042 "floppy might not work\n");
2046 static inline void iommu_prepare_isa(void)
2050 #endif /* !CONFIG_DMAR_FLPY_WA */
2052 static int md_domain_init(struct dmar_domain
*domain
, int guest_width
);
2054 static int __init
si_domain_work_fn(unsigned long start_pfn
,
2055 unsigned long end_pfn
, void *datax
)
2059 *ret
= iommu_domain_identity_map(si_domain
,
2060 (uint64_t)start_pfn
<< PAGE_SHIFT
,
2061 (uint64_t)end_pfn
<< PAGE_SHIFT
);
2066 static int __init
si_domain_init(int hw
)
2068 struct dmar_drhd_unit
*drhd
;
2069 struct intel_iommu
*iommu
;
2072 si_domain
= alloc_domain();
2076 pr_debug("Identity mapping domain is domain %d\n", si_domain
->id
);
2078 for_each_active_iommu(iommu
, drhd
) {
2079 ret
= iommu_attach_domain(si_domain
, iommu
);
2081 domain_exit(si_domain
);
2086 if (md_domain_init(si_domain
, DEFAULT_DOMAIN_ADDRESS_WIDTH
)) {
2087 domain_exit(si_domain
);
2091 si_domain
->flags
= DOMAIN_FLAG_STATIC_IDENTITY
;
2096 for_each_online_node(nid
) {
2097 work_with_active_regions(nid
, si_domain_work_fn
, &ret
);
2105 static void domain_remove_one_dev_info(struct dmar_domain
*domain
,
2106 struct pci_dev
*pdev
);
2107 static int identity_mapping(struct pci_dev
*pdev
)
2109 struct device_domain_info
*info
;
2111 if (likely(!iommu_identity_mapping
))
2115 list_for_each_entry(info
, &si_domain
->devices
, link
)
2116 if (info
->dev
== pdev
)
2121 static int domain_add_dev_info(struct dmar_domain
*domain
,
2122 struct pci_dev
*pdev
,
2125 struct device_domain_info
*info
;
2126 unsigned long flags
;
2129 info
= alloc_devinfo_mem();
2133 ret
= domain_context_mapping(domain
, pdev
, translation
);
2135 free_devinfo_mem(info
);
2139 info
->segment
= pci_domain_nr(pdev
->bus
);
2140 info
->bus
= pdev
->bus
->number
;
2141 info
->devfn
= pdev
->devfn
;
2143 info
->domain
= domain
;
2145 spin_lock_irqsave(&device_domain_lock
, flags
);
2146 list_add(&info
->link
, &domain
->devices
);
2147 list_add(&info
->global
, &device_domain_list
);
2148 pdev
->dev
.archdata
.iommu
= info
;
2149 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2154 static int iommu_should_identity_map(struct pci_dev
*pdev
, int startup
)
2156 if ((iommu_identity_mapping
& IDENTMAP_AZALIA
) && IS_AZALIA(pdev
))
2159 if ((iommu_identity_mapping
& IDENTMAP_GFX
) && IS_GFX_DEVICE(pdev
))
2162 if (!(iommu_identity_mapping
& IDENTMAP_ALL
))
2166 * We want to start off with all devices in the 1:1 domain, and
2167 * take them out later if we find they can't access all of memory.
2169 * However, we can't do this for PCI devices behind bridges,
2170 * because all PCI devices behind the same bridge will end up
2171 * with the same source-id on their transactions.
2173 * Practically speaking, we can't change things around for these
2174 * devices at run-time, because we can't be sure there'll be no
2175 * DMA transactions in flight for any of their siblings.
2177 * So PCI devices (unless they're on the root bus) as well as
2178 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2179 * the 1:1 domain, just in _case_ one of their siblings turns out
2180 * not to be able to map all of memory.
2182 if (!pci_is_pcie(pdev
)) {
2183 if (!pci_is_root_bus(pdev
->bus
))
2185 if (pdev
->class >> 8 == PCI_CLASS_BRIDGE_PCI
)
2187 } else if (pdev
->pcie_type
== PCI_EXP_TYPE_PCI_BRIDGE
)
2191 * At boot time, we don't yet know if devices will be 64-bit capable.
2192 * Assume that they will -- if they turn out not to be, then we can
2193 * take them out of the 1:1 domain later.
2196 return pdev
->dma_mask
> DMA_BIT_MASK(32);
2201 static int __init
iommu_prepare_static_identity_mapping(int hw
)
2203 struct pci_dev
*pdev
= NULL
;
2206 ret
= si_domain_init(hw
);
2210 for_each_pci_dev(pdev
) {
2211 if (iommu_should_identity_map(pdev
, 1)) {
2212 printk(KERN_INFO
"IOMMU: %s identity mapping for device %s\n",
2213 hw
? "hardware" : "software", pci_name(pdev
));
2215 ret
= domain_add_dev_info(si_domain
, pdev
,
2216 hw
? CONTEXT_TT_PASS_THROUGH
:
2217 CONTEXT_TT_MULTI_LEVEL
);
2226 static int __init
init_dmars(void)
2228 struct dmar_drhd_unit
*drhd
;
2229 struct dmar_rmrr_unit
*rmrr
;
2230 struct pci_dev
*pdev
;
2231 struct intel_iommu
*iommu
;
2237 * initialize and program root entry to not present
2240 for_each_drhd_unit(drhd
) {
2243 * lock not needed as this is only incremented in the single
2244 * threaded kernel __init code path all other access are read
2249 g_iommus
= kcalloc(g_num_of_iommus
, sizeof(struct intel_iommu
*),
2252 printk(KERN_ERR
"Allocating global iommu array failed\n");
2257 deferred_flush
= kzalloc(g_num_of_iommus
*
2258 sizeof(struct deferred_flush_tables
), GFP_KERNEL
);
2259 if (!deferred_flush
) {
2264 for_each_drhd_unit(drhd
) {
2268 iommu
= drhd
->iommu
;
2269 g_iommus
[iommu
->seq_id
] = iommu
;
2271 ret
= iommu_init_domains(iommu
);
2277 * we could share the same root & context tables
2278 * among all IOMMU's. Need to Split it later.
2280 ret
= iommu_alloc_root_entry(iommu
);
2282 printk(KERN_ERR
"IOMMU: allocate root entry failed\n");
2285 if (!ecap_pass_through(iommu
->ecap
))
2286 hw_pass_through
= 0;
2290 * Start from the sane iommu hardware state.
2292 for_each_drhd_unit(drhd
) {
2296 iommu
= drhd
->iommu
;
2299 * If the queued invalidation is already initialized by us
2300 * (for example, while enabling interrupt-remapping) then
2301 * we got the things already rolling from a sane state.
2307 * Clear any previous faults.
2309 dmar_fault(-1, iommu
);
2311 * Disable queued invalidation if supported and already enabled
2312 * before OS handover.
2314 dmar_disable_qi(iommu
);
2317 for_each_drhd_unit(drhd
) {
2321 iommu
= drhd
->iommu
;
2323 if (dmar_enable_qi(iommu
)) {
2325 * Queued Invalidate not enabled, use Register Based
2328 iommu
->flush
.flush_context
= __iommu_flush_context
;
2329 iommu
->flush
.flush_iotlb
= __iommu_flush_iotlb
;
2330 printk(KERN_INFO
"IOMMU %d 0x%Lx: using Register based "
2333 (unsigned long long)drhd
->reg_base_addr
);
2335 iommu
->flush
.flush_context
= qi_flush_context
;
2336 iommu
->flush
.flush_iotlb
= qi_flush_iotlb
;
2337 printk(KERN_INFO
"IOMMU %d 0x%Lx: using Queued "
2340 (unsigned long long)drhd
->reg_base_addr
);
2344 if (iommu_pass_through
)
2345 iommu_identity_mapping
|= IDENTMAP_ALL
;
2347 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2348 iommu_identity_mapping
|= IDENTMAP_GFX
;
2351 check_tylersburg_isoch();
2354 * If pass through is not set or not enabled, setup context entries for
2355 * identity mappings for rmrr, gfx, and isa and may fall back to static
2356 * identity mapping if iommu_identity_mapping is set.
2358 if (iommu_identity_mapping
) {
2359 ret
= iommu_prepare_static_identity_mapping(hw_pass_through
);
2361 printk(KERN_CRIT
"Failed to setup IOMMU pass-through\n");
2367 * for each dev attached to rmrr
2369 * locate drhd for dev, alloc domain for dev
2370 * allocate free domain
2371 * allocate page table entries for rmrr
2372 * if context not allocated for bus
2373 * allocate and init context
2374 * set present in root table for this bus
2375 * init context with domain, translation etc
2379 printk(KERN_INFO
"IOMMU: Setting RMRR:\n");
2380 for_each_rmrr_units(rmrr
) {
2381 for (i
= 0; i
< rmrr
->devices_cnt
; i
++) {
2382 pdev
= rmrr
->devices
[i
];
2384 * some BIOS lists non-exist devices in DMAR
2389 ret
= iommu_prepare_rmrr_dev(rmrr
, pdev
);
2392 "IOMMU: mapping reserved region failed\n");
2396 iommu_prepare_isa();
2401 * global invalidate context cache
2402 * global invalidate iotlb
2403 * enable translation
2405 for_each_drhd_unit(drhd
) {
2406 if (drhd
->ignored
) {
2408 * we always have to disable PMRs or DMA may fail on
2412 iommu_disable_protect_mem_regions(drhd
->iommu
);
2415 iommu
= drhd
->iommu
;
2417 iommu_flush_write_buffer(iommu
);
2419 ret
= dmar_set_interrupt(iommu
);
2423 iommu_set_root_entry(iommu
);
2425 iommu
->flush
.flush_context(iommu
, 0, 0, 0, DMA_CCMD_GLOBAL_INVL
);
2426 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH
);
2428 ret
= iommu_enable_translation(iommu
);
2432 iommu_disable_protect_mem_regions(iommu
);
2437 for_each_drhd_unit(drhd
) {
2440 iommu
= drhd
->iommu
;
2447 /* This takes a number of _MM_ pages, not VTD pages */
2448 static struct iova
*intel_alloc_iova(struct device
*dev
,
2449 struct dmar_domain
*domain
,
2450 unsigned long nrpages
, uint64_t dma_mask
)
2452 struct pci_dev
*pdev
= to_pci_dev(dev
);
2453 struct iova
*iova
= NULL
;
2455 /* Restrict dma_mask to the width that the iommu can handle */
2456 dma_mask
= min_t(uint64_t, DOMAIN_MAX_ADDR(domain
->gaw
), dma_mask
);
2458 if (!dmar_forcedac
&& dma_mask
> DMA_BIT_MASK(32)) {
2460 * First try to allocate an io virtual address in
2461 * DMA_BIT_MASK(32) and if that fails then try allocating
2464 iova
= alloc_iova(&domain
->iovad
, nrpages
,
2465 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2469 iova
= alloc_iova(&domain
->iovad
, nrpages
, IOVA_PFN(dma_mask
), 1);
2470 if (unlikely(!iova
)) {
2471 printk(KERN_ERR
"Allocating %ld-page iova for %s failed",
2472 nrpages
, pci_name(pdev
));
2479 static struct dmar_domain
*__get_valid_domain_for_dev(struct pci_dev
*pdev
)
2481 struct dmar_domain
*domain
;
2484 domain
= get_domain_for_dev(pdev
,
2485 DEFAULT_DOMAIN_ADDRESS_WIDTH
);
2488 "Allocating domain for %s failed", pci_name(pdev
));
2492 /* make sure context mapping is ok */
2493 if (unlikely(!domain_context_mapped(pdev
))) {
2494 ret
= domain_context_mapping(domain
, pdev
,
2495 CONTEXT_TT_MULTI_LEVEL
);
2498 "Domain context map for %s failed",
2507 static inline struct dmar_domain
*get_valid_domain_for_dev(struct pci_dev
*dev
)
2509 struct device_domain_info
*info
;
2511 /* No lock here, assumes no domain exit in normal case */
2512 info
= dev
->dev
.archdata
.iommu
;
2514 return info
->domain
;
2516 return __get_valid_domain_for_dev(dev
);
2519 static int iommu_dummy(struct pci_dev
*pdev
)
2521 return pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
;
2524 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2525 static int iommu_no_mapping(struct device
*dev
)
2527 struct pci_dev
*pdev
;
2530 if (unlikely(dev
->bus
!= &pci_bus_type
))
2533 pdev
= to_pci_dev(dev
);
2534 if (iommu_dummy(pdev
))
2537 if (!iommu_identity_mapping
)
2540 found
= identity_mapping(pdev
);
2542 if (iommu_should_identity_map(pdev
, 0))
2546 * 32 bit DMA is removed from si_domain and fall back
2547 * to non-identity mapping.
2549 domain_remove_one_dev_info(si_domain
, pdev
);
2550 printk(KERN_INFO
"32bit %s uses non-identity mapping\n",
2556 * In case of a detached 64 bit DMA device from vm, the device
2557 * is put into si_domain for identity mapping.
2559 if (iommu_should_identity_map(pdev
, 0)) {
2561 ret
= domain_add_dev_info(si_domain
, pdev
,
2563 CONTEXT_TT_PASS_THROUGH
:
2564 CONTEXT_TT_MULTI_LEVEL
);
2566 printk(KERN_INFO
"64bit %s uses identity mapping\n",
2576 static dma_addr_t
__intel_map_single(struct device
*hwdev
, phys_addr_t paddr
,
2577 size_t size
, int dir
, u64 dma_mask
)
2579 struct pci_dev
*pdev
= to_pci_dev(hwdev
);
2580 struct dmar_domain
*domain
;
2581 phys_addr_t start_paddr
;
2585 struct intel_iommu
*iommu
;
2586 unsigned long paddr_pfn
= paddr
>> PAGE_SHIFT
;
2588 BUG_ON(dir
== DMA_NONE
);
2590 if (iommu_no_mapping(hwdev
))
2593 domain
= get_valid_domain_for_dev(pdev
);
2597 iommu
= domain_get_iommu(domain
);
2598 size
= aligned_nrpages(paddr
, size
);
2600 iova
= intel_alloc_iova(hwdev
, domain
, dma_to_mm_pfn(size
),
2606 * Check if DMAR supports zero-length reads on write only
2609 if (dir
== DMA_TO_DEVICE
|| dir
== DMA_BIDIRECTIONAL
|| \
2610 !cap_zlr(iommu
->cap
))
2611 prot
|= DMA_PTE_READ
;
2612 if (dir
== DMA_FROM_DEVICE
|| dir
== DMA_BIDIRECTIONAL
)
2613 prot
|= DMA_PTE_WRITE
;
2615 * paddr - (paddr + size) might be partial page, we should map the whole
2616 * page. Note: if two part of one page are separately mapped, we
2617 * might have two guest_addr mapping to the same host paddr, but this
2618 * is not a big problem
2620 ret
= domain_pfn_mapping(domain
, mm_to_dma_pfn(iova
->pfn_lo
),
2621 mm_to_dma_pfn(paddr_pfn
), size
, prot
);
2625 /* it's a non-present to present mapping. Only flush if caching mode */
2626 if (cap_caching_mode(iommu
->cap
))
2627 iommu_flush_iotlb_psi(iommu
, domain
->id
, mm_to_dma_pfn(iova
->pfn_lo
), size
, 1);
2629 iommu_flush_write_buffer(iommu
);
2631 start_paddr
= (phys_addr_t
)iova
->pfn_lo
<< PAGE_SHIFT
;
2632 start_paddr
+= paddr
& ~PAGE_MASK
;
2637 __free_iova(&domain
->iovad
, iova
);
2638 printk(KERN_ERR
"Device %s request: %zx@%llx dir %d --- failed\n",
2639 pci_name(pdev
), size
, (unsigned long long)paddr
, dir
);
2643 static dma_addr_t
intel_map_page(struct device
*dev
, struct page
*page
,
2644 unsigned long offset
, size_t size
,
2645 enum dma_data_direction dir
,
2646 struct dma_attrs
*attrs
)
2648 return __intel_map_single(dev
, page_to_phys(page
) + offset
, size
,
2649 dir
, to_pci_dev(dev
)->dma_mask
);
2652 static void flush_unmaps(void)
2658 /* just flush them all */
2659 for (i
= 0; i
< g_num_of_iommus
; i
++) {
2660 struct intel_iommu
*iommu
= g_iommus
[i
];
2664 if (!deferred_flush
[i
].next
)
2667 /* In caching mode, global flushes turn emulation expensive */
2668 if (!cap_caching_mode(iommu
->cap
))
2669 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0,
2670 DMA_TLB_GLOBAL_FLUSH
);
2671 for (j
= 0; j
< deferred_flush
[i
].next
; j
++) {
2673 struct iova
*iova
= deferred_flush
[i
].iova
[j
];
2674 struct dmar_domain
*domain
= deferred_flush
[i
].domain
[j
];
2676 /* On real hardware multiple invalidations are expensive */
2677 if (cap_caching_mode(iommu
->cap
))
2678 iommu_flush_iotlb_psi(iommu
, domain
->id
,
2679 iova
->pfn_lo
, iova
->pfn_hi
- iova
->pfn_lo
+ 1, 0);
2681 mask
= ilog2(mm_to_dma_pfn(iova
->pfn_hi
- iova
->pfn_lo
+ 1));
2682 iommu_flush_dev_iotlb(deferred_flush
[i
].domain
[j
],
2683 (uint64_t)iova
->pfn_lo
<< PAGE_SHIFT
, mask
);
2685 __free_iova(&deferred_flush
[i
].domain
[j
]->iovad
, iova
);
2687 deferred_flush
[i
].next
= 0;
2693 static void flush_unmaps_timeout(unsigned long data
)
2695 unsigned long flags
;
2697 spin_lock_irqsave(&async_umap_flush_lock
, flags
);
2699 spin_unlock_irqrestore(&async_umap_flush_lock
, flags
);
2702 static void add_unmap(struct dmar_domain
*dom
, struct iova
*iova
)
2704 unsigned long flags
;
2706 struct intel_iommu
*iommu
;
2708 spin_lock_irqsave(&async_umap_flush_lock
, flags
);
2709 if (list_size
== HIGH_WATER_MARK
)
2712 iommu
= domain_get_iommu(dom
);
2713 iommu_id
= iommu
->seq_id
;
2715 next
= deferred_flush
[iommu_id
].next
;
2716 deferred_flush
[iommu_id
].domain
[next
] = dom
;
2717 deferred_flush
[iommu_id
].iova
[next
] = iova
;
2718 deferred_flush
[iommu_id
].next
++;
2721 mod_timer(&unmap_timer
, jiffies
+ msecs_to_jiffies(10));
2725 spin_unlock_irqrestore(&async_umap_flush_lock
, flags
);
2728 static void intel_unmap_page(struct device
*dev
, dma_addr_t dev_addr
,
2729 size_t size
, enum dma_data_direction dir
,
2730 struct dma_attrs
*attrs
)
2732 struct pci_dev
*pdev
= to_pci_dev(dev
);
2733 struct dmar_domain
*domain
;
2734 unsigned long start_pfn
, last_pfn
;
2736 struct intel_iommu
*iommu
;
2738 if (iommu_no_mapping(dev
))
2741 domain
= find_domain(pdev
);
2744 iommu
= domain_get_iommu(domain
);
2746 iova
= find_iova(&domain
->iovad
, IOVA_PFN(dev_addr
));
2747 if (WARN_ONCE(!iova
, "Driver unmaps unmatched page at PFN %llx\n",
2748 (unsigned long long)dev_addr
))
2751 start_pfn
= mm_to_dma_pfn(iova
->pfn_lo
);
2752 last_pfn
= mm_to_dma_pfn(iova
->pfn_hi
+ 1) - 1;
2754 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2755 pci_name(pdev
), start_pfn
, last_pfn
);
2757 /* clear the whole page */
2758 dma_pte_clear_range(domain
, start_pfn
, last_pfn
);
2760 /* free page tables */
2761 dma_pte_free_pagetable(domain
, start_pfn
, last_pfn
);
2763 if (intel_iommu_strict
) {
2764 iommu_flush_iotlb_psi(iommu
, domain
->id
, start_pfn
,
2765 last_pfn
- start_pfn
+ 1, 0);
2767 __free_iova(&domain
->iovad
, iova
);
2769 add_unmap(domain
, iova
);
2771 * queue up the release of the unmap to save the 1/6th of the
2772 * cpu used up by the iotlb flush operation...
2777 static void *intel_alloc_coherent(struct device
*hwdev
, size_t size
,
2778 dma_addr_t
*dma_handle
, gfp_t flags
)
2783 size
= PAGE_ALIGN(size
);
2784 order
= get_order(size
);
2786 if (!iommu_no_mapping(hwdev
))
2787 flags
&= ~(GFP_DMA
| GFP_DMA32
);
2788 else if (hwdev
->coherent_dma_mask
< dma_get_required_mask(hwdev
)) {
2789 if (hwdev
->coherent_dma_mask
< DMA_BIT_MASK(32))
2795 vaddr
= (void *)__get_free_pages(flags
, order
);
2798 memset(vaddr
, 0, size
);
2800 *dma_handle
= __intel_map_single(hwdev
, virt_to_bus(vaddr
), size
,
2802 hwdev
->coherent_dma_mask
);
2805 free_pages((unsigned long)vaddr
, order
);
2809 static void intel_free_coherent(struct device
*hwdev
, size_t size
, void *vaddr
,
2810 dma_addr_t dma_handle
)
2814 size
= PAGE_ALIGN(size
);
2815 order
= get_order(size
);
2817 intel_unmap_page(hwdev
, dma_handle
, size
, DMA_BIDIRECTIONAL
, NULL
);
2818 free_pages((unsigned long)vaddr
, order
);
2821 static void intel_unmap_sg(struct device
*hwdev
, struct scatterlist
*sglist
,
2822 int nelems
, enum dma_data_direction dir
,
2823 struct dma_attrs
*attrs
)
2825 struct pci_dev
*pdev
= to_pci_dev(hwdev
);
2826 struct dmar_domain
*domain
;
2827 unsigned long start_pfn
, last_pfn
;
2829 struct intel_iommu
*iommu
;
2831 if (iommu_no_mapping(hwdev
))
2834 domain
= find_domain(pdev
);
2837 iommu
= domain_get_iommu(domain
);
2839 iova
= find_iova(&domain
->iovad
, IOVA_PFN(sglist
[0].dma_address
));
2840 if (WARN_ONCE(!iova
, "Driver unmaps unmatched sglist at PFN %llx\n",
2841 (unsigned long long)sglist
[0].dma_address
))
2844 start_pfn
= mm_to_dma_pfn(iova
->pfn_lo
);
2845 last_pfn
= mm_to_dma_pfn(iova
->pfn_hi
+ 1) - 1;
2847 /* clear the whole page */
2848 dma_pte_clear_range(domain
, start_pfn
, last_pfn
);
2850 /* free page tables */
2851 dma_pte_free_pagetable(domain
, start_pfn
, last_pfn
);
2853 if (intel_iommu_strict
) {
2854 iommu_flush_iotlb_psi(iommu
, domain
->id
, start_pfn
,
2855 last_pfn
- start_pfn
+ 1, 0);
2857 __free_iova(&domain
->iovad
, iova
);
2859 add_unmap(domain
, iova
);
2861 * queue up the release of the unmap to save the 1/6th of the
2862 * cpu used up by the iotlb flush operation...
2867 static int intel_nontranslate_map_sg(struct device
*hddev
,
2868 struct scatterlist
*sglist
, int nelems
, int dir
)
2871 struct scatterlist
*sg
;
2873 for_each_sg(sglist
, sg
, nelems
, i
) {
2874 BUG_ON(!sg_page(sg
));
2875 sg
->dma_address
= page_to_phys(sg_page(sg
)) + sg
->offset
;
2876 sg
->dma_length
= sg
->length
;
2881 static int intel_map_sg(struct device
*hwdev
, struct scatterlist
*sglist
, int nelems
,
2882 enum dma_data_direction dir
, struct dma_attrs
*attrs
)
2885 struct pci_dev
*pdev
= to_pci_dev(hwdev
);
2886 struct dmar_domain
*domain
;
2889 struct iova
*iova
= NULL
;
2891 struct scatterlist
*sg
;
2892 unsigned long start_vpfn
;
2893 struct intel_iommu
*iommu
;
2895 BUG_ON(dir
== DMA_NONE
);
2896 if (iommu_no_mapping(hwdev
))
2897 return intel_nontranslate_map_sg(hwdev
, sglist
, nelems
, dir
);
2899 domain
= get_valid_domain_for_dev(pdev
);
2903 iommu
= domain_get_iommu(domain
);
2905 for_each_sg(sglist
, sg
, nelems
, i
)
2906 size
+= aligned_nrpages(sg
->offset
, sg
->length
);
2908 iova
= intel_alloc_iova(hwdev
, domain
, dma_to_mm_pfn(size
),
2911 sglist
->dma_length
= 0;
2916 * Check if DMAR supports zero-length reads on write only
2919 if (dir
== DMA_TO_DEVICE
|| dir
== DMA_BIDIRECTIONAL
|| \
2920 !cap_zlr(iommu
->cap
))
2921 prot
|= DMA_PTE_READ
;
2922 if (dir
== DMA_FROM_DEVICE
|| dir
== DMA_BIDIRECTIONAL
)
2923 prot
|= DMA_PTE_WRITE
;
2925 start_vpfn
= mm_to_dma_pfn(iova
->pfn_lo
);
2927 ret
= domain_sg_mapping(domain
, start_vpfn
, sglist
, size
, prot
);
2928 if (unlikely(ret
)) {
2929 /* clear the page */
2930 dma_pte_clear_range(domain
, start_vpfn
,
2931 start_vpfn
+ size
- 1);
2932 /* free page tables */
2933 dma_pte_free_pagetable(domain
, start_vpfn
,
2934 start_vpfn
+ size
- 1);
2936 __free_iova(&domain
->iovad
, iova
);
2940 /* it's a non-present to present mapping. Only flush if caching mode */
2941 if (cap_caching_mode(iommu
->cap
))
2942 iommu_flush_iotlb_psi(iommu
, domain
->id
, start_vpfn
, size
, 1);
2944 iommu_flush_write_buffer(iommu
);
2949 static int intel_mapping_error(struct device
*dev
, dma_addr_t dma_addr
)
2954 struct dma_map_ops intel_dma_ops
= {
2955 .alloc_coherent
= intel_alloc_coherent
,
2956 .free_coherent
= intel_free_coherent
,
2957 .map_sg
= intel_map_sg
,
2958 .unmap_sg
= intel_unmap_sg
,
2959 .map_page
= intel_map_page
,
2960 .unmap_page
= intel_unmap_page
,
2961 .mapping_error
= intel_mapping_error
,
2964 static inline int iommu_domain_cache_init(void)
2968 iommu_domain_cache
= kmem_cache_create("iommu_domain",
2969 sizeof(struct dmar_domain
),
2974 if (!iommu_domain_cache
) {
2975 printk(KERN_ERR
"Couldn't create iommu_domain cache\n");
2982 static inline int iommu_devinfo_cache_init(void)
2986 iommu_devinfo_cache
= kmem_cache_create("iommu_devinfo",
2987 sizeof(struct device_domain_info
),
2991 if (!iommu_devinfo_cache
) {
2992 printk(KERN_ERR
"Couldn't create devinfo cache\n");
2999 static inline int iommu_iova_cache_init(void)
3003 iommu_iova_cache
= kmem_cache_create("iommu_iova",
3004 sizeof(struct iova
),
3008 if (!iommu_iova_cache
) {
3009 printk(KERN_ERR
"Couldn't create iova cache\n");
3016 static int __init
iommu_init_mempool(void)
3019 ret
= iommu_iova_cache_init();
3023 ret
= iommu_domain_cache_init();
3027 ret
= iommu_devinfo_cache_init();
3031 kmem_cache_destroy(iommu_domain_cache
);
3033 kmem_cache_destroy(iommu_iova_cache
);
3038 static void __init
iommu_exit_mempool(void)
3040 kmem_cache_destroy(iommu_devinfo_cache
);
3041 kmem_cache_destroy(iommu_domain_cache
);
3042 kmem_cache_destroy(iommu_iova_cache
);
3046 static void quirk_ioat_snb_local_iommu(struct pci_dev
*pdev
)
3048 struct dmar_drhd_unit
*drhd
;
3052 /* We know that this device on this chipset has its own IOMMU.
3053 * If we find it under a different IOMMU, then the BIOS is lying
3054 * to us. Hope that the IOMMU for this device is actually
3055 * disabled, and it needs no translation...
3057 rc
= pci_bus_read_config_dword(pdev
->bus
, PCI_DEVFN(0, 0), 0xb0, &vtbar
);
3059 /* "can't" happen */
3060 dev_info(&pdev
->dev
, "failed to run vt-d quirk\n");
3063 vtbar
&= 0xffff0000;
3065 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3066 drhd
= dmar_find_matched_drhd_unit(pdev
);
3067 if (WARN_TAINT_ONCE(!drhd
|| drhd
->reg_base_addr
- vtbar
!= 0xa000,
3068 TAINT_FIRMWARE_WORKAROUND
,
3069 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3070 pdev
->dev
.archdata
.iommu
= DUMMY_DEVICE_DOMAIN_INFO
;
3072 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL
, PCI_DEVICE_ID_INTEL_IOAT_SNB
, quirk_ioat_snb_local_iommu
);
3074 static void __init
init_no_remapping_devices(void)
3076 struct dmar_drhd_unit
*drhd
;
3078 for_each_drhd_unit(drhd
) {
3079 if (!drhd
->include_all
) {
3081 for (i
= 0; i
< drhd
->devices_cnt
; i
++)
3082 if (drhd
->devices
[i
] != NULL
)
3084 /* ignore DMAR unit if no pci devices exist */
3085 if (i
== drhd
->devices_cnt
)
3093 for_each_drhd_unit(drhd
) {
3095 if (drhd
->ignored
|| drhd
->include_all
)
3098 for (i
= 0; i
< drhd
->devices_cnt
; i
++)
3099 if (drhd
->devices
[i
] &&
3100 !IS_GFX_DEVICE(drhd
->devices
[i
]))
3103 if (i
< drhd
->devices_cnt
)
3106 /* bypass IOMMU if it is just for gfx devices */
3108 for (i
= 0; i
< drhd
->devices_cnt
; i
++) {
3109 if (!drhd
->devices
[i
])
3111 drhd
->devices
[i
]->dev
.archdata
.iommu
= DUMMY_DEVICE_DOMAIN_INFO
;
3116 #ifdef CONFIG_SUSPEND
3117 static int init_iommu_hw(void)
3119 struct dmar_drhd_unit
*drhd
;
3120 struct intel_iommu
*iommu
= NULL
;
3122 for_each_active_iommu(iommu
, drhd
)
3124 dmar_reenable_qi(iommu
);
3126 for_each_iommu(iommu
, drhd
) {
3127 if (drhd
->ignored
) {
3129 * we always have to disable PMRs or DMA may fail on
3133 iommu_disable_protect_mem_regions(iommu
);
3137 iommu_flush_write_buffer(iommu
);
3139 iommu_set_root_entry(iommu
);
3141 iommu
->flush
.flush_context(iommu
, 0, 0, 0,
3142 DMA_CCMD_GLOBAL_INVL
);
3143 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0,
3144 DMA_TLB_GLOBAL_FLUSH
);
3145 if (iommu_enable_translation(iommu
))
3147 iommu_disable_protect_mem_regions(iommu
);
3153 static void iommu_flush_all(void)
3155 struct dmar_drhd_unit
*drhd
;
3156 struct intel_iommu
*iommu
;
3158 for_each_active_iommu(iommu
, drhd
) {
3159 iommu
->flush
.flush_context(iommu
, 0, 0, 0,
3160 DMA_CCMD_GLOBAL_INVL
);
3161 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0,
3162 DMA_TLB_GLOBAL_FLUSH
);
3166 static int iommu_suspend(void)
3168 struct dmar_drhd_unit
*drhd
;
3169 struct intel_iommu
*iommu
= NULL
;
3172 for_each_active_iommu(iommu
, drhd
) {
3173 iommu
->iommu_state
= kzalloc(sizeof(u32
) * MAX_SR_DMAR_REGS
,
3175 if (!iommu
->iommu_state
)
3181 for_each_active_iommu(iommu
, drhd
) {
3182 iommu_disable_translation(iommu
);
3184 spin_lock_irqsave(&iommu
->register_lock
, flag
);
3186 iommu
->iommu_state
[SR_DMAR_FECTL_REG
] =
3187 readl(iommu
->reg
+ DMAR_FECTL_REG
);
3188 iommu
->iommu_state
[SR_DMAR_FEDATA_REG
] =
3189 readl(iommu
->reg
+ DMAR_FEDATA_REG
);
3190 iommu
->iommu_state
[SR_DMAR_FEADDR_REG
] =
3191 readl(iommu
->reg
+ DMAR_FEADDR_REG
);
3192 iommu
->iommu_state
[SR_DMAR_FEUADDR_REG
] =
3193 readl(iommu
->reg
+ DMAR_FEUADDR_REG
);
3195 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
3200 for_each_active_iommu(iommu
, drhd
)
3201 kfree(iommu
->iommu_state
);
3206 static void iommu_resume(void)
3208 struct dmar_drhd_unit
*drhd
;
3209 struct intel_iommu
*iommu
= NULL
;
3212 if (init_iommu_hw()) {
3214 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3216 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3220 for_each_active_iommu(iommu
, drhd
) {
3222 spin_lock_irqsave(&iommu
->register_lock
, flag
);
3224 writel(iommu
->iommu_state
[SR_DMAR_FECTL_REG
],
3225 iommu
->reg
+ DMAR_FECTL_REG
);
3226 writel(iommu
->iommu_state
[SR_DMAR_FEDATA_REG
],
3227 iommu
->reg
+ DMAR_FEDATA_REG
);
3228 writel(iommu
->iommu_state
[SR_DMAR_FEADDR_REG
],
3229 iommu
->reg
+ DMAR_FEADDR_REG
);
3230 writel(iommu
->iommu_state
[SR_DMAR_FEUADDR_REG
],
3231 iommu
->reg
+ DMAR_FEUADDR_REG
);
3233 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
3236 for_each_active_iommu(iommu
, drhd
)
3237 kfree(iommu
->iommu_state
);
3240 static struct syscore_ops iommu_syscore_ops
= {
3241 .resume
= iommu_resume
,
3242 .suspend
= iommu_suspend
,
3245 static void __init
init_iommu_pm_ops(void)
3247 register_syscore_ops(&iommu_syscore_ops
);
3251 static inline int init_iommu_pm_ops(void) { }
3252 #endif /* CONFIG_PM */
3255 * Here we only respond to action of unbound device from driver.
3257 * Added device is not attached to its DMAR domain here yet. That will happen
3258 * when mapping the device to iova.
3260 static int device_notifier(struct notifier_block
*nb
,
3261 unsigned long action
, void *data
)
3263 struct device
*dev
= data
;
3264 struct pci_dev
*pdev
= to_pci_dev(dev
);
3265 struct dmar_domain
*domain
;
3267 if (iommu_no_mapping(dev
))
3270 domain
= find_domain(pdev
);
3274 if (action
== BUS_NOTIFY_UNBOUND_DRIVER
&& !iommu_pass_through
) {
3275 domain_remove_one_dev_info(domain
, pdev
);
3277 if (!(domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
) &&
3278 !(domain
->flags
& DOMAIN_FLAG_STATIC_IDENTITY
) &&
3279 list_empty(&domain
->devices
))
3280 domain_exit(domain
);
3286 static struct notifier_block device_nb
= {
3287 .notifier_call
= device_notifier
,
3290 int __init
intel_iommu_init(void)
3294 /* VT-d is required for a TXT/tboot launch, so enforce that */
3295 force_on
= tboot_force_iommu();
3297 if (dmar_table_init()) {
3299 panic("tboot: Failed to initialize DMAR table\n");
3303 if (dmar_dev_scope_init()) {
3305 panic("tboot: Failed to initialize DMAR device scope\n");
3310 * Check the need for DMA-remapping initialization now.
3311 * Above initialization will also be used by Interrupt-remapping.
3313 if (no_iommu
|| dmar_disabled
)
3316 if (iommu_init_mempool()) {
3318 panic("tboot: Failed to initialize iommu memory\n");
3322 if (dmar_init_reserved_ranges()) {
3324 panic("tboot: Failed to reserve iommu ranges\n");
3328 init_no_remapping_devices();
3333 panic("tboot: Failed to initialize DMARs\n");
3334 printk(KERN_ERR
"IOMMU: dmar init failed\n");
3335 put_iova_domain(&reserved_iova_list
);
3336 iommu_exit_mempool();
3340 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3342 init_timer(&unmap_timer
);
3343 #ifdef CONFIG_SWIOTLB
3346 dma_ops
= &intel_dma_ops
;
3348 init_iommu_pm_ops();
3350 register_iommu(&intel_iommu_ops
);
3352 bus_register_notifier(&pci_bus_type
, &device_nb
);
3357 static void iommu_detach_dependent_devices(struct intel_iommu
*iommu
,
3358 struct pci_dev
*pdev
)
3360 struct pci_dev
*tmp
, *parent
;
3362 if (!iommu
|| !pdev
)
3365 /* dependent device detach */
3366 tmp
= pci_find_upstream_pcie_bridge(pdev
);
3367 /* Secondary interface's bus number and devfn 0 */
3369 parent
= pdev
->bus
->self
;
3370 while (parent
!= tmp
) {
3371 iommu_detach_dev(iommu
, parent
->bus
->number
,
3373 parent
= parent
->bus
->self
;
3375 if (pci_is_pcie(tmp
)) /* this is a PCIe-to-PCI bridge */
3376 iommu_detach_dev(iommu
,
3377 tmp
->subordinate
->number
, 0);
3378 else /* this is a legacy PCI bridge */
3379 iommu_detach_dev(iommu
, tmp
->bus
->number
,
3384 static void domain_remove_one_dev_info(struct dmar_domain
*domain
,
3385 struct pci_dev
*pdev
)
3387 struct device_domain_info
*info
;
3388 struct intel_iommu
*iommu
;
3389 unsigned long flags
;
3391 struct list_head
*entry
, *tmp
;
3393 iommu
= device_to_iommu(pci_domain_nr(pdev
->bus
), pdev
->bus
->number
,
3398 spin_lock_irqsave(&device_domain_lock
, flags
);
3399 list_for_each_safe(entry
, tmp
, &domain
->devices
) {
3400 info
= list_entry(entry
, struct device_domain_info
, link
);
3401 /* No need to compare PCI domain; it has to be the same */
3402 if (info
->bus
== pdev
->bus
->number
&&
3403 info
->devfn
== pdev
->devfn
) {
3404 list_del(&info
->link
);
3405 list_del(&info
->global
);
3407 info
->dev
->dev
.archdata
.iommu
= NULL
;
3408 spin_unlock_irqrestore(&device_domain_lock
, flags
);
3410 iommu_disable_dev_iotlb(info
);
3411 iommu_detach_dev(iommu
, info
->bus
, info
->devfn
);
3412 iommu_detach_dependent_devices(iommu
, pdev
);
3413 free_devinfo_mem(info
);
3415 spin_lock_irqsave(&device_domain_lock
, flags
);
3423 /* if there is no other devices under the same iommu
3424 * owned by this domain, clear this iommu in iommu_bmp
3425 * update iommu count and coherency
3427 if (iommu
== device_to_iommu(info
->segment
, info
->bus
,
3433 unsigned long tmp_flags
;
3434 spin_lock_irqsave(&domain
->iommu_lock
, tmp_flags
);
3435 clear_bit(iommu
->seq_id
, &domain
->iommu_bmp
);
3436 domain
->iommu_count
--;
3437 domain_update_iommu_cap(domain
);
3438 spin_unlock_irqrestore(&domain
->iommu_lock
, tmp_flags
);
3440 spin_lock_irqsave(&iommu
->lock
, tmp_flags
);
3441 clear_bit(domain
->id
, iommu
->domain_ids
);
3442 iommu
->domains
[domain
->id
] = NULL
;
3443 spin_unlock_irqrestore(&iommu
->lock
, tmp_flags
);
3446 spin_unlock_irqrestore(&device_domain_lock
, flags
);
3449 static void vm_domain_remove_all_dev_info(struct dmar_domain
*domain
)
3451 struct device_domain_info
*info
;
3452 struct intel_iommu
*iommu
;
3453 unsigned long flags1
, flags2
;
3455 spin_lock_irqsave(&device_domain_lock
, flags1
);
3456 while (!list_empty(&domain
->devices
)) {
3457 info
= list_entry(domain
->devices
.next
,
3458 struct device_domain_info
, link
);
3459 list_del(&info
->link
);
3460 list_del(&info
->global
);
3462 info
->dev
->dev
.archdata
.iommu
= NULL
;
3464 spin_unlock_irqrestore(&device_domain_lock
, flags1
);
3466 iommu_disable_dev_iotlb(info
);
3467 iommu
= device_to_iommu(info
->segment
, info
->bus
, info
->devfn
);
3468 iommu_detach_dev(iommu
, info
->bus
, info
->devfn
);
3469 iommu_detach_dependent_devices(iommu
, info
->dev
);
3471 /* clear this iommu in iommu_bmp, update iommu count
3474 spin_lock_irqsave(&domain
->iommu_lock
, flags2
);
3475 if (test_and_clear_bit(iommu
->seq_id
,
3476 &domain
->iommu_bmp
)) {
3477 domain
->iommu_count
--;
3478 domain_update_iommu_cap(domain
);
3480 spin_unlock_irqrestore(&domain
->iommu_lock
, flags2
);
3482 free_devinfo_mem(info
);
3483 spin_lock_irqsave(&device_domain_lock
, flags1
);
3485 spin_unlock_irqrestore(&device_domain_lock
, flags1
);
3488 /* domain id for virtual machine, it won't be set in context */
3489 static unsigned long vm_domid
;
3491 static struct dmar_domain
*iommu_alloc_vm_domain(void)
3493 struct dmar_domain
*domain
;
3495 domain
= alloc_domain_mem();
3499 domain
->id
= vm_domid
++;
3501 memset(&domain
->iommu_bmp
, 0, sizeof(unsigned long));
3502 domain
->flags
= DOMAIN_FLAG_VIRTUAL_MACHINE
;
3507 static int md_domain_init(struct dmar_domain
*domain
, int guest_width
)
3511 init_iova_domain(&domain
->iovad
, DMA_32BIT_PFN
);
3512 spin_lock_init(&domain
->iommu_lock
);
3514 domain_reserve_special_ranges(domain
);
3516 /* calculate AGAW */
3517 domain
->gaw
= guest_width
;
3518 adjust_width
= guestwidth_to_adjustwidth(guest_width
);
3519 domain
->agaw
= width_to_agaw(adjust_width
);
3521 INIT_LIST_HEAD(&domain
->devices
);
3523 domain
->iommu_count
= 0;
3524 domain
->iommu_coherency
= 0;
3525 domain
->iommu_snooping
= 0;
3526 domain
->max_addr
= 0;
3529 /* always allocate the top pgd */
3530 domain
->pgd
= (struct dma_pte
*)alloc_pgtable_page(domain
->nid
);
3533 domain_flush_cache(domain
, domain
->pgd
, PAGE_SIZE
);
3537 static void iommu_free_vm_domain(struct dmar_domain
*domain
)
3539 unsigned long flags
;
3540 struct dmar_drhd_unit
*drhd
;
3541 struct intel_iommu
*iommu
;
3543 unsigned long ndomains
;
3545 for_each_drhd_unit(drhd
) {
3548 iommu
= drhd
->iommu
;
3550 ndomains
= cap_ndoms(iommu
->cap
);
3551 for_each_set_bit(i
, iommu
->domain_ids
, ndomains
) {
3552 if (iommu
->domains
[i
] == domain
) {
3553 spin_lock_irqsave(&iommu
->lock
, flags
);
3554 clear_bit(i
, iommu
->domain_ids
);
3555 iommu
->domains
[i
] = NULL
;
3556 spin_unlock_irqrestore(&iommu
->lock
, flags
);
3563 static void vm_domain_exit(struct dmar_domain
*domain
)
3565 /* Domain 0 is reserved, so dont process it */
3569 vm_domain_remove_all_dev_info(domain
);
3571 put_iova_domain(&domain
->iovad
);
3574 dma_pte_clear_range(domain
, 0, DOMAIN_MAX_PFN(domain
->gaw
));
3576 /* free page tables */
3577 dma_pte_free_pagetable(domain
, 0, DOMAIN_MAX_PFN(domain
->gaw
));
3579 iommu_free_vm_domain(domain
);
3580 free_domain_mem(domain
);
3583 static int intel_iommu_domain_init(struct iommu_domain
*domain
)
3585 struct dmar_domain
*dmar_domain
;
3587 dmar_domain
= iommu_alloc_vm_domain();
3590 "intel_iommu_domain_init: dmar_domain == NULL\n");
3593 if (md_domain_init(dmar_domain
, DEFAULT_DOMAIN_ADDRESS_WIDTH
)) {
3595 "intel_iommu_domain_init() failed\n");
3596 vm_domain_exit(dmar_domain
);
3599 domain
->priv
= dmar_domain
;
3604 static void intel_iommu_domain_destroy(struct iommu_domain
*domain
)
3606 struct dmar_domain
*dmar_domain
= domain
->priv
;
3608 domain
->priv
= NULL
;
3609 vm_domain_exit(dmar_domain
);
3612 static int intel_iommu_attach_device(struct iommu_domain
*domain
,
3615 struct dmar_domain
*dmar_domain
= domain
->priv
;
3616 struct pci_dev
*pdev
= to_pci_dev(dev
);
3617 struct intel_iommu
*iommu
;
3620 /* normally pdev is not mapped */
3621 if (unlikely(domain_context_mapped(pdev
))) {
3622 struct dmar_domain
*old_domain
;
3624 old_domain
= find_domain(pdev
);
3626 if (dmar_domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
||
3627 dmar_domain
->flags
& DOMAIN_FLAG_STATIC_IDENTITY
)
3628 domain_remove_one_dev_info(old_domain
, pdev
);
3630 domain_remove_dev_info(old_domain
);
3634 iommu
= device_to_iommu(pci_domain_nr(pdev
->bus
), pdev
->bus
->number
,
3639 /* check if this iommu agaw is sufficient for max mapped address */
3640 addr_width
= agaw_to_width(iommu
->agaw
);
3641 if (addr_width
> cap_mgaw(iommu
->cap
))
3642 addr_width
= cap_mgaw(iommu
->cap
);
3644 if (dmar_domain
->max_addr
> (1LL << addr_width
)) {
3645 printk(KERN_ERR
"%s: iommu width (%d) is not "
3646 "sufficient for the mapped address (%llx)\n",
3647 __func__
, addr_width
, dmar_domain
->max_addr
);
3650 dmar_domain
->gaw
= addr_width
;
3653 * Knock out extra levels of page tables if necessary
3655 while (iommu
->agaw
< dmar_domain
->agaw
) {
3656 struct dma_pte
*pte
;
3658 pte
= dmar_domain
->pgd
;
3659 if (dma_pte_present(pte
)) {
3660 dmar_domain
->pgd
= (struct dma_pte
*)
3661 phys_to_virt(dma_pte_addr(pte
));
3662 free_pgtable_page(pte
);
3664 dmar_domain
->agaw
--;
3667 return domain_add_dev_info(dmar_domain
, pdev
, CONTEXT_TT_MULTI_LEVEL
);
3670 static void intel_iommu_detach_device(struct iommu_domain
*domain
,
3673 struct dmar_domain
*dmar_domain
= domain
->priv
;
3674 struct pci_dev
*pdev
= to_pci_dev(dev
);
3676 domain_remove_one_dev_info(dmar_domain
, pdev
);
3679 static int intel_iommu_map(struct iommu_domain
*domain
,
3680 unsigned long iova
, phys_addr_t hpa
,
3681 int gfp_order
, int iommu_prot
)
3683 struct dmar_domain
*dmar_domain
= domain
->priv
;
3689 if (iommu_prot
& IOMMU_READ
)
3690 prot
|= DMA_PTE_READ
;
3691 if (iommu_prot
& IOMMU_WRITE
)
3692 prot
|= DMA_PTE_WRITE
;
3693 if ((iommu_prot
& IOMMU_CACHE
) && dmar_domain
->iommu_snooping
)
3694 prot
|= DMA_PTE_SNP
;
3696 size
= PAGE_SIZE
<< gfp_order
;
3697 max_addr
= iova
+ size
;
3698 if (dmar_domain
->max_addr
< max_addr
) {
3701 /* check if minimum agaw is sufficient for mapped address */
3702 end
= __DOMAIN_MAX_ADDR(dmar_domain
->gaw
) + 1;
3703 if (end
< max_addr
) {
3704 printk(KERN_ERR
"%s: iommu width (%d) is not "
3705 "sufficient for the mapped address (%llx)\n",
3706 __func__
, dmar_domain
->gaw
, max_addr
);
3709 dmar_domain
->max_addr
= max_addr
;
3711 /* Round up size to next multiple of PAGE_SIZE, if it and
3712 the low bits of hpa would take us onto the next page */
3713 size
= aligned_nrpages(hpa
, size
);
3714 ret
= domain_pfn_mapping(dmar_domain
, iova
>> VTD_PAGE_SHIFT
,
3715 hpa
>> VTD_PAGE_SHIFT
, size
, prot
);
3719 static int intel_iommu_unmap(struct iommu_domain
*domain
,
3720 unsigned long iova
, int gfp_order
)
3722 struct dmar_domain
*dmar_domain
= domain
->priv
;
3723 size_t size
= PAGE_SIZE
<< gfp_order
;
3725 dma_pte_clear_range(dmar_domain
, iova
>> VTD_PAGE_SHIFT
,
3726 (iova
+ size
- 1) >> VTD_PAGE_SHIFT
);
3728 if (dmar_domain
->max_addr
== iova
+ size
)
3729 dmar_domain
->max_addr
= iova
;
3734 static phys_addr_t
intel_iommu_iova_to_phys(struct iommu_domain
*domain
,
3737 struct dmar_domain
*dmar_domain
= domain
->priv
;
3738 struct dma_pte
*pte
;
3741 pte
= pfn_to_dma_pte(dmar_domain
, iova
>> VTD_PAGE_SHIFT
);
3743 phys
= dma_pte_addr(pte
);
3748 static int intel_iommu_domain_has_cap(struct iommu_domain
*domain
,
3751 struct dmar_domain
*dmar_domain
= domain
->priv
;
3753 if (cap
== IOMMU_CAP_CACHE_COHERENCY
)
3754 return dmar_domain
->iommu_snooping
;
3755 if (cap
== IOMMU_CAP_INTR_REMAP
)
3756 return intr_remapping_enabled
;
3761 static struct iommu_ops intel_iommu_ops
= {
3762 .domain_init
= intel_iommu_domain_init
,
3763 .domain_destroy
= intel_iommu_domain_destroy
,
3764 .attach_dev
= intel_iommu_attach_device
,
3765 .detach_dev
= intel_iommu_detach_device
,
3766 .map
= intel_iommu_map
,
3767 .unmap
= intel_iommu_unmap
,
3768 .iova_to_phys
= intel_iommu_iova_to_phys
,
3769 .domain_has_cap
= intel_iommu_domain_has_cap
,
3772 static void __devinit
quirk_iommu_rwbf(struct pci_dev
*dev
)
3775 * Mobile 4 Series Chipset neglects to set RWBF capability,
3778 printk(KERN_INFO
"DMAR: Forcing write-buffer flush capability\n");
3781 /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3782 if (dev
->revision
== 0x07) {
3783 printk(KERN_INFO
"DMAR: Disabling IOMMU for graphics on this chipset\n");
3788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2a40, quirk_iommu_rwbf
);
3791 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
3792 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
3793 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
3794 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
3795 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
3796 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
3797 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
3798 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
3800 static void __devinit
quirk_calpella_no_shadow_gtt(struct pci_dev
*dev
)
3804 if (pci_read_config_word(dev
, GGC
, &ggc
))
3807 if (!(ggc
& GGC_MEMORY_VT_ENABLED
)) {
3808 printk(KERN_INFO
"DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
3812 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x0040, quirk_calpella_no_shadow_gtt
);
3813 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x0044, quirk_calpella_no_shadow_gtt
);
3814 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x0062, quirk_calpella_no_shadow_gtt
);
3815 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x006a, quirk_calpella_no_shadow_gtt
);
3817 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3818 ISOCH DMAR unit for the Azalia sound device, but not give it any
3819 TLB entries, which causes it to deadlock. Check for that. We do
3820 this in a function called from init_dmars(), instead of in a PCI
3821 quirk, because we don't want to print the obnoxious "BIOS broken"
3822 message if VT-d is actually disabled.
3824 static void __init
check_tylersburg_isoch(void)
3826 struct pci_dev
*pdev
;
3827 uint32_t vtisochctrl
;
3829 /* If there's no Azalia in the system anyway, forget it. */
3830 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, 0x3a3e, NULL
);
3835 /* System Management Registers. Might be hidden, in which case
3836 we can't do the sanity check. But that's OK, because the
3837 known-broken BIOSes _don't_ actually hide it, so far. */
3838 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, 0x342e, NULL
);
3842 if (pci_read_config_dword(pdev
, 0x188, &vtisochctrl
)) {
3849 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3850 if (vtisochctrl
& 1)
3853 /* Drop all bits other than the number of TLB entries */
3854 vtisochctrl
&= 0x1c;
3856 /* If we have the recommended number of TLB entries (16), fine. */
3857 if (vtisochctrl
== 0x10)
3860 /* Zero TLB entries? You get to ride the short bus to school. */
3862 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3863 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3864 dmi_get_system_info(DMI_BIOS_VENDOR
),
3865 dmi_get_system_info(DMI_BIOS_VERSION
),
3866 dmi_get_system_info(DMI_PRODUCT_VERSION
));
3867 iommu_identity_mapping
|= IDENTMAP_AZALIA
;
3871 printk(KERN_WARNING
"DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",