iommu/vt-d: Make identity_mapping() take struct device not struct pci_dev
[deliverable/linux.git] / drivers / iommu / intel-iommu.c
1 /*
2 * Copyright © 2006-2014 Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
18 */
19
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <asm/irq_remapping.h>
43 #include <asm/cacheflush.h>
44 #include <asm/iommu.h>
45
46 #include "irq_remapping.h"
47 #include "pci.h"
48
49 #define ROOT_SIZE VTD_PAGE_SIZE
50 #define CONTEXT_SIZE VTD_PAGE_SIZE
51
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START (0xfee00000)
57 #define IOAPIC_RANGE_END (0xfeefffff)
58 #define IOVA_START_ADDR (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
64
65 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
66 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
67
68 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
69 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
70 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
71 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
72 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
73
74 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
75 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
76 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
77
78 /* page table handling */
79 #define LEVEL_STRIDE (9)
80 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
81
82 /*
83 * This bitmap is used to advertise the page sizes our hardware support
84 * to the IOMMU core, which will then use this information to split
85 * physically contiguous memory regions it is mapping into page sizes
86 * that we support.
87 *
88 * Traditionally the IOMMU core just handed us the mappings directly,
89 * after making sure the size is an order of a 4KiB page and that the
90 * mapping has natural alignment.
91 *
92 * To retain this behavior, we currently advertise that we support
93 * all page sizes that are an order of 4KiB.
94 *
95 * If at some point we'd like to utilize the IOMMU core's new behavior,
96 * we could change this to advertise the real page sizes we support.
97 */
98 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
99
100 static inline int agaw_to_level(int agaw)
101 {
102 return agaw + 2;
103 }
104
105 static inline int agaw_to_width(int agaw)
106 {
107 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
108 }
109
110 static inline int width_to_agaw(int width)
111 {
112 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
113 }
114
115 static inline unsigned int level_to_offset_bits(int level)
116 {
117 return (level - 1) * LEVEL_STRIDE;
118 }
119
120 static inline int pfn_level_offset(unsigned long pfn, int level)
121 {
122 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
123 }
124
125 static inline unsigned long level_mask(int level)
126 {
127 return -1UL << level_to_offset_bits(level);
128 }
129
130 static inline unsigned long level_size(int level)
131 {
132 return 1UL << level_to_offset_bits(level);
133 }
134
135 static inline unsigned long align_to_level(unsigned long pfn, int level)
136 {
137 return (pfn + level_size(level) - 1) & level_mask(level);
138 }
139
140 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
141 {
142 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
143 }
144
145 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
146 are never going to work. */
147 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
148 {
149 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
150 }
151
152 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
153 {
154 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
155 }
156 static inline unsigned long page_to_dma_pfn(struct page *pg)
157 {
158 return mm_to_dma_pfn(page_to_pfn(pg));
159 }
160 static inline unsigned long virt_to_dma_pfn(void *p)
161 {
162 return page_to_dma_pfn(virt_to_page(p));
163 }
164
165 /* global iommu list, set NULL for ignored DMAR units */
166 static struct intel_iommu **g_iommus;
167
168 static void __init check_tylersburg_isoch(void);
169 static int rwbf_quirk;
170
171 /*
172 * set to 1 to panic kernel if can't successfully enable VT-d
173 * (used when kernel is launched w/ TXT)
174 */
175 static int force_on = 0;
176
177 /*
178 * 0: Present
179 * 1-11: Reserved
180 * 12-63: Context Ptr (12 - (haw-1))
181 * 64-127: Reserved
182 */
183 struct root_entry {
184 u64 val;
185 u64 rsvd1;
186 };
187 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 static inline bool root_present(struct root_entry *root)
189 {
190 return (root->val & 1);
191 }
192 static inline void set_root_present(struct root_entry *root)
193 {
194 root->val |= 1;
195 }
196 static inline void set_root_value(struct root_entry *root, unsigned long value)
197 {
198 root->val |= value & VTD_PAGE_MASK;
199 }
200
201 static inline struct context_entry *
202 get_context_addr_from_root(struct root_entry *root)
203 {
204 return (struct context_entry *)
205 (root_present(root)?phys_to_virt(
206 root->val & VTD_PAGE_MASK) :
207 NULL);
208 }
209
210 /*
211 * low 64 bits:
212 * 0: present
213 * 1: fault processing disable
214 * 2-3: translation type
215 * 12-63: address space root
216 * high 64 bits:
217 * 0-2: address width
218 * 3-6: aval
219 * 8-23: domain id
220 */
221 struct context_entry {
222 u64 lo;
223 u64 hi;
224 };
225
226 static inline bool context_present(struct context_entry *context)
227 {
228 return (context->lo & 1);
229 }
230 static inline void context_set_present(struct context_entry *context)
231 {
232 context->lo |= 1;
233 }
234
235 static inline void context_set_fault_enable(struct context_entry *context)
236 {
237 context->lo &= (((u64)-1) << 2) | 1;
238 }
239
240 static inline void context_set_translation_type(struct context_entry *context,
241 unsigned long value)
242 {
243 context->lo &= (((u64)-1) << 4) | 3;
244 context->lo |= (value & 3) << 2;
245 }
246
247 static inline void context_set_address_root(struct context_entry *context,
248 unsigned long value)
249 {
250 context->lo |= value & VTD_PAGE_MASK;
251 }
252
253 static inline void context_set_address_width(struct context_entry *context,
254 unsigned long value)
255 {
256 context->hi |= value & 7;
257 }
258
259 static inline void context_set_domain_id(struct context_entry *context,
260 unsigned long value)
261 {
262 context->hi |= (value & ((1 << 16) - 1)) << 8;
263 }
264
265 static inline void context_clear_entry(struct context_entry *context)
266 {
267 context->lo = 0;
268 context->hi = 0;
269 }
270
271 /*
272 * 0: readable
273 * 1: writable
274 * 2-6: reserved
275 * 7: super page
276 * 8-10: available
277 * 11: snoop behavior
278 * 12-63: Host physcial address
279 */
280 struct dma_pte {
281 u64 val;
282 };
283
284 static inline void dma_clear_pte(struct dma_pte *pte)
285 {
286 pte->val = 0;
287 }
288
289 static inline u64 dma_pte_addr(struct dma_pte *pte)
290 {
291 #ifdef CONFIG_64BIT
292 return pte->val & VTD_PAGE_MASK;
293 #else
294 /* Must have a full atomic 64-bit read */
295 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
296 #endif
297 }
298
299 static inline bool dma_pte_present(struct dma_pte *pte)
300 {
301 return (pte->val & 3) != 0;
302 }
303
304 static inline bool dma_pte_superpage(struct dma_pte *pte)
305 {
306 return (pte->val & (1 << 7));
307 }
308
309 static inline int first_pte_in_page(struct dma_pte *pte)
310 {
311 return !((unsigned long)pte & ~VTD_PAGE_MASK);
312 }
313
314 /*
315 * This domain is a statically identity mapping domain.
316 * 1. This domain creats a static 1:1 mapping to all usable memory.
317 * 2. It maps to each iommu if successful.
318 * 3. Each iommu mapps to this domain if successful.
319 */
320 static struct dmar_domain *si_domain;
321 static int hw_pass_through = 1;
322
323 /* devices under the same p2p bridge are owned in one domain */
324 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
325
326 /* domain represents a virtual machine, more than one devices
327 * across iommus may be owned in one domain, e.g. kvm guest.
328 */
329 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
330
331 /* si_domain contains mulitple devices */
332 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
333
334 /* define the limit of IOMMUs supported in each domain */
335 #ifdef CONFIG_X86
336 # define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
337 #else
338 # define IOMMU_UNITS_SUPPORTED 64
339 #endif
340
341 struct dmar_domain {
342 int id; /* domain id */
343 int nid; /* node id */
344 DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
345 /* bitmap of iommus this domain uses*/
346
347 struct list_head devices; /* all devices' list */
348 struct iova_domain iovad; /* iova's that belong to this domain */
349
350 struct dma_pte *pgd; /* virtual address */
351 int gaw; /* max guest address width */
352
353 /* adjusted guest address width, 0 is level 2 30-bit */
354 int agaw;
355
356 int flags; /* flags to find out type of domain */
357
358 int iommu_coherency;/* indicate coherency of iommu access */
359 int iommu_snooping; /* indicate snooping control feature*/
360 int iommu_count; /* reference count of iommu */
361 int iommu_superpage;/* Level of superpages supported:
362 0 == 4KiB (no superpages), 1 == 2MiB,
363 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
364 spinlock_t iommu_lock; /* protect iommu set in domain */
365 u64 max_addr; /* maximum mapped address */
366 };
367
368 /* PCI domain-device relationship */
369 struct device_domain_info {
370 struct list_head link; /* link to domain siblings */
371 struct list_head global; /* link to global list */
372 u8 bus; /* PCI bus number */
373 u8 devfn; /* PCI devfn number */
374 struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
375 struct intel_iommu *iommu; /* IOMMU used by this device */
376 struct dmar_domain *domain; /* pointer to domain */
377 };
378
379 struct dmar_rmrr_unit {
380 struct list_head list; /* list of rmrr units */
381 struct acpi_dmar_header *hdr; /* ACPI header */
382 u64 base_address; /* reserved base address*/
383 u64 end_address; /* reserved end address */
384 struct dmar_dev_scope *devices; /* target devices */
385 int devices_cnt; /* target device count */
386 };
387
388 struct dmar_atsr_unit {
389 struct list_head list; /* list of ATSR units */
390 struct acpi_dmar_header *hdr; /* ACPI header */
391 struct dmar_dev_scope *devices; /* target devices */
392 int devices_cnt; /* target device count */
393 u8 include_all:1; /* include all ports */
394 };
395
396 static LIST_HEAD(dmar_atsr_units);
397 static LIST_HEAD(dmar_rmrr_units);
398
399 #define for_each_rmrr_units(rmrr) \
400 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
401
402 static void flush_unmaps_timeout(unsigned long data);
403
404 static DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
405
406 #define HIGH_WATER_MARK 250
407 struct deferred_flush_tables {
408 int next;
409 struct iova *iova[HIGH_WATER_MARK];
410 struct dmar_domain *domain[HIGH_WATER_MARK];
411 struct page *freelist[HIGH_WATER_MARK];
412 };
413
414 static struct deferred_flush_tables *deferred_flush;
415
416 /* bitmap for indexing intel_iommus */
417 static int g_num_of_iommus;
418
419 static DEFINE_SPINLOCK(async_umap_flush_lock);
420 static LIST_HEAD(unmaps_to_do);
421
422 static int timer_on;
423 static long list_size;
424
425 static void domain_exit(struct dmar_domain *domain);
426 static void domain_remove_dev_info(struct dmar_domain *domain);
427 static void domain_remove_one_dev_info(struct dmar_domain *domain,
428 struct pci_dev *pdev);
429 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
430 struct device *dev);
431
432 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
433 int dmar_disabled = 0;
434 #else
435 int dmar_disabled = 1;
436 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
437
438 int intel_iommu_enabled = 0;
439 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
440
441 static int dmar_map_gfx = 1;
442 static int dmar_forcedac;
443 static int intel_iommu_strict;
444 static int intel_iommu_superpage = 1;
445
446 int intel_iommu_gfx_mapped;
447 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
448
449 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
450 static DEFINE_SPINLOCK(device_domain_lock);
451 static LIST_HEAD(device_domain_list);
452
453 static struct iommu_ops intel_iommu_ops;
454
455 static int __init intel_iommu_setup(char *str)
456 {
457 if (!str)
458 return -EINVAL;
459 while (*str) {
460 if (!strncmp(str, "on", 2)) {
461 dmar_disabled = 0;
462 printk(KERN_INFO "Intel-IOMMU: enabled\n");
463 } else if (!strncmp(str, "off", 3)) {
464 dmar_disabled = 1;
465 printk(KERN_INFO "Intel-IOMMU: disabled\n");
466 } else if (!strncmp(str, "igfx_off", 8)) {
467 dmar_map_gfx = 0;
468 printk(KERN_INFO
469 "Intel-IOMMU: disable GFX device mapping\n");
470 } else if (!strncmp(str, "forcedac", 8)) {
471 printk(KERN_INFO
472 "Intel-IOMMU: Forcing DAC for PCI devices\n");
473 dmar_forcedac = 1;
474 } else if (!strncmp(str, "strict", 6)) {
475 printk(KERN_INFO
476 "Intel-IOMMU: disable batched IOTLB flush\n");
477 intel_iommu_strict = 1;
478 } else if (!strncmp(str, "sp_off", 6)) {
479 printk(KERN_INFO
480 "Intel-IOMMU: disable supported super page\n");
481 intel_iommu_superpage = 0;
482 }
483
484 str += strcspn(str, ",");
485 while (*str == ',')
486 str++;
487 }
488 return 0;
489 }
490 __setup("intel_iommu=", intel_iommu_setup);
491
492 static struct kmem_cache *iommu_domain_cache;
493 static struct kmem_cache *iommu_devinfo_cache;
494 static struct kmem_cache *iommu_iova_cache;
495
496 static inline void *alloc_pgtable_page(int node)
497 {
498 struct page *page;
499 void *vaddr = NULL;
500
501 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
502 if (page)
503 vaddr = page_address(page);
504 return vaddr;
505 }
506
507 static inline void free_pgtable_page(void *vaddr)
508 {
509 free_page((unsigned long)vaddr);
510 }
511
512 static inline void *alloc_domain_mem(void)
513 {
514 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
515 }
516
517 static void free_domain_mem(void *vaddr)
518 {
519 kmem_cache_free(iommu_domain_cache, vaddr);
520 }
521
522 static inline void * alloc_devinfo_mem(void)
523 {
524 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
525 }
526
527 static inline void free_devinfo_mem(void *vaddr)
528 {
529 kmem_cache_free(iommu_devinfo_cache, vaddr);
530 }
531
532 struct iova *alloc_iova_mem(void)
533 {
534 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
535 }
536
537 void free_iova_mem(struct iova *iova)
538 {
539 kmem_cache_free(iommu_iova_cache, iova);
540 }
541
542
543 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
544 {
545 unsigned long sagaw;
546 int agaw = -1;
547
548 sagaw = cap_sagaw(iommu->cap);
549 for (agaw = width_to_agaw(max_gaw);
550 agaw >= 0; agaw--) {
551 if (test_bit(agaw, &sagaw))
552 break;
553 }
554
555 return agaw;
556 }
557
558 /*
559 * Calculate max SAGAW for each iommu.
560 */
561 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
562 {
563 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
564 }
565
566 /*
567 * calculate agaw for each iommu.
568 * "SAGAW" may be different across iommus, use a default agaw, and
569 * get a supported less agaw for iommus that don't support the default agaw.
570 */
571 int iommu_calculate_agaw(struct intel_iommu *iommu)
572 {
573 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
574 }
575
576 /* This functionin only returns single iommu in a domain */
577 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
578 {
579 int iommu_id;
580
581 /* si_domain and vm domain should not get here. */
582 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
583 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
584
585 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
586 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
587 return NULL;
588
589 return g_iommus[iommu_id];
590 }
591
592 static void domain_update_iommu_coherency(struct dmar_domain *domain)
593 {
594 struct dmar_drhd_unit *drhd;
595 struct intel_iommu *iommu;
596 int i, found = 0;
597
598 domain->iommu_coherency = 1;
599
600 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
601 found = 1;
602 if (!ecap_coherent(g_iommus[i]->ecap)) {
603 domain->iommu_coherency = 0;
604 break;
605 }
606 }
607 if (found)
608 return;
609
610 /* No hardware attached; use lowest common denominator */
611 rcu_read_lock();
612 for_each_active_iommu(iommu, drhd) {
613 if (!ecap_coherent(iommu->ecap)) {
614 domain->iommu_coherency = 0;
615 break;
616 }
617 }
618 rcu_read_unlock();
619 }
620
621 static void domain_update_iommu_snooping(struct dmar_domain *domain)
622 {
623 int i;
624
625 domain->iommu_snooping = 1;
626
627 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
628 if (!ecap_sc_support(g_iommus[i]->ecap)) {
629 domain->iommu_snooping = 0;
630 break;
631 }
632 }
633 }
634
635 static void domain_update_iommu_superpage(struct dmar_domain *domain)
636 {
637 struct dmar_drhd_unit *drhd;
638 struct intel_iommu *iommu = NULL;
639 int mask = 0xf;
640
641 if (!intel_iommu_superpage) {
642 domain->iommu_superpage = 0;
643 return;
644 }
645
646 /* set iommu_superpage to the smallest common denominator */
647 rcu_read_lock();
648 for_each_active_iommu(iommu, drhd) {
649 mask &= cap_super_page_val(iommu->cap);
650 if (!mask) {
651 break;
652 }
653 }
654 rcu_read_unlock();
655
656 domain->iommu_superpage = fls(mask);
657 }
658
659 /* Some capabilities may be different across iommus */
660 static void domain_update_iommu_cap(struct dmar_domain *domain)
661 {
662 domain_update_iommu_coherency(domain);
663 domain_update_iommu_snooping(domain);
664 domain_update_iommu_superpage(domain);
665 }
666
667 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
668 {
669 struct dmar_drhd_unit *drhd = NULL;
670 struct intel_iommu *iommu;
671 struct device *dev;
672 struct pci_dev *pdev;
673 int i;
674
675 rcu_read_lock();
676 for_each_active_iommu(iommu, drhd) {
677 if (segment != drhd->segment)
678 continue;
679
680 for_each_active_dev_scope(drhd->devices,
681 drhd->devices_cnt, i, dev) {
682 if (!dev_is_pci(dev))
683 continue;
684 pdev = to_pci_dev(dev);
685 if (pdev->bus->number == bus && pdev->devfn == devfn)
686 goto out;
687 if (pdev->subordinate &&
688 pdev->subordinate->number <= bus &&
689 pdev->subordinate->busn_res.end >= bus)
690 goto out;
691 }
692
693 if (drhd->include_all)
694 goto out;
695 }
696 iommu = NULL;
697 out:
698 rcu_read_unlock();
699
700 return iommu;
701 }
702
703 static void domain_flush_cache(struct dmar_domain *domain,
704 void *addr, int size)
705 {
706 if (!domain->iommu_coherency)
707 clflush_cache_range(addr, size);
708 }
709
710 /* Gets context entry for a given bus and devfn */
711 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
712 u8 bus, u8 devfn)
713 {
714 struct root_entry *root;
715 struct context_entry *context;
716 unsigned long phy_addr;
717 unsigned long flags;
718
719 spin_lock_irqsave(&iommu->lock, flags);
720 root = &iommu->root_entry[bus];
721 context = get_context_addr_from_root(root);
722 if (!context) {
723 context = (struct context_entry *)
724 alloc_pgtable_page(iommu->node);
725 if (!context) {
726 spin_unlock_irqrestore(&iommu->lock, flags);
727 return NULL;
728 }
729 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
730 phy_addr = virt_to_phys((void *)context);
731 set_root_value(root, phy_addr);
732 set_root_present(root);
733 __iommu_flush_cache(iommu, root, sizeof(*root));
734 }
735 spin_unlock_irqrestore(&iommu->lock, flags);
736 return &context[devfn];
737 }
738
739 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
740 {
741 struct root_entry *root;
742 struct context_entry *context;
743 int ret;
744 unsigned long flags;
745
746 spin_lock_irqsave(&iommu->lock, flags);
747 root = &iommu->root_entry[bus];
748 context = get_context_addr_from_root(root);
749 if (!context) {
750 ret = 0;
751 goto out;
752 }
753 ret = context_present(&context[devfn]);
754 out:
755 spin_unlock_irqrestore(&iommu->lock, flags);
756 return ret;
757 }
758
759 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
760 {
761 struct root_entry *root;
762 struct context_entry *context;
763 unsigned long flags;
764
765 spin_lock_irqsave(&iommu->lock, flags);
766 root = &iommu->root_entry[bus];
767 context = get_context_addr_from_root(root);
768 if (context) {
769 context_clear_entry(&context[devfn]);
770 __iommu_flush_cache(iommu, &context[devfn], \
771 sizeof(*context));
772 }
773 spin_unlock_irqrestore(&iommu->lock, flags);
774 }
775
776 static void free_context_table(struct intel_iommu *iommu)
777 {
778 struct root_entry *root;
779 int i;
780 unsigned long flags;
781 struct context_entry *context;
782
783 spin_lock_irqsave(&iommu->lock, flags);
784 if (!iommu->root_entry) {
785 goto out;
786 }
787 for (i = 0; i < ROOT_ENTRY_NR; i++) {
788 root = &iommu->root_entry[i];
789 context = get_context_addr_from_root(root);
790 if (context)
791 free_pgtable_page(context);
792 }
793 free_pgtable_page(iommu->root_entry);
794 iommu->root_entry = NULL;
795 out:
796 spin_unlock_irqrestore(&iommu->lock, flags);
797 }
798
799 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
800 unsigned long pfn, int *target_level)
801 {
802 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
803 struct dma_pte *parent, *pte = NULL;
804 int level = agaw_to_level(domain->agaw);
805 int offset;
806
807 BUG_ON(!domain->pgd);
808
809 if (addr_width < BITS_PER_LONG && pfn >> addr_width)
810 /* Address beyond IOMMU's addressing capabilities. */
811 return NULL;
812
813 parent = domain->pgd;
814
815 while (1) {
816 void *tmp_page;
817
818 offset = pfn_level_offset(pfn, level);
819 pte = &parent[offset];
820 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
821 break;
822 if (level == *target_level)
823 break;
824
825 if (!dma_pte_present(pte)) {
826 uint64_t pteval;
827
828 tmp_page = alloc_pgtable_page(domain->nid);
829
830 if (!tmp_page)
831 return NULL;
832
833 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
834 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
835 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
836 /* Someone else set it while we were thinking; use theirs. */
837 free_pgtable_page(tmp_page);
838 } else {
839 dma_pte_addr(pte);
840 domain_flush_cache(domain, pte, sizeof(*pte));
841 }
842 }
843 if (level == 1)
844 break;
845
846 parent = phys_to_virt(dma_pte_addr(pte));
847 level--;
848 }
849
850 if (!*target_level)
851 *target_level = level;
852
853 return pte;
854 }
855
856
857 /* return address's pte at specific level */
858 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
859 unsigned long pfn,
860 int level, int *large_page)
861 {
862 struct dma_pte *parent, *pte = NULL;
863 int total = agaw_to_level(domain->agaw);
864 int offset;
865
866 parent = domain->pgd;
867 while (level <= total) {
868 offset = pfn_level_offset(pfn, total);
869 pte = &parent[offset];
870 if (level == total)
871 return pte;
872
873 if (!dma_pte_present(pte)) {
874 *large_page = total;
875 break;
876 }
877
878 if (pte->val & DMA_PTE_LARGE_PAGE) {
879 *large_page = total;
880 return pte;
881 }
882
883 parent = phys_to_virt(dma_pte_addr(pte));
884 total--;
885 }
886 return NULL;
887 }
888
889 /* clear last level pte, a tlb flush should be followed */
890 static void dma_pte_clear_range(struct dmar_domain *domain,
891 unsigned long start_pfn,
892 unsigned long last_pfn)
893 {
894 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
895 unsigned int large_page = 1;
896 struct dma_pte *first_pte, *pte;
897
898 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
899 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
900 BUG_ON(start_pfn > last_pfn);
901
902 /* we don't need lock here; nobody else touches the iova range */
903 do {
904 large_page = 1;
905 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
906 if (!pte) {
907 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
908 continue;
909 }
910 do {
911 dma_clear_pte(pte);
912 start_pfn += lvl_to_nr_pages(large_page);
913 pte++;
914 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
915
916 domain_flush_cache(domain, first_pte,
917 (void *)pte - (void *)first_pte);
918
919 } while (start_pfn && start_pfn <= last_pfn);
920 }
921
922 static void dma_pte_free_level(struct dmar_domain *domain, int level,
923 struct dma_pte *pte, unsigned long pfn,
924 unsigned long start_pfn, unsigned long last_pfn)
925 {
926 pfn = max(start_pfn, pfn);
927 pte = &pte[pfn_level_offset(pfn, level)];
928
929 do {
930 unsigned long level_pfn;
931 struct dma_pte *level_pte;
932
933 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
934 goto next;
935
936 level_pfn = pfn & level_mask(level - 1);
937 level_pte = phys_to_virt(dma_pte_addr(pte));
938
939 if (level > 2)
940 dma_pte_free_level(domain, level - 1, level_pte,
941 level_pfn, start_pfn, last_pfn);
942
943 /* If range covers entire pagetable, free it */
944 if (!(start_pfn > level_pfn ||
945 last_pfn < level_pfn + level_size(level) - 1)) {
946 dma_clear_pte(pte);
947 domain_flush_cache(domain, pte, sizeof(*pte));
948 free_pgtable_page(level_pte);
949 }
950 next:
951 pfn += level_size(level);
952 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
953 }
954
955 /* free page table pages. last level pte should already be cleared */
956 static void dma_pte_free_pagetable(struct dmar_domain *domain,
957 unsigned long start_pfn,
958 unsigned long last_pfn)
959 {
960 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
961
962 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
963 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
964 BUG_ON(start_pfn > last_pfn);
965
966 /* We don't need lock here; nobody else touches the iova range */
967 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
968 domain->pgd, 0, start_pfn, last_pfn);
969
970 /* free pgd */
971 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
972 free_pgtable_page(domain->pgd);
973 domain->pgd = NULL;
974 }
975 }
976
977 /* When a page at a given level is being unlinked from its parent, we don't
978 need to *modify* it at all. All we need to do is make a list of all the
979 pages which can be freed just as soon as we've flushed the IOTLB and we
980 know the hardware page-walk will no longer touch them.
981 The 'pte' argument is the *parent* PTE, pointing to the page that is to
982 be freed. */
983 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
984 int level, struct dma_pte *pte,
985 struct page *freelist)
986 {
987 struct page *pg;
988
989 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
990 pg->freelist = freelist;
991 freelist = pg;
992
993 if (level == 1)
994 return freelist;
995
996 for (pte = page_address(pg); !first_pte_in_page(pte); pte++) {
997 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
998 freelist = dma_pte_list_pagetables(domain, level - 1,
999 pte, freelist);
1000 }
1001
1002 return freelist;
1003 }
1004
1005 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1006 struct dma_pte *pte, unsigned long pfn,
1007 unsigned long start_pfn,
1008 unsigned long last_pfn,
1009 struct page *freelist)
1010 {
1011 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1012
1013 pfn = max(start_pfn, pfn);
1014 pte = &pte[pfn_level_offset(pfn, level)];
1015
1016 do {
1017 unsigned long level_pfn;
1018
1019 if (!dma_pte_present(pte))
1020 goto next;
1021
1022 level_pfn = pfn & level_mask(level);
1023
1024 /* If range covers entire pagetable, free it */
1025 if (start_pfn <= level_pfn &&
1026 last_pfn >= level_pfn + level_size(level) - 1) {
1027 /* These suborbinate page tables are going away entirely. Don't
1028 bother to clear them; we're just going to *free* them. */
1029 if (level > 1 && !dma_pte_superpage(pte))
1030 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1031
1032 dma_clear_pte(pte);
1033 if (!first_pte)
1034 first_pte = pte;
1035 last_pte = pte;
1036 } else if (level > 1) {
1037 /* Recurse down into a level that isn't *entirely* obsolete */
1038 freelist = dma_pte_clear_level(domain, level - 1,
1039 phys_to_virt(dma_pte_addr(pte)),
1040 level_pfn, start_pfn, last_pfn,
1041 freelist);
1042 }
1043 next:
1044 pfn += level_size(level);
1045 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1046
1047 if (first_pte)
1048 domain_flush_cache(domain, first_pte,
1049 (void *)++last_pte - (void *)first_pte);
1050
1051 return freelist;
1052 }
1053
1054 /* We can't just free the pages because the IOMMU may still be walking
1055 the page tables, and may have cached the intermediate levels. The
1056 pages can only be freed after the IOTLB flush has been done. */
1057 struct page *domain_unmap(struct dmar_domain *domain,
1058 unsigned long start_pfn,
1059 unsigned long last_pfn)
1060 {
1061 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1062 struct page *freelist = NULL;
1063
1064 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
1065 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
1066 BUG_ON(start_pfn > last_pfn);
1067
1068 /* we don't need lock here; nobody else touches the iova range */
1069 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1070 domain->pgd, 0, start_pfn, last_pfn, NULL);
1071
1072 /* free pgd */
1073 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1074 struct page *pgd_page = virt_to_page(domain->pgd);
1075 pgd_page->freelist = freelist;
1076 freelist = pgd_page;
1077
1078 domain->pgd = NULL;
1079 }
1080
1081 return freelist;
1082 }
1083
1084 void dma_free_pagelist(struct page *freelist)
1085 {
1086 struct page *pg;
1087
1088 while ((pg = freelist)) {
1089 freelist = pg->freelist;
1090 free_pgtable_page(page_address(pg));
1091 }
1092 }
1093
1094 /* iommu handling */
1095 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1096 {
1097 struct root_entry *root;
1098 unsigned long flags;
1099
1100 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1101 if (!root)
1102 return -ENOMEM;
1103
1104 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1105
1106 spin_lock_irqsave(&iommu->lock, flags);
1107 iommu->root_entry = root;
1108 spin_unlock_irqrestore(&iommu->lock, flags);
1109
1110 return 0;
1111 }
1112
1113 static void iommu_set_root_entry(struct intel_iommu *iommu)
1114 {
1115 void *addr;
1116 u32 sts;
1117 unsigned long flag;
1118
1119 addr = iommu->root_entry;
1120
1121 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1122 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1123
1124 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1125
1126 /* Make sure hardware complete it */
1127 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1128 readl, (sts & DMA_GSTS_RTPS), sts);
1129
1130 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1131 }
1132
1133 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1134 {
1135 u32 val;
1136 unsigned long flag;
1137
1138 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1139 return;
1140
1141 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1142 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1143
1144 /* Make sure hardware complete it */
1145 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1146 readl, (!(val & DMA_GSTS_WBFS)), val);
1147
1148 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1149 }
1150
1151 /* return value determine if we need a write buffer flush */
1152 static void __iommu_flush_context(struct intel_iommu *iommu,
1153 u16 did, u16 source_id, u8 function_mask,
1154 u64 type)
1155 {
1156 u64 val = 0;
1157 unsigned long flag;
1158
1159 switch (type) {
1160 case DMA_CCMD_GLOBAL_INVL:
1161 val = DMA_CCMD_GLOBAL_INVL;
1162 break;
1163 case DMA_CCMD_DOMAIN_INVL:
1164 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1165 break;
1166 case DMA_CCMD_DEVICE_INVL:
1167 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1168 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1169 break;
1170 default:
1171 BUG();
1172 }
1173 val |= DMA_CCMD_ICC;
1174
1175 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1176 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1177
1178 /* Make sure hardware complete it */
1179 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1180 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1181
1182 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1183 }
1184
1185 /* return value determine if we need a write buffer flush */
1186 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1187 u64 addr, unsigned int size_order, u64 type)
1188 {
1189 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1190 u64 val = 0, val_iva = 0;
1191 unsigned long flag;
1192
1193 switch (type) {
1194 case DMA_TLB_GLOBAL_FLUSH:
1195 /* global flush doesn't need set IVA_REG */
1196 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1197 break;
1198 case DMA_TLB_DSI_FLUSH:
1199 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1200 break;
1201 case DMA_TLB_PSI_FLUSH:
1202 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1203 /* IH bit is passed in as part of address */
1204 val_iva = size_order | addr;
1205 break;
1206 default:
1207 BUG();
1208 }
1209 /* Note: set drain read/write */
1210 #if 0
1211 /*
1212 * This is probably to be super secure.. Looks like we can
1213 * ignore it without any impact.
1214 */
1215 if (cap_read_drain(iommu->cap))
1216 val |= DMA_TLB_READ_DRAIN;
1217 #endif
1218 if (cap_write_drain(iommu->cap))
1219 val |= DMA_TLB_WRITE_DRAIN;
1220
1221 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1222 /* Note: Only uses first TLB reg currently */
1223 if (val_iva)
1224 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1225 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1226
1227 /* Make sure hardware complete it */
1228 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1229 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1230
1231 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1232
1233 /* check IOTLB invalidation granularity */
1234 if (DMA_TLB_IAIG(val) == 0)
1235 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1236 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1237 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1238 (unsigned long long)DMA_TLB_IIRG(type),
1239 (unsigned long long)DMA_TLB_IAIG(val));
1240 }
1241
1242 static struct device_domain_info *
1243 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1244 u8 bus, u8 devfn)
1245 {
1246 int found = 0;
1247 unsigned long flags;
1248 struct device_domain_info *info;
1249 struct pci_dev *pdev;
1250
1251 if (!ecap_dev_iotlb_support(iommu->ecap))
1252 return NULL;
1253
1254 if (!iommu->qi)
1255 return NULL;
1256
1257 spin_lock_irqsave(&device_domain_lock, flags);
1258 list_for_each_entry(info, &domain->devices, link)
1259 if (info->bus == bus && info->devfn == devfn) {
1260 found = 1;
1261 break;
1262 }
1263 spin_unlock_irqrestore(&device_domain_lock, flags);
1264
1265 if (!found || !info->dev || !dev_is_pci(info->dev))
1266 return NULL;
1267
1268 pdev = to_pci_dev(info->dev);
1269
1270 if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1271 return NULL;
1272
1273 if (!dmar_find_matched_atsr_unit(pdev))
1274 return NULL;
1275
1276 return info;
1277 }
1278
1279 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1280 {
1281 if (!info || !dev_is_pci(info->dev))
1282 return;
1283
1284 pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1285 }
1286
1287 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1288 {
1289 if (!info->dev || !dev_is_pci(info->dev) ||
1290 !pci_ats_enabled(to_pci_dev(info->dev)))
1291 return;
1292
1293 pci_disable_ats(to_pci_dev(info->dev));
1294 }
1295
1296 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1297 u64 addr, unsigned mask)
1298 {
1299 u16 sid, qdep;
1300 unsigned long flags;
1301 struct device_domain_info *info;
1302
1303 spin_lock_irqsave(&device_domain_lock, flags);
1304 list_for_each_entry(info, &domain->devices, link) {
1305 struct pci_dev *pdev;
1306 if (!info->dev || !dev_is_pci(info->dev))
1307 continue;
1308
1309 pdev = to_pci_dev(info->dev);
1310 if (!pci_ats_enabled(pdev))
1311 continue;
1312
1313 sid = info->bus << 8 | info->devfn;
1314 qdep = pci_ats_queue_depth(pdev);
1315 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1316 }
1317 spin_unlock_irqrestore(&device_domain_lock, flags);
1318 }
1319
1320 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1321 unsigned long pfn, unsigned int pages, int ih, int map)
1322 {
1323 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1324 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1325
1326 BUG_ON(pages == 0);
1327
1328 if (ih)
1329 ih = 1 << 6;
1330 /*
1331 * Fallback to domain selective flush if no PSI support or the size is
1332 * too big.
1333 * PSI requires page size to be 2 ^ x, and the base address is naturally
1334 * aligned to the size
1335 */
1336 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1337 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1338 DMA_TLB_DSI_FLUSH);
1339 else
1340 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1341 DMA_TLB_PSI_FLUSH);
1342
1343 /*
1344 * In caching mode, changes of pages from non-present to present require
1345 * flush. However, device IOTLB doesn't need to be flushed in this case.
1346 */
1347 if (!cap_caching_mode(iommu->cap) || !map)
1348 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1349 }
1350
1351 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1352 {
1353 u32 pmen;
1354 unsigned long flags;
1355
1356 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1357 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1358 pmen &= ~DMA_PMEN_EPM;
1359 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1360
1361 /* wait for the protected region status bit to clear */
1362 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1363 readl, !(pmen & DMA_PMEN_PRS), pmen);
1364
1365 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1366 }
1367
1368 static int iommu_enable_translation(struct intel_iommu *iommu)
1369 {
1370 u32 sts;
1371 unsigned long flags;
1372
1373 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1374 iommu->gcmd |= DMA_GCMD_TE;
1375 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1376
1377 /* Make sure hardware complete it */
1378 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1379 readl, (sts & DMA_GSTS_TES), sts);
1380
1381 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1382 return 0;
1383 }
1384
1385 static int iommu_disable_translation(struct intel_iommu *iommu)
1386 {
1387 u32 sts;
1388 unsigned long flag;
1389
1390 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1391 iommu->gcmd &= ~DMA_GCMD_TE;
1392 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1393
1394 /* Make sure hardware complete it */
1395 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1396 readl, (!(sts & DMA_GSTS_TES)), sts);
1397
1398 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1399 return 0;
1400 }
1401
1402
1403 static int iommu_init_domains(struct intel_iommu *iommu)
1404 {
1405 unsigned long ndomains;
1406 unsigned long nlongs;
1407
1408 ndomains = cap_ndoms(iommu->cap);
1409 pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1410 iommu->seq_id, ndomains);
1411 nlongs = BITS_TO_LONGS(ndomains);
1412
1413 spin_lock_init(&iommu->lock);
1414
1415 /* TBD: there might be 64K domains,
1416 * consider other allocation for future chip
1417 */
1418 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1419 if (!iommu->domain_ids) {
1420 pr_err("IOMMU%d: allocating domain id array failed\n",
1421 iommu->seq_id);
1422 return -ENOMEM;
1423 }
1424 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1425 GFP_KERNEL);
1426 if (!iommu->domains) {
1427 pr_err("IOMMU%d: allocating domain array failed\n",
1428 iommu->seq_id);
1429 kfree(iommu->domain_ids);
1430 iommu->domain_ids = NULL;
1431 return -ENOMEM;
1432 }
1433
1434 /*
1435 * if Caching mode is set, then invalid translations are tagged
1436 * with domainid 0. Hence we need to pre-allocate it.
1437 */
1438 if (cap_caching_mode(iommu->cap))
1439 set_bit(0, iommu->domain_ids);
1440 return 0;
1441 }
1442
1443 static void free_dmar_iommu(struct intel_iommu *iommu)
1444 {
1445 struct dmar_domain *domain;
1446 int i, count;
1447 unsigned long flags;
1448
1449 if ((iommu->domains) && (iommu->domain_ids)) {
1450 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1451 /*
1452 * Domain id 0 is reserved for invalid translation
1453 * if hardware supports caching mode.
1454 */
1455 if (cap_caching_mode(iommu->cap) && i == 0)
1456 continue;
1457
1458 domain = iommu->domains[i];
1459 clear_bit(i, iommu->domain_ids);
1460
1461 spin_lock_irqsave(&domain->iommu_lock, flags);
1462 count = --domain->iommu_count;
1463 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1464 if (count == 0)
1465 domain_exit(domain);
1466 }
1467 }
1468
1469 if (iommu->gcmd & DMA_GCMD_TE)
1470 iommu_disable_translation(iommu);
1471
1472 kfree(iommu->domains);
1473 kfree(iommu->domain_ids);
1474 iommu->domains = NULL;
1475 iommu->domain_ids = NULL;
1476
1477 g_iommus[iommu->seq_id] = NULL;
1478
1479 /* free context mapping */
1480 free_context_table(iommu);
1481 }
1482
1483 static struct dmar_domain *alloc_domain(bool vm)
1484 {
1485 /* domain id for virtual machine, it won't be set in context */
1486 static atomic_t vm_domid = ATOMIC_INIT(0);
1487 struct dmar_domain *domain;
1488
1489 domain = alloc_domain_mem();
1490 if (!domain)
1491 return NULL;
1492
1493 domain->nid = -1;
1494 domain->iommu_count = 0;
1495 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1496 domain->flags = 0;
1497 spin_lock_init(&domain->iommu_lock);
1498 INIT_LIST_HEAD(&domain->devices);
1499 if (vm) {
1500 domain->id = atomic_inc_return(&vm_domid);
1501 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
1502 }
1503
1504 return domain;
1505 }
1506
1507 static int iommu_attach_domain(struct dmar_domain *domain,
1508 struct intel_iommu *iommu)
1509 {
1510 int num;
1511 unsigned long ndomains;
1512 unsigned long flags;
1513
1514 ndomains = cap_ndoms(iommu->cap);
1515
1516 spin_lock_irqsave(&iommu->lock, flags);
1517
1518 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1519 if (num >= ndomains) {
1520 spin_unlock_irqrestore(&iommu->lock, flags);
1521 printk(KERN_ERR "IOMMU: no free domain ids\n");
1522 return -ENOMEM;
1523 }
1524
1525 domain->id = num;
1526 domain->iommu_count++;
1527 set_bit(num, iommu->domain_ids);
1528 set_bit(iommu->seq_id, domain->iommu_bmp);
1529 iommu->domains[num] = domain;
1530 spin_unlock_irqrestore(&iommu->lock, flags);
1531
1532 return 0;
1533 }
1534
1535 static void iommu_detach_domain(struct dmar_domain *domain,
1536 struct intel_iommu *iommu)
1537 {
1538 unsigned long flags;
1539 int num, ndomains;
1540
1541 spin_lock_irqsave(&iommu->lock, flags);
1542 ndomains = cap_ndoms(iommu->cap);
1543 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1544 if (iommu->domains[num] == domain) {
1545 clear_bit(num, iommu->domain_ids);
1546 iommu->domains[num] = NULL;
1547 break;
1548 }
1549 }
1550 spin_unlock_irqrestore(&iommu->lock, flags);
1551 }
1552
1553 static struct iova_domain reserved_iova_list;
1554 static struct lock_class_key reserved_rbtree_key;
1555
1556 static int dmar_init_reserved_ranges(void)
1557 {
1558 struct pci_dev *pdev = NULL;
1559 struct iova *iova;
1560 int i;
1561
1562 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1563
1564 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1565 &reserved_rbtree_key);
1566
1567 /* IOAPIC ranges shouldn't be accessed by DMA */
1568 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1569 IOVA_PFN(IOAPIC_RANGE_END));
1570 if (!iova) {
1571 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1572 return -ENODEV;
1573 }
1574
1575 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1576 for_each_pci_dev(pdev) {
1577 struct resource *r;
1578
1579 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1580 r = &pdev->resource[i];
1581 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1582 continue;
1583 iova = reserve_iova(&reserved_iova_list,
1584 IOVA_PFN(r->start),
1585 IOVA_PFN(r->end));
1586 if (!iova) {
1587 printk(KERN_ERR "Reserve iova failed\n");
1588 return -ENODEV;
1589 }
1590 }
1591 }
1592 return 0;
1593 }
1594
1595 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1596 {
1597 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1598 }
1599
1600 static inline int guestwidth_to_adjustwidth(int gaw)
1601 {
1602 int agaw;
1603 int r = (gaw - 12) % 9;
1604
1605 if (r == 0)
1606 agaw = gaw;
1607 else
1608 agaw = gaw + 9 - r;
1609 if (agaw > 64)
1610 agaw = 64;
1611 return agaw;
1612 }
1613
1614 static int domain_init(struct dmar_domain *domain, int guest_width)
1615 {
1616 struct intel_iommu *iommu;
1617 int adjust_width, agaw;
1618 unsigned long sagaw;
1619
1620 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1621 domain_reserve_special_ranges(domain);
1622
1623 /* calculate AGAW */
1624 iommu = domain_get_iommu(domain);
1625 if (guest_width > cap_mgaw(iommu->cap))
1626 guest_width = cap_mgaw(iommu->cap);
1627 domain->gaw = guest_width;
1628 adjust_width = guestwidth_to_adjustwidth(guest_width);
1629 agaw = width_to_agaw(adjust_width);
1630 sagaw = cap_sagaw(iommu->cap);
1631 if (!test_bit(agaw, &sagaw)) {
1632 /* hardware doesn't support it, choose a bigger one */
1633 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1634 agaw = find_next_bit(&sagaw, 5, agaw);
1635 if (agaw >= 5)
1636 return -ENODEV;
1637 }
1638 domain->agaw = agaw;
1639
1640 if (ecap_coherent(iommu->ecap))
1641 domain->iommu_coherency = 1;
1642 else
1643 domain->iommu_coherency = 0;
1644
1645 if (ecap_sc_support(iommu->ecap))
1646 domain->iommu_snooping = 1;
1647 else
1648 domain->iommu_snooping = 0;
1649
1650 if (intel_iommu_superpage)
1651 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1652 else
1653 domain->iommu_superpage = 0;
1654
1655 domain->nid = iommu->node;
1656
1657 /* always allocate the top pgd */
1658 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1659 if (!domain->pgd)
1660 return -ENOMEM;
1661 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1662 return 0;
1663 }
1664
1665 static void domain_exit(struct dmar_domain *domain)
1666 {
1667 struct dmar_drhd_unit *drhd;
1668 struct intel_iommu *iommu;
1669 struct page *freelist = NULL;
1670
1671 /* Domain 0 is reserved, so dont process it */
1672 if (!domain)
1673 return;
1674
1675 /* Flush any lazy unmaps that may reference this domain */
1676 if (!intel_iommu_strict)
1677 flush_unmaps_timeout(0);
1678
1679 /* remove associated devices */
1680 domain_remove_dev_info(domain);
1681
1682 /* destroy iovas */
1683 put_iova_domain(&domain->iovad);
1684
1685 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1686
1687 /* clear attached or cached domains */
1688 rcu_read_lock();
1689 for_each_active_iommu(iommu, drhd)
1690 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1691 test_bit(iommu->seq_id, domain->iommu_bmp))
1692 iommu_detach_domain(domain, iommu);
1693 rcu_read_unlock();
1694
1695 dma_free_pagelist(freelist);
1696
1697 free_domain_mem(domain);
1698 }
1699
1700 static int domain_context_mapping_one(struct dmar_domain *domain,
1701 struct intel_iommu *iommu,
1702 u8 bus, u8 devfn, int translation)
1703 {
1704 struct context_entry *context;
1705 unsigned long flags;
1706 struct dma_pte *pgd;
1707 unsigned long num;
1708 unsigned long ndomains;
1709 int id;
1710 int agaw;
1711 struct device_domain_info *info = NULL;
1712
1713 pr_debug("Set context mapping for %02x:%02x.%d\n",
1714 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1715
1716 BUG_ON(!domain->pgd);
1717 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1718 translation != CONTEXT_TT_MULTI_LEVEL);
1719
1720 context = device_to_context_entry(iommu, bus, devfn);
1721 if (!context)
1722 return -ENOMEM;
1723 spin_lock_irqsave(&iommu->lock, flags);
1724 if (context_present(context)) {
1725 spin_unlock_irqrestore(&iommu->lock, flags);
1726 return 0;
1727 }
1728
1729 id = domain->id;
1730 pgd = domain->pgd;
1731
1732 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1733 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1734 int found = 0;
1735
1736 /* find an available domain id for this device in iommu */
1737 ndomains = cap_ndoms(iommu->cap);
1738 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1739 if (iommu->domains[num] == domain) {
1740 id = num;
1741 found = 1;
1742 break;
1743 }
1744 }
1745
1746 if (found == 0) {
1747 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1748 if (num >= ndomains) {
1749 spin_unlock_irqrestore(&iommu->lock, flags);
1750 printk(KERN_ERR "IOMMU: no free domain ids\n");
1751 return -EFAULT;
1752 }
1753
1754 set_bit(num, iommu->domain_ids);
1755 iommu->domains[num] = domain;
1756 id = num;
1757 }
1758
1759 /* Skip top levels of page tables for
1760 * iommu which has less agaw than default.
1761 * Unnecessary for PT mode.
1762 */
1763 if (translation != CONTEXT_TT_PASS_THROUGH) {
1764 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1765 pgd = phys_to_virt(dma_pte_addr(pgd));
1766 if (!dma_pte_present(pgd)) {
1767 spin_unlock_irqrestore(&iommu->lock, flags);
1768 return -ENOMEM;
1769 }
1770 }
1771 }
1772 }
1773
1774 context_set_domain_id(context, id);
1775
1776 if (translation != CONTEXT_TT_PASS_THROUGH) {
1777 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1778 translation = info ? CONTEXT_TT_DEV_IOTLB :
1779 CONTEXT_TT_MULTI_LEVEL;
1780 }
1781 /*
1782 * In pass through mode, AW must be programmed to indicate the largest
1783 * AGAW value supported by hardware. And ASR is ignored by hardware.
1784 */
1785 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1786 context_set_address_width(context, iommu->msagaw);
1787 else {
1788 context_set_address_root(context, virt_to_phys(pgd));
1789 context_set_address_width(context, iommu->agaw);
1790 }
1791
1792 context_set_translation_type(context, translation);
1793 context_set_fault_enable(context);
1794 context_set_present(context);
1795 domain_flush_cache(domain, context, sizeof(*context));
1796
1797 /*
1798 * It's a non-present to present mapping. If hardware doesn't cache
1799 * non-present entry we only need to flush the write-buffer. If the
1800 * _does_ cache non-present entries, then it does so in the special
1801 * domain #0, which we have to flush:
1802 */
1803 if (cap_caching_mode(iommu->cap)) {
1804 iommu->flush.flush_context(iommu, 0,
1805 (((u16)bus) << 8) | devfn,
1806 DMA_CCMD_MASK_NOBIT,
1807 DMA_CCMD_DEVICE_INVL);
1808 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1809 } else {
1810 iommu_flush_write_buffer(iommu);
1811 }
1812 iommu_enable_dev_iotlb(info);
1813 spin_unlock_irqrestore(&iommu->lock, flags);
1814
1815 spin_lock_irqsave(&domain->iommu_lock, flags);
1816 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1817 domain->iommu_count++;
1818 if (domain->iommu_count == 1)
1819 domain->nid = iommu->node;
1820 domain_update_iommu_cap(domain);
1821 }
1822 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1823 return 0;
1824 }
1825
1826 static int
1827 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1828 int translation)
1829 {
1830 int ret;
1831 struct pci_dev *tmp, *parent;
1832 struct intel_iommu *iommu;
1833
1834 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1835 pdev->devfn);
1836 if (!iommu)
1837 return -ENODEV;
1838
1839 ret = domain_context_mapping_one(domain, iommu,
1840 pdev->bus->number, pdev->devfn,
1841 translation);
1842 if (ret)
1843 return ret;
1844
1845 /* dependent device mapping */
1846 tmp = pci_find_upstream_pcie_bridge(pdev);
1847 if (!tmp)
1848 return 0;
1849 /* Secondary interface's bus number and devfn 0 */
1850 parent = pdev->bus->self;
1851 while (parent != tmp) {
1852 ret = domain_context_mapping_one(domain, iommu,
1853 parent->bus->number,
1854 parent->devfn, translation);
1855 if (ret)
1856 return ret;
1857 parent = parent->bus->self;
1858 }
1859 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1860 return domain_context_mapping_one(domain, iommu,
1861 tmp->subordinate->number, 0,
1862 translation);
1863 else /* this is a legacy PCI bridge */
1864 return domain_context_mapping_one(domain, iommu,
1865 tmp->bus->number,
1866 tmp->devfn,
1867 translation);
1868 }
1869
1870 static int domain_context_mapped(struct pci_dev *pdev)
1871 {
1872 int ret;
1873 struct pci_dev *tmp, *parent;
1874 struct intel_iommu *iommu;
1875
1876 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1877 pdev->devfn);
1878 if (!iommu)
1879 return -ENODEV;
1880
1881 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1882 if (!ret)
1883 return ret;
1884 /* dependent device mapping */
1885 tmp = pci_find_upstream_pcie_bridge(pdev);
1886 if (!tmp)
1887 return ret;
1888 /* Secondary interface's bus number and devfn 0 */
1889 parent = pdev->bus->self;
1890 while (parent != tmp) {
1891 ret = device_context_mapped(iommu, parent->bus->number,
1892 parent->devfn);
1893 if (!ret)
1894 return ret;
1895 parent = parent->bus->self;
1896 }
1897 if (pci_is_pcie(tmp))
1898 return device_context_mapped(iommu, tmp->subordinate->number,
1899 0);
1900 else
1901 return device_context_mapped(iommu, tmp->bus->number,
1902 tmp->devfn);
1903 }
1904
1905 /* Returns a number of VTD pages, but aligned to MM page size */
1906 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1907 size_t size)
1908 {
1909 host_addr &= ~PAGE_MASK;
1910 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1911 }
1912
1913 /* Return largest possible superpage level for a given mapping */
1914 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1915 unsigned long iov_pfn,
1916 unsigned long phy_pfn,
1917 unsigned long pages)
1918 {
1919 int support, level = 1;
1920 unsigned long pfnmerge;
1921
1922 support = domain->iommu_superpage;
1923
1924 /* To use a large page, the virtual *and* physical addresses
1925 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1926 of them will mean we have to use smaller pages. So just
1927 merge them and check both at once. */
1928 pfnmerge = iov_pfn | phy_pfn;
1929
1930 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1931 pages >>= VTD_STRIDE_SHIFT;
1932 if (!pages)
1933 break;
1934 pfnmerge >>= VTD_STRIDE_SHIFT;
1935 level++;
1936 support--;
1937 }
1938 return level;
1939 }
1940
1941 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1942 struct scatterlist *sg, unsigned long phys_pfn,
1943 unsigned long nr_pages, int prot)
1944 {
1945 struct dma_pte *first_pte = NULL, *pte = NULL;
1946 phys_addr_t uninitialized_var(pteval);
1947 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1948 unsigned long sg_res;
1949 unsigned int largepage_lvl = 0;
1950 unsigned long lvl_pages = 0;
1951
1952 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1953
1954 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1955 return -EINVAL;
1956
1957 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1958
1959 if (sg)
1960 sg_res = 0;
1961 else {
1962 sg_res = nr_pages + 1;
1963 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1964 }
1965
1966 while (nr_pages > 0) {
1967 uint64_t tmp;
1968
1969 if (!sg_res) {
1970 sg_res = aligned_nrpages(sg->offset, sg->length);
1971 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1972 sg->dma_length = sg->length;
1973 pteval = page_to_phys(sg_page(sg)) | prot;
1974 phys_pfn = pteval >> VTD_PAGE_SHIFT;
1975 }
1976
1977 if (!pte) {
1978 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1979
1980 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
1981 if (!pte)
1982 return -ENOMEM;
1983 /* It is large page*/
1984 if (largepage_lvl > 1) {
1985 pteval |= DMA_PTE_LARGE_PAGE;
1986 /* Ensure that old small page tables are removed to make room
1987 for superpage, if they exist. */
1988 dma_pte_clear_range(domain, iov_pfn,
1989 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1990 dma_pte_free_pagetable(domain, iov_pfn,
1991 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1992 } else {
1993 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1994 }
1995
1996 }
1997 /* We don't need lock here, nobody else
1998 * touches the iova range
1999 */
2000 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2001 if (tmp) {
2002 static int dumps = 5;
2003 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2004 iov_pfn, tmp, (unsigned long long)pteval);
2005 if (dumps) {
2006 dumps--;
2007 debug_dma_dump_mappings(NULL);
2008 }
2009 WARN_ON(1);
2010 }
2011
2012 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2013
2014 BUG_ON(nr_pages < lvl_pages);
2015 BUG_ON(sg_res < lvl_pages);
2016
2017 nr_pages -= lvl_pages;
2018 iov_pfn += lvl_pages;
2019 phys_pfn += lvl_pages;
2020 pteval += lvl_pages * VTD_PAGE_SIZE;
2021 sg_res -= lvl_pages;
2022
2023 /* If the next PTE would be the first in a new page, then we
2024 need to flush the cache on the entries we've just written.
2025 And then we'll need to recalculate 'pte', so clear it and
2026 let it get set again in the if (!pte) block above.
2027
2028 If we're done (!nr_pages) we need to flush the cache too.
2029
2030 Also if we've been setting superpages, we may need to
2031 recalculate 'pte' and switch back to smaller pages for the
2032 end of the mapping, if the trailing size is not enough to
2033 use another superpage (i.e. sg_res < lvl_pages). */
2034 pte++;
2035 if (!nr_pages || first_pte_in_page(pte) ||
2036 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2037 domain_flush_cache(domain, first_pte,
2038 (void *)pte - (void *)first_pte);
2039 pte = NULL;
2040 }
2041
2042 if (!sg_res && nr_pages)
2043 sg = sg_next(sg);
2044 }
2045 return 0;
2046 }
2047
2048 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2049 struct scatterlist *sg, unsigned long nr_pages,
2050 int prot)
2051 {
2052 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2053 }
2054
2055 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2056 unsigned long phys_pfn, unsigned long nr_pages,
2057 int prot)
2058 {
2059 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2060 }
2061
2062 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2063 {
2064 if (!iommu)
2065 return;
2066
2067 clear_context_table(iommu, bus, devfn);
2068 iommu->flush.flush_context(iommu, 0, 0, 0,
2069 DMA_CCMD_GLOBAL_INVL);
2070 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2071 }
2072
2073 static inline void unlink_domain_info(struct device_domain_info *info)
2074 {
2075 assert_spin_locked(&device_domain_lock);
2076 list_del(&info->link);
2077 list_del(&info->global);
2078 if (info->dev)
2079 info->dev->archdata.iommu = NULL;
2080 }
2081
2082 static void domain_remove_dev_info(struct dmar_domain *domain)
2083 {
2084 struct device_domain_info *info;
2085 unsigned long flags, flags2;
2086
2087 spin_lock_irqsave(&device_domain_lock, flags);
2088 while (!list_empty(&domain->devices)) {
2089 info = list_entry(domain->devices.next,
2090 struct device_domain_info, link);
2091 unlink_domain_info(info);
2092 spin_unlock_irqrestore(&device_domain_lock, flags);
2093
2094 iommu_disable_dev_iotlb(info);
2095 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2096
2097 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
2098 iommu_detach_dependent_devices(info->iommu, info->dev);
2099 /* clear this iommu in iommu_bmp, update iommu count
2100 * and capabilities
2101 */
2102 spin_lock_irqsave(&domain->iommu_lock, flags2);
2103 if (test_and_clear_bit(info->iommu->seq_id,
2104 domain->iommu_bmp)) {
2105 domain->iommu_count--;
2106 domain_update_iommu_cap(domain);
2107 }
2108 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2109 }
2110
2111 free_devinfo_mem(info);
2112 spin_lock_irqsave(&device_domain_lock, flags);
2113 }
2114 spin_unlock_irqrestore(&device_domain_lock, flags);
2115 }
2116
2117 /*
2118 * find_domain
2119 * Note: we use struct device->archdata.iommu stores the info
2120 */
2121 static struct dmar_domain *find_domain(struct device *dev)
2122 {
2123 struct device_domain_info *info;
2124
2125 /* No lock here, assumes no domain exit in normal case */
2126 info = dev->archdata.iommu;
2127 if (info)
2128 return info->domain;
2129 return NULL;
2130 }
2131
2132 static inline struct device_domain_info *
2133 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2134 {
2135 struct device_domain_info *info;
2136
2137 list_for_each_entry(info, &device_domain_list, global)
2138 if (info->iommu->segment == segment && info->bus == bus &&
2139 info->devfn == devfn)
2140 return info;
2141
2142 return NULL;
2143 }
2144
2145 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2146 int bus, int devfn,
2147 struct device *dev,
2148 struct dmar_domain *domain)
2149 {
2150 struct dmar_domain *found = NULL;
2151 struct device_domain_info *info;
2152 unsigned long flags;
2153
2154 info = alloc_devinfo_mem();
2155 if (!info)
2156 return NULL;
2157
2158 info->bus = bus;
2159 info->devfn = devfn;
2160 info->dev = dev;
2161 info->domain = domain;
2162 info->iommu = iommu;
2163 if (!dev)
2164 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2165
2166 spin_lock_irqsave(&device_domain_lock, flags);
2167 if (dev)
2168 found = find_domain(dev);
2169 else {
2170 struct device_domain_info *info2;
2171 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2172 if (info2)
2173 found = info2->domain;
2174 }
2175 if (found) {
2176 spin_unlock_irqrestore(&device_domain_lock, flags);
2177 free_devinfo_mem(info);
2178 /* Caller must free the original domain */
2179 return found;
2180 }
2181
2182 list_add(&info->link, &domain->devices);
2183 list_add(&info->global, &device_domain_list);
2184 if (dev)
2185 dev->archdata.iommu = info;
2186 spin_unlock_irqrestore(&device_domain_lock, flags);
2187
2188 return domain;
2189 }
2190
2191 /* domain is initialized */
2192 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
2193 {
2194 struct dmar_domain *domain, *free = NULL;
2195 struct intel_iommu *iommu = NULL;
2196 struct device_domain_info *info;
2197 struct dmar_drhd_unit *drhd;
2198 struct pci_dev *dev_tmp;
2199 unsigned long flags;
2200 int bus = 0, devfn = 0;
2201 int segment;
2202
2203 domain = find_domain(&pdev->dev);
2204 if (domain)
2205 return domain;
2206
2207 segment = pci_domain_nr(pdev->bus);
2208
2209 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
2210 if (dev_tmp) {
2211 if (pci_is_pcie(dev_tmp)) {
2212 bus = dev_tmp->subordinate->number;
2213 devfn = 0;
2214 } else {
2215 bus = dev_tmp->bus->number;
2216 devfn = dev_tmp->devfn;
2217 }
2218 spin_lock_irqsave(&device_domain_lock, flags);
2219 info = dmar_search_domain_by_dev_info(segment, bus, devfn);
2220 if (info) {
2221 iommu = info->iommu;
2222 domain = info->domain;
2223 }
2224 spin_unlock_irqrestore(&device_domain_lock, flags);
2225 if (info)
2226 goto found_domain;
2227 }
2228
2229 drhd = dmar_find_matched_drhd_unit(pdev);
2230 if (!drhd) {
2231 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2232 pci_name(pdev));
2233 return NULL;
2234 }
2235 iommu = drhd->iommu;
2236
2237 /* Allocate and intialize new domain for the device */
2238 domain = alloc_domain(false);
2239 if (!domain)
2240 goto error;
2241 if (iommu_attach_domain(domain, iommu)) {
2242 free_domain_mem(domain);
2243 goto error;
2244 }
2245 free = domain;
2246 if (domain_init(domain, gaw))
2247 goto error;
2248
2249 /* register pcie-to-pci device */
2250 if (dev_tmp) {
2251 domain = dmar_insert_dev_info(iommu, bus, devfn, NULL,
2252 domain);
2253 if (!domain)
2254 goto error;
2255 }
2256
2257 found_domain:
2258 domain = dmar_insert_dev_info(iommu, pdev->bus->number,
2259 pdev->devfn, &pdev->dev, domain);
2260 error:
2261 if (free != domain)
2262 domain_exit(free);
2263
2264 return domain;
2265 }
2266
2267 static int iommu_identity_mapping;
2268 #define IDENTMAP_ALL 1
2269 #define IDENTMAP_GFX 2
2270 #define IDENTMAP_AZALIA 4
2271
2272 static int iommu_domain_identity_map(struct dmar_domain *domain,
2273 unsigned long long start,
2274 unsigned long long end)
2275 {
2276 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2277 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2278
2279 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2280 dma_to_mm_pfn(last_vpfn))) {
2281 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2282 return -ENOMEM;
2283 }
2284
2285 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2286 start, end, domain->id);
2287 /*
2288 * RMRR range might have overlap with physical memory range,
2289 * clear it first
2290 */
2291 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2292
2293 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2294 last_vpfn - first_vpfn + 1,
2295 DMA_PTE_READ|DMA_PTE_WRITE);
2296 }
2297
2298 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2299 unsigned long long start,
2300 unsigned long long end)
2301 {
2302 struct dmar_domain *domain;
2303 int ret;
2304
2305 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2306 if (!domain)
2307 return -ENOMEM;
2308
2309 /* For _hardware_ passthrough, don't bother. But for software
2310 passthrough, we do it anyway -- it may indicate a memory
2311 range which is reserved in E820, so which didn't get set
2312 up to start with in si_domain */
2313 if (domain == si_domain && hw_pass_through) {
2314 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2315 pci_name(pdev), start, end);
2316 return 0;
2317 }
2318
2319 printk(KERN_INFO
2320 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2321 pci_name(pdev), start, end);
2322
2323 if (end < start) {
2324 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2325 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2326 dmi_get_system_info(DMI_BIOS_VENDOR),
2327 dmi_get_system_info(DMI_BIOS_VERSION),
2328 dmi_get_system_info(DMI_PRODUCT_VERSION));
2329 ret = -EIO;
2330 goto error;
2331 }
2332
2333 if (end >> agaw_to_width(domain->agaw)) {
2334 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2335 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2336 agaw_to_width(domain->agaw),
2337 dmi_get_system_info(DMI_BIOS_VENDOR),
2338 dmi_get_system_info(DMI_BIOS_VERSION),
2339 dmi_get_system_info(DMI_PRODUCT_VERSION));
2340 ret = -EIO;
2341 goto error;
2342 }
2343
2344 ret = iommu_domain_identity_map(domain, start, end);
2345 if (ret)
2346 goto error;
2347
2348 /* context entry init */
2349 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2350 if (ret)
2351 goto error;
2352
2353 return 0;
2354
2355 error:
2356 domain_exit(domain);
2357 return ret;
2358 }
2359
2360 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2361 struct pci_dev *pdev)
2362 {
2363 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2364 return 0;
2365 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2366 rmrr->end_address);
2367 }
2368
2369 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2370 static inline void iommu_prepare_isa(void)
2371 {
2372 struct pci_dev *pdev;
2373 int ret;
2374
2375 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2376 if (!pdev)
2377 return;
2378
2379 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2380 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2381
2382 if (ret)
2383 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2384 "floppy might not work\n");
2385
2386 }
2387 #else
2388 static inline void iommu_prepare_isa(void)
2389 {
2390 return;
2391 }
2392 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2393
2394 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2395
2396 static int __init si_domain_init(int hw)
2397 {
2398 struct dmar_drhd_unit *drhd;
2399 struct intel_iommu *iommu;
2400 int nid, ret = 0;
2401
2402 si_domain = alloc_domain(false);
2403 if (!si_domain)
2404 return -EFAULT;
2405
2406 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2407
2408 for_each_active_iommu(iommu, drhd) {
2409 ret = iommu_attach_domain(si_domain, iommu);
2410 if (ret) {
2411 domain_exit(si_domain);
2412 return -EFAULT;
2413 }
2414 }
2415
2416 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2417 domain_exit(si_domain);
2418 return -EFAULT;
2419 }
2420
2421 pr_debug("IOMMU: identity mapping domain is domain %d\n",
2422 si_domain->id);
2423
2424 if (hw)
2425 return 0;
2426
2427 for_each_online_node(nid) {
2428 unsigned long start_pfn, end_pfn;
2429 int i;
2430
2431 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2432 ret = iommu_domain_identity_map(si_domain,
2433 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2434 if (ret)
2435 return ret;
2436 }
2437 }
2438
2439 return 0;
2440 }
2441
2442 static int identity_mapping(struct device *dev)
2443 {
2444 struct device_domain_info *info;
2445
2446 if (likely(!iommu_identity_mapping))
2447 return 0;
2448
2449 info = dev->archdata.iommu;
2450 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2451 return (info->domain == si_domain);
2452
2453 return 0;
2454 }
2455
2456 static int domain_add_dev_info(struct dmar_domain *domain,
2457 struct pci_dev *pdev,
2458 int translation)
2459 {
2460 struct dmar_domain *ndomain;
2461 struct intel_iommu *iommu;
2462 int ret;
2463
2464 iommu = device_to_iommu(pci_domain_nr(pdev->bus),
2465 pdev->bus->number, pdev->devfn);
2466 if (!iommu)
2467 return -ENODEV;
2468
2469 ndomain = dmar_insert_dev_info(iommu, pdev->bus->number, pdev->devfn,
2470 &pdev->dev, domain);
2471 if (ndomain != domain)
2472 return -EBUSY;
2473
2474 ret = domain_context_mapping(domain, pdev, translation);
2475 if (ret) {
2476 domain_remove_one_dev_info(domain, pdev);
2477 return ret;
2478 }
2479
2480 return 0;
2481 }
2482
2483 static bool device_has_rmrr(struct pci_dev *dev)
2484 {
2485 struct dmar_rmrr_unit *rmrr;
2486 struct device *tmp;
2487 int i;
2488
2489 rcu_read_lock();
2490 for_each_rmrr_units(rmrr) {
2491 /*
2492 * Return TRUE if this RMRR contains the device that
2493 * is passed in.
2494 */
2495 for_each_active_dev_scope(rmrr->devices,
2496 rmrr->devices_cnt, i, tmp)
2497 if (tmp == &dev->dev) {
2498 rcu_read_unlock();
2499 return true;
2500 }
2501 }
2502 rcu_read_unlock();
2503 return false;
2504 }
2505
2506 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2507 {
2508
2509 /*
2510 * We want to prevent any device associated with an RMRR from
2511 * getting placed into the SI Domain. This is done because
2512 * problems exist when devices are moved in and out of domains
2513 * and their respective RMRR info is lost. We exempt USB devices
2514 * from this process due to their usage of RMRRs that are known
2515 * to not be needed after BIOS hand-off to OS.
2516 */
2517 if (device_has_rmrr(pdev) &&
2518 (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2519 return 0;
2520
2521 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2522 return 1;
2523
2524 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2525 return 1;
2526
2527 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2528 return 0;
2529
2530 /*
2531 * We want to start off with all devices in the 1:1 domain, and
2532 * take them out later if we find they can't access all of memory.
2533 *
2534 * However, we can't do this for PCI devices behind bridges,
2535 * because all PCI devices behind the same bridge will end up
2536 * with the same source-id on their transactions.
2537 *
2538 * Practically speaking, we can't change things around for these
2539 * devices at run-time, because we can't be sure there'll be no
2540 * DMA transactions in flight for any of their siblings.
2541 *
2542 * So PCI devices (unless they're on the root bus) as well as
2543 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2544 * the 1:1 domain, just in _case_ one of their siblings turns out
2545 * not to be able to map all of memory.
2546 */
2547 if (!pci_is_pcie(pdev)) {
2548 if (!pci_is_root_bus(pdev->bus))
2549 return 0;
2550 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2551 return 0;
2552 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2553 return 0;
2554
2555 /*
2556 * At boot time, we don't yet know if devices will be 64-bit capable.
2557 * Assume that they will -- if they turn out not to be, then we can
2558 * take them out of the 1:1 domain later.
2559 */
2560 if (!startup) {
2561 /*
2562 * If the device's dma_mask is less than the system's memory
2563 * size then this is not a candidate for identity mapping.
2564 */
2565 u64 dma_mask = pdev->dma_mask;
2566
2567 if (pdev->dev.coherent_dma_mask &&
2568 pdev->dev.coherent_dma_mask < dma_mask)
2569 dma_mask = pdev->dev.coherent_dma_mask;
2570
2571 return dma_mask >= dma_get_required_mask(&pdev->dev);
2572 }
2573
2574 return 1;
2575 }
2576
2577 static int __init iommu_prepare_static_identity_mapping(int hw)
2578 {
2579 struct pci_dev *pdev = NULL;
2580 int ret;
2581
2582 ret = si_domain_init(hw);
2583 if (ret)
2584 return -EFAULT;
2585
2586 for_each_pci_dev(pdev) {
2587 if (iommu_should_identity_map(pdev, 1)) {
2588 ret = domain_add_dev_info(si_domain, pdev,
2589 hw ? CONTEXT_TT_PASS_THROUGH :
2590 CONTEXT_TT_MULTI_LEVEL);
2591 if (ret) {
2592 /* device not associated with an iommu */
2593 if (ret == -ENODEV)
2594 continue;
2595 return ret;
2596 }
2597 pr_info("IOMMU: %s identity mapping for device %s\n",
2598 hw ? "hardware" : "software", pci_name(pdev));
2599 }
2600 }
2601
2602 return 0;
2603 }
2604
2605 static int __init init_dmars(void)
2606 {
2607 struct dmar_drhd_unit *drhd;
2608 struct dmar_rmrr_unit *rmrr;
2609 struct device *dev;
2610 struct intel_iommu *iommu;
2611 int i, ret;
2612
2613 /*
2614 * for each drhd
2615 * allocate root
2616 * initialize and program root entry to not present
2617 * endfor
2618 */
2619 for_each_drhd_unit(drhd) {
2620 /*
2621 * lock not needed as this is only incremented in the single
2622 * threaded kernel __init code path all other access are read
2623 * only
2624 */
2625 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2626 g_num_of_iommus++;
2627 continue;
2628 }
2629 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2630 IOMMU_UNITS_SUPPORTED);
2631 }
2632
2633 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2634 GFP_KERNEL);
2635 if (!g_iommus) {
2636 printk(KERN_ERR "Allocating global iommu array failed\n");
2637 ret = -ENOMEM;
2638 goto error;
2639 }
2640
2641 deferred_flush = kzalloc(g_num_of_iommus *
2642 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2643 if (!deferred_flush) {
2644 ret = -ENOMEM;
2645 goto free_g_iommus;
2646 }
2647
2648 for_each_active_iommu(iommu, drhd) {
2649 g_iommus[iommu->seq_id] = iommu;
2650
2651 ret = iommu_init_domains(iommu);
2652 if (ret)
2653 goto free_iommu;
2654
2655 /*
2656 * TBD:
2657 * we could share the same root & context tables
2658 * among all IOMMU's. Need to Split it later.
2659 */
2660 ret = iommu_alloc_root_entry(iommu);
2661 if (ret) {
2662 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2663 goto free_iommu;
2664 }
2665 if (!ecap_pass_through(iommu->ecap))
2666 hw_pass_through = 0;
2667 }
2668
2669 /*
2670 * Start from the sane iommu hardware state.
2671 */
2672 for_each_active_iommu(iommu, drhd) {
2673 /*
2674 * If the queued invalidation is already initialized by us
2675 * (for example, while enabling interrupt-remapping) then
2676 * we got the things already rolling from a sane state.
2677 */
2678 if (iommu->qi)
2679 continue;
2680
2681 /*
2682 * Clear any previous faults.
2683 */
2684 dmar_fault(-1, iommu);
2685 /*
2686 * Disable queued invalidation if supported and already enabled
2687 * before OS handover.
2688 */
2689 dmar_disable_qi(iommu);
2690 }
2691
2692 for_each_active_iommu(iommu, drhd) {
2693 if (dmar_enable_qi(iommu)) {
2694 /*
2695 * Queued Invalidate not enabled, use Register Based
2696 * Invalidate
2697 */
2698 iommu->flush.flush_context = __iommu_flush_context;
2699 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2700 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2701 "invalidation\n",
2702 iommu->seq_id,
2703 (unsigned long long)drhd->reg_base_addr);
2704 } else {
2705 iommu->flush.flush_context = qi_flush_context;
2706 iommu->flush.flush_iotlb = qi_flush_iotlb;
2707 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2708 "invalidation\n",
2709 iommu->seq_id,
2710 (unsigned long long)drhd->reg_base_addr);
2711 }
2712 }
2713
2714 if (iommu_pass_through)
2715 iommu_identity_mapping |= IDENTMAP_ALL;
2716
2717 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2718 iommu_identity_mapping |= IDENTMAP_GFX;
2719 #endif
2720
2721 check_tylersburg_isoch();
2722
2723 /*
2724 * If pass through is not set or not enabled, setup context entries for
2725 * identity mappings for rmrr, gfx, and isa and may fall back to static
2726 * identity mapping if iommu_identity_mapping is set.
2727 */
2728 if (iommu_identity_mapping) {
2729 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2730 if (ret) {
2731 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2732 goto free_iommu;
2733 }
2734 }
2735 /*
2736 * For each rmrr
2737 * for each dev attached to rmrr
2738 * do
2739 * locate drhd for dev, alloc domain for dev
2740 * allocate free domain
2741 * allocate page table entries for rmrr
2742 * if context not allocated for bus
2743 * allocate and init context
2744 * set present in root table for this bus
2745 * init context with domain, translation etc
2746 * endfor
2747 * endfor
2748 */
2749 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2750 for_each_rmrr_units(rmrr) {
2751 /* some BIOS lists non-exist devices in DMAR table. */
2752 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2753 i, dev) {
2754 if (!dev_is_pci(dev))
2755 continue;
2756 ret = iommu_prepare_rmrr_dev(rmrr, to_pci_dev(dev));
2757 if (ret)
2758 printk(KERN_ERR
2759 "IOMMU: mapping reserved region failed\n");
2760 }
2761 }
2762
2763 iommu_prepare_isa();
2764
2765 /*
2766 * for each drhd
2767 * enable fault log
2768 * global invalidate context cache
2769 * global invalidate iotlb
2770 * enable translation
2771 */
2772 for_each_iommu(iommu, drhd) {
2773 if (drhd->ignored) {
2774 /*
2775 * we always have to disable PMRs or DMA may fail on
2776 * this device
2777 */
2778 if (force_on)
2779 iommu_disable_protect_mem_regions(iommu);
2780 continue;
2781 }
2782
2783 iommu_flush_write_buffer(iommu);
2784
2785 ret = dmar_set_interrupt(iommu);
2786 if (ret)
2787 goto free_iommu;
2788
2789 iommu_set_root_entry(iommu);
2790
2791 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2792 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2793
2794 ret = iommu_enable_translation(iommu);
2795 if (ret)
2796 goto free_iommu;
2797
2798 iommu_disable_protect_mem_regions(iommu);
2799 }
2800
2801 return 0;
2802
2803 free_iommu:
2804 for_each_active_iommu(iommu, drhd)
2805 free_dmar_iommu(iommu);
2806 kfree(deferred_flush);
2807 free_g_iommus:
2808 kfree(g_iommus);
2809 error:
2810 return ret;
2811 }
2812
2813 /* This takes a number of _MM_ pages, not VTD pages */
2814 static struct iova *intel_alloc_iova(struct device *dev,
2815 struct dmar_domain *domain,
2816 unsigned long nrpages, uint64_t dma_mask)
2817 {
2818 struct pci_dev *pdev = to_pci_dev(dev);
2819 struct iova *iova = NULL;
2820
2821 /* Restrict dma_mask to the width that the iommu can handle */
2822 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2823
2824 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2825 /*
2826 * First try to allocate an io virtual address in
2827 * DMA_BIT_MASK(32) and if that fails then try allocating
2828 * from higher range
2829 */
2830 iova = alloc_iova(&domain->iovad, nrpages,
2831 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2832 if (iova)
2833 return iova;
2834 }
2835 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2836 if (unlikely(!iova)) {
2837 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2838 nrpages, pci_name(pdev));
2839 return NULL;
2840 }
2841
2842 return iova;
2843 }
2844
2845 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2846 {
2847 struct dmar_domain *domain;
2848 int ret;
2849
2850 domain = get_domain_for_dev(pdev,
2851 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2852 if (!domain) {
2853 printk(KERN_ERR
2854 "Allocating domain for %s failed", pci_name(pdev));
2855 return NULL;
2856 }
2857
2858 /* make sure context mapping is ok */
2859 if (unlikely(!domain_context_mapped(pdev))) {
2860 ret = domain_context_mapping(domain, pdev,
2861 CONTEXT_TT_MULTI_LEVEL);
2862 if (ret) {
2863 printk(KERN_ERR
2864 "Domain context map for %s failed",
2865 pci_name(pdev));
2866 return NULL;
2867 }
2868 }
2869
2870 return domain;
2871 }
2872
2873 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2874 {
2875 struct device_domain_info *info;
2876
2877 /* No lock here, assumes no domain exit in normal case */
2878 info = dev->dev.archdata.iommu;
2879 if (likely(info))
2880 return info->domain;
2881
2882 return __get_valid_domain_for_dev(dev);
2883 }
2884
2885 static int iommu_dummy(struct device *dev)
2886 {
2887 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2888 }
2889
2890 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2891 static int iommu_no_mapping(struct device *dev)
2892 {
2893 struct pci_dev *pdev;
2894 int found;
2895
2896 if (unlikely(!dev_is_pci(dev)))
2897 return 1;
2898
2899 if (iommu_dummy(dev))
2900 return 1;
2901
2902 if (!iommu_identity_mapping)
2903 return 0;
2904
2905 pdev = to_pci_dev(dev);
2906 found = identity_mapping(dev);
2907 if (found) {
2908 if (iommu_should_identity_map(pdev, 0))
2909 return 1;
2910 else {
2911 /*
2912 * 32 bit DMA is removed from si_domain and fall back
2913 * to non-identity mapping.
2914 */
2915 domain_remove_one_dev_info(si_domain, pdev);
2916 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2917 pci_name(pdev));
2918 return 0;
2919 }
2920 } else {
2921 /*
2922 * In case of a detached 64 bit DMA device from vm, the device
2923 * is put into si_domain for identity mapping.
2924 */
2925 if (iommu_should_identity_map(pdev, 0)) {
2926 int ret;
2927 ret = domain_add_dev_info(si_domain, pdev,
2928 hw_pass_through ?
2929 CONTEXT_TT_PASS_THROUGH :
2930 CONTEXT_TT_MULTI_LEVEL);
2931 if (!ret) {
2932 printk(KERN_INFO "64bit %s uses identity mapping\n",
2933 pci_name(pdev));
2934 return 1;
2935 }
2936 }
2937 }
2938
2939 return 0;
2940 }
2941
2942 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2943 size_t size, int dir, u64 dma_mask)
2944 {
2945 struct pci_dev *pdev = to_pci_dev(hwdev);
2946 struct dmar_domain *domain;
2947 phys_addr_t start_paddr;
2948 struct iova *iova;
2949 int prot = 0;
2950 int ret;
2951 struct intel_iommu *iommu;
2952 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2953
2954 BUG_ON(dir == DMA_NONE);
2955
2956 if (iommu_no_mapping(hwdev))
2957 return paddr;
2958
2959 domain = get_valid_domain_for_dev(pdev);
2960 if (!domain)
2961 return 0;
2962
2963 iommu = domain_get_iommu(domain);
2964 size = aligned_nrpages(paddr, size);
2965
2966 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2967 if (!iova)
2968 goto error;
2969
2970 /*
2971 * Check if DMAR supports zero-length reads on write only
2972 * mappings..
2973 */
2974 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2975 !cap_zlr(iommu->cap))
2976 prot |= DMA_PTE_READ;
2977 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2978 prot |= DMA_PTE_WRITE;
2979 /*
2980 * paddr - (paddr + size) might be partial page, we should map the whole
2981 * page. Note: if two part of one page are separately mapped, we
2982 * might have two guest_addr mapping to the same host paddr, but this
2983 * is not a big problem
2984 */
2985 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2986 mm_to_dma_pfn(paddr_pfn), size, prot);
2987 if (ret)
2988 goto error;
2989
2990 /* it's a non-present to present mapping. Only flush if caching mode */
2991 if (cap_caching_mode(iommu->cap))
2992 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
2993 else
2994 iommu_flush_write_buffer(iommu);
2995
2996 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2997 start_paddr += paddr & ~PAGE_MASK;
2998 return start_paddr;
2999
3000 error:
3001 if (iova)
3002 __free_iova(&domain->iovad, iova);
3003 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3004 pci_name(pdev), size, (unsigned long long)paddr, dir);
3005 return 0;
3006 }
3007
3008 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3009 unsigned long offset, size_t size,
3010 enum dma_data_direction dir,
3011 struct dma_attrs *attrs)
3012 {
3013 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3014 dir, to_pci_dev(dev)->dma_mask);
3015 }
3016
3017 static void flush_unmaps(void)
3018 {
3019 int i, j;
3020
3021 timer_on = 0;
3022
3023 /* just flush them all */
3024 for (i = 0; i < g_num_of_iommus; i++) {
3025 struct intel_iommu *iommu = g_iommus[i];
3026 if (!iommu)
3027 continue;
3028
3029 if (!deferred_flush[i].next)
3030 continue;
3031
3032 /* In caching mode, global flushes turn emulation expensive */
3033 if (!cap_caching_mode(iommu->cap))
3034 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3035 DMA_TLB_GLOBAL_FLUSH);
3036 for (j = 0; j < deferred_flush[i].next; j++) {
3037 unsigned long mask;
3038 struct iova *iova = deferred_flush[i].iova[j];
3039 struct dmar_domain *domain = deferred_flush[i].domain[j];
3040
3041 /* On real hardware multiple invalidations are expensive */
3042 if (cap_caching_mode(iommu->cap))
3043 iommu_flush_iotlb_psi(iommu, domain->id,
3044 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1,
3045 !deferred_flush[i].freelist[j], 0);
3046 else {
3047 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
3048 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3049 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3050 }
3051 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3052 if (deferred_flush[i].freelist[j])
3053 dma_free_pagelist(deferred_flush[i].freelist[j]);
3054 }
3055 deferred_flush[i].next = 0;
3056 }
3057
3058 list_size = 0;
3059 }
3060
3061 static void flush_unmaps_timeout(unsigned long data)
3062 {
3063 unsigned long flags;
3064
3065 spin_lock_irqsave(&async_umap_flush_lock, flags);
3066 flush_unmaps();
3067 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3068 }
3069
3070 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3071 {
3072 unsigned long flags;
3073 int next, iommu_id;
3074 struct intel_iommu *iommu;
3075
3076 spin_lock_irqsave(&async_umap_flush_lock, flags);
3077 if (list_size == HIGH_WATER_MARK)
3078 flush_unmaps();
3079
3080 iommu = domain_get_iommu(dom);
3081 iommu_id = iommu->seq_id;
3082
3083 next = deferred_flush[iommu_id].next;
3084 deferred_flush[iommu_id].domain[next] = dom;
3085 deferred_flush[iommu_id].iova[next] = iova;
3086 deferred_flush[iommu_id].freelist[next] = freelist;
3087 deferred_flush[iommu_id].next++;
3088
3089 if (!timer_on) {
3090 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3091 timer_on = 1;
3092 }
3093 list_size++;
3094 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3095 }
3096
3097 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3098 size_t size, enum dma_data_direction dir,
3099 struct dma_attrs *attrs)
3100 {
3101 struct pci_dev *pdev = to_pci_dev(dev);
3102 struct dmar_domain *domain;
3103 unsigned long start_pfn, last_pfn;
3104 struct iova *iova;
3105 struct intel_iommu *iommu;
3106 struct page *freelist;
3107
3108 if (iommu_no_mapping(dev))
3109 return;
3110
3111 domain = find_domain(dev);
3112 BUG_ON(!domain);
3113
3114 iommu = domain_get_iommu(domain);
3115
3116 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3117 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3118 (unsigned long long)dev_addr))
3119 return;
3120
3121 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3122 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3123
3124 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3125 pci_name(pdev), start_pfn, last_pfn);
3126
3127 freelist = domain_unmap(domain, start_pfn, last_pfn);
3128
3129 if (intel_iommu_strict) {
3130 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3131 last_pfn - start_pfn + 1, !freelist, 0);
3132 /* free iova */
3133 __free_iova(&domain->iovad, iova);
3134 dma_free_pagelist(freelist);
3135 } else {
3136 add_unmap(domain, iova, freelist);
3137 /*
3138 * queue up the release of the unmap to save the 1/6th of the
3139 * cpu used up by the iotlb flush operation...
3140 */
3141 }
3142 }
3143
3144 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3145 dma_addr_t *dma_handle, gfp_t flags,
3146 struct dma_attrs *attrs)
3147 {
3148 void *vaddr;
3149 int order;
3150
3151 size = PAGE_ALIGN(size);
3152 order = get_order(size);
3153
3154 if (!iommu_no_mapping(hwdev))
3155 flags &= ~(GFP_DMA | GFP_DMA32);
3156 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3157 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3158 flags |= GFP_DMA;
3159 else
3160 flags |= GFP_DMA32;
3161 }
3162
3163 vaddr = (void *)__get_free_pages(flags, order);
3164 if (!vaddr)
3165 return NULL;
3166 memset(vaddr, 0, size);
3167
3168 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3169 DMA_BIDIRECTIONAL,
3170 hwdev->coherent_dma_mask);
3171 if (*dma_handle)
3172 return vaddr;
3173 free_pages((unsigned long)vaddr, order);
3174 return NULL;
3175 }
3176
3177 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3178 dma_addr_t dma_handle, struct dma_attrs *attrs)
3179 {
3180 int order;
3181
3182 size = PAGE_ALIGN(size);
3183 order = get_order(size);
3184
3185 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3186 free_pages((unsigned long)vaddr, order);
3187 }
3188
3189 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3190 int nelems, enum dma_data_direction dir,
3191 struct dma_attrs *attrs)
3192 {
3193 struct dmar_domain *domain;
3194 unsigned long start_pfn, last_pfn;
3195 struct iova *iova;
3196 struct intel_iommu *iommu;
3197 struct page *freelist;
3198
3199 if (iommu_no_mapping(hwdev))
3200 return;
3201
3202 domain = find_domain(hwdev);
3203 BUG_ON(!domain);
3204
3205 iommu = domain_get_iommu(domain);
3206
3207 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3208 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3209 (unsigned long long)sglist[0].dma_address))
3210 return;
3211
3212 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3213 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3214
3215 freelist = domain_unmap(domain, start_pfn, last_pfn);
3216
3217 if (intel_iommu_strict) {
3218 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3219 last_pfn - start_pfn + 1, !freelist, 0);
3220 /* free iova */
3221 __free_iova(&domain->iovad, iova);
3222 dma_free_pagelist(freelist);
3223 } else {
3224 add_unmap(domain, iova, freelist);
3225 /*
3226 * queue up the release of the unmap to save the 1/6th of the
3227 * cpu used up by the iotlb flush operation...
3228 */
3229 }
3230 }
3231
3232 static int intel_nontranslate_map_sg(struct device *hddev,
3233 struct scatterlist *sglist, int nelems, int dir)
3234 {
3235 int i;
3236 struct scatterlist *sg;
3237
3238 for_each_sg(sglist, sg, nelems, i) {
3239 BUG_ON(!sg_page(sg));
3240 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3241 sg->dma_length = sg->length;
3242 }
3243 return nelems;
3244 }
3245
3246 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3247 enum dma_data_direction dir, struct dma_attrs *attrs)
3248 {
3249 int i;
3250 struct pci_dev *pdev = to_pci_dev(hwdev);
3251 struct dmar_domain *domain;
3252 size_t size = 0;
3253 int prot = 0;
3254 struct iova *iova = NULL;
3255 int ret;
3256 struct scatterlist *sg;
3257 unsigned long start_vpfn;
3258 struct intel_iommu *iommu;
3259
3260 BUG_ON(dir == DMA_NONE);
3261 if (iommu_no_mapping(hwdev))
3262 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3263
3264 domain = get_valid_domain_for_dev(pdev);
3265 if (!domain)
3266 return 0;
3267
3268 iommu = domain_get_iommu(domain);
3269
3270 for_each_sg(sglist, sg, nelems, i)
3271 size += aligned_nrpages(sg->offset, sg->length);
3272
3273 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3274 pdev->dma_mask);
3275 if (!iova) {
3276 sglist->dma_length = 0;
3277 return 0;
3278 }
3279
3280 /*
3281 * Check if DMAR supports zero-length reads on write only
3282 * mappings..
3283 */
3284 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3285 !cap_zlr(iommu->cap))
3286 prot |= DMA_PTE_READ;
3287 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3288 prot |= DMA_PTE_WRITE;
3289
3290 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3291
3292 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3293 if (unlikely(ret)) {
3294 /* clear the page */
3295 dma_pte_clear_range(domain, start_vpfn,
3296 start_vpfn + size - 1);
3297 /* free page tables */
3298 dma_pte_free_pagetable(domain, start_vpfn,
3299 start_vpfn + size - 1);
3300 /* free iova */
3301 __free_iova(&domain->iovad, iova);
3302 return 0;
3303 }
3304
3305 /* it's a non-present to present mapping. Only flush if caching mode */
3306 if (cap_caching_mode(iommu->cap))
3307 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3308 else
3309 iommu_flush_write_buffer(iommu);
3310
3311 return nelems;
3312 }
3313
3314 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3315 {
3316 return !dma_addr;
3317 }
3318
3319 struct dma_map_ops intel_dma_ops = {
3320 .alloc = intel_alloc_coherent,
3321 .free = intel_free_coherent,
3322 .map_sg = intel_map_sg,
3323 .unmap_sg = intel_unmap_sg,
3324 .map_page = intel_map_page,
3325 .unmap_page = intel_unmap_page,
3326 .mapping_error = intel_mapping_error,
3327 };
3328
3329 static inline int iommu_domain_cache_init(void)
3330 {
3331 int ret = 0;
3332
3333 iommu_domain_cache = kmem_cache_create("iommu_domain",
3334 sizeof(struct dmar_domain),
3335 0,
3336 SLAB_HWCACHE_ALIGN,
3337
3338 NULL);
3339 if (!iommu_domain_cache) {
3340 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3341 ret = -ENOMEM;
3342 }
3343
3344 return ret;
3345 }
3346
3347 static inline int iommu_devinfo_cache_init(void)
3348 {
3349 int ret = 0;
3350
3351 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3352 sizeof(struct device_domain_info),
3353 0,
3354 SLAB_HWCACHE_ALIGN,
3355 NULL);
3356 if (!iommu_devinfo_cache) {
3357 printk(KERN_ERR "Couldn't create devinfo cache\n");
3358 ret = -ENOMEM;
3359 }
3360
3361 return ret;
3362 }
3363
3364 static inline int iommu_iova_cache_init(void)
3365 {
3366 int ret = 0;
3367
3368 iommu_iova_cache = kmem_cache_create("iommu_iova",
3369 sizeof(struct iova),
3370 0,
3371 SLAB_HWCACHE_ALIGN,
3372 NULL);
3373 if (!iommu_iova_cache) {
3374 printk(KERN_ERR "Couldn't create iova cache\n");
3375 ret = -ENOMEM;
3376 }
3377
3378 return ret;
3379 }
3380
3381 static int __init iommu_init_mempool(void)
3382 {
3383 int ret;
3384 ret = iommu_iova_cache_init();
3385 if (ret)
3386 return ret;
3387
3388 ret = iommu_domain_cache_init();
3389 if (ret)
3390 goto domain_error;
3391
3392 ret = iommu_devinfo_cache_init();
3393 if (!ret)
3394 return ret;
3395
3396 kmem_cache_destroy(iommu_domain_cache);
3397 domain_error:
3398 kmem_cache_destroy(iommu_iova_cache);
3399
3400 return -ENOMEM;
3401 }
3402
3403 static void __init iommu_exit_mempool(void)
3404 {
3405 kmem_cache_destroy(iommu_devinfo_cache);
3406 kmem_cache_destroy(iommu_domain_cache);
3407 kmem_cache_destroy(iommu_iova_cache);
3408
3409 }
3410
3411 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3412 {
3413 struct dmar_drhd_unit *drhd;
3414 u32 vtbar;
3415 int rc;
3416
3417 /* We know that this device on this chipset has its own IOMMU.
3418 * If we find it under a different IOMMU, then the BIOS is lying
3419 * to us. Hope that the IOMMU for this device is actually
3420 * disabled, and it needs no translation...
3421 */
3422 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3423 if (rc) {
3424 /* "can't" happen */
3425 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3426 return;
3427 }
3428 vtbar &= 0xffff0000;
3429
3430 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3431 drhd = dmar_find_matched_drhd_unit(pdev);
3432 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3433 TAINT_FIRMWARE_WORKAROUND,
3434 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3435 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3436 }
3437 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3438
3439 static void __init init_no_remapping_devices(void)
3440 {
3441 struct dmar_drhd_unit *drhd;
3442 struct device *dev;
3443 int i;
3444
3445 for_each_drhd_unit(drhd) {
3446 if (!drhd->include_all) {
3447 for_each_active_dev_scope(drhd->devices,
3448 drhd->devices_cnt, i, dev)
3449 break;
3450 /* ignore DMAR unit if no devices exist */
3451 if (i == drhd->devices_cnt)
3452 drhd->ignored = 1;
3453 }
3454 }
3455
3456 for_each_active_drhd_unit(drhd) {
3457 if (drhd->include_all)
3458 continue;
3459
3460 for_each_active_dev_scope(drhd->devices,
3461 drhd->devices_cnt, i, dev)
3462 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3463 break;
3464 if (i < drhd->devices_cnt)
3465 continue;
3466
3467 /* This IOMMU has *only* gfx devices. Either bypass it or
3468 set the gfx_mapped flag, as appropriate */
3469 if (dmar_map_gfx) {
3470 intel_iommu_gfx_mapped = 1;
3471 } else {
3472 drhd->ignored = 1;
3473 for_each_active_dev_scope(drhd->devices,
3474 drhd->devices_cnt, i, dev)
3475 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3476 }
3477 }
3478 }
3479
3480 #ifdef CONFIG_SUSPEND
3481 static int init_iommu_hw(void)
3482 {
3483 struct dmar_drhd_unit *drhd;
3484 struct intel_iommu *iommu = NULL;
3485
3486 for_each_active_iommu(iommu, drhd)
3487 if (iommu->qi)
3488 dmar_reenable_qi(iommu);
3489
3490 for_each_iommu(iommu, drhd) {
3491 if (drhd->ignored) {
3492 /*
3493 * we always have to disable PMRs or DMA may fail on
3494 * this device
3495 */
3496 if (force_on)
3497 iommu_disable_protect_mem_regions(iommu);
3498 continue;
3499 }
3500
3501 iommu_flush_write_buffer(iommu);
3502
3503 iommu_set_root_entry(iommu);
3504
3505 iommu->flush.flush_context(iommu, 0, 0, 0,
3506 DMA_CCMD_GLOBAL_INVL);
3507 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3508 DMA_TLB_GLOBAL_FLUSH);
3509 if (iommu_enable_translation(iommu))
3510 return 1;
3511 iommu_disable_protect_mem_regions(iommu);
3512 }
3513
3514 return 0;
3515 }
3516
3517 static void iommu_flush_all(void)
3518 {
3519 struct dmar_drhd_unit *drhd;
3520 struct intel_iommu *iommu;
3521
3522 for_each_active_iommu(iommu, drhd) {
3523 iommu->flush.flush_context(iommu, 0, 0, 0,
3524 DMA_CCMD_GLOBAL_INVL);
3525 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3526 DMA_TLB_GLOBAL_FLUSH);
3527 }
3528 }
3529
3530 static int iommu_suspend(void)
3531 {
3532 struct dmar_drhd_unit *drhd;
3533 struct intel_iommu *iommu = NULL;
3534 unsigned long flag;
3535
3536 for_each_active_iommu(iommu, drhd) {
3537 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3538 GFP_ATOMIC);
3539 if (!iommu->iommu_state)
3540 goto nomem;
3541 }
3542
3543 iommu_flush_all();
3544
3545 for_each_active_iommu(iommu, drhd) {
3546 iommu_disable_translation(iommu);
3547
3548 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3549
3550 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3551 readl(iommu->reg + DMAR_FECTL_REG);
3552 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3553 readl(iommu->reg + DMAR_FEDATA_REG);
3554 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3555 readl(iommu->reg + DMAR_FEADDR_REG);
3556 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3557 readl(iommu->reg + DMAR_FEUADDR_REG);
3558
3559 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3560 }
3561 return 0;
3562
3563 nomem:
3564 for_each_active_iommu(iommu, drhd)
3565 kfree(iommu->iommu_state);
3566
3567 return -ENOMEM;
3568 }
3569
3570 static void iommu_resume(void)
3571 {
3572 struct dmar_drhd_unit *drhd;
3573 struct intel_iommu *iommu = NULL;
3574 unsigned long flag;
3575
3576 if (init_iommu_hw()) {
3577 if (force_on)
3578 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3579 else
3580 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3581 return;
3582 }
3583
3584 for_each_active_iommu(iommu, drhd) {
3585
3586 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3587
3588 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3589 iommu->reg + DMAR_FECTL_REG);
3590 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3591 iommu->reg + DMAR_FEDATA_REG);
3592 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3593 iommu->reg + DMAR_FEADDR_REG);
3594 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3595 iommu->reg + DMAR_FEUADDR_REG);
3596
3597 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3598 }
3599
3600 for_each_active_iommu(iommu, drhd)
3601 kfree(iommu->iommu_state);
3602 }
3603
3604 static struct syscore_ops iommu_syscore_ops = {
3605 .resume = iommu_resume,
3606 .suspend = iommu_suspend,
3607 };
3608
3609 static void __init init_iommu_pm_ops(void)
3610 {
3611 register_syscore_ops(&iommu_syscore_ops);
3612 }
3613
3614 #else
3615 static inline void init_iommu_pm_ops(void) {}
3616 #endif /* CONFIG_PM */
3617
3618
3619 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3620 {
3621 struct acpi_dmar_reserved_memory *rmrr;
3622 struct dmar_rmrr_unit *rmrru;
3623
3624 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3625 if (!rmrru)
3626 return -ENOMEM;
3627
3628 rmrru->hdr = header;
3629 rmrr = (struct acpi_dmar_reserved_memory *)header;
3630 rmrru->base_address = rmrr->base_address;
3631 rmrru->end_address = rmrr->end_address;
3632 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3633 ((void *)rmrr) + rmrr->header.length,
3634 &rmrru->devices_cnt);
3635 if (rmrru->devices_cnt && rmrru->devices == NULL) {
3636 kfree(rmrru);
3637 return -ENOMEM;
3638 }
3639
3640 list_add(&rmrru->list, &dmar_rmrr_units);
3641
3642 return 0;
3643 }
3644
3645 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3646 {
3647 struct acpi_dmar_atsr *atsr;
3648 struct dmar_atsr_unit *atsru;
3649
3650 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3651 atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3652 if (!atsru)
3653 return -ENOMEM;
3654
3655 atsru->hdr = hdr;
3656 atsru->include_all = atsr->flags & 0x1;
3657 if (!atsru->include_all) {
3658 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3659 (void *)atsr + atsr->header.length,
3660 &atsru->devices_cnt);
3661 if (atsru->devices_cnt && atsru->devices == NULL) {
3662 kfree(atsru);
3663 return -ENOMEM;
3664 }
3665 }
3666
3667 list_add_rcu(&atsru->list, &dmar_atsr_units);
3668
3669 return 0;
3670 }
3671
3672 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3673 {
3674 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3675 kfree(atsru);
3676 }
3677
3678 static void intel_iommu_free_dmars(void)
3679 {
3680 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3681 struct dmar_atsr_unit *atsru, *atsr_n;
3682
3683 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3684 list_del(&rmrru->list);
3685 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3686 kfree(rmrru);
3687 }
3688
3689 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3690 list_del(&atsru->list);
3691 intel_iommu_free_atsr(atsru);
3692 }
3693 }
3694
3695 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3696 {
3697 int i, ret = 1;
3698 struct pci_bus *bus;
3699 struct pci_dev *bridge = NULL;
3700 struct device *tmp;
3701 struct acpi_dmar_atsr *atsr;
3702 struct dmar_atsr_unit *atsru;
3703
3704 dev = pci_physfn(dev);
3705 for (bus = dev->bus; bus; bus = bus->parent) {
3706 bridge = bus->self;
3707 if (!bridge || !pci_is_pcie(bridge) ||
3708 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3709 return 0;
3710 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3711 break;
3712 }
3713 if (!bridge)
3714 return 0;
3715
3716 rcu_read_lock();
3717 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3718 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3719 if (atsr->segment != pci_domain_nr(dev->bus))
3720 continue;
3721
3722 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3723 if (tmp == &bridge->dev)
3724 goto out;
3725
3726 if (atsru->include_all)
3727 goto out;
3728 }
3729 ret = 0;
3730 out:
3731 rcu_read_unlock();
3732
3733 return ret;
3734 }
3735
3736 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3737 {
3738 int ret = 0;
3739 struct dmar_rmrr_unit *rmrru;
3740 struct dmar_atsr_unit *atsru;
3741 struct acpi_dmar_atsr *atsr;
3742 struct acpi_dmar_reserved_memory *rmrr;
3743
3744 if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3745 return 0;
3746
3747 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3748 rmrr = container_of(rmrru->hdr,
3749 struct acpi_dmar_reserved_memory, header);
3750 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3751 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3752 ((void *)rmrr) + rmrr->header.length,
3753 rmrr->segment, rmrru->devices,
3754 rmrru->devices_cnt);
3755 if (ret > 0)
3756 break;
3757 else if(ret < 0)
3758 return ret;
3759 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3760 if (dmar_remove_dev_scope(info, rmrr->segment,
3761 rmrru->devices, rmrru->devices_cnt))
3762 break;
3763 }
3764 }
3765
3766 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3767 if (atsru->include_all)
3768 continue;
3769
3770 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3771 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3772 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3773 (void *)atsr + atsr->header.length,
3774 atsr->segment, atsru->devices,
3775 atsru->devices_cnt);
3776 if (ret > 0)
3777 break;
3778 else if(ret < 0)
3779 return ret;
3780 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3781 if (dmar_remove_dev_scope(info, atsr->segment,
3782 atsru->devices, atsru->devices_cnt))
3783 break;
3784 }
3785 }
3786
3787 return 0;
3788 }
3789
3790 /*
3791 * Here we only respond to action of unbound device from driver.
3792 *
3793 * Added device is not attached to its DMAR domain here yet. That will happen
3794 * when mapping the device to iova.
3795 */
3796 static int device_notifier(struct notifier_block *nb,
3797 unsigned long action, void *data)
3798 {
3799 struct device *dev = data;
3800 struct pci_dev *pdev = to_pci_dev(dev);
3801 struct dmar_domain *domain;
3802
3803 if (iommu_dummy(dev))
3804 return 0;
3805
3806 if (action != BUS_NOTIFY_UNBOUND_DRIVER &&
3807 action != BUS_NOTIFY_DEL_DEVICE)
3808 return 0;
3809
3810 domain = find_domain(dev);
3811 if (!domain)
3812 return 0;
3813
3814 down_read(&dmar_global_lock);
3815 domain_remove_one_dev_info(domain, pdev);
3816 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3817 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3818 list_empty(&domain->devices))
3819 domain_exit(domain);
3820 up_read(&dmar_global_lock);
3821
3822 return 0;
3823 }
3824
3825 static struct notifier_block device_nb = {
3826 .notifier_call = device_notifier,
3827 };
3828
3829 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3830 unsigned long val, void *v)
3831 {
3832 struct memory_notify *mhp = v;
3833 unsigned long long start, end;
3834 unsigned long start_vpfn, last_vpfn;
3835
3836 switch (val) {
3837 case MEM_GOING_ONLINE:
3838 start = mhp->start_pfn << PAGE_SHIFT;
3839 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
3840 if (iommu_domain_identity_map(si_domain, start, end)) {
3841 pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
3842 start, end);
3843 return NOTIFY_BAD;
3844 }
3845 break;
3846
3847 case MEM_OFFLINE:
3848 case MEM_CANCEL_ONLINE:
3849 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3850 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
3851 while (start_vpfn <= last_vpfn) {
3852 struct iova *iova;
3853 struct dmar_drhd_unit *drhd;
3854 struct intel_iommu *iommu;
3855 struct page *freelist;
3856
3857 iova = find_iova(&si_domain->iovad, start_vpfn);
3858 if (iova == NULL) {
3859 pr_debug("dmar: failed get IOVA for PFN %lx\n",
3860 start_vpfn);
3861 break;
3862 }
3863
3864 iova = split_and_remove_iova(&si_domain->iovad, iova,
3865 start_vpfn, last_vpfn);
3866 if (iova == NULL) {
3867 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
3868 start_vpfn, last_vpfn);
3869 return NOTIFY_BAD;
3870 }
3871
3872 freelist = domain_unmap(si_domain, iova->pfn_lo,
3873 iova->pfn_hi);
3874
3875 rcu_read_lock();
3876 for_each_active_iommu(iommu, drhd)
3877 iommu_flush_iotlb_psi(iommu, si_domain->id,
3878 iova->pfn_lo,
3879 iova->pfn_hi - iova->pfn_lo + 1,
3880 !freelist, 0);
3881 rcu_read_unlock();
3882 dma_free_pagelist(freelist);
3883
3884 start_vpfn = iova->pfn_hi + 1;
3885 free_iova_mem(iova);
3886 }
3887 break;
3888 }
3889
3890 return NOTIFY_OK;
3891 }
3892
3893 static struct notifier_block intel_iommu_memory_nb = {
3894 .notifier_call = intel_iommu_memory_notifier,
3895 .priority = 0
3896 };
3897
3898 int __init intel_iommu_init(void)
3899 {
3900 int ret = -ENODEV;
3901 struct dmar_drhd_unit *drhd;
3902 struct intel_iommu *iommu;
3903
3904 /* VT-d is required for a TXT/tboot launch, so enforce that */
3905 force_on = tboot_force_iommu();
3906
3907 if (iommu_init_mempool()) {
3908 if (force_on)
3909 panic("tboot: Failed to initialize iommu memory\n");
3910 return -ENOMEM;
3911 }
3912
3913 down_write(&dmar_global_lock);
3914 if (dmar_table_init()) {
3915 if (force_on)
3916 panic("tboot: Failed to initialize DMAR table\n");
3917 goto out_free_dmar;
3918 }
3919
3920 /*
3921 * Disable translation if already enabled prior to OS handover.
3922 */
3923 for_each_active_iommu(iommu, drhd)
3924 if (iommu->gcmd & DMA_GCMD_TE)
3925 iommu_disable_translation(iommu);
3926
3927 if (dmar_dev_scope_init() < 0) {
3928 if (force_on)
3929 panic("tboot: Failed to initialize DMAR device scope\n");
3930 goto out_free_dmar;
3931 }
3932
3933 if (no_iommu || dmar_disabled)
3934 goto out_free_dmar;
3935
3936 if (list_empty(&dmar_rmrr_units))
3937 printk(KERN_INFO "DMAR: No RMRR found\n");
3938
3939 if (list_empty(&dmar_atsr_units))
3940 printk(KERN_INFO "DMAR: No ATSR found\n");
3941
3942 if (dmar_init_reserved_ranges()) {
3943 if (force_on)
3944 panic("tboot: Failed to reserve iommu ranges\n");
3945 goto out_free_reserved_range;
3946 }
3947
3948 init_no_remapping_devices();
3949
3950 ret = init_dmars();
3951 if (ret) {
3952 if (force_on)
3953 panic("tboot: Failed to initialize DMARs\n");
3954 printk(KERN_ERR "IOMMU: dmar init failed\n");
3955 goto out_free_reserved_range;
3956 }
3957 up_write(&dmar_global_lock);
3958 printk(KERN_INFO
3959 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3960
3961 init_timer(&unmap_timer);
3962 #ifdef CONFIG_SWIOTLB
3963 swiotlb = 0;
3964 #endif
3965 dma_ops = &intel_dma_ops;
3966
3967 init_iommu_pm_ops();
3968
3969 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3970 bus_register_notifier(&pci_bus_type, &device_nb);
3971 if (si_domain && !hw_pass_through)
3972 register_memory_notifier(&intel_iommu_memory_nb);
3973
3974 intel_iommu_enabled = 1;
3975
3976 return 0;
3977
3978 out_free_reserved_range:
3979 put_iova_domain(&reserved_iova_list);
3980 out_free_dmar:
3981 intel_iommu_free_dmars();
3982 up_write(&dmar_global_lock);
3983 iommu_exit_mempool();
3984 return ret;
3985 }
3986
3987 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3988 struct device *dev)
3989 {
3990 struct pci_dev *tmp, *parent, *pdev;
3991
3992 if (!iommu || !dev || !dev_is_pci(dev))
3993 return;
3994
3995 pdev = to_pci_dev(dev);
3996
3997 /* dependent device detach */
3998 tmp = pci_find_upstream_pcie_bridge(pdev);
3999 /* Secondary interface's bus number and devfn 0 */
4000 if (tmp) {
4001 parent = pdev->bus->self;
4002 while (parent != tmp) {
4003 iommu_detach_dev(iommu, parent->bus->number,
4004 parent->devfn);
4005 parent = parent->bus->self;
4006 }
4007 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
4008 iommu_detach_dev(iommu,
4009 tmp->subordinate->number, 0);
4010 else /* this is a legacy PCI bridge */
4011 iommu_detach_dev(iommu, tmp->bus->number,
4012 tmp->devfn);
4013 }
4014 }
4015
4016 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4017 struct pci_dev *pdev)
4018 {
4019 struct device_domain_info *info, *tmp;
4020 struct intel_iommu *iommu;
4021 unsigned long flags;
4022 int found = 0;
4023
4024 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4025 pdev->devfn);
4026 if (!iommu)
4027 return;
4028
4029 spin_lock_irqsave(&device_domain_lock, flags);
4030 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4031 if (info->iommu->segment == pci_domain_nr(pdev->bus) &&
4032 info->bus == pdev->bus->number &&
4033 info->devfn == pdev->devfn) {
4034 unlink_domain_info(info);
4035 spin_unlock_irqrestore(&device_domain_lock, flags);
4036
4037 iommu_disable_dev_iotlb(info);
4038 iommu_detach_dev(iommu, info->bus, info->devfn);
4039 iommu_detach_dependent_devices(iommu, &pdev->dev);
4040 free_devinfo_mem(info);
4041
4042 spin_lock_irqsave(&device_domain_lock, flags);
4043
4044 if (found)
4045 break;
4046 else
4047 continue;
4048 }
4049
4050 /* if there is no other devices under the same iommu
4051 * owned by this domain, clear this iommu in iommu_bmp
4052 * update iommu count and coherency
4053 */
4054 if (info->iommu == iommu)
4055 found = 1;
4056 }
4057
4058 spin_unlock_irqrestore(&device_domain_lock, flags);
4059
4060 if (found == 0) {
4061 unsigned long tmp_flags;
4062 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
4063 clear_bit(iommu->seq_id, domain->iommu_bmp);
4064 domain->iommu_count--;
4065 domain_update_iommu_cap(domain);
4066 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
4067
4068 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
4069 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
4070 spin_lock_irqsave(&iommu->lock, tmp_flags);
4071 clear_bit(domain->id, iommu->domain_ids);
4072 iommu->domains[domain->id] = NULL;
4073 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
4074 }
4075 }
4076 }
4077
4078 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4079 {
4080 int adjust_width;
4081
4082 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
4083 domain_reserve_special_ranges(domain);
4084
4085 /* calculate AGAW */
4086 domain->gaw = guest_width;
4087 adjust_width = guestwidth_to_adjustwidth(guest_width);
4088 domain->agaw = width_to_agaw(adjust_width);
4089
4090 domain->iommu_coherency = 0;
4091 domain->iommu_snooping = 0;
4092 domain->iommu_superpage = 0;
4093 domain->max_addr = 0;
4094 domain->nid = -1;
4095
4096 /* always allocate the top pgd */
4097 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4098 if (!domain->pgd)
4099 return -ENOMEM;
4100 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4101 return 0;
4102 }
4103
4104 static int intel_iommu_domain_init(struct iommu_domain *domain)
4105 {
4106 struct dmar_domain *dmar_domain;
4107
4108 dmar_domain = alloc_domain(true);
4109 if (!dmar_domain) {
4110 printk(KERN_ERR
4111 "intel_iommu_domain_init: dmar_domain == NULL\n");
4112 return -ENOMEM;
4113 }
4114 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4115 printk(KERN_ERR
4116 "intel_iommu_domain_init() failed\n");
4117 domain_exit(dmar_domain);
4118 return -ENOMEM;
4119 }
4120 domain_update_iommu_cap(dmar_domain);
4121 domain->priv = dmar_domain;
4122
4123 domain->geometry.aperture_start = 0;
4124 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4125 domain->geometry.force_aperture = true;
4126
4127 return 0;
4128 }
4129
4130 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4131 {
4132 struct dmar_domain *dmar_domain = domain->priv;
4133
4134 domain->priv = NULL;
4135 domain_exit(dmar_domain);
4136 }
4137
4138 static int intel_iommu_attach_device(struct iommu_domain *domain,
4139 struct device *dev)
4140 {
4141 struct dmar_domain *dmar_domain = domain->priv;
4142 struct pci_dev *pdev = to_pci_dev(dev);
4143 struct intel_iommu *iommu;
4144 int addr_width;
4145
4146 /* normally pdev is not mapped */
4147 if (unlikely(domain_context_mapped(pdev))) {
4148 struct dmar_domain *old_domain;
4149
4150 old_domain = find_domain(dev);
4151 if (old_domain) {
4152 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4153 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4154 domain_remove_one_dev_info(old_domain, pdev);
4155 else
4156 domain_remove_dev_info(old_domain);
4157 }
4158 }
4159
4160 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4161 pdev->devfn);
4162 if (!iommu)
4163 return -ENODEV;
4164
4165 /* check if this iommu agaw is sufficient for max mapped address */
4166 addr_width = agaw_to_width(iommu->agaw);
4167 if (addr_width > cap_mgaw(iommu->cap))
4168 addr_width = cap_mgaw(iommu->cap);
4169
4170 if (dmar_domain->max_addr > (1LL << addr_width)) {
4171 printk(KERN_ERR "%s: iommu width (%d) is not "
4172 "sufficient for the mapped address (%llx)\n",
4173 __func__, addr_width, dmar_domain->max_addr);
4174 return -EFAULT;
4175 }
4176 dmar_domain->gaw = addr_width;
4177
4178 /*
4179 * Knock out extra levels of page tables if necessary
4180 */
4181 while (iommu->agaw < dmar_domain->agaw) {
4182 struct dma_pte *pte;
4183
4184 pte = dmar_domain->pgd;
4185 if (dma_pte_present(pte)) {
4186 dmar_domain->pgd = (struct dma_pte *)
4187 phys_to_virt(dma_pte_addr(pte));
4188 free_pgtable_page(pte);
4189 }
4190 dmar_domain->agaw--;
4191 }
4192
4193 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4194 }
4195
4196 static void intel_iommu_detach_device(struct iommu_domain *domain,
4197 struct device *dev)
4198 {
4199 struct dmar_domain *dmar_domain = domain->priv;
4200 struct pci_dev *pdev = to_pci_dev(dev);
4201
4202 domain_remove_one_dev_info(dmar_domain, pdev);
4203 }
4204
4205 static int intel_iommu_map(struct iommu_domain *domain,
4206 unsigned long iova, phys_addr_t hpa,
4207 size_t size, int iommu_prot)
4208 {
4209 struct dmar_domain *dmar_domain = domain->priv;
4210 u64 max_addr;
4211 int prot = 0;
4212 int ret;
4213
4214 if (iommu_prot & IOMMU_READ)
4215 prot |= DMA_PTE_READ;
4216 if (iommu_prot & IOMMU_WRITE)
4217 prot |= DMA_PTE_WRITE;
4218 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4219 prot |= DMA_PTE_SNP;
4220
4221 max_addr = iova + size;
4222 if (dmar_domain->max_addr < max_addr) {
4223 u64 end;
4224
4225 /* check if minimum agaw is sufficient for mapped address */
4226 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4227 if (end < max_addr) {
4228 printk(KERN_ERR "%s: iommu width (%d) is not "
4229 "sufficient for the mapped address (%llx)\n",
4230 __func__, dmar_domain->gaw, max_addr);
4231 return -EFAULT;
4232 }
4233 dmar_domain->max_addr = max_addr;
4234 }
4235 /* Round up size to next multiple of PAGE_SIZE, if it and
4236 the low bits of hpa would take us onto the next page */
4237 size = aligned_nrpages(hpa, size);
4238 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4239 hpa >> VTD_PAGE_SHIFT, size, prot);
4240 return ret;
4241 }
4242
4243 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4244 unsigned long iova, size_t size)
4245 {
4246 struct dmar_domain *dmar_domain = domain->priv;
4247 struct page *freelist = NULL;
4248 struct intel_iommu *iommu;
4249 unsigned long start_pfn, last_pfn;
4250 unsigned int npages;
4251 int iommu_id, num, ndomains, level = 0;
4252
4253 /* Cope with horrid API which requires us to unmap more than the
4254 size argument if it happens to be a large-page mapping. */
4255 if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4256 BUG();
4257
4258 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4259 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4260
4261 start_pfn = iova >> VTD_PAGE_SHIFT;
4262 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4263
4264 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4265
4266 npages = last_pfn - start_pfn + 1;
4267
4268 for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4269 iommu = g_iommus[iommu_id];
4270
4271 /*
4272 * find bit position of dmar_domain
4273 */
4274 ndomains = cap_ndoms(iommu->cap);
4275 for_each_set_bit(num, iommu->domain_ids, ndomains) {
4276 if (iommu->domains[num] == dmar_domain)
4277 iommu_flush_iotlb_psi(iommu, num, start_pfn,
4278 npages, !freelist, 0);
4279 }
4280
4281 }
4282
4283 dma_free_pagelist(freelist);
4284
4285 if (dmar_domain->max_addr == iova + size)
4286 dmar_domain->max_addr = iova;
4287
4288 return size;
4289 }
4290
4291 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4292 dma_addr_t iova)
4293 {
4294 struct dmar_domain *dmar_domain = domain->priv;
4295 struct dma_pte *pte;
4296 int level = 0;
4297 u64 phys = 0;
4298
4299 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4300 if (pte)
4301 phys = dma_pte_addr(pte);
4302
4303 return phys;
4304 }
4305
4306 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4307 unsigned long cap)
4308 {
4309 struct dmar_domain *dmar_domain = domain->priv;
4310
4311 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4312 return dmar_domain->iommu_snooping;
4313 if (cap == IOMMU_CAP_INTR_REMAP)
4314 return irq_remapping_enabled;
4315
4316 return 0;
4317 }
4318
4319 #define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4320
4321 static int intel_iommu_add_device(struct device *dev)
4322 {
4323 struct pci_dev *pdev = to_pci_dev(dev);
4324 struct pci_dev *bridge, *dma_pdev = NULL;
4325 struct iommu_group *group;
4326 int ret;
4327
4328 if (!device_to_iommu(pci_domain_nr(pdev->bus),
4329 pdev->bus->number, pdev->devfn))
4330 return -ENODEV;
4331
4332 bridge = pci_find_upstream_pcie_bridge(pdev);
4333 if (bridge) {
4334 if (pci_is_pcie(bridge))
4335 dma_pdev = pci_get_domain_bus_and_slot(
4336 pci_domain_nr(pdev->bus),
4337 bridge->subordinate->number, 0);
4338 if (!dma_pdev)
4339 dma_pdev = pci_dev_get(bridge);
4340 } else
4341 dma_pdev = pci_dev_get(pdev);
4342
4343 /* Account for quirked devices */
4344 swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4345
4346 /*
4347 * If it's a multifunction device that does not support our
4348 * required ACS flags, add to the same group as lowest numbered
4349 * function that also does not suport the required ACS flags.
4350 */
4351 if (dma_pdev->multifunction &&
4352 !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4353 u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4354
4355 for (i = 0; i < 8; i++) {
4356 struct pci_dev *tmp;
4357
4358 tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4359 if (!tmp)
4360 continue;
4361
4362 if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4363 swap_pci_ref(&dma_pdev, tmp);
4364 break;
4365 }
4366 pci_dev_put(tmp);
4367 }
4368 }
4369
4370 /*
4371 * Devices on the root bus go through the iommu. If that's not us,
4372 * find the next upstream device and test ACS up to the root bus.
4373 * Finding the next device may require skipping virtual buses.
4374 */
4375 while (!pci_is_root_bus(dma_pdev->bus)) {
4376 struct pci_bus *bus = dma_pdev->bus;
4377
4378 while (!bus->self) {
4379 if (!pci_is_root_bus(bus))
4380 bus = bus->parent;
4381 else
4382 goto root_bus;
4383 }
4384
4385 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4386 break;
4387
4388 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4389 }
4390
4391 root_bus:
4392 group = iommu_group_get(&dma_pdev->dev);
4393 pci_dev_put(dma_pdev);
4394 if (!group) {
4395 group = iommu_group_alloc();
4396 if (IS_ERR(group))
4397 return PTR_ERR(group);
4398 }
4399
4400 ret = iommu_group_add_device(group, dev);
4401
4402 iommu_group_put(group);
4403 return ret;
4404 }
4405
4406 static void intel_iommu_remove_device(struct device *dev)
4407 {
4408 iommu_group_remove_device(dev);
4409 }
4410
4411 static struct iommu_ops intel_iommu_ops = {
4412 .domain_init = intel_iommu_domain_init,
4413 .domain_destroy = intel_iommu_domain_destroy,
4414 .attach_dev = intel_iommu_attach_device,
4415 .detach_dev = intel_iommu_detach_device,
4416 .map = intel_iommu_map,
4417 .unmap = intel_iommu_unmap,
4418 .iova_to_phys = intel_iommu_iova_to_phys,
4419 .domain_has_cap = intel_iommu_domain_has_cap,
4420 .add_device = intel_iommu_add_device,
4421 .remove_device = intel_iommu_remove_device,
4422 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4423 };
4424
4425 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4426 {
4427 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4428 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4429 dmar_map_gfx = 0;
4430 }
4431
4432 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4433 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4434 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4435 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4436 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4437 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4438 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4439
4440 static void quirk_iommu_rwbf(struct pci_dev *dev)
4441 {
4442 /*
4443 * Mobile 4 Series Chipset neglects to set RWBF capability,
4444 * but needs it. Same seems to hold for the desktop versions.
4445 */
4446 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4447 rwbf_quirk = 1;
4448 }
4449
4450 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4451 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4452 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4453 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4454 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4455 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4456 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4457
4458 #define GGC 0x52
4459 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4460 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4461 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4462 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4463 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4464 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4465 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4466 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4467
4468 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4469 {
4470 unsigned short ggc;
4471
4472 if (pci_read_config_word(dev, GGC, &ggc))
4473 return;
4474
4475 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4476 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4477 dmar_map_gfx = 0;
4478 } else if (dmar_map_gfx) {
4479 /* we have to ensure the gfx device is idle before we flush */
4480 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4481 intel_iommu_strict = 1;
4482 }
4483 }
4484 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4485 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4486 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4487 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4488
4489 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4490 ISOCH DMAR unit for the Azalia sound device, but not give it any
4491 TLB entries, which causes it to deadlock. Check for that. We do
4492 this in a function called from init_dmars(), instead of in a PCI
4493 quirk, because we don't want to print the obnoxious "BIOS broken"
4494 message if VT-d is actually disabled.
4495 */
4496 static void __init check_tylersburg_isoch(void)
4497 {
4498 struct pci_dev *pdev;
4499 uint32_t vtisochctrl;
4500
4501 /* If there's no Azalia in the system anyway, forget it. */
4502 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4503 if (!pdev)
4504 return;
4505 pci_dev_put(pdev);
4506
4507 /* System Management Registers. Might be hidden, in which case
4508 we can't do the sanity check. But that's OK, because the
4509 known-broken BIOSes _don't_ actually hide it, so far. */
4510 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4511 if (!pdev)
4512 return;
4513
4514 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4515 pci_dev_put(pdev);
4516 return;
4517 }
4518
4519 pci_dev_put(pdev);
4520
4521 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4522 if (vtisochctrl & 1)
4523 return;
4524
4525 /* Drop all bits other than the number of TLB entries */
4526 vtisochctrl &= 0x1c;
4527
4528 /* If we have the recommended number of TLB entries (16), fine. */
4529 if (vtisochctrl == 0x10)
4530 return;
4531
4532 /* Zero TLB entries? You get to ride the short bus to school. */
4533 if (!vtisochctrl) {
4534 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4535 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4536 dmi_get_system_info(DMI_BIOS_VENDOR),
4537 dmi_get_system_info(DMI_BIOS_VERSION),
4538 dmi_get_system_info(DMI_PRODUCT_VERSION));
4539 iommu_identity_mapping |= IDENTMAP_AZALIA;
4540 return;
4541 }
4542
4543 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4544 vtisochctrl);
4545 }
This page took 0.214135 seconds and 5 git commands to generate.